├── Diagrams ├── PNG │ ├── spo-connector-aad-summary.PNG │ ├── HighLevelComponentArchitecture.png │ ├── spo-connector-aad-client_secret.PNG │ └── spo-connector-aad-Graph_API_permissions.PNG └── Visio │ └── HighLevelComponentArchitecture.vsdx ├── AzureSearch.SharepointOnline.Connector ├── SearchDefinitions │ ├── blobSynonymMap.json │ ├── blobIndexer.json │ ├── blobIndex.json │ └── blobSkillset.json ├── Properties │ └── launchSettings.json ├── Helpers │ ├── AuthHandler.cs │ ├── MsalAuthenticationProvider.cs │ ├── AzureSearchServiceHelper.cs │ ├── SearchIndexHelper.cs │ ├── ProtectedApiCallHelper.cs │ ├── AzureTableStorage.cs │ ├── AzureBLOBStorage.cs │ ├── SearchServiceHelper.cs │ └── SharePointOnlineHelper.cs ├── appSettings.json ├── AzureSearch.SharePointOnline.Connector.csproj └── Program.cs ├── AzureSearch.SharepointOnline.Connector.CustomSkills ├── appsettings.Development.json ├── Services │ ├── BlobStorageService.cs │ ├── ISharePointMetadataService.cs │ ├── SecretServices.cs │ ├── SharePointFileMetadata.cs │ └── SharePointMetadataService.cs ├── appsettings.json ├── Config │ ├── AppSettingsEnvironmentConfig.cs │ ├── ConnectionStringsConfig.cs │ ├── MappingConfig.cs │ └── EnvironmentConfig.cs ├── Fields │ ├── InputRecord.cs │ └── OutputRecord.cs ├── Dockerfile ├── Mapping │ ├── metadatatoindexmapping.json │ └── metadatatoindexmapping_old.json ├── Properties │ └── launchSettings.json ├── AzureSearch.SharepointOnline.Connector.CustomSkills.csproj ├── Program.cs ├── Startup.cs └── Controllers │ └── CustomSkillsController.cs ├── .dockerignore ├── LICENSE.md ├── AzureSearch.SharePointOnline.sln ├── .gitignore └── README.md /Diagrams/PNG/spo-connector-aad-summary.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anevjes/AzureSearch.SharePointOnline/HEAD/Diagrams/PNG/spo-connector-aad-summary.PNG -------------------------------------------------------------------------------- /Diagrams/PNG/HighLevelComponentArchitecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anevjes/AzureSearch.SharePointOnline/HEAD/Diagrams/PNG/HighLevelComponentArchitecture.png -------------------------------------------------------------------------------- /Diagrams/PNG/spo-connector-aad-client_secret.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anevjes/AzureSearch.SharePointOnline/HEAD/Diagrams/PNG/spo-connector-aad-client_secret.PNG -------------------------------------------------------------------------------- /Diagrams/Visio/HighLevelComponentArchitecture.vsdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anevjes/AzureSearch.SharePointOnline/HEAD/Diagrams/Visio/HighLevelComponentArchitecture.vsdx -------------------------------------------------------------------------------- /Diagrams/PNG/spo-connector-aad-Graph_API_permissions.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anevjes/AzureSearch.SharePointOnline/HEAD/Diagrams/PNG/spo-connector-aad-Graph_API_permissions.PNG -------------------------------------------------------------------------------- /AzureSearch.SharepointOnline.Connector/SearchDefinitions/blobSynonymMap.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anevjes/AzureSearch.SharePointOnline/HEAD/AzureSearch.SharepointOnline.Connector/SearchDefinitions/blobSynonymMap.json -------------------------------------------------------------------------------- /AzureSearch.SharepointOnline.Connector.CustomSkills/appsettings.Development.json: -------------------------------------------------------------------------------- 1 | { 2 | "Logging": { 3 | "LogLevel": { 4 | "Default": "Debug", 5 | "System": "Information", 6 | "Microsoft": "Information" 7 | } 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /AzureSearch.SharepointOnline.Connector/Properties/launchSettings.json: -------------------------------------------------------------------------------- 1 | { 2 | "profiles": { 3 | "AzureSearch.SharePointConnector": { 4 | "commandName": "Project", 5 | "commandLineArgs": "-fullcrawl" 6 | }, 7 | "Docker": { 8 | "commandName": "Docker" 9 | } 10 | } 11 | } -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | **/.classpath 2 | **/.dockerignore 3 | **/.env 4 | **/.git 5 | **/.gitignore 6 | **/.project 7 | **/.settings 8 | **/.toolstarget 9 | **/.vs 10 | **/.vscode 11 | **/*.*proj.user 12 | **/*.dbmdl 13 | **/*.jfm 14 | **/azds.yaml 15 | **/bin 16 | **/charts 17 | **/docker-compose* 18 | **/Dockerfile* 19 | **/node_modules 20 | **/npm-debug.log 21 | **/obj 22 | **/secrets.dev.yaml 23 | **/values.dev.yaml 24 | LICENSE 25 | README.md -------------------------------------------------------------------------------- /AzureSearch.SharepointOnline.Connector.CustomSkills/Services/BlobStorageService.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Threading.Tasks; 5 | 6 | namespace BishopBlobCustomSkill.Services 7 | { 8 | public class BlobStorageService 9 | { 10 | public void ReadMetadata(string sasToken, string blobUrl) 11 | { 12 | var connectionString = $"BlobEndpoint{blobUrl}?{sasToken}"; 13 | 14 | } 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /AzureSearch.SharepointOnline.Connector.CustomSkills/appsettings.json: -------------------------------------------------------------------------------- 1 | { 2 | "ConnectionStrings": { 3 | "MetadataStorageConnectionString": "DefaultEndpointsProtocol=https;AccountName=YOUR_ACCOUNTNAME;AccountKey=YOUR_STORAGE_KEY;EndpointSuffix=core.windows.net" 4 | }, 5 | "EnvironmentConfig": { 6 | "ApiKey": "abcd", 7 | "MappingFile": "c:\\temp\\mapping.json" 8 | }, 9 | "Logging": { 10 | "LogLevel": { 11 | "Default": "Warning" 12 | } 13 | }, 14 | "AllowedHosts": "*" 15 | } 16 | -------------------------------------------------------------------------------- /AzureSearch.SharepointOnline.Connector.CustomSkills/Config/AppSettingsEnvironmentConfig.cs: -------------------------------------------------------------------------------- 1 | //THIS CODE IS PROVIDED AS IS WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT. 2 | 3 | using System; 4 | using System.Collections.Generic; 5 | using System.Linq; 6 | using System.Threading.Tasks; 7 | 8 | namespace AzureSearch.SharepointOnline.Connector.CustomSkills.Config 9 | { 10 | public class AppSettingsEnvironmentConfig : EnvironmentConfig 11 | { 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /AzureSearch.SharepointOnline.Connector.CustomSkills/Config/ConnectionStringsConfig.cs: -------------------------------------------------------------------------------- 1 | //THIS CODE IS PROVIDED AS IS WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT. 2 | 3 | using System; 4 | using System.Collections.Generic; 5 | using System.Linq; 6 | using System.Threading.Tasks; 7 | 8 | namespace BishopBlobCustomSkill.Config 9 | { 10 | public class ConnectionStringsConfig 11 | { 12 | public string MetadataStorageConnectionString { get; set; } 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /AzureSearch.SharepointOnline.Connector.CustomSkills/Config/MappingConfig.cs: -------------------------------------------------------------------------------- 1 | //THIS CODE IS PROVIDED AS IS WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT. 2 | 3 | using System; 4 | using System.Collections.Generic; 5 | using System.Linq; 6 | using System.Threading.Tasks; 7 | 8 | namespace AzureSearch.SharepointOnline.Connector.CustomSkills.Config 9 | { 10 | public class MappingConfig 11 | { 12 | public string MetadataOutputMappingFile { get; set; } 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /AzureSearch.SharepointOnline.Connector.CustomSkills/Services/ISharePointMetadataService.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Threading.Tasks; 5 | 6 | namespace BishopBlobCustomSkill.Services 7 | { 8 | public interface ISharePointMetadataService 9 | { 10 | Task GetMetadata(Uri metadataUri); 11 | Dictionary MapMetadataToOutput(IDictionary metadata); 12 | 13 | Task> GetMetadataAsDictionary(Uri metadataUri); 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /AzureSearch.SharepointOnline.Connector.CustomSkills/Config/EnvironmentConfig.cs: -------------------------------------------------------------------------------- 1 | //THIS CODE IS PROVIDED AS IS WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT. 2 | 3 | using System; 4 | using System.Collections.Generic; 5 | using System.Linq; 6 | using System.Threading.Tasks; 7 | 8 | namespace AzureSearch.SharepointOnline.Connector.CustomSkills.Config 9 | { 10 | public class EnvironmentConfig 11 | { 12 | public string MappingFile { get; set; } 13 | public string ApiKey { get; set; } 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /AzureSearch.SharepointOnline.Connector.CustomSkills/Services/SecretServices.cs: -------------------------------------------------------------------------------- 1 | using Microsoft.Azure.KeyVault; 2 | using Microsoft.EntityFrameworkCore.Metadata.Internal; 3 | using System; 4 | using System.Collections.Generic; 5 | using System.Linq; 6 | using System.Threading.Tasks; 7 | 8 | namespace BishopBlobCustomSkill.Services 9 | { 10 | public class SecretServices 11 | { 12 | private KeyVaultClient kvc; 13 | public SecretServices(String securityToken) 14 | { 15 | // kvc = new KeyVaultClient(new KeyVaultClient.AuthenticationCallback(securityToken)); 16 | //var kvc = new KeyVaultClient() 17 | 18 | } 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /AzureSearch.SharepointOnline.Connector.CustomSkills/Fields/InputRecord.cs: -------------------------------------------------------------------------------- 1 | //THIS CODE IS PROVIDED AS IS WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT. 2 | 3 | namespace BishopBlobCustomSkill.Controllers 4 | { 5 | class InputRecord 6 | { 7 | public class InputRecordData 8 | { 9 | public string DocPath { get; set; } 10 | public string SASToken { get; set; } 11 | public string Metadataurl { get; set; } 12 | } 13 | 14 | public string RecordId { get; set; } 15 | public InputRecordData Data { get; set; } 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /AzureSearch.SharepointOnline.Connector.CustomSkills/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM mcr.microsoft.com/dotnet/core/aspnet:2.2-stretch-slim AS base 2 | WORKDIR /app 3 | EXPOSE 80 4 | 5 | FROM mcr.microsoft.com/dotnet/core/sdk:2.2-stretch AS build 6 | WORKDIR /src 7 | COPY ["BishopBlobCustomSkill/BishopBlobCustomSkill.csproj", "BishopBlobCustomSkill/"] 8 | RUN dotnet restore "BishopBlobCustomSkill/BishopBlobCustomSkill.csproj" 9 | COPY . . 10 | WORKDIR "/src/BishopBlobCustomSkill" 11 | RUN dotnet build "BishopBlobCustomSkill.csproj" -c Release -o /app 12 | 13 | FROM build AS publish 14 | RUN dotnet publish "BishopBlobCustomSkill.csproj" -c Release -o /app 15 | 16 | FROM base AS final 17 | WORKDIR /app 18 | COPY --from=publish /app . 19 | ENTRYPOINT ["dotnet", "BishopBlobCustomSkill.dll"] -------------------------------------------------------------------------------- /AzureSearch.SharepointOnline.Connector.CustomSkills/Mapping/metadatatoindexmapping.json: -------------------------------------------------------------------------------- 1 | { 2 | "description": "", 3 | "outputMapping": [ 4 | { 5 | "metadataFieldName": "ContentType", 6 | "outputFieldName": "ContentType" 7 | }, 8 | { 9 | "metadataFieldName": "Created", 10 | "outputFieldName": "Created" 11 | }, 12 | { 13 | "metadataFieldName": "Modified", 14 | "outputFieldName": "Modified" 15 | }, 16 | { 17 | "metadataFieldName": "SPWebUrl", 18 | "outputFieldName": "SPWebUrl" 19 | }, 20 | { 21 | "metadataFieldName": "LinkFilename", 22 | "outputFieldName": "LinkFilename" 23 | }, 24 | { 25 | "metadataFieldName": "createdAuthorDisplayName", 26 | "outputFieldName": "createdAuthorDisplayName" 27 | } 28 | ] 29 | } -------------------------------------------------------------------------------- /AzureSearch.SharepointOnline.Connector.CustomSkills/Properties/launchSettings.json: -------------------------------------------------------------------------------- 1 | { 2 | "iisSettings": { 3 | "windowsAuthentication": false, 4 | "anonymousAuthentication": true, 5 | "iisExpress": { 6 | "applicationUrl": "http://localhost:6221/", 7 | "sslPort": 0 8 | } 9 | }, 10 | "profiles": { 11 | "IIS Express": { 12 | "commandName": "IISExpress", 13 | "launchBrowser": true, 14 | "environmentVariables": { 15 | "ASPNETCORE_ENVIRONMENT": "Development" 16 | } 17 | }, 18 | "AzureSearch.SharepointOnline.Connector.CustomSkills": { 19 | "commandName": "Project", 20 | "launchBrowser": true, 21 | "environmentVariables": { 22 | "ASPNETCORE_ENVIRONMENT": "Development" 23 | }, 24 | "applicationUrl": "http://localhost:6223/" 25 | } 26 | } 27 | } -------------------------------------------------------------------------------- /AzureSearch.SharepointOnline.Connector.CustomSkills/Services/SharePointFileMetadata.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections; 3 | using System.Collections.Generic; 4 | using System.Linq; 5 | using System.Threading.Tasks; 6 | 7 | namespace BishopBlobCustomSkill.Services 8 | { 9 | public class SharePointFileMetadata 10 | { 11 | public string SPWebUrl { get; set; } = ""; 12 | public string CreatedAuthorDisplayName { get; set; } = ""; 13 | public string DocumentType { get; set; } = ""; 14 | public IList Region { get; set; } = new List(); 15 | public IList Country { get; set; } = new List(); 16 | public IList AustraliaState { get; set; } = new List(); 17 | public IList Asset { get; set; } = new List(); 18 | public string LinkFilename { get; set; } = ""; 19 | 20 | 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /AzureSearch.SharepointOnline.Connector.CustomSkills/Mapping/metadatatoindexmapping_old.json: -------------------------------------------------------------------------------- 1 | { 2 | "description": "", 3 | "outputMapping": [ 4 | { 5 | "metadataFieldName": "createdAuthorDisplayName", 6 | "outputFieldName": "createdAuthorDisplayName" 7 | }, 8 | { 9 | "metadataFieldName": "SPWebUrl", 10 | "outputFieldName": "SPWebUrl" 11 | }, 12 | { 13 | "metadataFieldName": "Documenttype", 14 | "outputFieldName": "documentType" 15 | }, 16 | { 17 | "metadataFieldName": "Region", 18 | "outputFieldName": "region" 19 | }, 20 | { 21 | "metadataFieldName": "Country", 22 | "outputFieldName": "country" 23 | }, 24 | { 25 | "metadataFieldName": "AustraliaState_x0028_ifapplicable_x0029_", 26 | "outputFieldName": "australiaState" 27 | }, 28 | { 29 | "metadataFieldName": "Asset", 30 | "outputFieldName": "asset" 31 | }, 32 | { 33 | "metadataFieldName": "LinkFilename", 34 | "outputFieldName": "linkFilename" 35 | } 36 | ] 37 | } -------------------------------------------------------------------------------- /AzureSearch.SharepointOnline.Connector.CustomSkills/AzureSearch.SharepointOnline.Connector.CustomSkills.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | netcoreapp2.2 5 | InProcess 6 | Linux 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | Always 20 | 21 | 22 | Always 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /AzureSearch.SharepointOnline.Connector.CustomSkills/Program.cs: -------------------------------------------------------------------------------- 1 | //THIS CODE IS PROVIDED AS IS WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT. 2 | 3 | using System; 4 | using System.Collections.Generic; 5 | using System.IO; 6 | using System.Linq; 7 | using System.Threading.Tasks; 8 | using Microsoft.AspNetCore; 9 | using Microsoft.AspNetCore.Hosting; 10 | using Microsoft.Extensions.Configuration; 11 | using Microsoft.Extensions.Logging; 12 | 13 | namespace BishopBlobCustomSkill 14 | { 15 | public class Program 16 | { 17 | public static void Main(string[] args) 18 | { 19 | CreateWebHostBuilder(args).Build().Run(); 20 | } 21 | 22 | public static IWebHostBuilder CreateWebHostBuilder(string[] args) => 23 | WebHost.CreateDefaultBuilder(args) 24 | .ConfigureAppConfiguration( (hostingContext, config) => 25 | { 26 | //config.AddEnvironmentVariables(); 27 | config.AddEnvironmentVariables(""); 28 | }) 29 | .UseStartup(); 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. All rights reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE -------------------------------------------------------------------------------- /AzureSearch.SharepointOnline.Connector/Helpers/AuthHandler.cs: -------------------------------------------------------------------------------- 1 | //THIS CODE IS PROVIDED AS IS WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT. 2 | 3 | using Microsoft.Graph; 4 | using System.Net.Http; 5 | using System.Threading; 6 | using System.Threading.Tasks; 7 | 8 | namespace AzureSearch.SharePointConnector 9 | { 10 | // This class allows an implementation of IAuthenticationProvider to be inserted into the DelegatingHandler 11 | // pipeline of an HttpClient instance. In future versions of GraphSDK, many cross-cutting concerns will 12 | // be implemented as DelegatingHandlers. This AuthHandler will come in the box. 13 | public class AuthHandler : DelegatingHandler 14 | { 15 | private IAuthenticationProvider _authenticationProvider; 16 | 17 | public AuthHandler(IAuthenticationProvider authenticationProvider, HttpMessageHandler innerHandler) 18 | { 19 | InnerHandler = innerHandler; 20 | _authenticationProvider = authenticationProvider; 21 | } 22 | 23 | protected override async Task SendAsync(HttpRequestMessage request, CancellationToken cancellationToken) 24 | { 25 | await _authenticationProvider.AuthenticateRequestAsync(request); 26 | return await base.SendAsync(request, cancellationToken); 27 | } 28 | } 29 | } -------------------------------------------------------------------------------- /AzureSearch.SharepointOnline.Connector.CustomSkills/Fields/OutputRecord.cs: -------------------------------------------------------------------------------- 1 | //THIS CODE IS PROVIDED AS IS WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT. 2 | 3 | using System; 4 | using System.Collections; 5 | using System.Collections.Generic; 6 | using Newtonsoft.Json; 7 | 8 | namespace BishopBlobCustomSkill.Fields 9 | { 10 | class OutputRecord 11 | { 12 | public class OutputRecordData 13 | { 14 | [JsonProperty(PropertyName = "tags")] 15 | public string Tags { get; set; } = ""; 16 | public string ACLS { get; set; } = ""; 17 | public string SourceUrl { get; set; } = ""; 18 | public string CreatedAuthorDisplayName { get; set; } = ""; 19 | public string SPWebUrl { get; set; } = ""; 20 | public string DocumentType { get; set; } = ""; 21 | public IList Region { get; set; } = new List(); 22 | public IList Country { get; set; } = new List(); 23 | public IList AustraliaState { get; set; } = new List(); 24 | public IList Asset { get; set; } = new List(); 25 | public string LinkFilename { get; set; } = ""; 26 | 27 | } 28 | 29 | public class OutputRecordMessage 30 | { 31 | public string Message { get; set; } 32 | } 33 | 34 | public string RecordId { get; set; } 35 | public OutputRecordData Data { get; set; } 36 | public List Errors { get; set; } 37 | public List Warnings { get; set; } 38 | } 39 | } -------------------------------------------------------------------------------- /AzureSearch.SharepointOnline.Connector/Helpers/MsalAuthenticationProvider.cs: -------------------------------------------------------------------------------- 1 | //THIS CODE IS PROVIDED AS IS WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT. 2 | 3 | using Microsoft.Graph; 4 | using Microsoft.Identity.Client; 5 | using System.Net.Http; 6 | using System.Net.Http.Headers; 7 | using System.Threading.Tasks; 8 | 9 | namespace AzureSearch.SharePointConnector 10 | { 11 | // This class encapsulates the details of getting a token from MSAL and exposes it via the 12 | // IAuthenticationProvider interface so that GraphServiceClient or AuthHandler can use it. 13 | // A significantly enhanced version of this class will in the future be available from 14 | // the GraphSDK team. It will supports all the types of Client Application as defined by MSAL. 15 | public class MsalAuthenticationProvider : IAuthenticationProvider 16 | { 17 | private IConfidentialClientApplication _clientApplication; 18 | private string[] _scopes; 19 | 20 | public MsalAuthenticationProvider(IConfidentialClientApplication clientApplication, string[] scopes) 21 | { 22 | _clientApplication = clientApplication; 23 | _scopes = scopes; 24 | } 25 | 26 | /// 27 | /// Update HttpRequestMessage with credentials 28 | /// 29 | public async Task AuthenticateRequestAsync(HttpRequestMessage request) 30 | { 31 | var token = await GetTokenAsync(); 32 | request.Headers.Authorization = new AuthenticationHeaderValue("bearer", token); 33 | } 34 | 35 | /// 36 | /// Acquire Token 37 | /// 38 | public async Task GetTokenAsync() 39 | { 40 | AuthenticationResult authResult = null; 41 | authResult = await _clientApplication.AcquireTokenForClient(_scopes) 42 | .ExecuteAsync(); 43 | 44 | return authResult.AccessToken; 45 | } 46 | } 47 | } -------------------------------------------------------------------------------- /AzureSearch.SharepointOnline.Connector.CustomSkills/Startup.cs: -------------------------------------------------------------------------------- 1 | //THIS CODE IS PROVIDED AS IS WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT. 2 | 3 | using System; 4 | using System.Collections.Generic; 5 | using System.Linq; 6 | using System.Threading.Tasks; 7 | using AzureSearch.SharepointOnline.Connector.CustomSkills.Config; 8 | using BishopBlobCustomSkill.Config; 9 | using BishopBlobCustomSkill.Services; 10 | using Microsoft.AspNetCore.Builder; 11 | using Microsoft.AspNetCore.Hosting; 12 | using Microsoft.AspNetCore.Mvc; 13 | using Microsoft.Extensions.Configuration; 14 | using Microsoft.Extensions.DependencyInjection; 15 | using Microsoft.Extensions.Logging; 16 | using Microsoft.Extensions.Options; 17 | 18 | namespace BishopBlobCustomSkill 19 | { 20 | public class Startup 21 | { 22 | public Startup(IConfiguration configuration) 23 | { 24 | Configuration = configuration; 25 | } 26 | 27 | public IConfiguration Configuration { get; } 28 | 29 | // This method gets called by the runtime. Use this method to add services to the container. 30 | public void ConfigureServices(IServiceCollection services) 31 | { 32 | services.AddMvc().SetCompatibilityVersion(CompatibilityVersion.Version_2_2); 33 | 34 | services.Configure(this.Configuration.GetSection("ConnectionStrings")); 35 | services.Configure(this.Configuration.GetSection("EnvironmentConfig")); 36 | services.Configure(this.Configuration); 37 | services.AddSingleton(); 38 | } 39 | 40 | // This method gets called by the runtime. Use this method to configure the HTTP request pipeline. 41 | public void Configure(IApplicationBuilder app, IHostingEnvironment env) 42 | { 43 | if (env.IsDevelopment()) 44 | { 45 | app.UseDeveloperExceptionPage(); 46 | } 47 | 48 | app.UseMvc(); 49 | } 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /AzureSearch.SharepointOnline.Connector/Helpers/AzureSearchServiceHelper.cs: -------------------------------------------------------------------------------- 1 | //THIS CODE IS PROVIDED AS IS WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT. 2 | 3 | using Microsoft.Azure.Search; 4 | using Microsoft.Azure.Search.Models; 5 | using System; 6 | using System.IO; 7 | using System.Net.Http; 8 | using System.Text; 9 | using System.Threading.Tasks; 10 | 11 | // C# Tutorial: Combine data from multiple data sources in one Azure Search index 12 | // https://docs.microsoft.com/en-us/azure/search/tutorial-multiple-data-sources 13 | 14 | namespace AzureSearch.SharePointOnline.Connector.Helpers 15 | { 16 | public class AzureSearchServiceHelper 17 | { 18 | private readonly SearchServiceClient client; 19 | 20 | public AzureSearchServiceHelper(string searchServiceName, string searchServiceAdminKey) 21 | { 22 | client = new SearchServiceClient(searchServiceName, new SearchCredentials(searchServiceAdminKey)); 23 | client.HttpClient.DefaultRequestHeaders.Add("api-key", searchServiceAdminKey); 24 | } 25 | 26 | public async Task DeleteItemFromIndexAsync(string itemName, string indexName,string itemId) 27 | { 28 | Console.WriteLine($"Deleting '{itemName}' item from index source..."); 29 | var index = await client.Indexes.GetAsync("demo-index"); 30 | 31 | //POST / indexes /[index name] / docs / index ? api - version =[api - version] 32 | var uri = $"https://{client.SearchServiceName}.{client.SearchDnsSuffix}/indexes/{indexName}?api-version=2017-11-11-Preview"; 33 | 34 | var json = @" 35 | { 36 | 'value': [ 37 | { 38 | '@search.action': 'delete', 39 | 'id': 'replaceme' 40 | }, 41 | ] 42 | }"; 43 | 44 | json = json.Replace("replaceme", itemId); 45 | 46 | var content = new StringContent(json, Encoding.UTF8, "application/json"); 47 | 48 | var response = await client.HttpClient.PostAsync(uri, content); 49 | 50 | 51 | var b = index; 52 | } 53 | 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /AzureSearch.SharepointOnline.Connector/appSettings.json: -------------------------------------------------------------------------------- 1 | { 2 | "ConnectionStrings": { 3 | "AADDetails": { 4 | "applicationId": "YOUR_AAD_APP_CLIENTID", 5 | "applicationSecret": "APP-SECRET", 6 | "tenantId": "TENANT_ID", 7 | "redirectUri": "https://microsoft.com", 8 | "domain": "SPOAADTENANTNAME.onmicrosoft.com" 9 | }, 10 | "SearchDetails": { 11 | "name": "AZURE_SEARCH_NAME", 12 | "adminKey": "AZURE_SEARCH_ADMIN_KEY", 13 | "indexName": "demo-index", 14 | "blobDataSourceName": "blob-datasource", 15 | "blobSynonymMapName": "blob-synonymmap", 16 | "blobSkillsetName": "demo-skillset", 17 | "blobIndexerName": "demo-indexer", 18 | "cognitiveAccount": "/subscriptions/SUBSCRIPTION_ID/resourceGroups/RESOURCE_GROUP_NAME/providers/Microsoft.CognitiveServices/accounts/COGNITIVE_SERVICE_NAME/", 19 | "cognitiveKey": "6779dee6e8644406b086f2f1b492953a", 20 | "customSpoMetadataSkillUri": "https://YOURWEBAPP_HOSTNAME.azurewebsites.net/api/customskills/MergeSharePointMetadata", 21 | "SPOMetadataMapper-Api-Key": "GUID_MATCHING API KEY INSIDE CustomSkills appSetting" 22 | }, 23 | "StorageDetails": { 24 | "storageAccountName": "AZ_STORAGE_ACCT_NAME", 25 | "storageAccountKey": "", 26 | "storageBlobContainerName": "spocontent", 27 | "storageTableName": "spoIncrementalCrawlerTokens", 28 | "spoItemStorageTableName": "spoItems" 29 | }, 30 | "SPODetails": { 31 | "spoHostName": "YOURSPOHOST.sharepoint.com", 32 | "siteUrl": "/", 33 | "metadataJSONStore": true, 34 | "metadataFieldsToIgnore": [ 35 | "@odata.context", 36 | "@odata.id", 37 | "FileLeafRef", 38 | "@odata.etag", 39 | "LinkFilenameNoMenu", 40 | "DocIcon", 41 | "FolderChildCount", 42 | "_UIVersionString", 43 | "ParentVersionStringLookupId", 44 | "ParentLeafNameLookupId", 45 | "responseHeaders", 46 | "statusCode", 47 | "_ComplianceFlags", 48 | "_ComplianceTag", 49 | "_ComplianceTagWrittenTime", 50 | "_ComplianceTagUserId", 51 | "_CommentCount", 52 | "_LikeCount", 53 | "ItemChildCount", 54 | "Edit", 55 | "_CheckinComment" 56 | ], 57 | "docLibExclusions": [ 58 | ] 59 | } 60 | }, 61 | "Logging": { 62 | "key": "APP_INSIGHTS_KEY", 63 | "LogLevel": { 64 | "Default": "Warning" 65 | } 66 | } 67 | } -------------------------------------------------------------------------------- /AzureSearch.SharepointOnline.Connector/Helpers/SearchIndexHelper.cs: -------------------------------------------------------------------------------- 1 | //THIS CODE IS PROVIDED AS IS WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT. 2 | 3 | using Microsoft.Azure.Search; 4 | using Microsoft.Azure.Search.Models; 5 | using System; 6 | using System.Threading.Tasks; 7 | 8 | namespace AzureSearch.SharePointOnline.Connector.Helpers 9 | { 10 | public class SearchIndexHelper 11 | { 12 | private readonly SearchIndexClient client; 13 | 14 | public SearchIndexHelper(string searchServiceName, string searchServiceQueryKey, string indexName) 15 | { 16 | client = new SearchIndexClient(searchServiceName, indexName, new SearchCredentials(searchServiceQueryKey)); 17 | } 18 | 19 | public async Task SearchIndexAsync(string[] select) 20 | { 21 | Console.WriteLine("Querying the index..."); 22 | var results = await client.Documents.SearchAsync( 23 | searchText: "*", 24 | searchParameters: new SearchParameters() { Select = select } 25 | ); 26 | 27 | Console.WriteLine("Results:"); 28 | foreach (var result in results.Results) 29 | { 30 | Console.WriteLine("==========================================================================="); 31 | foreach (string key in result.Document.Keys) 32 | { 33 | if (result.Document[key] is string text) 34 | { 35 | text = text.Replace("\n", ""); 36 | text = text.Length > 200 ? $"{text.Substring(0, 200)}..." : text; 37 | Console.WriteLine($" {key}: '{text}'"); 38 | } 39 | else if (result.Document[key] is string[] texts) 40 | { 41 | Console.Write($" {key}: "); 42 | foreach (var t in texts) 43 | { 44 | Console.Write($"'{t}' "); 45 | } 46 | Console.WriteLine(); 47 | } 48 | else 49 | { 50 | Console.WriteLine($" {key}: {result.Document[key]}"); 51 | } 52 | } 53 | 54 | } 55 | Console.WriteLine("===========================================================================\n"); 56 | } 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /AzureSearch.SharepointOnline.Connector/AzureSearch.SharePointOnline.Connector.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Exe 5 | netcoreapp2.2 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | ..\..\..\..\Program Files\dotnet\sdk\NuGetFallbackFolder\microsoft.aspnetcore\2.2.0\lib\netstandard2.0\Microsoft.AspNetCore.dll 39 | 40 | 41 | 42 | 43 | 44 | Always 45 | 46 | 47 | Always 48 | 49 | 50 | Always 51 | 52 | 53 | Always 54 | 55 | 56 | Always 57 | 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /AzureSearch.SharepointOnline.Connector/Helpers/ProtectedApiCallHelper.cs: -------------------------------------------------------------------------------- 1 | //THIS CODE IS PROVIDED AS IS WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT. 2 | 3 | using Newtonsoft.Json; 4 | using Newtonsoft.Json.Linq; 5 | using System; 6 | using System.Linq; 7 | using System.Net.Http; 8 | using System.Net.Http.Headers; 9 | using System.Threading.Tasks; 10 | 11 | namespace AzureSearch.SharePointConnector 12 | { 13 | public class ProtectedApiCallHelper 14 | { 15 | /// 16 | /// Constructor 17 | /// 18 | /// HttpClient used to call the protected API 19 | public ProtectedApiCallHelper(HttpClient httpClient) 20 | { 21 | HttpClient = httpClient; 22 | } 23 | 24 | protected HttpClient HttpClient { get; private set; } 25 | 26 | 27 | /// 28 | /// Calls the protected Web API and processes the result 29 | /// 30 | /// Url of the Web API to call (supposed to return Json) 31 | /// Access token used as a bearer security token to call the Web API 32 | /// Callback used to process the result of the call to the Web API 33 | public async Task CallWebApiAndProcessResultASync(string webApiUrl, string accessToken, Action processResult) 34 | { 35 | if (!string.IsNullOrEmpty(accessToken)) 36 | { 37 | var defaultRequetHeaders = HttpClient.DefaultRequestHeaders; 38 | if (defaultRequetHeaders.Accept == null || !defaultRequetHeaders.Accept.Any(m => m.MediaType == "application/json")) 39 | { 40 | HttpClient.DefaultRequestHeaders.Accept.Add(new MediaTypeWithQualityHeaderValue("application/json")); 41 | } 42 | defaultRequetHeaders.Authorization = new AuthenticationHeaderValue("bearer", accessToken); 43 | 44 | HttpResponseMessage response = await HttpClient.GetAsync(webApiUrl); 45 | if (response.IsSuccessStatusCode) 46 | { 47 | string json = await response.Content.ReadAsStringAsync(); 48 | JObject result = JsonConvert.DeserializeObject(json) as JObject; 49 | Console.ForegroundColor = ConsoleColor.Gray; 50 | processResult(result); 51 | } 52 | else 53 | { 54 | Console.ForegroundColor = ConsoleColor.Red; 55 | Console.WriteLine($"Failed to call the Web Api: {response.StatusCode}"); 56 | string content = await response.Content.ReadAsStringAsync(); 57 | 58 | // Note that if you got reponse.Code == 403 and reponse.content.code == "Authorization_RequestDenied" 59 | // this is because the tenant admin as not granted consent for the application to call the Web API 60 | Console.WriteLine($"Content: {content}"); 61 | } 62 | Console.ResetColor(); 63 | } 64 | } 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /AzureSearch.SharePointOnline.sln: -------------------------------------------------------------------------------- 1 | 2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio Version 16 4 | VisualStudioVersion = 16.0.29009.5 5 | MinimumVisualStudioVersion = 15.0.26124.0 6 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "AzureSearch.SharePointOnline.Connector", "AzureSearch.SharepointOnline.Connector\AzureSearch.SharePointOnline.Connector.csproj", "{2B647F4C-CACD-4864-A886-553039C4E811}" 7 | EndProject 8 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "AzureSearch.SharepointOnline.Connector.CustomSkills", "AzureSearch.SharepointOnline.Connector.CustomSkills\AzureSearch.SharepointOnline.Connector.CustomSkills.csproj", "{B8BAD7A9-8835-45E6-84DF-8C0DAA9298C1}" 9 | EndProject 10 | Global 11 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 12 | Debug|Any CPU = Debug|Any CPU 13 | Debug|x64 = Debug|x64 14 | Debug|x86 = Debug|x86 15 | Release|Any CPU = Release|Any CPU 16 | Release|x64 = Release|x64 17 | Release|x86 = Release|x86 18 | EndGlobalSection 19 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 20 | {2B647F4C-CACD-4864-A886-553039C4E811}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 21 | {2B647F4C-CACD-4864-A886-553039C4E811}.Debug|Any CPU.Build.0 = Debug|Any CPU 22 | {2B647F4C-CACD-4864-A886-553039C4E811}.Debug|x64.ActiveCfg = Debug|Any CPU 23 | {2B647F4C-CACD-4864-A886-553039C4E811}.Debug|x64.Build.0 = Debug|Any CPU 24 | {2B647F4C-CACD-4864-A886-553039C4E811}.Debug|x86.ActiveCfg = Debug|Any CPU 25 | {2B647F4C-CACD-4864-A886-553039C4E811}.Debug|x86.Build.0 = Debug|Any CPU 26 | {2B647F4C-CACD-4864-A886-553039C4E811}.Release|Any CPU.ActiveCfg = Release|Any CPU 27 | {2B647F4C-CACD-4864-A886-553039C4E811}.Release|Any CPU.Build.0 = Release|Any CPU 28 | {2B647F4C-CACD-4864-A886-553039C4E811}.Release|x64.ActiveCfg = Release|Any CPU 29 | {2B647F4C-CACD-4864-A886-553039C4E811}.Release|x64.Build.0 = Release|Any CPU 30 | {2B647F4C-CACD-4864-A886-553039C4E811}.Release|x86.ActiveCfg = Release|Any CPU 31 | {2B647F4C-CACD-4864-A886-553039C4E811}.Release|x86.Build.0 = Release|Any CPU 32 | {B8BAD7A9-8835-45E6-84DF-8C0DAA9298C1}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 33 | {B8BAD7A9-8835-45E6-84DF-8C0DAA9298C1}.Debug|Any CPU.Build.0 = Debug|Any CPU 34 | {B8BAD7A9-8835-45E6-84DF-8C0DAA9298C1}.Debug|x64.ActiveCfg = Debug|Any CPU 35 | {B8BAD7A9-8835-45E6-84DF-8C0DAA9298C1}.Debug|x64.Build.0 = Debug|Any CPU 36 | {B8BAD7A9-8835-45E6-84DF-8C0DAA9298C1}.Debug|x86.ActiveCfg = Debug|Any CPU 37 | {B8BAD7A9-8835-45E6-84DF-8C0DAA9298C1}.Debug|x86.Build.0 = Debug|Any CPU 38 | {B8BAD7A9-8835-45E6-84DF-8C0DAA9298C1}.Release|Any CPU.ActiveCfg = Release|Any CPU 39 | {B8BAD7A9-8835-45E6-84DF-8C0DAA9298C1}.Release|Any CPU.Build.0 = Release|Any CPU 40 | {B8BAD7A9-8835-45E6-84DF-8C0DAA9298C1}.Release|x64.ActiveCfg = Release|Any CPU 41 | {B8BAD7A9-8835-45E6-84DF-8C0DAA9298C1}.Release|x64.Build.0 = Release|Any CPU 42 | {B8BAD7A9-8835-45E6-84DF-8C0DAA9298C1}.Release|x86.ActiveCfg = Release|Any CPU 43 | {B8BAD7A9-8835-45E6-84DF-8C0DAA9298C1}.Release|x86.Build.0 = Release|Any CPU 44 | EndGlobalSection 45 | GlobalSection(SolutionProperties) = preSolution 46 | HideSolutionNode = FALSE 47 | EndGlobalSection 48 | GlobalSection(ExtensibilityGlobals) = postSolution 49 | SolutionGuid = {F78006A2-FBAE-4241-86DF-6E341777EFA1} 50 | EndGlobalSection 51 | EndGlobal 52 | -------------------------------------------------------------------------------- /AzureSearch.SharepointOnline.Connector/SearchDefinitions/blobIndexer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "[IndexerName]", 3 | "dataSourceName": "[DataSourceName]", 4 | "targetIndexName": "[IndexName]", 5 | "skillsetName": "[SkillSetName]", 6 | "fieldMappings": [ 7 | { 8 | "sourceFieldName": "metadata_storage_path", 9 | "targetFieldName": "id", 10 | "mappingFunction": { "name": "base64Encode" } 11 | }, 12 | { 13 | "sourceFieldName": "metadata_storage_path", 14 | "targetFieldName": "blobUri" 15 | }, 16 | { 17 | "sourceFieldName": "metadata_storage_name", 18 | "targetFieldName": "metadata_storage_name" 19 | }, 20 | { 21 | "sourceFieldName": "metadata_storage_sas_token", 22 | "targetFieldName": "metadata_storage_sas_token" 23 | }, 24 | { 25 | "sourceFieldName": "metadataurl", 26 | "targetFieldName": "metadataurl" 27 | } 28 | 29 | ], 30 | "outputFieldMappings": [ 31 | { 32 | "sourceFieldName": "/document/fullText", 33 | "targetFieldName": "fullText" 34 | }, 35 | { 36 | "sourceFieldName": "/document/languageCode", 37 | "targetFieldName": "languageCode" 38 | }, 39 | { 40 | "sourceFieldName": "/document/fullText/pages/*/keyPhrases/*", 41 | "targetFieldName": "keyPhrases" 42 | }, 43 | { 44 | "sourceFieldName": "/document/fullText/pages/*/organizations/*", 45 | "targetFieldName": "organizations" 46 | }, 47 | { 48 | "sourceFieldName": "/document/fullText/pages/*/locations/*", 49 | "targetFieldName": "locations" 50 | }, 51 | { 52 | "sourceFieldName": "/document/fullText/pages/*/people/*", 53 | "targetFieldName": "people" 54 | }, 55 | { 56 | "sourceFieldName": "/document/fullText/pages/*/dateTimes/*", 57 | "targetFieldName": "dateTimes" 58 | }, 59 | { 60 | "sourceFieldName": "/document/fullText/pages/*/typelessEntities/*/name", 61 | "targetFieldName": "typelessEntities" 62 | }, 63 | { 64 | "sourceFieldName": "/document/normalized_images/*/imageDescriptions/captions/*/text", 65 | "targetFieldName": "imageDescriptions" 66 | }, 67 | { 68 | "sourceFieldName": "/document/normalized_images/*/imageCategories/*/name", 69 | "targetFieldName": "imageCategories" 70 | }, 71 | { 72 | "sourceFieldName": "/document/normalized_images/*/imageTags/*/name", 73 | "targetFieldName": "imageTags" 74 | }, 75 | { 76 | "sourceFieldName": "/document/CreatedAuthorDisplayName", 77 | "targetFieldName": "CreatedAuthorDisplayName" 78 | }, 79 | { 80 | "sourceFieldName": "/document/SPWebUrl", 81 | "targetFieldName": "SPWebUrl" 82 | }, 83 | { 84 | "sourceFieldName": "/document/LinkFilename", 85 | "targetFieldName": "LinkFilename" 86 | }, 87 | { 88 | "sourceFieldName": "/document/ContentType", 89 | "targetFieldName": "ContentType" 90 | } 91 | ], 92 | "parameters": { 93 | "batchSize": 1, 94 | "maxFailedItems": -1, 95 | "maxFailedItemsPerBatch": -1, 96 | "configuration": { 97 | "dataToExtract": "contentAndMetadata", 98 | "imageAction": "generateNormalizedImages", 99 | "excludedFileNameExtensions": ".json,.js", 100 | "failOnUnsupportedContentType": false, 101 | "indexStorageMetadataOnlyForOversizedDocuments": true, 102 | "failOnUnprocessableDocument": false 103 | } 104 | } 105 | } -------------------------------------------------------------------------------- /AzureSearch.SharepointOnline.Connector/SearchDefinitions/blobIndex.json: -------------------------------------------------------------------------------- 1 | { 2 | "fields": [ 3 | { 4 | "name": "id", 5 | "type": "Edm.String", 6 | "searchable": false, 7 | "filterable": false, 8 | "retrievable": true, 9 | "sortable": false, 10 | "facetable": false, 11 | "key": true 12 | }, 13 | { 14 | "name": "blobUri", 15 | "type": "Edm.String", 16 | "searchable": true, 17 | "filterable": false, 18 | "retrievable": true, 19 | "sortable": true, 20 | "facetable": false 21 | }, 22 | { 23 | "name": "fullText", 24 | "type": "Edm.String", 25 | "searchable": true, 26 | "filterable": false, 27 | "retrievable": true, 28 | "sortable": false, 29 | "facetable": false 30 | }, 31 | { 32 | "name": "languageCode", 33 | "type": "Edm.String", 34 | "searchable": true, 35 | "filterable": true, 36 | "retrievable": true, 37 | "sortable": false, 38 | "facetable": true 39 | }, 40 | { 41 | "name": "keyPhrases", 42 | "type": "Collection(Edm.String)", 43 | "searchable": true, 44 | "filterable": false, 45 | "retrievable": true, 46 | "sortable": false, 47 | "facetable": false, 48 | "synonymMaps": [ 49 | "[SynonymMapName]" 50 | ] 51 | }, 52 | { 53 | "name": "organizations", 54 | "type": "Collection(Edm.String)", 55 | "searchable": true, 56 | "filterable": true, 57 | "retrievable": true, 58 | "sortable": false, 59 | "facetable": false 60 | }, 61 | { 62 | "name": "locations", 63 | "type": "Collection(Edm.String)", 64 | "searchable": true, 65 | "filterable": true, 66 | "retrievable": true, 67 | "sortable": false, 68 | "facetable": true 69 | }, 70 | { 71 | "name": "SPWebUrl", 72 | "type": "Edm.String", 73 | "searchable": true, 74 | "sortable": false, 75 | "filterable": false, 76 | "facetable": false, 77 | "retrievable": true 78 | }, 79 | { 80 | "name": "ContentType", 81 | "type": "Edm.String", 82 | "searchable": true, 83 | "sortable": false, 84 | "filterable": true, 85 | "facetable": true, 86 | "retrievable": true 87 | }, 88 | { 89 | "name": "CreatedAuthorDisplayName", 90 | "type": "Edm.String", 91 | "searchable": true, 92 | "sortable": true, 93 | "filterable": true, 94 | "facetable": true, 95 | "retrievable": true 96 | }, 97 | { 98 | "name": "LinkFilename", 99 | "type": "Edm.String", 100 | "searchable": true, 101 | "sortable": true, 102 | "filterable": true, 103 | "facetable": false, 104 | "retrievable": true 105 | }, 106 | { 107 | "name": "people", 108 | "type": "Collection(Edm.String)", 109 | "searchable": true, 110 | "filterable": true, 111 | "retrievable": true, 112 | "sortable": false, 113 | "facetable": false 114 | }, 115 | { 116 | "name": "dateTimes", 117 | "type": "Collection(Edm.String)", 118 | "searchable": false, 119 | "filterable": true, 120 | "retrievable": true, 121 | "sortable": false, 122 | "facetable": false 123 | }, 124 | { 125 | "name": "typelessEntities", 126 | "type": "Collection(Edm.String)", 127 | "searchable": false, 128 | "filterable": false, 129 | "retrievable": true, 130 | "sortable": false, 131 | "facetable": false 132 | }, 133 | { 134 | "name": "imageDescriptions", 135 | "type": "Collection(Edm.String)", 136 | "searchable": true, 137 | "filterable": false, 138 | "retrievable": true, 139 | "sortable": false, 140 | "facetable": false 141 | }, 142 | { 143 | "name": "imageCategories", 144 | "type": "Collection(Edm.String)", 145 | "searchable": true, 146 | "filterable": false, 147 | "retrievable": true, 148 | "sortable": false, 149 | "facetable": false 150 | }, 151 | { 152 | "name": "imageTags", 153 | "type": "Collection(Edm.String)", 154 | "searchable": true, 155 | "filterable": false, 156 | "retrievable": true, 157 | "sortable": false, 158 | "facetable": false 159 | }, 160 | { 161 | "name": "ocrPrintedText", 162 | "type": "Edm.String", 163 | "searchable": true, 164 | "sortable": true, 165 | "filterable": true, 166 | "facetable": false, 167 | "retrievable": true 168 | } 169 | ], 170 | "corsOptions": { 171 | "allowedOrigins": [ "*" ] 172 | }, 173 | "suggesters": [ 174 | { 175 | "name": "sg", 176 | "searchMode": "analyzingInfixMatching", 177 | "sourceFields": [ "keyPhrases", "organizations", "locations", "people" ] 178 | } 179 | ] 180 | } 181 | -------------------------------------------------------------------------------- /AzureSearch.SharepointOnline.Connector/Helpers/AzureTableStorage.cs: -------------------------------------------------------------------------------- 1 | //THIS CODE IS PROVIDED AS IS WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT. 2 | 3 | 4 | using System; 5 | using System.Collections.Generic; 6 | using System.Text; 7 | using Microsoft.Azure; 8 | using Microsoft.WindowsAzure.Storage; 9 | using Microsoft.WindowsAzure.Storage.Table; 10 | using System.Threading.Tasks; 11 | using WindowsAzure.ChronoTableStorage; 12 | 13 | namespace AzureSearch.SharePointOnline.Connector.Helpers 14 | { 15 | public class IndexCrawlEntity : TableEntity 16 | { 17 | public string DeltaToken { get; set; } 18 | public string UtcTime { get; set; } 19 | public IndexCrawlEntity() { } 20 | public IndexCrawlEntity(string documentLibraryUrl, string deltaToken) 21 | { 22 | DateTime timeNow = DateTime.UtcNow; 23 | this.PartitionKey = documentLibraryUrl; 24 | this.DeltaToken = deltaToken; 25 | this.RowKey = WindowsAzure.ChronoTableStorage.RowKey.CreateChronological(timeNow); 26 | this.UtcTime = timeNow.ToString("yyyy-MM-dd HH:mm:ss"); 27 | } 28 | } 29 | 30 | 31 | public class SpoItem : TableEntity 32 | { 33 | public string DocumentLibraryUrl { get; set; } 34 | public string UtcTime { get; set; } 35 | public SpoItem() { } 36 | public SpoItem(string itemId, string documentLibraryUrl) 37 | { 38 | DateTime timeNow = DateTime.UtcNow; 39 | this.PartitionKey = itemId; 40 | this.DocumentLibraryUrl = documentLibraryUrl; 41 | this.RowKey = WindowsAzure.ChronoTableStorage.RowKey.CreateChronological(timeNow); 42 | var base64encodedUrl = Base64Encode(documentLibraryUrl); 43 | this.RowKey = base64encodedUrl; 44 | this.UtcTime = timeNow.ToString("yyyy-MM-dd HH:mm:ss"); 45 | } 46 | 47 | public static string Base64Encode(string plainText) 48 | { 49 | var plainTextBytes = System.Text.Encoding.UTF8.GetBytes(plainText); 50 | return System.Convert.ToBase64String(plainTextBytes); 51 | } 52 | } 53 | 54 | class AzureTableStorage 55 | { 56 | public static string TableStorageConnectionString { get; set; } 57 | private CloudTableClient TableClient { get; set; } 58 | private CloudStorageAccount StorageAccount { get; set; } 59 | private CloudTable AzureSearchTable { get; set; } 60 | public string[] DeltaTokens { get; set; } 61 | 62 | public AzureTableStorage(string tableStorageConnectionString, string tableName) 63 | { 64 | TableStorageConnectionString = tableStorageConnectionString; 65 | StorageAccount = CloudStorageAccount.Parse(TableStorageConnectionString); 66 | TableClient = StorageAccount.CreateCloudTableClient(); 67 | AzureSearchTable = TableClient.GetTableReference(tableName); 68 | } 69 | 70 | async public Task GetEntitiesInPartion(string documentLibraryUrl) 71 | { 72 | // Construct the query operation for all IndexCrawlEntities where PartitionKey="documentLibraryUrl" 73 | TableQuery query = new TableQuery().Where(TableQuery.GenerateFilterCondition("PartitionKey", QueryComparisons.Equal, documentLibraryUrl)).Take(1); 74 | string deltaTokenValue = ""; 75 | TableContinuationToken token = null; 76 | do 77 | { 78 | TableQuerySegment resultSegment = await AzureSearchTable.ExecuteQuerySegmentedAsync(query, token); 79 | 80 | //ExecuteQuery 81 | token = resultSegment.ContinuationToken; 82 | 83 | //foreach (IndexCrawlEntity entity in resultSegment.Results) 84 | //{ 85 | // deltaTokenValues.Add(entity.RowKey); 86 | //} 87 | 88 | var deltaTokenResults = resultSegment.Results; 89 | 90 | if (deltaTokenResults.Count > 0) 91 | { 92 | deltaTokenValue = deltaTokenResults[0].DeltaToken; 93 | } 94 | 95 | } while (token != null); 96 | return deltaTokenValue; 97 | } 98 | 99 | async public Task GetSpoItemEntitiesInPartion(string itemId) 100 | { 101 | // Construct the query operation for all IndexCrawlEntities where PartitionKey="documentLibraryUrl" 102 | TableQuery query = new TableQuery().Where(TableQuery.GenerateFilterCondition("PartitionKey", QueryComparisons.Equal, itemId)).Take(1); 103 | string spWebUrl = ""; 104 | TableContinuationToken token = null; 105 | do 106 | { 107 | TableQuerySegment resultSegment = await AzureSearchTable.ExecuteQuerySegmentedAsync(query, token); 108 | 109 | //ExecuteQuery 110 | token = resultSegment.ContinuationToken; 111 | 112 | //foreach (IndexCrawlEntity entity in resultSegment.Results) 113 | //{ 114 | // deltaTokenValues.Add(entity.RowKey); 115 | //} 116 | 117 | var spoItemResults = resultSegment.Results; 118 | 119 | if (spoItemResults.Count > 0) 120 | { 121 | spWebUrl = spoItemResults[0].DocumentLibraryUrl; 122 | } 123 | 124 | } while (token != null); 125 | return spWebUrl; 126 | } 127 | 128 | 129 | async public void InsertEntity(IndexCrawlEntity searchInfoEntity) 130 | { 131 | TableOperation insertOperation = TableOperation.InsertOrReplace(searchInfoEntity); 132 | await AzureSearchTable.ExecuteAsync(insertOperation); 133 | } 134 | 135 | async public void InsertSpoItemEntity(SpoItem spoItemEntity) 136 | { 137 | TableOperation insertOperation = TableOperation.InsertOrReplace(spoItemEntity); 138 | await AzureSearchTable.ExecuteAsync(insertOperation); 139 | } 140 | 141 | } 142 | } 143 | -------------------------------------------------------------------------------- /AzureSearch.SharepointOnline.Connector.CustomSkills/Services/SharePointMetadataService.cs: -------------------------------------------------------------------------------- 1 | using AzureSearch.SharepointOnline.Connector.CustomSkills.Config; 2 | using BishopBlobCustomSkill.Config; 3 | using Microsoft.Azure.Storage; 4 | using Microsoft.Azure.Storage.Blob; 5 | using Microsoft.Extensions.Options; 6 | using Newtonsoft.Json; 7 | using Newtonsoft.Json.Linq; 8 | using System; 9 | using System.Collections.Generic; 10 | using System.IO; 11 | using System.Linq; 12 | using System.Security.Cryptography.X509Certificates; 13 | using System.Threading.Tasks; 14 | 15 | namespace BishopBlobCustomSkill.Services 16 | { 17 | public class SharePointMetadataService : ISharePointMetadataService 18 | { 19 | private CloudStorageAccount storageAccount; 20 | private CloudBlobClient blobClient; 21 | private string endpoint; 22 | private readonly JObject metadataMapping; 23 | private IDictionary metadataToFieldMapping = new Dictionary(); 24 | public SharePointMetadataService(IOptions configOption, 25 | IOptions appSettingsEnvironmentOptions, 26 | IOptions environmentOptions) 27 | { 28 | 29 | storageAccount = CloudStorageAccount.Parse(configOption.Value.MetadataStorageConnectionString); 30 | blobClient = storageAccount.CreateCloudBlobClient(); 31 | 32 | // Read mapping file and extract field mappings from metadata to output fields 33 | var mappingFilePath = environmentOptions.Value.MappingFile; 34 | 35 | if (mappingFilePath == null || mappingFilePath.Trim().Length == 0) 36 | { 37 | mappingFilePath = appSettingsEnvironmentOptions.Value.MappingFile; 38 | } 39 | 40 | metadataMapping = JObject.Parse(File.ReadAllText(mappingFilePath)); 41 | 42 | metadataToFieldMapping = MappingToDictionary(metadataMapping); 43 | 44 | 45 | } 46 | 47 | 48 | private IDictionary MappingToDictionary(JObject mappingJson) 49 | { 50 | var mapping = mappingJson["outputMapping"]; 51 | 52 | var d = new Dictionary(); 53 | 54 | foreach (JObject m in mapping) 55 | { 56 | d.Add(m["metadataFieldName"].ToString(), m["outputFieldName"].ToString()); 57 | } 58 | 59 | return d; 60 | } 61 | 62 | 63 | //public string GetMetadata(string metadataUrl) 64 | //{ 65 | // CloudStorageAccount storageAccount = CreateStorageAccountFromConnectionString(CloudConfigurationManager.GetSetting("StorageConnectionString")); 66 | //} 67 | public async Task GetMetadata(Uri metadataUri) 68 | { 69 | CloudBlockBlob cbb = new CloudBlockBlob(metadataUri, blobClient); 70 | using (var ms = new MemoryStream()) 71 | { 72 | await cbb.DownloadToStreamAsync(ms); 73 | var x = DeserializeFromStream(ms); 74 | var output = new SharePointFileMetadata(); 75 | output.CreatedAuthorDisplayName = x.createdAuthorDisplayName; 76 | output.SPWebUrl = x.SPWebUrl; 77 | output.DocumentType = x.Documenttype; 78 | object o = x.Region; 79 | output.Region = JArrayToStringCollection(x.Region); 80 | output.Country = JArrayToStringCollection(x.Country); 81 | output.AustraliaState = JArrayToStringCollection(x.AustraliaState_x0028_ifapplicable_x0029_); 82 | output.Asset = JArrayToStringCollection(x.Asset); 83 | output.LinkFilename = x.LinkFilename; 84 | return output; 85 | } 86 | } 87 | 88 | public async Task> GetMetadataAsDictionary(Uri metadataUri) 89 | { 90 | CloudBlockBlob cbb = new CloudBlockBlob(metadataUri, blobClient); 91 | try 92 | { 93 | using (var ms = new MemoryStream()) 94 | { 95 | await cbb.DownloadToStreamAsync(ms); 96 | var metadataDictionary = DeserializeDictionaryFromStream(ms); 97 | return metadataDictionary; 98 | } 99 | } catch (Exception ex) 100 | { 101 | var msg = ex.ToString(); 102 | throw; 103 | } 104 | } 105 | 106 | public Dictionary MapMetadataToOutput(IDictionary metadata) 107 | { 108 | var outputDictionary = new Dictionary(); 109 | 110 | // Key, value pair is: (meta data field name, output field name) 111 | foreach (var outputMapping in metadataToFieldMapping) 112 | { 113 | if (metadata.ContainsKey(outputMapping.Key)) 114 | { 115 | outputDictionary.Add(outputMapping.Value, metadata[outputMapping.Key]); 116 | } 117 | } 118 | return outputDictionary; 119 | } 120 | 121 | private static List JArrayToStringCollection(JArray arry) 122 | { 123 | try 124 | { 125 | if (arry == null) 126 | { 127 | return new List(); 128 | } 129 | else 130 | { 131 | return arry.ToObject>(); 132 | } 133 | } 134 | catch (Exception ex) 135 | { 136 | var x = ex.Message; 137 | return new List(); 138 | } 139 | 140 | } 141 | 142 | public static dynamic DeserializeFromStream(Stream stream) 143 | { 144 | stream.Position = 0; 145 | var serializer = new JsonSerializer(); 146 | 147 | using (var sr = new StreamReader(stream)) 148 | using (var jsonTextReader = new JsonTextReader(sr)) 149 | { 150 | return serializer.Deserialize(jsonTextReader); 151 | } 152 | } 153 | 154 | public static IDictionary DeserializeDictionaryFromStream(Stream stream) 155 | { 156 | stream.Position = 0; 157 | var serializer = new JsonSerializer(); 158 | 159 | using (var sr = new StreamReader(stream)) 160 | { 161 | var jsonString = sr.ReadToEnd(); 162 | return JsonConvert.DeserializeObject>(jsonString); 163 | } 164 | } 165 | } 166 | } 167 | -------------------------------------------------------------------------------- /AzureSearch.SharepointOnline.Connector/SearchDefinitions/blobSkillset.json: -------------------------------------------------------------------------------- 1 | { 2 | "description": "Extract OCR, detect language, extract entities, key-phrases, etc., with merge and splits", 3 | "skills": [ 4 | { 5 | "@odata.type": "#Microsoft.Skills.Vision.ImageAnalysisSkill", 6 | "description": "Extract visual features from images", 7 | "defaultLanguageCode": "en", 8 | "visualFeatures": [ "Tags", "Description", "Categories" ], 9 | "context": "/document/normalized_images/*", 10 | "inputs": [ 11 | { 12 | "name": "image", 13 | "source": "/document/normalized_images/*" 14 | } 15 | ], 16 | "outputs": [ 17 | { 18 | "name": "tags", 19 | "targetName": "imageTags" 20 | }, 21 | { 22 | "name": "description", 23 | "targetName": "imageDescriptions" 24 | }, 25 | { 26 | "name": "categories", 27 | "targetName": "imageCategories" 28 | } 29 | ] 30 | }, 31 | { 32 | "@odata.type": "#Microsoft.Skills.Vision.OcrSkill", 33 | "description": "Extract text (plain and structured) from image.", 34 | "textExtractionAlgorithm": "printed", 35 | "defaultLanguageCode": "en", 36 | "detectOrientation": true, 37 | "context": "/document/normalized_images/*", 38 | "inputs": [ 39 | { 40 | "name": "image", 41 | "source": "/document/normalized_images/*" 42 | } 43 | ], 44 | "outputs": [ 45 | { 46 | "name": "text", 47 | "targetName": "ocrPrintedText" 48 | }, 49 | { 50 | "name": "layoutText", 51 | "targetName": "ocrPrintedLayoutText" 52 | } 53 | ] 54 | }, 55 | { 56 | "@odata.type": "#Microsoft.Skills.Text.MergeSkill", 57 | "description": "Merge text content with printed text of each image inserted at the right location in the content field", 58 | "context": "/document", 59 | "inputs": [ 60 | { 61 | "name": "text", 62 | "source": "/document/content" 63 | }, 64 | { 65 | "name": "itemsToInsert", 66 | "source": "/document/normalized_images/*/ocrPrintedText" 67 | }, 68 | { 69 | "name": "offsets", 70 | "source": "/document/normalized_images/*/contentOffset" 71 | } 72 | ], 73 | "outputs": [ 74 | { 75 | "name": "mergedText", 76 | "targetName": "fullText" 77 | }, 78 | { 79 | "name": "mergedOffsets", 80 | "targetName": "fullTextOffsets" 81 | } 82 | ] 83 | }, 84 | { 85 | "@odata.type": "#Microsoft.Skills.Text.LanguageDetectionSkill", 86 | "description": "Detect language on text.", 87 | "context": "/document", 88 | "inputs": [ 89 | { 90 | "name": "text", 91 | "source": "/document/fullText" 92 | } 93 | ], 94 | "outputs": [ 95 | { 96 | "name": "languageCode", 97 | "targetName": "languageCode" 98 | } 99 | ] 100 | }, 101 | { 102 | "@odata.type": "#Microsoft.Skills.Text.SplitSkill", 103 | "description": "Split text into pages for subsequent skill processing.", 104 | "defaultLanguageCode": "en", 105 | "textSplitMode": "pages", 106 | "maximumPageLength": 4000, 107 | "context": "/document/fullText", 108 | "inputs": [ 109 | { 110 | "name": "text", 111 | "source": "/document/fullText" 112 | }, 113 | { 114 | "name": "languageCode", 115 | "source": "/document/languageCode" 116 | } 117 | ], 118 | "outputs": [ 119 | { 120 | "name": "textItems", 121 | "targetName": "pages" 122 | } 123 | ] 124 | }, 125 | { 126 | "@odata.type": "#Microsoft.Skills.Text.EntityRecognitionSkill", 127 | "description": "Extract entities from pages.", 128 | "categories": [ "Organization", "Location", "Person", "DateTime" ], 129 | "defaultLanguageCode": "en", 130 | "includeTypelessEntities": true, 131 | "context": "/document/fullText/pages/*", 132 | "inputs": [ 133 | { 134 | "name": "text", 135 | "source": "/document/fullText/pages/*" 136 | }, 137 | { 138 | "name": "languageCode", 139 | "source": "/document/languageCode" 140 | } 141 | ], 142 | "outputs": [ 143 | { 144 | "name": "organizations", 145 | "targetName": "organizations" 146 | }, 147 | { 148 | "name": "locations", 149 | "targetName": "locations" 150 | }, 151 | { 152 | "name": "persons", 153 | "targetName": "people" 154 | }, 155 | { 156 | "name": "dateTimes", 157 | "targetName": "dateTimes" 158 | }, 159 | { 160 | "name": "namedEntities", 161 | "targetName": "namedEntities" 162 | }, 163 | { 164 | "name": "entities", 165 | "targetName": "typelessEntities" 166 | } 167 | ] 168 | }, 169 | { 170 | "@odata.type": "#Microsoft.Skills.Text.KeyPhraseExtractionSkill", 171 | "description": "Extract key-phrases from pages.", 172 | "defaultLanguageCode": "en", 173 | "context": "/document/fullText/pages/*", 174 | "inputs": [ 175 | { 176 | "name": "text", 177 | "source": "/document/fullText/pages/*" 178 | }, 179 | { 180 | "name": "languageCode", 181 | "source": "/document/languageCode" 182 | } 183 | ], 184 | "outputs": [ 185 | { 186 | "name": "keyPhrases", 187 | "targetName": "keyPhrases" 188 | } 189 | ] 190 | }, 191 | { 192 | "@odata.type": "#Microsoft.Skills.Custom.WebApiSkill", 193 | "description": "Our SharePoint Metadata mapping custom skill", 194 | "uri": "[CustomSpoMetadataSkillUri]", 195 | "batchSize": 1, 196 | "context": "/document", 197 | "httpHeaders": { 198 | "SPOMetadataMapper-Api-Key": "[SPOMetadataMapper-Api-Key]" 199 | }, 200 | "inputs": [ 201 | { 202 | "name": "docpath", 203 | "source": "/document/blobUri" 204 | }, 205 | { 206 | "name": "sastoken", 207 | "source": "/document/metadata_storage_sas_token" 208 | }, 209 | { 210 | "name": "metadataurl", 211 | "source": "/document/metadataurl" 212 | } 213 | 214 | ], 215 | "outputs": [ 216 | { 217 | "name": "tags", 218 | "targetName": "tags" 219 | }, 220 | { 221 | "name": "acls", 222 | "targetName": "acls" 223 | }, 224 | { 225 | "name": "createdAuthorDisplayName", 226 | "targetName": "CreatedAuthorDisplayName" 227 | }, 228 | { 229 | "name": "SPWebUrl", 230 | "targetName": "SPWebUrl" 231 | }, 232 | { 233 | "name": "LinkFilename", 234 | "targetName": "LinkFilename" 235 | }, 236 | { 237 | "name": "ContentType", 238 | "targetName": "ContentType" 239 | } 240 | ] 241 | } 242 | ], 243 | "cognitiveServices": { 244 | "@odata.type": "#Microsoft.Azure.Search.CognitiveServicesByKey", 245 | "description": "[CognitiveServicesAccount]", 246 | "key": "[CognitiveServicesKey]" 247 | } 248 | } -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | ## 4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore 5 | 6 | # User-specific files 7 | *.rsuser 8 | *.suo 9 | *.user 10 | *.userosscache 11 | *.sln.docstates 12 | 13 | # User-specific files (MonoDevelop/Xamarin Studio) 14 | *.userprefs 15 | 16 | # Mono auto generated files 17 | mono_crash.* 18 | 19 | # Build results 20 | [Dd]ebug/ 21 | [Dd]ebugPublic/ 22 | [Rr]elease/ 23 | [Rr]eleases/ 24 | x64/ 25 | x86/ 26 | [Aa][Rr][Mm]/ 27 | [Aa][Rr][Mm]64/ 28 | bld/ 29 | [Bb]in/ 30 | [Oo]bj/ 31 | [Ll]og/ 32 | [Ll]ogs/ 33 | 34 | # Visual Studio 2015/2017 cache/options directory 35 | .vs/ 36 | # Uncomment if you have tasks that create the project's static files in wwwroot 37 | #wwwroot/ 38 | 39 | # Visual Studio 2017 auto generated files 40 | Generated\ Files/ 41 | 42 | # MSTest test Results 43 | [Tt]est[Rr]esult*/ 44 | [Bb]uild[Ll]og.* 45 | 46 | # NUnit 47 | *.VisualState.xml 48 | TestResult.xml 49 | nunit-*.xml 50 | 51 | # Build Results of an ATL Project 52 | [Dd]ebugPS/ 53 | [Rr]eleasePS/ 54 | dlldata.c 55 | 56 | # Benchmark Results 57 | BenchmarkDotNet.Artifacts/ 58 | 59 | # .NET Core 60 | project.lock.json 61 | project.fragment.lock.json 62 | artifacts/ 63 | 64 | # StyleCop 65 | StyleCopReport.xml 66 | 67 | # Files built by Visual Studio 68 | *_i.c 69 | *_p.c 70 | *_h.h 71 | *.ilk 72 | *.meta 73 | *.obj 74 | *.iobj 75 | *.pch 76 | *.pdb 77 | *.ipdb 78 | *.pgc 79 | *.pgd 80 | *.rsp 81 | *.sbr 82 | *.tlb 83 | *.tli 84 | *.tlh 85 | *.tmp 86 | *.tmp_proj 87 | *_wpftmp.csproj 88 | *.log 89 | *.vspscc 90 | *.vssscc 91 | .builds 92 | *.pidb 93 | *.svclog 94 | *.scc 95 | 96 | # Chutzpah Test files 97 | _Chutzpah* 98 | 99 | # Visual C++ cache files 100 | ipch/ 101 | *.aps 102 | *.ncb 103 | *.opendb 104 | *.opensdf 105 | *.sdf 106 | *.cachefile 107 | *.VC.db 108 | *.VC.VC.opendb 109 | 110 | # Visual Studio profiler 111 | *.psess 112 | *.vsp 113 | *.vspx 114 | *.sap 115 | 116 | # Visual Studio Trace Files 117 | *.e2e 118 | 119 | # TFS 2012 Local Workspace 120 | $tf/ 121 | 122 | # Guidance Automation Toolkit 123 | *.gpState 124 | 125 | # ReSharper is a .NET coding add-in 126 | _ReSharper*/ 127 | *.[Rr]e[Ss]harper 128 | *.DotSettings.user 129 | 130 | # JustCode is a .NET coding add-in 131 | .JustCode 132 | 133 | # TeamCity is a build add-in 134 | _TeamCity* 135 | 136 | # DotCover is a Code Coverage Tool 137 | *.dotCover 138 | 139 | # AxoCover is a Code Coverage Tool 140 | .axoCover/* 141 | !.axoCover/settings.json 142 | 143 | # Visual Studio code coverage results 144 | *.coverage 145 | *.coveragexml 146 | 147 | # NCrunch 148 | _NCrunch_* 149 | .*crunch*.local.xml 150 | nCrunchTemp_* 151 | 152 | # MightyMoose 153 | *.mm.* 154 | AutoTest.Net/ 155 | 156 | # Web workbench (sass) 157 | .sass-cache/ 158 | 159 | # Installshield output folder 160 | [Ee]xpress/ 161 | 162 | # DocProject is a documentation generator add-in 163 | DocProject/buildhelp/ 164 | DocProject/Help/*.HxT 165 | DocProject/Help/*.HxC 166 | DocProject/Help/*.hhc 167 | DocProject/Help/*.hhk 168 | DocProject/Help/*.hhp 169 | DocProject/Help/Html2 170 | DocProject/Help/html 171 | 172 | # Click-Once directory 173 | publish/ 174 | 175 | # Publish Web Output 176 | *.[Pp]ublish.xml 177 | *.azurePubxml 178 | # Note: Comment the next line if you want to checkin your web deploy settings, 179 | # but database connection strings (with potential passwords) will be unencrypted 180 | *.pubxml 181 | *.publishproj 182 | 183 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 184 | # checkin your Azure Web App publish settings, but sensitive information contained 185 | # in these scripts will be unencrypted 186 | PublishScripts/ 187 | 188 | # NuGet Packages 189 | *.nupkg 190 | # NuGet Symbol Packages 191 | *.snupkg 192 | # The packages folder can be ignored because of Package Restore 193 | **/[Pp]ackages/* 194 | # except build/, which is used as an MSBuild target. 195 | !**/[Pp]ackages/build/ 196 | # Uncomment if necessary however generally it will be regenerated when needed 197 | #!**/[Pp]ackages/repositories.config 198 | # NuGet v3's project.json files produces more ignorable files 199 | *.nuget.props 200 | *.nuget.targets 201 | 202 | # Microsoft Azure Build Output 203 | csx/ 204 | *.build.csdef 205 | 206 | # Microsoft Azure Emulator 207 | ecf/ 208 | rcf/ 209 | 210 | # Windows Store app package directories and files 211 | AppPackages/ 212 | BundleArtifacts/ 213 | Package.StoreAssociation.xml 214 | _pkginfo.txt 215 | *.appx 216 | *.appxbundle 217 | *.appxupload 218 | 219 | # Visual Studio cache files 220 | # files ending in .cache can be ignored 221 | *.[Cc]ache 222 | # but keep track of directories ending in .cache 223 | !?*.[Cc]ache/ 224 | 225 | # Others 226 | ClientBin/ 227 | ~$* 228 | *~ 229 | *.dbmdl 230 | *.dbproj.schemaview 231 | *.jfm 232 | *.pfx 233 | *.publishsettings 234 | orleans.codegen.cs 235 | 236 | # Including strong name files can present a security risk 237 | # (https://github.com/github/gitignore/pull/2483#issue-259490424) 238 | #*.snk 239 | 240 | # Since there are multiple workflows, uncomment next line to ignore bower_components 241 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 242 | #bower_components/ 243 | 244 | # RIA/Silverlight projects 245 | Generated_Code/ 246 | 247 | # Backup & report files from converting an old project file 248 | # to a newer Visual Studio version. Backup files are not needed, 249 | # because we have git ;-) 250 | _UpgradeReport_Files/ 251 | Backup*/ 252 | UpgradeLog*.XML 253 | UpgradeLog*.htm 254 | ServiceFabricBackup/ 255 | *.rptproj.bak 256 | 257 | # SQL Server files 258 | *.mdf 259 | *.ldf 260 | *.ndf 261 | 262 | # Business Intelligence projects 263 | *.rdl.data 264 | *.bim.layout 265 | *.bim_*.settings 266 | *.rptproj.rsuser 267 | *- [Bb]ackup.rdl 268 | *- [Bb]ackup ([0-9]).rdl 269 | *- [Bb]ackup ([0-9][0-9]).rdl 270 | 271 | # Microsoft Fakes 272 | FakesAssemblies/ 273 | 274 | # GhostDoc plugin setting file 275 | *.GhostDoc.xml 276 | 277 | # Node.js Tools for Visual Studio 278 | .ntvs_analysis.dat 279 | node_modules/ 280 | 281 | # Visual Studio 6 build log 282 | *.plg 283 | 284 | # Visual Studio 6 workspace options file 285 | *.opt 286 | 287 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.) 288 | *.vbw 289 | 290 | # Visual Studio LightSwitch build output 291 | **/*.HTMLClient/GeneratedArtifacts 292 | **/*.DesktopClient/GeneratedArtifacts 293 | **/*.DesktopClient/ModelManifest.xml 294 | **/*.Server/GeneratedArtifacts 295 | **/*.Server/ModelManifest.xml 296 | _Pvt_Extensions 297 | 298 | # Paket dependency manager 299 | .paket/paket.exe 300 | paket-files/ 301 | 302 | # FAKE - F# Make 303 | .fake/ 304 | 305 | # CodeRush personal settings 306 | .cr/personal 307 | 308 | # Python Tools for Visual Studio (PTVS) 309 | __pycache__/ 310 | *.pyc 311 | 312 | # Cake - Uncomment if you are using it 313 | # tools/** 314 | # !tools/packages.config 315 | 316 | # Tabs Studio 317 | *.tss 318 | 319 | # Telerik's JustMock configuration file 320 | *.jmconfig 321 | 322 | # BizTalk build output 323 | *.btp.cs 324 | *.btm.cs 325 | *.odx.cs 326 | *.xsd.cs 327 | 328 | # OpenCover UI analysis results 329 | OpenCover/ 330 | 331 | # Azure Stream Analytics local run output 332 | ASALocalRun/ 333 | 334 | # MSBuild Binary and Structured Log 335 | *.binlog 336 | 337 | # NVidia Nsight GPU debugger configuration file 338 | *.nvuser 339 | 340 | # MFractors (Xamarin productivity tool) working folder 341 | .mfractor/ 342 | 343 | # Local History for Visual Studio 344 | .localhistory/ 345 | 346 | # BeatPulse healthcheck temp database 347 | healthchecksdb 348 | 349 | # Backup folder for Package Reference Convert tool in Visual Studio 2017 350 | MigrationBackup/ 351 | 352 | # Ionide (cross platform F# VS Code tools) working folder 353 | .ionide/ -------------------------------------------------------------------------------- /AzureSearch.SharepointOnline.Connector/Helpers/AzureBLOBStorage.cs: -------------------------------------------------------------------------------- 1 | //THIS CODE IS PROVIDED AS IS WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT. 2 | 3 | using Microsoft.Graph; 4 | using Microsoft.WindowsAzure.Storage.Blob; 5 | using System; 6 | using System.Collections.Generic; 7 | using System.IO; 8 | using System.Net.Http; 9 | using System.Text; 10 | using System.Threading.Tasks; 11 | using System.Threading; 12 | 13 | namespace AzureSearch.SharePointOnline.Connector.Helpers 14 | { 15 | class AzureBLOBStorage 16 | { 17 | 18 | public static int DownloadFileCount; 19 | private static int _currentRetry = 0; 20 | private static int _retryCount = 5; 21 | private static TimeSpan _delay = TimeSpan.FromSeconds(15); 22 | private static int spoDownloadErrorCount = 0; 23 | private static int spoDownloadErrorRetryCount = 5; 24 | 25 | 26 | static async Task DownloadFileLocal(GraphServiceClient graphClient, object downloadUrl, string fileName) 27 | { 28 | // Create a file stream to contain the downloaded file. 29 | using (FileStream fileStream = System.IO.File.Create((@"C:\Temp\" + fileName))) 30 | { 31 | HttpRequestMessage req = new HttpRequestMessage(HttpMethod.Get, (string)downloadUrl); 32 | HttpResponseMessage response = await graphClient.HttpProvider.SendAsync(req); 33 | var responseStream = await response.Content.ReadAsStreamAsync(); 34 | responseStream.CopyTo(fileStream); 35 | Console.WriteLine("file {0} written to BLOB", fileName); 36 | DownloadFileCount++; 37 | } 38 | 39 | } 40 | 41 | public static async Task DownloadFileToAzureBLOB(GraphServiceClient graphClient, object downloadUrl, string fileName, CloudBlobContainer container, string storageUploadUri) 42 | { 43 | var blockBlob = container.GetBlockBlobReference(fileName); 44 | await DownloadSPOFile(graphClient, downloadUrl, fileName, blockBlob, storageUploadUri); 45 | } 46 | 47 | static async Task DownloadSPOFile(GraphServiceClient graphClient, object downloadUrl, string fileName, CloudBlockBlob blockBlob, string storageUploadUri) 48 | { 49 | try 50 | { 51 | HttpRequestMessage req2 = new HttpRequestMessage(HttpMethod.Get, (string)downloadUrl); 52 | HttpResponseMessage response = await graphClient.HttpProvider.SendAsync(req2); 53 | if (response.IsSuccessStatusCode) 54 | { 55 | using (var responseStream = await response.Content.ReadAsStreamAsync()) 56 | { 57 | blockBlob.Metadata.Add("Metadataurl", storageUploadUri); 58 | 59 | await blockBlob.UploadFromStreamAsync(responseStream); 60 | //await blockBlob.SetMetadataAsync(); 61 | Console.WriteLine("file {0} written to Azure BLOB", fileName); 62 | DownloadFileCount++; 63 | } 64 | } 65 | else 66 | { 67 | 68 | } 69 | } 70 | catch (Exception e) 71 | { 72 | spoDownloadErrorCount++; 73 | if (spoDownloadErrorCount <= spoDownloadErrorRetryCount) 74 | { 75 | Console.WriteLine("Retry count [{0}] downloading file {1}", spoDownloadErrorCount, downloadUrl); 76 | await DownloadSPOFile(graphClient, downloadUrl, fileName, blockBlob, storageUploadUri); 77 | } 78 | } 79 | spoDownloadErrorCount = 0; 80 | } 81 | 82 | static async Task DownloadFileToAzureBLOB(GraphServiceClient graphClient, object downloadUrl, string fileName, CloudBlobContainer container) 83 | { 84 | var blockBlob = container.GetBlockBlobReference(fileName); 85 | // Create a file stream to contain the downloaded file. 86 | 87 | HttpRequestMessage req = new HttpRequestMessage(HttpMethod.Get, (string)downloadUrl); 88 | HttpResponseMessage response = await graphClient.HttpProvider.SendAsync(req); 89 | var responseStream = await response.Content.ReadAsStreamAsync(); 90 | 91 | await blockBlob.UploadFromStreamAsync(responseStream); 92 | //await blockBlob.SetMetadataAsync(); 93 | Console.WriteLine("file {0} written to Azure BLOB", fileName); 94 | DownloadFileCount++; 95 | } 96 | 97 | public static async Task DownloadFileToAzureBLOB(GraphServiceClient graphClient, object downloadUrl, string fileName, CloudBlobContainer container, IDictionary metadata) 98 | { 99 | var blockBlob = container.GetBlockBlobReference(fileName); 100 | // Create a file stream to contain the downloaded file. 101 | try 102 | { 103 | HttpRequestMessage req = new HttpRequestMessage(HttpMethod.Get, (string)downloadUrl); 104 | HttpResponseMessage response = await graphClient.HttpProvider.SendAsync(req); 105 | var responseStream = await response.Content.ReadAsStreamAsync(); 106 | 107 | foreach (var meta in metadata) 108 | { 109 | var metaKey = meta.Key; 110 | var metaValue = meta.Value.ToString(); 111 | blockBlob.Metadata.Add(metaKey, metaValue); 112 | } 113 | //Write Metadata tags: 114 | 115 | await blockBlob.UploadFromStreamAsync(responseStream); 116 | //await blockBlob.SetMetadataAsync(); 117 | Console.WriteLine("file {0} written to Azure BLOB", fileName); 118 | DownloadFileCount++; 119 | } 120 | catch (Exception e) 121 | { 122 | Console.WriteLine("Error Downloading File: " + e.Message.ToString()); 123 | } 124 | 125 | 126 | } 127 | 128 | public static async Task UploadFileToAzureBLOB(Stream contents, string fileName, CloudBlobContainer container) 129 | { 130 | var blockBlob = container.GetBlockBlobReference(fileName); 131 | await blockBlob.UploadFromStreamAsync(contents); 132 | Console.WriteLine("file {0} written to Azure BLOB", fileName); 133 | return blockBlob.StorageUri.PrimaryUri.ToString(); 134 | 135 | } 136 | 137 | public static async Task DeleteFileFromAzureBLOB(string fileName, CloudBlobContainer container) 138 | { 139 | var blockBlob = container.GetBlockBlobReference(fileName); 140 | Console.WriteLine("Removing fileName [" + fileName + "]"); 141 | await blockBlob.DeleteAsync(); 142 | Console.WriteLine("file {0} deleted from Azure BLOB", fileName); 143 | return blockBlob.StorageUri.PrimaryUri.ToString(); 144 | 145 | } 146 | 147 | public static async Task CreateAzureBLOBContainer(CloudBlobClient storageClient, string containerName) 148 | { 149 | Console.WriteLine("Creating container [" + containerName + "]"); 150 | var container = storageClient.GetContainerReference(containerName); 151 | try 152 | { 153 | await container.CreateIfNotExistsAsync(); 154 | } 155 | catch (Exception err) 156 | { 157 | //Wait for the The specified container is being deleted. Try operation later. to clear 158 | if (err.HResult.Equals(-2146233088)) 159 | { 160 | Console.WriteLine("The specified container is being deleted. Will try again"); 161 | _currentRetry++; 162 | await Task.Delay(_delay); 163 | await CreateAzureBLOBContainer(storageClient, containerName); 164 | if (_currentRetry > _retryCount) 165 | { 166 | // If this isn't a transient error or we shouldn't retry, 167 | // rethrow the exception. 168 | throw; 169 | } 170 | 171 | } 172 | } 173 | 174 | CloudBlobContainer newContainer = storageClient.GetContainerReference(containerName); 175 | return newContainer; 176 | } 177 | 178 | public static async Task DeleteContainerFromAzureBLOB(CloudBlobContainer container) 179 | { 180 | await container.DeleteAsync(); 181 | 182 | Console.WriteLine("Removing container [" + container + "]"); 183 | } 184 | 185 | 186 | } 187 | } 188 | -------------------------------------------------------------------------------- /AzureSearch.SharepointOnline.Connector.CustomSkills/Controllers/CustomSkillsController.cs: -------------------------------------------------------------------------------- 1 | //THIS CODE IS PROVIDED AS IS WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT. 2 | 3 | using System; 4 | using System.Collections.Generic; 5 | using System.IO; 6 | using System.Linq; 7 | using System.Threading.Tasks; 8 | using AzureSearch.SharepointOnline.Connector.CustomSkills.Config; 9 | using BishopBlobCustomSkill.Fields; 10 | using BishopBlobCustomSkill.Services; 11 | using Microsoft.AspNetCore.Mvc; 12 | using Microsoft.Extensions.Options; 13 | using Newtonsoft.Json; 14 | 15 | namespace BishopBlobCustomSkill.Controllers 16 | { 17 | /// 18 | /// Class structured to deserialize input format: https://docs.microsoft.com/en-us/azure/search/cognitive-search-custom-skill-interface 19 | /// 20 | class CustomSkillApiRequest 21 | { 22 | public List Values { get; set; } 23 | } 24 | 25 | class WebApiResponse 26 | { 27 | public List Values { get; set; } 28 | } 29 | 30 | class CustomSkillApiResponse 31 | { 32 | public List Values { get; set; } 33 | } 34 | 35 | class CustomSkillOutputRecord 36 | { 37 | public class OutputRecordMessage 38 | { 39 | public string Message { get; set; } 40 | } 41 | 42 | public string RecordId { get; set; } 43 | public Dictionary Data { get; set; } 44 | public List Errors { get; set; } 45 | public List Warnings { get; set; } 46 | } 47 | 48 | 49 | 50 | [Route("api/[controller]")] 51 | //[Route("api/")] 52 | [ApiController] 53 | public class CustomSkillsController : ControllerBase 54 | { 55 | private readonly string ApiKey; 56 | 57 | private ISharePointMetadataService svc; 58 | private const string ApiKeyHeader = "SPOMetadataMapper-Api-Key"; 59 | public CustomSkillsController(ISharePointMetadataService svc, IOptions environmentOptions) 60 | { 61 | this.svc = svc; 62 | this.ApiKey = environmentOptions.Value.ApiKey; 63 | } 64 | 65 | 66 | 67 | [HttpGet] 68 | [HttpPost] 69 | [Route("MergeSharePointMetadatav2")] 70 | public async Task> MergeSharePointMetadatav2() 71 | { 72 | 73 | System.Diagnostics.Trace.WriteLine("Starting call"); 74 | 75 | var response = new WebApiResponse() 76 | { 77 | Values = new List() 78 | }; 79 | 80 | string requestBody = new StreamReader(Request.Body).ReadToEnd(); 81 | //dynamic data = JsonConvert.DeserializeObject(requestBody); 82 | var data = JsonConvert.DeserializeObject(requestBody); 83 | 84 | // Do some schema validation 85 | if (data == null) 86 | { 87 | return new BadRequestObjectResult("The request schema does not match expected schema."); 88 | } 89 | if (data.Values == null) 90 | { 91 | return new BadRequestObjectResult("The request schema does not match expected schema. Could not find values array."); 92 | } 93 | 94 | // Calculate the response for each value. 95 | foreach (var record in data.Values) 96 | { 97 | if (record == null || record.RecordId == null) continue; 98 | 99 | 100 | 101 | OutputRecord responseRecord = new OutputRecord 102 | { 103 | RecordId = record.RecordId, 104 | }; 105 | 106 | 107 | try 108 | { 109 | System.Diagnostics.Trace.WriteLine("Record Metadata URL Details: {0}", record.Data.Metadataurl); 110 | responseRecord.Data = new OutputRecord.OutputRecordData(); 111 | var metadata = await svc.GetMetadata(new Uri(record.Data.Metadataurl)); 112 | 113 | responseRecord.Data.ACLS = ""; 114 | responseRecord.Data.SPWebUrl = metadata.SPWebUrl; 115 | responseRecord.Data.CreatedAuthorDisplayName = metadata.CreatedAuthorDisplayName; 116 | //responseRecord.Data.DocumentType = metadata.DocumentType; 117 | //responseRecord.Data.Region = metadata.Region; 118 | //responseRecord.Data.Country = metadata.Country; 119 | //responseRecord.Data.AustraliaState = metadata.AustraliaState; 120 | //responseRecord.Data.Asset = metadata.Asset; 121 | responseRecord.Data.LinkFilename = metadata.LinkFilename; 122 | 123 | } 124 | catch (Exception e) 125 | { 126 | // Something bad happened, log the issue. 127 | 128 | System.Diagnostics.Trace.TraceInformation("Something [info] bad happened {0}", e.Message.ToString()); 129 | System.Diagnostics.Trace.TraceError("Something [error] bad happened {0}", e.Message.ToString()); 130 | System.Diagnostics.Trace.WriteLine("Something [error] bad happened {0}", e.Message.ToString()); 131 | var error = new OutputRecord.OutputRecordMessage 132 | { 133 | 134 | Message = e.InnerException.Message 135 | }; 136 | 137 | responseRecord.Errors = new List 138 | { 139 | error 140 | }; 141 | } 142 | finally 143 | { 144 | response.Values.Add(responseRecord); 145 | } 146 | } 147 | 148 | return (ActionResult)new OkObjectResult(response); 149 | 150 | } 151 | 152 | //private static OutputRecord.OutputRecordData GetEntityMetadata(InputRecord.InputRecordData input) 153 | //{ 154 | // var response = new OutputRecord.OutputRecordData() 155 | // { 156 | // ACLS = "no acls 20190821-1", 157 | // Tags = "Metadata1: " + input.Metadata1, 158 | // SourceUrl = input.SPUrl 159 | // }; 160 | 161 | // return response; 162 | //} 163 | 164 | [HttpGet] 165 | [HttpPost] 166 | [Route("MergeSharePointMetadata")] 167 | public async Task> MergeSharePointMetadata() 168 | { 169 | // Check API Key 170 | 171 | System.Diagnostics.Trace.WriteLine("Starting call"); 172 | 173 | var requestApiKey = Request.Headers[ApiKeyHeader].FirstOrDefault(); 174 | 175 | if (requestApiKey != ApiKey) { 176 | return new BadRequestObjectResult("Invalid API key for custom skill"); 177 | } 178 | 179 | var response = new CustomSkillApiResponse() 180 | { 181 | Values = new List() 182 | }; 183 | 184 | string requestBody = new StreamReader(Request.Body).ReadToEnd(); 185 | var data = JsonConvert.DeserializeObject(requestBody); 186 | 187 | // Do some schema validation 188 | if (data == null) 189 | { 190 | return new BadRequestObjectResult("The request schema does not match expected schema."); 191 | } 192 | if (data.Values == null) 193 | { 194 | return new BadRequestObjectResult("The request schema does not match expected schema. Could not find values array."); 195 | } 196 | 197 | // Calculate the response for each value. 198 | foreach (var record in data.Values) 199 | { 200 | if (record == null || record.RecordId == null) continue; 201 | 202 | CustomSkillOutputRecord responseRecord = new CustomSkillOutputRecord 203 | { 204 | RecordId = record.RecordId, 205 | }; 206 | 207 | 208 | 209 | try 210 | { 211 | System.Diagnostics.Trace.WriteLine("Record Metadata URL Details: {0}", record.Data.Metadataurl); 212 | var metadata = await svc.GetMetadataAsDictionary(new Uri(record.Data.Metadataurl)); 213 | responseRecord.Data = svc.MapMetadataToOutput(metadata); 214 | } 215 | catch (Exception e) 216 | { 217 | // Something bad happened, log the issue. 218 | 219 | System.Diagnostics.Trace.TraceInformation("Something [info] bad happened {0}", e.Message.ToString()); 220 | System.Diagnostics.Trace.TraceError("Something [error] bad happened {0}", e.Message.ToString()); 221 | System.Diagnostics.Trace.WriteLine("Something [error] bad happened {0}", e.Message.ToString()); 222 | var error = new CustomSkillOutputRecord.OutputRecordMessage 223 | { 224 | 225 | Message = e.InnerException.Message 226 | }; 227 | 228 | responseRecord.Errors = new List 229 | { 230 | error 231 | }; 232 | } 233 | finally 234 | { 235 | response.Values.Add(responseRecord); 236 | } 237 | } 238 | 239 | return (ActionResult)new OkObjectResult(response); 240 | 241 | } 242 | } 243 | } 244 | -------------------------------------------------------------------------------- /AzureSearch.SharepointOnline.Connector/Helpers/SearchServiceHelper.cs: -------------------------------------------------------------------------------- 1 | //THIS CODE IS PROVIDED AS IS WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT. 2 | 3 | using Microsoft.Azure.Search; 4 | using Microsoft.Azure.Search.Models; 5 | using System; 6 | using System.IO; 7 | using System.Net.Http; 8 | using System.Text; 9 | using System.Threading.Tasks; 10 | 11 | namespace AzureSearch.SharePointOnline.Connector.Helpers 12 | { 13 | public class SearchServiceHelper 14 | { 15 | private readonly SearchServiceClient client; 16 | 17 | public SearchServiceHelper(string searchServiceName, string searchServiceAdminKey) 18 | { 19 | client = new SearchServiceClient(searchServiceName, new SearchCredentials(searchServiceAdminKey)); 20 | client.HttpClient.DefaultRequestHeaders.Add("api-key", searchServiceAdminKey); 21 | } 22 | 23 | public async Task CreateOrUpdateBlobDataSourceAsync( 24 | string dataSourceName, 25 | string storageAccountName, 26 | string storageAccountKey, 27 | string storageContainerName) 28 | { 29 | Console.WriteLine($"Creating '{dataSourceName}' blob data source..."); 30 | await client.DataSources.CreateOrUpdateAsync(new DataSource() 31 | { 32 | Name = dataSourceName, 33 | Type = "azureblob", 34 | Credentials = new DataSourceCredentials($"DefaultEndpointsProtocol=https;AccountName={storageAccountName};AccountKey={storageAccountKey};"), 35 | Container = new DataContainer(storageContainerName) // In query param you can specify an optional virtual directory name 36 | }); 37 | } 38 | 39 | public async Task CreateOrUpdateCosmosDBDataSourceAsync( 40 | string dataSourceName, 41 | string cosmosDBConnectionString, 42 | string cosmosDbDatabaseName, 43 | string cosmosDBContainer) 44 | { 45 | Console.WriteLine($"Creating '{dataSourceName}' CosmosDB data source..."); 46 | await client.DataSources.CreateOrUpdateAsync(DataSource.DocumentDb( 47 | name: dataSourceName, 48 | documentDbConnectionString: $"{cosmosDBConnectionString};Database={cosmosDbDatabaseName}", 49 | collectionName: cosmosDBContainer, 50 | useChangeDetection: true 51 | )); 52 | } 53 | 54 | public async Task DeleteDataSourceAsync(string dataSourceName) 55 | { 56 | Console.WriteLine($"Deleting '{dataSourceName}' data source..."); 57 | await client.DataSources.DeleteAsync(dataSourceName); 58 | } 59 | 60 | public async Task CreateSynonymsMapFromJsonDefinitionAsync(string synonymMapName, string synonymMapDefinitionPath) 61 | { 62 | Console.WriteLine($"Creating '{synonymMapName}' synonym map with '{synonymMapDefinitionPath}'..."); 63 | using (StreamReader reader = new StreamReader(synonymMapDefinitionPath)) 64 | { 65 | var uri = $"https://{client.SearchServiceName}.{client.SearchDnsSuffix}/synonymmaps/{synonymMapName}?api-version=2017-11-11-Preview"; 66 | var json = reader.ReadToEnd(); 67 | json = json.Replace("[SynonymMapName]", synonymMapName); 68 | var content = new StringContent(json, Encoding.UTF8, "application/json"); 69 | var response = await client.HttpClient.PutAsync(uri, content); 70 | response.EnsureSuccessStatusCode(); 71 | } 72 | } 73 | 74 | public async Task DeleteSynonymMapAsync(string synonymMapName) 75 | { 76 | Console.WriteLine($"Deleting '{synonymMapName}' synonym map..."); 77 | await client.SynonymMaps.DeleteAsync(synonymMapName); 78 | } 79 | 80 | public async Task CreateIndexFromJsonDefinitionAsync(string indexName, string indexDefinitionPath, string synonymMapName) 81 | { 82 | Console.WriteLine($"Creating '{indexName}' index with '{indexDefinitionPath}'..."); 83 | using (StreamReader reader = new StreamReader(indexDefinitionPath)) 84 | { 85 | var uri = $"https://{client.SearchServiceName}.{client.SearchDnsSuffix}/indexes/{indexName}?api-version=2017-11-11-Preview"; 86 | var json = reader.ReadToEnd(); 87 | json = json.Replace("[SynonymMapName]", synonymMapName); 88 | var content = new StringContent(json, Encoding.UTF8, "application/json"); 89 | try 90 | { 91 | var response = await client.HttpClient.PutAsync(uri, content); 92 | 93 | } 94 | catch (Exception ex) 95 | //when (ex.Message.Contains("404 (Not Found)")) 96 | { 97 | var _ = ex.Message; 98 | } 99 | 100 | } 101 | } 102 | 103 | public async Task DeleteIndexAsync(string indexName) 104 | { 105 | Console.WriteLine($"Deleting '{indexName}' index..."); 106 | await client.Indexes.DeleteAsync(indexName); 107 | } 108 | 109 | public async Task CreateSkillsetFromJsonDefinitionAsync(string skillsetName, string skillsetDefinitionPath, string cognitiveKey, string cognitiveAccount, string customSpoMetadataSkillUri, string spoMetadataMapperApiKey) 110 | { 111 | Console.WriteLine($"Creating '{skillsetName}' skillset with '{skillsetDefinitionPath}'..."); 112 | using (StreamReader reader = new StreamReader(skillsetDefinitionPath)) 113 | { 114 | var uri = $"https://{client.SearchServiceName}.{client.SearchDnsSuffix}/skillsets/{skillsetName}?api-version=2017-11-11-Preview"; 115 | var json = reader.ReadToEnd(); 116 | json = json.Replace("[CognitiveServicesAccount]", cognitiveAccount); 117 | json = json.Replace("[CognitiveServicesKey]", cognitiveKey); 118 | json = json.Replace("[CustomSpoMetadataSkillUri]", customSpoMetadataSkillUri); 119 | json = json.Replace("[SPOMetadataMapper-Api-Key]", spoMetadataMapperApiKey); 120 | var content = new StringContent(json, Encoding.UTF8, "application/json"); 121 | var response = await client.HttpClient.PutAsync(uri, content); 122 | response.EnsureSuccessStatusCode(); 123 | } 124 | } 125 | 126 | public async Task DeleteSkillsetAsync(string skillsetName) 127 | { 128 | try 129 | { 130 | Console.WriteLine($"Deleting '{skillsetName}' skillset..."); 131 | var uri = $"https://{client.SearchServiceName}.{client.SearchDnsSuffix}/skillsets/{skillsetName}?api-version=2017-11-11-Preview"; 132 | var response = await client.HttpClient.DeleteAsync(uri); 133 | response.EnsureSuccessStatusCode(); 134 | } 135 | catch (Exception ex) when (ex.Message.Contains("404 (Not Found)")) { } 136 | } 137 | 138 | public async Task CreateIndexerFromJsonDefinitionAsync(string indexerName, string indexerDefinitionPath, string dataSourceName, string indexName, string skillsetName) 139 | { 140 | Console.WriteLine($"Creating '{indexerName}' indexer with '{indexerDefinitionPath}'..."); 141 | using (StreamReader reader = new StreamReader(indexerDefinitionPath)) 142 | { 143 | var uri = $"https://{client.SearchServiceName}.{client.SearchDnsSuffix}/indexers/{indexerName}?api-version=2017-11-11-Preview"; 144 | var json = reader.ReadToEnd(); 145 | json = json.Replace("[IndexerName]", indexerName); 146 | json = json.Replace("[DataSourceName]", dataSourceName); 147 | json = json.Replace("[IndexName]", indexName); 148 | json = json.Replace("[SkillSetName]", skillsetName); 149 | var content = new StringContent(json, Encoding.UTF8, "application/json"); 150 | var response = await client.HttpClient.PutAsync(uri, content); 151 | response.EnsureSuccessStatusCode(); 152 | } 153 | } 154 | 155 | public async Task CreateIndexerAsync(string indexerName, string dataSourceName, string indexName) 156 | { 157 | Console.WriteLine($"Creating '{indexerName}' indexer..."); 158 | await client.Indexers.CreateAsync(new Indexer( 159 | name: indexerName, 160 | dataSourceName: dataSourceName, 161 | targetIndexName: indexName 162 | )); 163 | } 164 | 165 | public async Task DeleteIndexerAsync(string indexerName) 166 | { 167 | Console.WriteLine($"Deleting '{indexerName}' indexer..."); 168 | await client.Indexers.DeleteAsync(indexerName); 169 | } 170 | 171 | public async Task WaitForIndexerToFinishAsync(string indexerName, int delaySecs = 60) 172 | { 173 | IndexerExecutionInfo info; 174 | 175 | do 176 | { 177 | Console.WriteLine($" Waiting {delaySecs} seconds..."); 178 | await Task.Delay(delaySecs * 1000); 179 | Console.WriteLine($" Getting indexer status..."); 180 | info = await client.Indexers.GetStatusAsync(indexerName); 181 | Console.WriteLine($" ...Indexer status: {info.Status}, Indexer Execution Status: {info.LastResult?.Status}."); 182 | } while ( 183 | info.Status == IndexerStatus.Running 184 | && (info.LastResult == null || info.LastResult.Status == IndexerExecutionStatus.InProgress)); 185 | 186 | if (info.Status == IndexerStatus.Running && info.LastResult?.Status == IndexerExecutionStatus.Success) 187 | { 188 | Console.WriteLine($"...Indexer '{indexerName}' created successfully."); 189 | } 190 | else 191 | { 192 | Console.WriteLine($"...Failed to create '{indexerName}' indexer."); 193 | Console.WriteLine($" Error: '{info.LastResult.ErrorMessage}'"); 194 | } 195 | 196 | foreach (var warning in info.LastResult?.Warnings) 197 | { 198 | Console.WriteLine("==========================================================================="); 199 | Console.WriteLine($" Warning for '{warning.Key}': '{warning.Message}'"); 200 | } 201 | 202 | foreach (var error in info.LastResult?.Errors) 203 | { 204 | Console.WriteLine("==========================================================================="); 205 | Console.WriteLine($" Error for '{error.Key}': '{error.ErrorMessage}'"); 206 | } 207 | Console.WriteLine("==========================================================================="); 208 | } 209 | } 210 | } 211 | -------------------------------------------------------------------------------- /AzureSearch.SharepointOnline.Connector/Helpers/SharePointOnlineHelper.cs: -------------------------------------------------------------------------------- 1 | //THIS CODE IS PROVIDED AS IS WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT. 2 | 3 | using Microsoft.Extensions.Configuration; 4 | using Microsoft.Graph; 5 | using Microsoft.Identity.Client; 6 | using Microsoft.WindowsAzure.Storage.Blob; 7 | using Newtonsoft.Json; 8 | using System; 9 | using System.Collections; 10 | using System.Collections.Generic; 11 | using System.IO; 12 | using System.Net.Http; 13 | using System.Text; 14 | using System.Threading.Tasks; 15 | using System.Configuration; 16 | using AzureSearch.SharePointOnline.Connector.Helpers; 17 | 18 | namespace AzureSearch.SharePointConnector.Helpers 19 | { 20 | class SharePointOnlineHelper 21 | { 22 | 23 | private static GraphServiceClient _graphServiceClient; 24 | private static String _clientSecret; 25 | private static String _clientId; 26 | private static String _tenantId; 27 | private static String _redirectUrl; 28 | private static String _authority; 29 | private static String _spoHostName; 30 | private static bool _getAcls; 31 | 32 | 33 | 34 | private static HttpClient _httpClient; 35 | public static string metadataJSONStore { get; set; } 36 | public static string[] metadataFieldsToIgnore { get; set; } 37 | public static bool acls { get; set; } 38 | public static AzureTableStorage azTableStorage { get; set; } 39 | 40 | private static IAuthenticationProvider CreateAuthorizationProvider(IConfigurationRoot config) 41 | { 42 | var clientId = config["ConnectionStrings:AADDetails:applicationId"]; 43 | var clientSecret = config["ConnectionStrings:AADDetails:applicationSecret"]; 44 | var tenantId = config["ConnectionStrings:AADDetails:tenantId"]; 45 | var redirectUri = config["ConnectionStrings:AADDetails:redirectUri"]; 46 | var authority = $"https://login.microsoftonline.com/{config["ConnectionStrings:AADDetails:tenantId"]}/v2.0"; 47 | var spoHostName = config["ConnectionStrings:SPODetails:spoHostName"]; 48 | 49 | _spoHostName = spoHostName; 50 | _clientSecret = clientSecret; 51 | _clientId = clientId; 52 | _tenantId = tenantId; 53 | _redirectUrl = redirectUri; 54 | _authority = authority; 55 | 56 | //this specific scope means that application will default to what is defined in the application registration rather than using dynamic scopes 57 | List scopes = new List(); 58 | scopes.Add("https://graph.microsoft.com/.default"); 59 | 60 | var cca = ConfidentialClientApplicationBuilder.Create(clientId) 61 | .WithAuthority(authority) 62 | .WithRedirectUri(redirectUri) 63 | .WithClientSecret(clientSecret) 64 | .Build(); 65 | 66 | 67 | 68 | 69 | return new MsalAuthenticationProvider(cca, scopes.ToArray()); 70 | } 71 | 72 | 73 | static Stream GenerateJsonMetadataFile(IDictionary metadata) 74 | { 75 | string JSONresult = JsonConvert.SerializeObject(metadata); 76 | 77 | byte[] byteArray = Encoding.ASCII.GetBytes(JSONresult); 78 | MemoryStream jsonStream = new MemoryStream(byteArray); 79 | 80 | // convert stream to string 81 | StreamReader reader = new StreamReader(jsonStream); 82 | 83 | return jsonStream; 84 | } 85 | 86 | public static GraphServiceClient GetAuthenticatedGraphClient(IConfigurationRoot config) 87 | { 88 | var authenticationProvider = CreateAuthorizationProvider(config); 89 | 90 | _graphServiceClient = new GraphServiceClient(authenticationProvider); 91 | return _graphServiceClient; 92 | } 93 | 94 | public static async Task> GetFolderContents(GraphServiceClient graphClient, string folderName, string driveId) 95 | 96 | { 97 | IDriveItemChildrenCollectionPage docFolderLibItems = null; 98 | var folderContents = new List(); 99 | 100 | try 101 | { 102 | docFolderLibItems = await graphClient 103 | .Drives[driveId] 104 | .Root 105 | .ItemWithPath(folderName) 106 | .Children 107 | .Request() 108 | .GetAsync(); 109 | 110 | folderContents.AddRange(docFolderLibItems); 111 | 112 | while (docFolderLibItems.NextPageRequest != null) 113 | { 114 | docFolderLibItems = await docFolderLibItems.NextPageRequest.GetAsync(); 115 | folderContents.AddRange(docFolderLibItems); 116 | } 117 | 118 | } 119 | catch (Exception e) 120 | { 121 | docFolderLibItems = null; 122 | } 123 | 124 | //return docFolderLibItems; 125 | return folderContents; 126 | } 127 | 128 | public static async Task GetSpoDocumentItems(GraphServiceClient graphClient, List docLibItems, string driveId, CloudBlobContainer container, bool getAcls) 129 | { 130 | foreach (var item in docLibItems) 131 | { 132 | if (item.Folder != null) 133 | { 134 | string ParentFolderPathString = null; 135 | string fullFolderNamePath = null; 136 | var folderName = item.Name; 137 | 138 | if (item.ParentReference.Path != null) 139 | { 140 | var ParentFolderPathSplit = item.ParentReference.Path.Split(":"); 141 | if (ParentFolderPathSplit.Length >= 1) 142 | { 143 | ParentFolderPathString = ParentFolderPathSplit[1]; 144 | if (ParentFolderPathString.Length >= 1) 145 | { 146 | fullFolderNamePath = String.Format("{0}/{1}", ParentFolderPathString, folderName); 147 | } 148 | else 149 | { 150 | fullFolderNamePath = folderName; 151 | } 152 | } 153 | } 154 | else 155 | { 156 | fullFolderNamePath = folderName; 157 | } 158 | 159 | 160 | var folderItems = await GetFolderContents(graphClient, fullFolderNamePath, driveId); 161 | if (folderItems.Count >0 ) 162 | { 163 | await GetSpoDocumentItems(graphClient, folderItems, driveId, container, _getAcls); 164 | } 165 | } 166 | // Let's download the first file we get in the response. 167 | if (item.File != null) 168 | { 169 | // We'll use the file metadata to determine size and the name of the downloaded file 170 | // and to get the download URL. 171 | if (item.Deleted != null) 172 | { 173 | if (item.Deleted.State == "deleted") 174 | { 175 | Console.WriteLine("Deleted Item detected"); 176 | 177 | var spoItemUrl = await azTableStorage.GetSpoItemEntitiesInPartion(item.Id); 178 | 179 | //Clean up the Storage account path for the deleted item so we dont index it again 180 | await AzureBLOBStorage.DeleteFileFromAzureBLOB(spoItemUrl, container); 181 | //Clean up the json metadata file for the above file: 182 | string spoItemUrlJson = ($"{spoItemUrl}.json"); 183 | await AzureBLOBStorage.DeleteFileFromAzureBLOB(spoItemUrlJson, container); 184 | 185 | break; 186 | } 187 | } 188 | 189 | var driveItemInfo = await graphClient.Drives[driveId].Items[item.Id].Request().GetAsync(); 190 | 191 | var SPWebUrl = driveItemInfo.WebUrl; 192 | var createdAuthorDisplayName = driveItemInfo.CreatedBy.User.DisplayName; 193 | var baseFileName = SPWebUrl; 194 | var jsonMetadataFileName = String.Format("{0}.json", baseFileName); 195 | 196 | //Below is for ACL Security trimming extraction which is still work in progress. 197 | if (getAcls) 198 | { 199 | var driveItemPermissions = await graphClient.Drives[driveId].Items[item.Id].Permissions.Request().GetAsync(); 200 | 201 | foreach (var driveItemPermission in driveItemPermissions) 202 | { 203 | 204 | var grantedDispayName = driveItemPermission.GrantedTo.User.DisplayName; 205 | var grantedObjectId = driveItemPermission.GrantedTo.User.Id; 206 | 207 | //If no ID is present then its a sharepoint group 208 | if (grantedObjectId == null) 209 | { 210 | var scopes = new[] { _spoHostName + "/.default" }; 211 | //var scopes = new[] { _spoHostName + "/Sites.FullControl.All" }; 212 | 213 | //var scopes = new[] { "https://graph.microsoft.com/contacts.read" }; 214 | 215 | var v1Authority = _authority.Replace("/v2.0", ""); 216 | 217 | var clientApplication = ConfidentialClientApplicationBuilder.Create(_clientId) 218 | .WithAuthority(_authority) 219 | .WithClientSecret(_clientSecret) 220 | .WithClientId(_clientId) 221 | .WithTenantId(_tenantId) 222 | .Build(); 223 | 224 | var result = await clientApplication.AcquireTokenForClient(scopes).ExecuteAsync(); 225 | 226 | HttpClient client = new HttpClient(); 227 | client.DefaultRequestHeaders.Add("Authorization", "Bearer " + result.AccessToken); 228 | client.DefaultRequestHeaders.Add("Accept", "application/json"); 229 | 230 | ////setup the client get 231 | HttpResponseMessage result2 = await client.GetAsync(String.Format("{0}/_api/Web/SiteGroups/GetByName('{1}')/users", SPWebUrl, grantedDispayName)); 232 | 233 | string filter = string.Format("startswith(displayName, {0}", grantedDispayName); 234 | //string filter = string.Format("displayName startswith '{0}'", grantedDispayName); 235 | var groupLookup = await graphClient.Groups 236 | .Request() 237 | .Filter($"startswith(displayName, '{grantedDispayName}')") 238 | //.Filter(filter) 239 | .Select("id, displayName").GetAsync(); 240 | 241 | var ac = groupLookup; 242 | } 243 | } 244 | } 245 | var fields = await graphClient.Drives[driveId].Items[item.Id].ListItem.Fields.Request().GetAsync(); 246 | 247 | //generate metadata content and upload to blob 248 | var metadataFields = fields.AdditionalData; 249 | 250 | foreach (var metadataFieldToIgnore in metadataFieldsToIgnore) 251 | { 252 | //Console.WriteLine("Removing key [{0}] from metadata fields to extract", metadataFieldToIgnore); 253 | try 254 | { 255 | metadataFields.Remove(metadataFieldToIgnore); 256 | } 257 | catch 258 | { 259 | //swallow exceptions - where fields we want to remove may not exist / theres a better way to do this altogether. 260 | } 261 | } 262 | metadataFields.Add("SPWebUrl", SPWebUrl); 263 | metadataFields.Add("createdAuthorDisplayName", createdAuthorDisplayName); 264 | 265 | 266 | // Get the download URL. This URL is preauthenticated and has a short TTL. 267 | object downloadUrl; 268 | driveItemInfo.AdditionalData.TryGetValue("@microsoft.graph.downloadUrl", out downloadUrl); 269 | long size = (long)driveItemInfo.Size; 270 | 271 | Console.WriteLine("located file {0}, full url [{1}]", baseFileName, downloadUrl.ToString()); 272 | //await DownloadFileLocal(graphClient, downloadUrl, fileName); 273 | if (metadataJSONStore.Equals("True")) 274 | { 275 | //Metadata JSON logic 276 | using (var metadataJson = GenerateJsonMetadataFile(metadataFields)) 277 | { 278 | var uploadUri = await AzureBLOBStorage.UploadFileToAzureBLOB(metadataJson, jsonMetadataFileName, container); 279 | //External JSON file approach 280 | await AzureBLOBStorage.DownloadFileToAzureBLOB(graphClient, downloadUrl, baseFileName, container, uploadUri); 281 | } 282 | 283 | } 284 | else 285 | { 286 | //BLOB metadata approach 287 | await AzureBLOBStorage.DownloadFileToAzureBLOB(graphClient, downloadUrl, baseFileName, container, metadataFields); 288 | } 289 | 290 | //Persist the itemId and url to Storage Table 291 | SpoItem spoItemEntity = new SpoItem(item.Id, SPWebUrl); 292 | azTableStorage.InsertSpoItemEntity(spoItemEntity); 293 | } 294 | } 295 | } 296 | } 297 | } 298 | -------------------------------------------------------------------------------- /AzureSearch.SharepointOnline.Connector/Program.cs: -------------------------------------------------------------------------------- 1 | //THIS CODE IS PROVIDED AS IS WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT. 2 | 3 | using Microsoft.Extensions.Configuration; 4 | using Microsoft.Graph; 5 | using Microsoft.Identity.Client; 6 | using Microsoft.WindowsAzure.Storage; 7 | using Microsoft.WindowsAzure.Storage.Blob; 8 | using System; 9 | using System.Collections.Generic; 10 | using System.Diagnostics; 11 | using System.IO; 12 | using System.Linq; 13 | using System.Net.Http; 14 | using System.Threading.Tasks; 15 | using AzureSearch.SharePointConnector.Helpers; 16 | using AzureSearch.SharePointOnline.Connector.Helpers; 17 | using Microsoft.ApplicationInsights.Extensibility; 18 | using Microsoft.Extensions.DependencyInjection; 19 | using Microsoft.ApplicationInsights.Channel; 20 | using Microsoft.Extensions.Logging; 21 | 22 | namespace AzureSearch.SharePointConnector 23 | { 24 | class Program 25 | { 26 | private static GraphServiceClient _graphServiceClient; 27 | private static HttpClient _httpClient; 28 | public static SearchServiceHelper searchServiceHelper; 29 | public CloudBlobContainer container; 30 | 31 | private static bool IncrementalCrawl { get; set; } 32 | public static string BlobContainerName { get; set; } 33 | public static string StorageAccountName { get; set; } 34 | public static string StorageAccountKey { get; set; } 35 | public static string StorageConnectionString { get; set; } 36 | public static string StorageTableName { get; set; } 37 | public static string SpoItemStorageTableName { get; set; } 38 | public static string SPOHostName { get; set; } 39 | public static string SiteUrl { get; set; } 40 | public static string MetadataJSONStore { get; set; } 41 | public static bool IncludeAcls { get; set; } 42 | public static string[] MetadataFieldsToIgnore { get; set; } 43 | public static string[] DocLibsToIgnore { get; set; } 44 | public static string SearchServiceName { get; set; } 45 | public static string SearchServiceAdminKey { get; set; } 46 | public static string SearchServiceIndexName { get; set; } 47 | public static string SearchServiceBlobDataSourceName { get; set; } 48 | public static string SearchServiceBlobSynonymMapName { get; set; } 49 | public static string SearchServiceBlobSkillsetName { get; set; } 50 | public static string SearchServiceBlobIndexerName { get; set; } 51 | public static string CognitiveAccount { get; set; } 52 | public static string CognitiveKey { get; set; } 53 | public static string CustomSpoMetadataSkillUri { get; set; } 54 | public static string SPOMetadataMapperApiKey { get; set; } 55 | public static string AppInsightsApiKey { get; set; } 56 | 57 | public static string DefinitionsPath = "SearchDefinitions"; 58 | 59 | //CloudStorageAccount storageAccount; 60 | 61 | 62 | 63 | static async Task Main(string[] args) 64 | { 65 | 66 | // Load appsettings.json 67 | var config = LoadAppSettings(); 68 | if (null == config) 69 | { 70 | Console.WriteLine("Missing or invalid appsettings.json file. Please see README.md for configuration instructions."); 71 | return; 72 | } 73 | SetGlobalConfig(config); 74 | 75 | ////Logging 76 | //IServiceCollection services = new ServiceCollection(); 77 | 78 | //// Channel is explicitly configured to do flush on it later. 79 | //var channel = new InMemoryChannel(); 80 | //services.Configure( 81 | // (config) => 82 | // { 83 | // config.TelemetryChannel = channel; 84 | // } 85 | //); 86 | 87 | //services.AddLogging(builder => 88 | //{ 89 | // builder.AddConsole(); 90 | // builder.AddApplicationInsights(AppInsightsApiKey); 91 | //}); 92 | 93 | //var provider = services.BuildServiceProvider(); 94 | //var logger = provider.GetService>(); 95 | 96 | //logger.LogInformation("This will show up in Application Insights"); 97 | 98 | // Explicitly call Flush() followed by sleep is required in Console Apps. 99 | // This is to ensure that even if application terminates, telemetry is sent to the back-end. 100 | //channel.Flush(); 101 | 102 | 103 | 104 | searchServiceHelper = new SearchServiceHelper(SearchServiceName, SearchServiceAdminKey); 105 | 106 | System.Diagnostics.Trace.TraceWarning("Slow response - database01"); 107 | 108 | TimeSpan elapsedTime; 109 | 110 | //Start stopwatch for timing telemtry 111 | Stopwatch sw = new Stopwatch(); 112 | var timeStart = DateTime.Now; 113 | sw.Start(); 114 | 115 | //Storage 116 | var storageAccount = CloudStorageAccount.Parse(StorageConnectionString); 117 | var storageClient = storageAccount.CreateCloudBlobClient(); 118 | 119 | AzureTableStorage azTableStorage = new AzureTableStorage(StorageConnectionString, StorageTableName); 120 | AzureTableStorage azTableStorageSpoItems = new AzureTableStorage(StorageConnectionString, SpoItemStorageTableName); 121 | 122 | CloudBlobContainer container = await AzureBLOBStorage.CreateAzureBLOBContainer(storageClient, BlobContainerName); 123 | 124 | 125 | //Search 126 | AzureSearchServiceHelper searchClient = new AzureSearchServiceHelper(SearchServiceName, SearchServiceAdminKey); 127 | 128 | IDriveItemChildrenCollectionPage docLibItems; 129 | IDriveItemDeltaCollectionPage docLibDeltaItems; 130 | 131 | for (int i = 0; i < args.Length; i++) 132 | { 133 | if (args[i].ToLower() == "-incrementalcrawl") 134 | { 135 | IncrementalCrawl = true; 136 | Console.WriteLine("Search Crawl mode set to Incremental"); 137 | container = await AzureBLOBStorage.CreateAzureBLOBContainer(storageClient, BlobContainerName); 138 | 139 | } 140 | 141 | if (args[i].ToLower() == "-fullcrawl") 142 | { 143 | IncrementalCrawl = false; 144 | Console.WriteLine("Search Crawl mode set to Full"); 145 | await AzureBLOBStorage.DeleteContainerFromAzureBLOB(container); 146 | container = await AzureBLOBStorage.CreateAzureBLOBContainer(storageClient, BlobContainerName); 147 | 148 | } 149 | 150 | if (args[i].ToLower() == "-includeacls") 151 | { 152 | IncludeAcls = true; 153 | Console.WriteLine("Search Crawl mode set to Full"); 154 | } 155 | } 156 | 157 | 158 | SharePointOnlineHelper.metadataFieldsToIgnore = MetadataFieldsToIgnore; 159 | SharePointOnlineHelper.metadataJSONStore = MetadataJSONStore; 160 | SharePointOnlineHelper.acls = IncludeAcls; 161 | SharePointOnlineHelper.azTableStorage = azTableStorageSpoItems; 162 | 163 | 164 | foreach (var metadataFieldToIgnore in MetadataFieldsToIgnore) 165 | { 166 | Console.WriteLine("Removing key [{0}] from metadata fields to extract", metadataFieldToIgnore); 167 | } 168 | 169 | //Query using Graph SDK (preferred when possible) 170 | GraphServiceClient graphClient = SharePointOnlineHelper.GetAuthenticatedGraphClient(config); 171 | Site targetSite = await graphClient.Sites.GetByPath(SiteUrl, SPOHostName).Request().GetAsync(); 172 | 173 | ISiteDrivesCollectionPage drives = graphClient.Sites[targetSite.Id].Drives.Request().GetAsync().Result; 174 | 175 | 176 | //Graph BETA supports site pages 177 | //var sitePages = graphClient.Sites[targetSite.Id].Pages.Request().GetAsync().GetAwaiter().GetResult(); 178 | //var sitePages = graphClient.Sites[targetSite.Id].Pages.Request().GetAsync().Result; 179 | //var a = 1; 180 | 181 | foreach (var drive in drives) 182 | { 183 | var driveName = drive.Name; 184 | var driveUrl = drive.WebUrl; 185 | bool excludedDocLIb = Array.Exists(DocLibsToIgnore, element => element == driveName); 186 | 187 | if (excludedDocLIb) 188 | { 189 | Console.WriteLine("Skipping [{0}] as its an excluded docLib", DocLibsToIgnore); 190 | continue; 191 | } 192 | Console.WriteLine("Fetching items from drive [{0}]", driveName); 193 | 194 | var driveId = drive.Id; 195 | var driveContents = new List(); 196 | 197 | //Full Crawl Logic 198 | if (!IncrementalCrawl) 199 | { 200 | docLibItems = await graphClient 201 | .Drives[driveId] 202 | .Root 203 | .Children 204 | .Request() 205 | .GetAsync(); 206 | 207 | driveContents.AddRange(docLibItems.CurrentPage); 208 | 209 | if (docLibItems.NextPageRequest != null) 210 | { 211 | while (docLibItems.NextPageRequest != null) 212 | { 213 | docLibItems = await docLibItems.NextPageRequest.GetAsync(); 214 | driveContents.AddRange(docLibItems.CurrentPage); 215 | await SharePointOnlineHelper.GetSpoDocumentItems(graphClient, driveContents, driveId, container, IncludeAcls); 216 | } 217 | } 218 | else 219 | { 220 | await SharePointOnlineHelper.GetSpoDocumentItems(graphClient, driveContents, driveId, container, IncludeAcls); 221 | } 222 | 223 | } 224 | 225 | //Incremental Crawl Logic 226 | if (IncrementalCrawl) 227 | { 228 | 229 | 230 | //Retrieve the last known deltaToken from Table storage, if the value is null it will fetch all items for that drive 231 | //Base64 encode the string to remove special characters 232 | byte[] byt = System.Text.Encoding.UTF8.GetBytes(driveUrl); 233 | var driveUrlEscpaed = Convert.ToBase64String(byt); 234 | 235 | var lastDeltaToken = await azTableStorage.GetEntitiesInPartion(driveUrlEscpaed); 236 | docLibDeltaItems = await graphClient 237 | .Drives[driveId] 238 | .Root 239 | .Delta(lastDeltaToken) 240 | .Request() 241 | .GetAsync(); 242 | 243 | var deltaLink = docLibDeltaItems.AdditionalData["@odata.deltaLink"].ToString(); 244 | if (deltaLink != null) 245 | { 246 | var tokenindex = deltaLink.IndexOf("token="); 247 | 248 | var token = deltaLink.Substring(tokenindex + 7, deltaLink.ToString().Length - tokenindex - 9); 249 | driveContents.AddRange(docLibDeltaItems.CurrentPage); 250 | 251 | if (docLibDeltaItems.NextPageRequest != null) 252 | { 253 | while (docLibDeltaItems.NextPageRequest != null) 254 | { 255 | var docLibItems2 = await docLibDeltaItems.NextPageRequest.GetAsync(); 256 | driveContents.AddRange(docLibItems2.CurrentPage); 257 | await SharePointOnlineHelper.GetSpoDocumentItems(graphClient, driveContents, driveId, container, IncludeAcls); 258 | } 259 | } 260 | else 261 | { 262 | await SharePointOnlineHelper.GetSpoDocumentItems(graphClient, driveContents, driveId, container, IncludeAcls); 263 | 264 | //Lets persist the changeToken to storage so we can continue the next incrmental crawl from this point. 265 | IndexCrawlEntity indexCrawlEntity = new IndexCrawlEntity(driveUrlEscpaed, token); 266 | azTableStorage.InsertEntity(indexCrawlEntity); 267 | } 268 | //Console.WriteLine("Fetched total of {0} documents from [{1}] data source", DownloadFileCount, driveName); 269 | } 270 | } 271 | 272 | } 273 | 274 | if (!IncrementalCrawl) 275 | { 276 | //Now lets do a full crawl of all the fetched SPO documents from the BLOB store as the fetching of all documents into storage would have completed by now 277 | //Warning this will perform an entire search index rebuild - so while this phase is running search resultset will be impacted 278 | 279 | await IndexDocumentsAsync(); 280 | } 281 | 282 | sw.Stop(); 283 | elapsedTime = sw.Elapsed; 284 | var timeEnd = DateTime.Now; 285 | 286 | Console.WriteLine("Fetched total of {0} documents during crawl", AzureBLOBStorage.DownloadFileCount); 287 | Console.WriteLine("Crawl Start time: {0}", timeStart); 288 | Console.WriteLine("Crawl Completed time: {0}", timeEnd); 289 | Console.WriteLine("Total crawl duration time: {0}", elapsedTime); 290 | } 291 | 292 | 293 | private static void SetGlobalConfig(IConfigurationRoot config) 294 | { 295 | StorageAccountName = config["ConnectionStrings:StorageDetails:storageAccountName"]; 296 | StorageAccountKey = config["ConnectionStrings:StorageDetails:storageAccountKey"]; 297 | BlobContainerName = config["ConnectionStrings:StorageDetails:storageBlobContainerName"]; 298 | StorageConnectionString = ($"DefaultEndpointsProtocol=https;AccountName={StorageAccountName};AccountKey={StorageAccountKey};"); 299 | 300 | SPOHostName = config["ConnectionStrings:SPODetails:SPOHostName"]; 301 | SiteUrl = config["ConnectionStrings:SPODetails:siteUrl"]; 302 | MetadataFieldsToIgnore = config.GetSection("ConnectionStrings:SPODetails:metadataFieldsToIgnore").GetChildren().ToArray().Select(c => c.Value).ToArray(); 303 | DocLibsToIgnore = config.GetSection("ConnectionStrings:SPODetails:docLibExclusions").GetChildren().ToArray().Select(c => c.Value).ToArray(); 304 | MetadataJSONStore = config["ConnectionStrings:SPODetails:metadataJSONStore"]; 305 | 306 | StorageTableName = config["ConnectionStrings:StorageDetails:storageTableName"]; 307 | SpoItemStorageTableName = config["ConnectionStrings:StorageDetails:spoItemStorageTableName"]; 308 | 309 | SearchServiceName = config["ConnectionStrings:SearchDetails:name"]; 310 | SearchServiceAdminKey = config["ConnectionStrings:SearchDetails:adminKey"]; 311 | SearchServiceIndexName = config["ConnectionStrings:SearchDetails:indexName"]; 312 | SearchServiceBlobDataSourceName = config["ConnectionStrings:SearchDetails:blobDataSourceName"]; 313 | SearchServiceBlobSynonymMapName = config["ConnectionStrings:SearchDetails:blobSynonymMapName"]; 314 | SearchServiceBlobSkillsetName = config["ConnectionStrings:SearchDetails:blobSkillsetName"]; 315 | SearchServiceBlobIndexerName = config["ConnectionStrings:SearchDetails:blobIndexerName"]; 316 | CognitiveAccount = config["ConnectionStrings:SearchDetails:cognitiveAccount"]; 317 | CognitiveKey = config["ConnectionStrings:SearchDetails:cognitiveKey"]; 318 | CustomSpoMetadataSkillUri = config["ConnectionStrings:SearchDetails:customSpoMetadataSkillUri"]; 319 | SPOMetadataMapperApiKey = config["ConnectionStrings:SearchDetails:SPOMetadataMapper-Api-Key"]; 320 | AppInsightsApiKey = config["Logging:key"]; 321 | 322 | } 323 | private static IAuthenticationProvider CreateAuthorizationProvider(IConfigurationRoot config) 324 | { 325 | var clientId = config["ConnectionStrings: AADDetails:applicationId"]; 326 | var clientSecret = config["ConnectionStrings:AADDetails:applicationSecret"]; 327 | var redirectUri = config["ConnectionStrings:AADDetails:redirectUri"]; 328 | var authority = $"https://login.microsoftonline.com/{config["ConnectionStrings:AADDetails:tenantId"]}/v2.0"; 329 | 330 | //this specific scope means that application will default to what is defined in the application registration rather than using dynamic scopes 331 | List scopes = new List(); 332 | scopes.Add("https://graph.microsoft.com/.default"); 333 | 334 | var cca = ConfidentialClientApplicationBuilder.Create(clientId) 335 | .WithAuthority(authority) 336 | .WithRedirectUri(redirectUri) 337 | .WithClientSecret(clientSecret) 338 | .Build(); 339 | return new MsalAuthenticationProvider(cca, scopes.ToArray()); 340 | } 341 | 342 | private static IConfigurationRoot LoadAppSettings() 343 | { 344 | try 345 | { 346 | var settingsFileName = Path.Combine(System.IO.Directory.GetCurrentDirectory(), "appSettings.json"); 347 | 348 | var config = new ConfigurationBuilder() 349 | .SetBasePath(AppContext.BaseDirectory) 350 | .AddJsonFile("appSettings.json", optional: false, reloadOnChange: true) 351 | .Build(); 352 | 353 | //config.AddEnvironmentVariables(""); 354 | 355 | // Validate required settings 356 | if (string.IsNullOrEmpty(config["ConnectionStrings:AADDetails:applicationId"]) || 357 | string.IsNullOrEmpty(config["ConnectionStrings:AADDetails:applicationSecret"]) || 358 | string.IsNullOrEmpty(config["ConnectionStrings:AADDetails:redirectUri"]) || 359 | string.IsNullOrEmpty(config["ConnectionStrings:AADDetails:tenantId"]) || 360 | string.IsNullOrEmpty(config["ConnectionStrings:AADDetails:domain"]) || 361 | string.IsNullOrEmpty(config["ConnectionStrings:StorageDetails:storageBlobContainerName"]) || 362 | string.IsNullOrEmpty(config["ConnectionStrings:SPODetails:SPOHostName"]) || 363 | string.IsNullOrEmpty(config["ConnectionStrings:SPODetails:siteUrl"]) || 364 | string.IsNullOrEmpty(config["ConnectionStrings:SPODetails:metadataJSONStore"])) 365 | { 366 | return null; 367 | } 368 | 369 | return config; 370 | } 371 | catch (System.IO.FileNotFoundException) 372 | { 373 | return null; 374 | } 375 | } 376 | 377 | private static async Task IndexDocumentsAsync() 378 | { 379 | //var definitionsPath = "definitions"; 380 | var synonymMapDefinitionPath = Path.Combine(DefinitionsPath, "blobSynonymMap.json"); 381 | var indexDefinitionPath = Path.Combine(DefinitionsPath, "blobIndex.json"); 382 | var skillsetDefinitionPath = Path.Combine(DefinitionsPath, "blobSkillset.json"); 383 | var indexerDefinitionPath = Path.Combine(DefinitionsPath, "blobIndexer.json"); 384 | 385 | await searchServiceHelper.CreateOrUpdateBlobDataSourceAsync(SearchServiceBlobDataSourceName, StorageAccountName, StorageAccountKey, BlobContainerName); 386 | 387 | await searchServiceHelper.DeleteSynonymMapAsync(SearchServiceBlobSynonymMapName); 388 | await searchServiceHelper.CreateSynonymsMapFromJsonDefinitionAsync(SearchServiceBlobSynonymMapName, synonymMapDefinitionPath); 389 | 390 | await searchServiceHelper.DeleteIndexAsync(SearchServiceIndexName); 391 | await searchServiceHelper.CreateIndexFromJsonDefinitionAsync(SearchServiceIndexName, indexDefinitionPath, SearchServiceBlobSynonymMapName); 392 | 393 | await searchServiceHelper.DeleteSkillsetAsync(SearchServiceBlobSkillsetName); 394 | await searchServiceHelper.CreateSkillsetFromJsonDefinitionAsync(SearchServiceBlobSkillsetName, skillsetDefinitionPath, CognitiveKey, CognitiveAccount, CustomSpoMetadataSkillUri, SPOMetadataMapperApiKey); 395 | 396 | await searchServiceHelper.DeleteIndexerAsync(SearchServiceBlobIndexerName); 397 | await searchServiceHelper.CreateIndexerFromJsonDefinitionAsync(SearchServiceBlobIndexerName, indexerDefinitionPath, SearchServiceBlobDataSourceName, SearchServiceIndexName, SearchServiceBlobSkillsetName); 398 | 399 | await searchServiceHelper.WaitForIndexerToFinishAsync(SearchServiceBlobIndexerName); 400 | 401 | } 402 | } 403 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | The motivation behind this project is to simplify the ingestion of SharePoint Online Document library content into Azure Cognitive Search. 3 | 4 | Azure Cognitive Search brings in a number of benefits over standard SharePoint Online Search allowing you to have more control over how you further refine and enrich content in search through: 5 | 6 | - Deep knowledge mining of your content with pre-built AI-based [Azure Cognitive Services](https://docs.microsoft.com/en-us/azure/search/cognitive-search-predefined-skills) 7 | - [Synonyms](https://docs.microsoft.com/en-us/azure/search/search-synonyms 8 | )) for query expansion over a search index 9 | - More control over Type Aheads 10 | - Modeling of relational Data 11 | - Multi Language indexes 12 | 13 | ### Azure Cognitive Search 14 | Cognitive search is an AI feature in Azure Cognitive Search, used to extract text from images, blobs, and other unstructured data sources - enriching the content to make it more searchable in an Azure Cognitive Search index. Extraction and enrichment are implemented through cognitive skills attached to an indexing pipeline. AI enrichments are supported in the following ways: 15 | Natural language processing skills include entity recognition, language detection, key phrase extraction, text manipulation, and sentiment detection. With these skills, unstructured text can assume new forms, mapped as searchable and filterable fields in an index. 16 | 17 | Image processing skills include Optical Character Recognition (OCR) and identification of visual features, such as facial detection, image interpretation, image recognition (famous people and landmarks) or attributes like colors or image orientation. You can create text-representations of image content, searchable using all the query capabilities of Azure Cognitive Search. 18 | 19 | In addition to the out of the box pre built Azure Cognitive skills you can further enrich the index ingestion pipeline by building your own custom skills in form of custom Web APIs. 20 | 21 | ### Azure Cognitive Search indexer: 22 | 23 | An [indexer](https://docs.microsoft.com/en-us/azure/search/search-indexer-overview) in Azure Cognitive Search is a crawler that extracts searchable data and metadata from an external Azure data source and populates an index based on field-to-field mappings between the index and your data source. This approach is sometimes referred to as a 'pull model' because the service pulls data in without you having to write any code that adds data to an index. 24 | 25 | # SharePoint Online Document Library Connector 26 | Azure Cognitive Search Indexers are based on data source types or platforms, with individual indexers for: 27 | - **SQL Server on Azure** 28 | - **Cosmos DB** 29 | - **Azure Table Storage and Blob Storage** 30 | 31 | At the moment there is no out of the box Azure Cognitive Search indexer for SharePoint Online content; hence the reason why we are publishing this project. 32 | 33 | ## High Level Architecture Overview 34 | 35 | ![AzureSearch.SharePointOnline High Level Architecture](https://raw.githubusercontent.com/anevjes/AzureSearch.SharePointOnline/master/Diagrams/PNG/HighLevelComponentArchitecture.png) 36 | 37 | The solution comprises of the following Azure resources: 38 | 39 | - **Azure Cognitive Search** 40 | - **Azure General Purpose Storage Account** - We leverage (BLOBs and Tables) 41 | - **Azure Web Apps**- for hosting custom SharePoint Online Metadata Merging Web API - used for merging SharePoint Field contents with the associated document inside Azure Cognitive Search index 42 | - **Application Insights** for logging 43 | 44 | ## Software Components: 45 | 46 | **AzureSearch.SharePointOnline.Connector:** 47 | 48 | Stand-alone DotNetCore console app responsible for: 49 | 50 | - Initializing Azure Cognitive Search index / indexer / skill-set mapping / synonym mapping against the Azure Cognitive Search instance. 51 | 52 | - Downloading SharePoint Online Document Library (Documents) and associated Fields - via Microsoft Graph SDK. Graph SDK handles the [Throttling](https://docs.microsoft.com/en-us/graph/throttling) retry attempts back to SharePoint Online for us as it respects the Retry-After HTTP response headers internally. 53 | 54 | - Drops the discovered Documents and its associated metadata into Azure Blob storage. Container paths in blob storage follow the same naming convention as the urls. For each document that is discovered it is accompanied with an additional .json file which includes all the discovered metadata.
55 | 56 | _Example:_ 57 | 58 | HelloWorld.pdf
59 | HelloWorld.pdf.json 60 | 61 | Where HelloWorld.pdf is the raw PDF file form SharePoint document library and HelloWorld.pdf.json contains SharePoint Field / SPWeb URL location metadata. 62 | 63 | Sample of contents from HelloWorld.pdf.json file:
64 | ```json 65 | { 66 | "Classification": "HBI", 67 | "ContentType": "Document", 68 | "Created": "2019-09-19T05:08:38Z", 69 | "AuthorLookupId": "12", 70 | "Modified": "2019-09-19T05:12:40Z", 71 | "EditorLookupId": "12", 72 | "SPWebUrl": "https://somespourl.sharepoint.com/DemoDocs/HelloWorld.pdf", 73 | "LinkFilename": "HelloWorld.pdf", 74 | "FileSizeDisplay": "1473403", 75 | "_DisplayName": "", 76 | "createdAuthorDisplayName": "MOD Administrator" 77 | } 78 | ``` 79 | 80 | - Incremental Crawling (partially working)- As part of this we leverage two Azure Storage Tables: 81 | - spoIncrementalCrawlerTokens - Used for keeping track of all the Microsoft Graph API Delta tokens per Document Library. This makes our crawling more efficient as we only re-crawl the changes 82 | - Unfortunately as part of delta changes we do not get the URL of the deleted items. Microsoft Graph Delta query only returns ItemID for deleted items - so for us to keep track of the mapping between ItemIds and the URLs we write all the itemIds and their associated URLs inside spoItems Table. This way when it comes to removing deleted SharePoint Document Library items from azure index we can achieve this through this mapping. 83 | - 84 | Usage example: 85 | 86 | 87 | ``` Usage example 88 | AzureSearch.SharePointConnector.exe [-fullcrawl | -incrementalcrawl] 89 | ``` 90 | 91 | **AzureSearch.SharepointOnline.Connector.CustomSkills:** 92 | 93 | Enriches the Azure Search Index with SharePoint metadata. This component is invoked as part of the Azure Cognitive Search indexer custom skill pipeline. You can read more details on the configuration around this component in the Getting Started Area. 94 | 95 | ## Current Functionality 96 | 97 | At the moment the following functionality is available: 98 | 99 | - Crawling of SharePoint online Document Libraries through Graph API. 100 | - Association of SharePoint Document Library Fields to the documents - providing a single complete Azure search index with all the related data. 101 | - Crawl Type: **Full Crawl** 102 | - Automatic creation of Azure Cognitive Search Index Schema based on the fields you need and BLOB indexer. 103 | 104 | ## Planned Functionality 105 | 106 | Please note that some of the dev work on the below functionality has already commenced as you will see in the source code. 107 | 108 | - Incremental crawling through the use of [Graph API delta Tokens](https://docs.microsoft.com/en-us/graph/delta-query-overview) and leveraging [Azure Table storage](https://docs.microsoft.com/en-us/azure/storage/tables/table-storage-overview) for tracking of the last known change tokens during crawl time for more efficient crawling 109 | - SharePoint Online ACL / Azure Cognitive Search Security Trimming 110 | 111 | Cognitive 112 | 113 | # Getting Started 114 | 115 | ## 1. Create Azure Resources 116 | 117 | - Create brand new **Azure Resource Group** 118 | - Inside the resource group create a new: 119 | - **General purpose v2 Storage Account** 120 | - Create new BLOB container named **spocontent** and make sure it's Access Level is set to Private 121 | 122 | - Create Storage Table named: 123 | - **spoIncrementalCrawlerTokens** 124 | - Create Storage Table named: 125 | - **spoItems** 126 | 127 | Extract the **Connection String** for this account as you will need it to update the following files in later steps: 128 | - AzureSearch.SharePointOnline.Connector/appSettings.json 129 | - AzureSearch.SharepointOnline.Connector.CustomSkills/appSettings.json 130 | 131 | - **Azure Search Service** - Select a plan based on your scale needs. Make sure to extract the **Search Admin Key** as we will need this key to be updated inside: 132 | - AzureSearch.SharePointOnline.Connector/appSettings.json file. 133 | 134 | - **Azure Cognitive Services** - Make sure to extract the Cognitive services Keys once setup; as we will need this key to be updated inside: 135 | - AzureSearch.SharePointOnline.Connector/appSettings.json file. 136 | - **Azure App Service Plan / Web App** - Select a plan based on your estimated crawl freshness requirements. We will use this to host the **AzureSearch.SharepointOnline.Connector.CustomSkills WebAPI** required during search indexing stages. 137 | 138 | - **Azure Application Insights** - We will leverage Application Insights for logging purposes. Make sure to extract the App Insights Key once setup; as we will need this key to be updated inside: 139 | - AzureSearch.SharePointOnline.Connector/appSettings.json file. 140 | 141 | 142 | ## 2. Configure Azure Active Directory App Registration: 143 | Calls to Graph API - for SharePoint online crawling will be performed by an AAD app registration. We require an AAD App registration to be created in your SharePoint Online AAD tenant. This will need to be carried out by your AAD *Global Admin* account as we will need them to grant consent. 144 | 145 | - Login into Azure Portal 146 | - Switch Directory to your SharePoint Online Directory 147 | - Locate your **Azure Active Directory** Resource and open the AAD blade 148 | - Select **App registrations** from AAD blade 149 | - Click on **+New registration** 150 | - Name: _AzureSearch.SharePointOnline.Crawler_ 151 | - Supported account types: _Accounts in this organizational directory only (only - Single tenant)_ 152 | - Redirect URI: _set it to whatever you like - needs to be **https** if you set this value_ 153 | - Click **Create** 154 | 155 | - You will now be presented with a summary screen for your newly created AAD app registration. Make sure to capture the following information from this screen as you will shortly need to provide these details inside the AzureSearch.SharePointOnline.Connector/appSettings.json file. 156 | 157 | - IMAGE OF AAD APP REG Summary screen here: 158 | 159 | - In the App registration Summary screen select **Certificates & secrets** 160 | - Under Client secrets - select **+New client secret** 161 | - Add description / duration as per your standards. 162 | - Make sure to copy the generated secret as we will be using the value inside the AzureSearch.SharePointOnline.Connector/appSettings.json file. 163 | 164 | IMAGE-CLIENZT SECRET 165 | 166 | **API Permissions for Microsoft Graph** 167 | 168 | Now we need to grant the newly created AAD app registration with Graph API permissions. 169 | - Click on **API Permissions** 170 | - Click on **+Add a permission** 171 | - Select **Microsoft Graph** 172 | - Grant the follwoing permissions: 173 | - https://graph.microsoft.com/Group.Read.All 174 | - https://graph.microsoft.com/Sites.Read.All 175 | - Now Admin Consent the permissions for your PSO tenant by clicking the **Grant admin consent for 'Your tenant'** 176 | IMAGE API PERMISSIONS 177 | 178 | ## 2. Define the Azure Search Index, Indexer, SkillSet and SnynonymMap Defintions 179 | 180 | ### Index Definition 181 | During this phase we will need to consider all the SharePoint Fields that we would like to be indexed and exposed via Azure Cognitive Search. We have provided a sample Azure Cognitive Search index definition for you which includes some basic SharePoint Fields as part of the index. 182 | 183 | Add Azure Cognitive Search [Fields](https://docs.microsoft.com/en-us/azure/search/search-what-is-an-index) as you require them but make sure to always keep **SPWebUrl** and **blobUri** as the code has dependencies on these two fields. If you want to you can also add custom [Suggesters](https://docs.microsoft.com/en-us/azure/search/search-what-is-an-index#suggesters), [scoring profiles](https://docs.microsoft.com/en-us/azure/search/search-what-is-an-index#scoring-profiles) and [analyzers](https://docs.microsoft.com/en-us/azure/search/search-what-is-an-index#analyzers) as part of your index definition. 184 | 185 | **Important Note:** Take note of the index Field names from this file as you will use them as config across the following files: 186 | 187 | AzureSearch.SharePointOnline.Connector/SearchDefinitions/blobIndexer.json 188 | 189 | AzureSearch.SharepointOnline.Connector.CustomSkills/Mapping/metadatatoindexmapping.json 190 | 191 | 192 | - Open **AzureSearch.SharePointOnline.Connector/SearchDefinitions/blobIndex.json** 193 | 194 | Sample Index Definition below: 195 | 196 | ```json 197 | { 198 | "fields": [ 199 | { 200 | "name": "id", 201 | "type": "Edm.String", 202 | "searchable": false, 203 | "filterable": false, 204 | "retrievable": true, 205 | "sortable": false, 206 | "facetable": false, 207 | "key": true 208 | }, 209 | { 210 | "name": "blobUri", 211 | "type": "Edm.String", 212 | "searchable": true, 213 | "filterable": false, 214 | "retrievable": true, 215 | "sortable": true, 216 | "facetable": false 217 | }, 218 | { 219 | "name": "fullText", 220 | "type": "Edm.String", 221 | "searchable": true, 222 | "filterable": false, 223 | "retrievable": true, 224 | "sortable": false, 225 | "facetable": false 226 | }, 227 | { 228 | "name": "languageCode", 229 | "type": "Edm.String", 230 | "searchable": true, 231 | "filterable": true, 232 | "retrievable": true, 233 | "sortable": false, 234 | "facetable": true 235 | }, 236 | { 237 | "name": "keyPhrases", 238 | "type": "Collection(Edm.String)", 239 | "searchable": true, 240 | "filterable": false, 241 | "retrievable": true, 242 | "sortable": false, 243 | "facetable": false, 244 | "synonymMaps": [ 245 | "[SynonymMapName]" 246 | ] 247 | }, 248 | { 249 | "name": "organizations", 250 | "type": "Collection(Edm.String)", 251 | "searchable": true, 252 | "filterable": true, 253 | "retrievable": true, 254 | "sortable": false, 255 | "facetable": false 256 | }, 257 | { 258 | "name": "locations", 259 | "type": "Collection(Edm.String)", 260 | "searchable": true, 261 | "filterable": true, 262 | "retrievable": true, 263 | "sortable": false, 264 | "facetable": true 265 | }, 266 | { 267 | "name": "SPWebUrl", 268 | "type": "Edm.String", 269 | "searchable": true, 270 | "sortable": false, 271 | "filterable": false, 272 | "facetable": false, 273 | "retrievable": true 274 | }, 275 | { 276 | "name": "ContentType", 277 | "type": "Edm.String", 278 | "searchable": true, 279 | "sortable": false, 280 | "filterable": true, 281 | "facetable": true, 282 | "retrievable": true 283 | }, 284 | { 285 | "name": "CreatedAuthorDisplayName", 286 | "type": "Edm.String", 287 | "searchable": true, 288 | "sortable": true, 289 | "filterable": true, 290 | "facetable": true, 291 | "retrievable": true 292 | }, 293 | { 294 | "name": "LinkFilename", 295 | "type": "Edm.String", 296 | "searchable": true, 297 | "sortable": true, 298 | "filterable": true, 299 | "facetable": false, 300 | "retrievable": true 301 | }, 302 | { 303 | "name": "people", 304 | "type": "Collection(Edm.String)", 305 | "searchable": true, 306 | "filterable": true, 307 | "retrievable": true, 308 | "sortable": false, 309 | "facetable": false 310 | }, 311 | { 312 | "name": "dateTimes", 313 | "type": "Collection(Edm.String)", 314 | "searchable": false, 315 | "filterable": true, 316 | "retrievable": true, 317 | "sortable": false, 318 | "facetable": false 319 | }, 320 | { 321 | "name": "typelessEntities", 322 | "type": "Collection(Edm.String)", 323 | "searchable": false, 324 | "filterable": false, 325 | "retrievable": true, 326 | "sortable": false, 327 | "facetable": false 328 | }, 329 | { 330 | "name": "imageDescriptions", 331 | "type": "Collection(Edm.String)", 332 | "searchable": true, 333 | "filterable": false, 334 | "retrievable": true, 335 | "sortable": false, 336 | "facetable": false 337 | }, 338 | { 339 | "name": "imageCategories", 340 | "type": "Collection(Edm.String)", 341 | "searchable": true, 342 | "filterable": false, 343 | "retrievable": true, 344 | "sortable": false, 345 | "facetable": false 346 | }, 347 | { 348 | "name": "imageTags", 349 | "type": "Collection(Edm.String)", 350 | "searchable": true, 351 | "filterable": false, 352 | "retrievable": true, 353 | "sortable": false, 354 | "facetable": false 355 | }, 356 | { 357 | "name": "ocrPrintedText", 358 | "type": "Edm.String", 359 | "searchable": true, 360 | "sortable": true, 361 | "filterable": true, 362 | "facetable": false, 363 | "retrievable": true 364 | } 365 | ], 366 | "corsOptions": { 367 | "allowedOrigins": [ "*" ] 368 | }, 369 | "suggesters": [ 370 | { 371 | "name": "sg", 372 | "searchMode": "analyzingInfixMatching", 373 | "sourceFields": [ "keyPhrases", "organizations", "locations", "people" ] 374 | } 375 | ] 376 | } 377 | ``` 378 | ## Indexer Definition 379 | 380 | During this stage you will need to map index fields from the index definition defined in previous step to the Azure BLOB indexer. This is performed via fieldMappings. 381 | 382 | **Important:** 383 | Do not modify any of the below fields as we inject these values for you: 384 | ```json 385 | { 386 | "name": "[IndexerName]", 387 | "dataSourceName": "[DataSourceName]", 388 | "targetIndexName": "[IndexName]", 389 | "skillsetName": "[SkillSetName]", 390 | .. 391 | } 392 | ``` 393 | 394 | - Open up **AzureSearch.SharePointOnline.Connector/SearchDefinitions/blobIndexer.json** file 395 | 396 | 397 | Sample Indexer Definition: 398 | 399 | ```json 400 | { 401 | "name": "[IndexerName]", 402 | "dataSourceName": "[DataSourceName]", 403 | "targetIndexName": "[IndexName]", 404 | "skillsetName": "[SkillSetName]", 405 | "fieldMappings": [ 406 | { 407 | "sourceFieldName": "metadata_storage_path", 408 | "targetFieldName": "id", 409 | "mappingFunction": { "name": "base64Encode" } 410 | }, 411 | { 412 | "sourceFieldName": "metadata_storage_path", 413 | "targetFieldName": "blobUri" 414 | }, 415 | { 416 | "sourceFieldName": "metadata_storage_name", 417 | "targetFieldName": "metadata_storage_name" 418 | }, 419 | { 420 | "sourceFieldName": "metadata_storage_sas_token", 421 | "targetFieldName": "metadata_storage_sas_token" 422 | }, 423 | { 424 | "sourceFieldName": "metadataurl", 425 | "targetFieldName": "metadataurl" 426 | } 427 | 428 | ], 429 | "outputFieldMappings": [ 430 | { 431 | "sourceFieldName": "/document/fullText", 432 | "targetFieldName": "fullText" 433 | }, 434 | { 435 | "sourceFieldName": "/document/languageCode", 436 | "targetFieldName": "languageCode" 437 | }, 438 | { 439 | "sourceFieldName": "/document/fullText/pages/*/keyPhrases/*", 440 | "targetFieldName": "keyPhrases" 441 | }, 442 | { 443 | "sourceFieldName": "/document/fullText/pages/*/organizations/*", 444 | "targetFieldName": "organizations" 445 | }, 446 | { 447 | "sourceFieldName": "/document/fullText/pages/*/locations/*", 448 | "targetFieldName": "locations" 449 | }, 450 | { 451 | "sourceFieldName": "/document/fullText/pages/*/people/*", 452 | "targetFieldName": "people" 453 | }, 454 | { 455 | "sourceFieldName": "/document/fullText/pages/*/dateTimes/*", 456 | "targetFieldName": "dateTimes" 457 | }, 458 | { 459 | "sourceFieldName": "/document/fullText/pages/*/typelessEntities/*/name", 460 | "targetFieldName": "typelessEntities" 461 | }, 462 | { 463 | "sourceFieldName": "/document/normalized_images/*/imageDescriptions/captions/*/text", 464 | "targetFieldName": "imageDescriptions" 465 | }, 466 | { 467 | "sourceFieldName": "/document/normalized_images/*/imageCategories/*/name", 468 | "targetFieldName": "imageCategories" 469 | }, 470 | { 471 | "sourceFieldName": "/document/normalized_images/*/imageTags/*/name", 472 | "targetFieldName": "imageTags" 473 | }, 474 | { 475 | "sourceFieldName": "/document/CreatedAuthorDisplayName", 476 | "targetFieldName": "CreatedAuthorDisplayName" 477 | }, 478 | { 479 | "sourceFieldName": "/document/SPWebUrl", 480 | "targetFieldName": "SPWebUrl" 481 | }, 482 | { 483 | "sourceFieldName": "/document/LinkFilename", 484 | "targetFieldName": "LinkFilename" 485 | }, 486 | { 487 | "sourceFieldName": "/document/ContentType", 488 | "targetFieldName": "ContentType" 489 | } 490 | ], 491 | "parameters": { 492 | "batchSize": 1, 493 | "maxFailedItems": -1, 494 | "maxFailedItemsPerBatch": -1, 495 | "configuration": { 496 | "dataToExtract": "contentAndMetadata", 497 | "imageAction": "generateNormalizedImages", 498 | "excludedFileNameExtensions": ".json,.js", 499 | "failOnUnsupportedContentType": false, 500 | "indexStorageMetadataOnlyForOversizedDocuments": true, 501 | "failOnUnprocessableDocument": false 502 | } 503 | } 504 | } 505 | ``` 506 | 507 | ## Skillset Definition 508 | 509 | This is the area where we define the different skills that will run through as part of the indexer content enrichment pipeline. 510 | 511 | Here we have a number out of the box Azure Cognitive Search skills such as OCR detection and you will notice this is where we have out Custom WebAPI skill-set defined. This is pointing back to our AzureSearch.SharepointOnline.Connector.CustomSkills WebAPI. 512 | 513 | This custom skill provides us with the ability to extract information from two different datasets (raw sharePoint SPFile contents and the associated metadata which we store as an associated SPFile.json) and enrich the same Azure Cognitive Search index. 514 | 515 | AzureSearch.SharepointOnline.Connector.CustomSkills skill needs to known which SharePoint Fields from the metadata.json files that are uploaded to the Azure Storage BLOB container as part of the 'AzureSearch.SharePointOnline.Connector' run need to be written back to which fields in Azure search Index. 516 | 517 | 518 | - Locate the following section within the: 519 | **AzureSearch.SharePointOnline.Connector/SearchDefinitions/blobSkillset.json file.** 520 | 521 | ```json 522 | { 523 | "@odata.type": "#Microsoft.Skills.Custom.WebApiSkill", 524 | "description": "Our SharePoint Metadata mapping custom skill", 525 | "uri": "[CustomSpoMetadataSkillUri]", 526 | "batchSize": 1, 527 | "context": "/document", 528 | "httpHeaders": { 529 | "SPOMetadataMapper-Api-Key": "[SPOMetadataMapper-Api-Key]" 530 | }, 531 | "inputs": [ 532 | { 533 | "name": "docpath", 534 | "source": "/document/blobUri" 535 | }, 536 | { 537 | "name": "sastoken", 538 | "source": "/document/metadata_storage_sas_token" 539 | }, 540 | { 541 | "name": "metadataurl", 542 | "source": "/document/metadataurl" 543 | } 544 | 545 | ], 546 | "outputs": [ 547 | { 548 | "name": "tags", 549 | "targetName": "tags" 550 | }, 551 | { 552 | "name": "acls", 553 | "targetName": "acls" 554 | }, 555 | { 556 | "name": "createdAuthorDisplayName", 557 | "targetName": "CreatedAuthorDisplayName" 558 | }, 559 | { 560 | "name": "SPWebUrl", 561 | "targetName": "SPWebUrl" 562 | }, 563 | { 564 | "name": "LinkFilename", 565 | "targetName": "LinkFilename" 566 | }, 567 | { 568 | "name": "ContentType", 569 | "targetName": "ContentType" 570 | } 571 | ] 572 | } 573 | ``` 574 | 575 | **Important:**
576 | **inputs** are field names from Azure search BLOB indexer - never remove the following: 577 | ```json 578 | inputs": [ 579 | { 580 | "name": "docpath", 581 | "source": "/document/blobUri" 582 | }, 583 | { 584 | "name": "sastoken", 585 | "source": "/document/metadata_storage_sas_token" 586 | }, 587 | { 588 | "name": "metadataurl", 589 | "source": "/document/metadataurl" 590 | } 591 | ``` 592 | 593 | **outputs** are back to the index Field names. 594 | 595 |
596 | 597 | 598 | 599 | ## Synonym Map Definition 600 | 601 | Lastly we have a sample Synonym Map file which you can tweak as per the following [article](https://docs.microsoft.com/en-us/rest/api/searchservice/create-synonym-map). 602 | 603 | We have provided a sample synonym map file for you inside: 604 | AzureSearch.SharePointOnline.Connector/SearchDefinitions/blobSynonymMap.json. 605 | 606 | **Important:** 607 | Do not modify the name attribute of the below definition. 608 | 609 | ```json 610 | { 611 | "name": "[SynonymMapName]", 612 | "format": "solr", 613 | "synonyms": "MS => Microsoft\nAzure,cloud,nube\nvirtual machine,máquina virtual,vm\nDocker,containers,contenedores" 614 | } 615 | ``` 616 | 617 | 618 | ## 3. Deploy / Configure AzureSearch.SharepointOnline.Connector.CustomSkills:
619 | 620 | During Azure Cognitive Search indexing - we call out into a custom Skill; which enriches the crawled SharePoint Document LIbrary Attachment data with raw SharePoint Field data. Before we can run the SPOConnector console app - we have to deploy **AzureSearch.SharepointOnline.Connector.CustomSkills** to a public endpoint. 621 | 622 | - Open up AzureSearch.SharepointOnline.Connector.CustomSkills/appSettings.json 623 | 624 | - Update the following properties within the appSettings.json file: 625 | 626 | - **MetadataStorageConnectionString** - set this to the full connection string to the General Purpose Storage account you setup as part of **step 1** above. 627 | 628 | - **EnvironmentConfig.ApiKey** - We use this key as another layer of authorization for the calls to the custom WebAPI from the indexer. You can set this to any value you like - we generate a GUID and set that value in here. Please make sure to save this value as you will need to supply this value inside **AzureSearch.SharePointOnline.Connector/appSettings.json** file later on. 629 | 630 | - **EnvironmentConfig.MappingFile** - This is a index to SharePoint field mapping file location. We have set this to be configurable incase you want to easily port the hosting of this WebAPI to a container. This generally points to physical path of where AzureSearch.SharepointOnline.Connector.CustomSkills project lives **/Mapping/metadatatoindexmapping.json** 631 | 632 | Now we are completed with the changes required inside *AzureSearch.SharepointOnline.Connector.CustomSkills/appSettings.json* time to update one more file inside AzureSearch.SharepointOnline.Connector.CustomSkills project. 633 | 634 | 635 | - Open up **AzureSearch.SharepointOnline.Connector.CustomSkills/Mapping/metadatatoindexmapping.json** 636 | 637 | 638 | *Sample:* AzureSearch.SharepointOnline.Connector.CustomSkills/Mapping/metadatatoindexmapping.json 639 | 640 | 641 | 642 | ```json 643 | { 644 | "description": "", 645 | "outputMapping": [ 646 | { 647 | "metadataFieldName": "ContentType", 648 | "outputFieldName": "ContentType" 649 | }, 650 | { 651 | "metadataFieldName": "Created", 652 | "outputFieldName": "Created" 653 | }, 654 | { 655 | "metadataFieldName": "Modified", 656 | "outputFieldName": "Modified" 657 | }, 658 | { 659 | "metadataFieldName": "SPWebUrl", 660 | "outputFieldName": "SPWebUrl" 661 | }, 662 | { 663 | "metadataFieldName": "LinkFilename", 664 | "outputFieldName": "LinkFilename" 665 | }, 666 | { 667 | "metadataFieldName": "createdAuthorDisplayName", 668 | "outputFieldName": "createdAuthorDisplayName" 669 | } 670 | ] 671 | } 672 | ``` 673 | **Important:** 674 | You will see a file that looks as per the above structure. This is a mapping file which describes which SharePoint FieldName you want mapped to what Azure Cognitive Search Index Field name. 675 | 676 | **metadataFieldName** is the name of the SharePoint Field that is stored as part of the metadata json file during crawling. 677 | 678 | **outputFieldName** is the name of the Azure Cognitive Search index that you want to map the MetadataFieldName to. Make sure that the outputFieldName is a valid field inside Azure Cognitive Search Index definition located in: 679 | 680 | AzureSearch.SharePointOnline.Connector/SearchDefinitions/blobIndexer.json 681 | 682 | and 683 | 684 | AzureSearch.SharePointOnline.Connector/SearchDefinitions/blobIndex.json 685 | 686 | 687 | - Now you're ready to deploy the AzureSearch.SharepointOnline.Connector.CustomSkills WebAPI to the Azure WebApp you built as part of Step 1 above. 688 | 689 | - Grab the endpoint URL of the deployed WebAPi as you will need to place this URL inside AzureSearch.SharePointOnline.Connector/appSettings.json as per the below next set of steps. 690 | 691 | 692 | 693 | ## AzureSearch.SharePointOnline.Connector Configuration Settings 694 | 695 | Within the AzureSearch.SharePointOnline.Connector project you will need to modify the following configuration files: 696 | 697 | IMPORTANT: The below sample currently uses local config files. Please note that best practice for storing secrets is to keep them outside of configuration files. We will update this code to use Azure KeyVault - it is highly recommended that if you use this solution that you decouple the secrets from the below config files and make use of Azure KeyVault. A good sample on how to do this is [here](https://github.com/Azure-Samples/azure-search-knowledge-mining/blob/master/workshops/Appendix/KeyVault.md). 698 | 699 | *AzureSearch.SharePointOnline.Connector/appSettings.json* 700 | 701 | ```json 702 | { 703 | "ConnectionStrings": { 704 | "AADDetails": { 705 | "applicationId": "", 706 | "applicationSecret": "", 707 | "tenantId": "", 708 | "redirectUri": "https://microsoft.com", 709 | "domain": "sometenant.onmicrosoft.com" 710 | }, 711 | "SearchDetails": { 712 | "name": "YOUR_SEARCH_NAME", 713 | "adminKey": "", 714 | "indexName": "demo-index", 715 | "blobDataSourceName": "blob-datasource", 716 | "blobSynonymMapName": "blob-synonymmap", 717 | "blobSkillsetName": "demo-skillset", 718 | "blobIndexerName": "demo-indexer", 719 | "cognitiveAccount": "/subscriptions/REPLACE_SUBSCRIPTION_GUID/resourceGroups/REPLACE_RESOURCE_GROUP_NAME/providers/Microsoft.CognitiveServices/accounts/REPLACE_COGNITIVESERVICE_NAME/", 720 | "cognitiveKey": "", 721 | "customSpoMetadataSkillUri": "https://REPLACE_CUSTOM_SPOMETADATAHOST.azurewebsites.net/api/customskills/MergeSharePointMetadata", 722 | "SPOMetadataMapper-Api-Key": "THISNEEDSTOMATCH_GUID_FROM_SPOMETADATA_WEBAPI_APPSETTINGS" 723 | }, 724 | "StorageDetails": { 725 | "storageAccountName": "YOUR_SEARCH_NAME", 726 | "storageAccountKey": "", 727 | "storageBlobContainerName": "spocontent", 728 | "storageTableName": "spoIncrementalCrawlerTokens", 729 | "spoItemStorageTableName": "spoItems" 730 | }, 731 | "SPODetails": { 732 | "spoHostName": "somespohost.sharepoint.com", 733 | "siteUrl": "/", 734 | "metadataJSONStore": true, 735 | "metadataFieldsToIgnore": [ 736 | "@odata.context", 737 | "@odata.id", 738 | "FileLeafRef", 739 | "@odata.etag", 740 | "LinkFilenameNoMenu", 741 | "DocIcon", 742 | "FolderChildCount", 743 | "_UIVersionString", 744 | "ParentVersionStringLookupId", 745 | "ParentLeafNameLookupId", 746 | "responseHeaders", 747 | "statusCode", 748 | "_ComplianceFlags", 749 | "_ComplianceTag", 750 | "_ComplianceTagWrittenTime", 751 | "_ComplianceTagUserId", 752 | "_CommentCount", 753 | "_LikeCount", 754 | "ItemChildCount", 755 | "Edit", 756 | "_CheckinComment" 757 | ], 758 | "docLibExclusions": [ 759 | ] 760 | } 761 | }, 762 | "Logging": { 763 | "key": "APPINSIGHTS_KEY", 764 | "LogLevel": { 765 | "Default": "Warning" 766 | } 767 | } 768 | } 769 | ``` 770 | 771 | 772 | Most of the properties in the above JSON file are self explanatory. 773 | 774 | For the **AADDetails** section fill in the details from your newly generated AAD App Registration you performed as part of step 2 above. 775 | 776 | 777 | For **SearchDetails** fill in the details from your newly stood up Azure Cognitive Search instance and the cogntive Service. 778 | 779 | Make sure to paste the same value you had generated and entered into **'AzureSearch.SharepointOnline.Connector.CustomSkills/appsettings.json' EnvironmentConfig.ApiKey** back into 780 | 781 | * SearchDetails.SPOMetadataMapper-Api-Key 782 | 783 | and 784 | 785 | Make sure that you place the public URI path to the AzureSearch.SharepointOnline.Connector.CustomSkills WebAPI inside: 786 | 787 | * SearchDetails.customSpoMetadataSkillUri property 788 | 789 | inside *AzureSearch.SharePointOnline.Connector/appSettings.json* 790 | 791 | 792 | 793 | # Run 794 | 795 | You now have all the components and config setup. 796 | 797 | You can now build AzureSearch.SharePointOnline.Connector Project and run a crawl against your SPOSite/s. 798 | 799 | ``` Usage example 800 | AzureSearch.SharePointConnector.exe [-fullcrawl | -incrementalcrawl] 801 | ``` 802 | 803 | TODO: Add in a GIF Animation of a working AzureSearch.SharePointOnline.Connector Console App as a demo. 804 | 805 | # Contribute 806 | 807 | * Keep Master branch deployable; create new branches for new features / bug fixes and merge them into Master via Pull Requests when they’re completed. 808 | * Raise feature requests / bugs as issues. --------------------------------------------------------------------------------