14 | using System.Data.SqlClient;
15 | using Azure.Identity;
16 | using Azure.Core;
17 | using System.Data;
18 |
19 | var DefaultAzureCredentialOptions = new DefaultAzureCredentialOptions
20 | {
21 | ExcludeAzureCliCredential = true,
22 | ExcludeManagedIdentityCredential = true,
23 | ExcludeSharedTokenCacheCredential = true,
24 | ExcludeVisualStudioCredential = false,
25 | ExcludeAzurePowerShellCredential = true,
26 | ExcludeEnvironmentCredential = true,
27 | ExcludeVisualStudioCodeCredential = true,
28 | ExcludeInteractiveBrowserCredential = true
29 | };
30 |
31 | var accessToken = new DefaultAzureCredential(DefaultAzureCredentialOptions).GetToken(new TokenRequestContext(new string[] { "https://database.windows.net//.default" }));
32 | var sqlServer = "fkm4vwf6l6zebg4lqrhbtdcmsq-absyvg6llsuutcc3wwyid37nou.datawarehouse.pbidedicated.windows.net";
33 | var sqlDatabase = "";
34 | var connectionString = $"Server={sqlServer};Database={sqlDatabase}";
35 |
36 | //Set AAD Access Token, Open Conneciton, Run Queries and Disconnect
37 | using var con = new SqlConnection(connectionString);
38 | con.AccessToken = accessToken.Token;
39 | con.Open();
40 | using var cmd = new SqlCommand();
41 | cmd.Connection = con;
42 | cmd.CommandType = CommandType.Text;
43 | cmd.CommandText = "SELECT @@Version";
44 | var res =cmd.ExecuteScalar();
45 | con.Close();
46 |
47 | Console.WriteLine(res);
48 |
49 |
50 |
--------------------------------------------------------------------------------
/07_SharePoint/README.md:
--------------------------------------------------------------------------------
1 | ## 07_SharePoint
2 | Example Fabric Notebook with SharePoint Integration an AAD Service Principle and Graph API.
3 | This allows Notebooks to seamlessly download file and folders from SharePoint.
4 |
5 | What you will need
6 | - ClientID and Secret for Service Principle (details below). Our sample assumes that the Secret is in Keyvault.
7 | - TenantID for AAD where the app regsistration has been added.
8 | - A Sharepoint Site, Library, optional Folder and some sample files
9 | - A Fabric Workspace with Notebooks
10 |
11 | **Pre-Requisite
12 | You will need to create the service principle and assign Sharepoint permissions to the service principle for the target site.
13 | This process is very well documented in the blog here
14 | https://sposcripts.com/download-files-from-sharepoint-using-graph/
15 |
16 | Sample Code below to download files, including wildard support
17 |
18 | ConnectionSettings = '{"library": "Unittest", "tenant_id":"xxxxxxxx-xxxx--xxxx-xxxxxxxxxxxx","app_client_id":"app-fabricdw-dev-clientid","app_client_secret":"app-fabricdw-dev-clientsecret","keyvault":"kv-fabric-dev","sharepoint_url":"prodata365.sharepoint.com","site":"Fabric"}'
19 | SourceSettings = '{}'
20 | SourceDirectory = 'tst'
21 | TargetDirectory = 'unittest/AW/tst'
22 | SourceObject = '*.xlsx'
23 | TargetFileName = ''
24 | TargetSettings = ''
25 |
26 | from builtin.sharepoint import Sharepoint,AuthToken
27 | import pandas
28 | import json
29 | from os.path import join
30 | from pathlib import Path
31 |
32 | SourceSettings = SourceSettings or '{}'
33 | ConnectionSettings = ConnectionSettings or '{}'
34 | source_connection_options = json.loads(ConnectionSettings)
35 | source_options = json.loads(SourceSettings)
36 |
37 | auth_token = AuthToken(**source_connection_options)
38 | sharepoint = Sharepoint(auth_token, folder=SourceDirectory, file=SourceObject, **source_options, **source_connection_options)
39 |
40 | files = sharepoint.get_file_bytes()
41 |
42 | for file_name, file_bytes in files.items():
43 | Path(join("/lakehouse/default/Files/",TargetDirectory)).mkdir(parents=True, exist_ok=True)
44 |
45 | with open(join("/lakehouse/default/Files/",TargetDirectory,file_name), "wb") as f:
46 | f.write(file_bytes.getbuffer())
47 |
48 |
49 |
--------------------------------------------------------------------------------
/03_CopyBlobOneLake/README.md:
--------------------------------------------------------------------------------
1 | ## 03_CopyBlobOneLake
2 | Example of using Azure.Identity with HttpClient to copy a blob on OneLake.
3 | Currently tested mechanisms of authentication are AzurePowerShellCredential, VisualStudioCredential only works so far on older versions of Visual Studio components in VS 20022.
4 |
5 |
6 | Some Warnings:
7 | 1. You need to ensure that ExcludeManagedIdentityCredential is set to True if you are not using Managed Identity.
8 | Ths avoids timeouts as Azure.Identity alwasy tries Managed Identity First.
9 |
10 | 2. If doing frequent connections you need to consider caching the AccessToken. By default is valid for an hour, but re-caling the
11 | TokenRequest on each connection request can be a second or so of wasted time.
12 |
13 | 3. Make sure to change the default details in sourceUrl and sinkUrl as these are Workspace specific.
14 |
15 | 4. We are tracking a bug in the Visual Studio DLLs whereby VisualStudioCredential does not work for generating tokens on the latest version, but you can switch to AzureCliCredential and this does work.
16 | We are triaging this and will work with Microsoft Support to confirm if this is a bug, or if VisualStudioCredenital has lost some support.
17 |
18 |
19 |
20 | using Azure.Identity;
21 | using Azure.Core;
22 | using System.Net.Http.Headers;
23 |
24 | DefaultAzureCredentialOptions DefaultAzureCredentialOptions = new()
25 | {
26 | ExcludeAzureCliCredential = false,
27 | ExcludeVisualStudioCredential = false,
28 | ExcludeAzurePowerShellCredential = false
29 | };
30 | var defaultAzureCredential= new DefaultAzureCredential(DefaultAzureCredentialOptions);
31 | string bearerToken = defaultAzureCredential.GetToken(new TokenRequestContext(new[] { "https://storage.azure.com/" })).Token;
32 |
33 | HttpClient client = new();
34 |
35 | string rootUrl = "https://onelake.blob.fabric.microsoft.com/";
36 | // CHANGE THESE
37 | string sourceUrl = $"{rootUrl}FabricDW [Dev]/FabricLH.Lakehouse/Files/unittest/AdventureWorks/erp/Account.csv";
38 | string sinkUrl = $"{rootUrl}FabricDW [Dev]/FabricLH.Lakehouse/Files/landing/csv/Account.csv";
39 |
40 | using (var request = new HttpRequestMessage(HttpMethod.Put, $"{sinkUrl}"))
41 | {
42 | request.Headers.Add("X-Ms-Copy-Source", $"{sourceUrl}");
43 | request.Headers.Authorization = new AuthenticationHeaderValue("Bearer", bearerToken);
44 |
45 | var response = client.SendAsync(request);
46 |
47 | response.Wait();
48 | Console.WriteLine(response.Result);
49 | }
50 |
51 |
--------------------------------------------------------------------------------
/04_PauseFabricCapacity/README.md:
--------------------------------------------------------------------------------
1 | ## 04_Pause
2 | Example of using Azure.Identity with HttpClient to copy a blob on OneLake.
3 | Currently tested mechanisms of authentication are AzurePowerShellCredential, VisualStudioCredential only works so far on older versions of Visual Studio components in VS 20022.
4 |
5 |
6 | Some Warnings:
7 | 1. You need to ensure that ExcludeManagedIdentityCredential is set to True if you are not using Managed Identity.
8 | Ths avoids timeouts as Azure.Identity alwasy tries Managed Identity First.
9 |
10 | 2. If doing frequent connections you need to consider caching the AccessToken. By default is valid for an hour, but re-caling the
11 | TokenRequest on each connection request can be a second or so of wasted time.
12 |
13 | 3. Make sure to change the SubscriptionId, ResourceGroupName and CapacityName to relevant values.
14 |
15 | 4. We are tracking a bug in the Visual Studio DLLs whereby VisualStudioCredential does not work for generating tokens on the latest version, but you can switch to AzureCliCredential and this does work.
16 | We are triaging this and will work with Microsoft Support to confirm if this is a bug, or if VisualStudioCredenital has lost some support.
17 |
18 |
19 |
20 | using Azure.Core;
21 | using Azure.Identity;
22 | using System.Net.Http.Headers;
23 |
24 |
25 | DefaultAzureCredentialOptions DefaultAzureCredentialOptions = new()
26 | {
27 | ExcludeAzureCliCredential = false,
28 | ExcludeVisualStudioCredential = false,
29 | ExcludeAzurePowerShellCredential = false
30 | };
31 | // Fill in specific information here:
32 | string SubscriptionId = "";
33 | string ResourceGroupName = "";
34 | string CapacityName = "";
35 |
36 | string CapacityUrl = $"https://management.azure.com/subscriptions/{SubscriptionId}/resourceGroups/{ResourceGroupName}/providers/Microsoft.Fabric/capacities/{CapacityName}";
37 |
38 | var defaultAzureCredential = new DefaultAzureCredential(DefaultAzureCredentialOptions);
39 | string bearerToken = defaultAzureCredential.GetToken(new TokenRequestContext(new[] { "https://management.azure.com" })).Token;
40 |
41 | HttpClient client = new();
42 |
43 | using (var request = new HttpRequestMessage(HttpMethod.Post, $"{CapacityUrl}/suspend?api-version=2022-07-01-preview"))
44 | {
45 | request.Headers.Authorization = new AuthenticationHeaderValue("Bearer", bearerToken);
46 |
47 | var response = client.SendAsync(request);
48 |
49 | response.Wait();
50 | Console.WriteLine(response.Result);
51 | }
52 |
53 |
--------------------------------------------------------------------------------
/05_ResumeFabricCapacity/README.md:
--------------------------------------------------------------------------------
1 | ## 05_ResumeFabricCapacity
2 | Example of using Azure.Identity with HttpClient to copy a blob on OneLake.
3 | Currently tested mechanisms of authentication are AzurePowerShellCredential, VisualStudioCredential only works so far on older versions of Visual Studio components in VS 20022.
4 |
5 |
6 | Some Warnings:
7 | 1. You need to ensure that ExcludeManagedIdentityCredential is set to True if you are not using Managed Identity.
8 | Ths avoids timeouts as Azure.Identity alwasy tries Managed Identity First.
9 |
10 | 2. If doing frequent connections you need to consider caching the AccessToken. By default is valid for an hour, but re-caling the
11 | TokenRequest on each connection request can be a second or so of wasted time.
12 |
13 | 3. Make sure to change the SubscriptionId, ResourceGroupName and CapacityName to relevant values.
14 |
15 | 4. We are tracking a bug in the Visual Studio DLLs whereby VisualStudioCredential does not work for generating tokens on the latest version, but you can switch to AzureCliCredential and this does work.
16 | We are triaging this and will work with Microsoft Support to confirm if this is a bug, or if VisualStudioCredenital has lost some support.
17 |
18 |
19 |
20 | using Azure.Core;
21 | using Azure.Identity;
22 | using System.Net.Http.Headers;
23 |
24 |
25 | DefaultAzureCredentialOptions DefaultAzureCredentialOptions = new()
26 | {
27 | ExcludeAzureCliCredential = false,
28 | ExcludeVisualStudioCredential = false,
29 | ExcludeAzurePowerShellCredential = false
30 | };
31 | // Fill in specific information here:
32 | string SubscriptionId = "";
33 | string ResourceGroupName = "";
34 | string CapacityName = "";
35 |
36 | string CapacityUrl = $"https://management.azure.com/subscriptions/{SubscriptionId}/resourceGroups/{ResourceGroupName}/providers/Microsoft.Fabric/capacities/{CapacityName}";
37 |
38 | var defaultAzureCredential = new DefaultAzureCredential(DefaultAzureCredentialOptions);
39 | string bearerToken = defaultAzureCredential.GetToken(new TokenRequestContext(new[] { "https://management.azure.com" })).Token;
40 |
41 | HttpClient client = new();
42 |
43 | using (var request = new HttpRequestMessage(HttpMethod.Post, $"{CapacityUrl}/resume?api-version=2022-07-01-preview"))
44 | {
45 | request.Headers.Authorization = new AuthenticationHeaderValue("Bearer", bearerToken);
46 |
47 | var response = client.SendAsync(request);
48 |
49 | response.Wait();
50 | Console.WriteLine(response.Result);
51 | }
52 |
53 |
--------------------------------------------------------------------------------
/02_PipelineExecute/README.md:
--------------------------------------------------------------------------------
1 | ## 02_PipelineExecute
2 | Example of using Azure.Identity with HttpClient to execute a Fabric Pipeline.
3 | Currently tested mechanisms of authentication are AzurePowerShellCredential, VisualStudioCredential only works so far on older versions of Visual Studio components in VS 20022.
4 |
5 | The root of the url (wabi-north-europe-redirect) may need to be changed depending on your region
6 |
7 | Some Warnings:
8 | 1. You need to ensure that ExcludeManagedIdentityCredential is set to True if you are not using Managed Identity.
9 | Ths avoids timeouts as Azure.Identity alwasy tries Managed Identity First.
10 |
11 | 2. If doing frequent connections you need to consider caching the AccessToken. By default is valid for an hour, but re-caling the
12 | TokenRequest on each connection request can be a second or so of wasted time.
13 |
14 | 3. We are tracking a bug in the Visual Studio DLLs whereby VisualStudioCredential does not work for generating tokens on the latest version, but you can switch to AzureCliCredential and this does work.
15 | We are triaging this and will work with Microsoft Support to confirm if this is a bug, or if VisualStudioCredenital has lost some support.
16 |
17 |
18 | using Azure.Identity;
19 | using Azure.Core;
20 | using System.Net.Http.Headers;
21 |
22 | var DefaultAzureCredentialOptions = new DefaultAzureCredentialOptions
23 | {
24 | ExcludeAzureCliCredential = true,
25 | ExcludeManagedIdentityCredential = true,
26 | ExcludeSharedTokenCacheCredential = true,
27 | ExcludeVisualStudioCredential = false,
28 | ExcludeAzurePowerShellCredential = false,
29 | ExcludeEnvironmentCredential = true,
30 | ExcludeVisualStudioCodeCredential = true,
31 | ExcludeInteractiveBrowserCredential = true
32 | };
33 |
34 | var accessToken = new DefaultAzureCredential(DefaultAzureCredentialOptions).GetToken(new TokenRequestContext(new[] { "https://analysis.windows.net/powerbi/api/.default" }));
35 | string Token = accessToken.Token.ToString();
36 |
37 | // constructs pipeline url
38 | string pipelineId = "0987f3e1-4f93-46f9-b43b-c53dbbc13c33";
39 | string pipelineUrl = $"https://wabi-north-europe-redirect.analysis.windows.net/metadata/artifacts/{pipelineId}/jobs/Pipeline";
40 |
41 | HttpClient client = new();
42 | using (var request = new HttpRequestMessage(HttpMethod.Post, pipelineUrl))
43 | {
44 | // attaches headers
45 | request.Headers.Add("Accept", "application/json, text/plain, */*");
46 | request.Headers.Add("Accept-Encoding", "gzip, deflate, br");
47 | request.Headers.Authorization = new AuthenticationHeaderValue("Bearer", Token);
48 |
49 | var response = client.SendAsync(request);
50 |
51 | response.Wait();
52 | Console.WriteLine(response.Result.ToString());
53 | }
54 |
55 |
56 |
--------------------------------------------------------------------------------
/11_RestartMirror/Restart-Meta.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"markdown","source":["### Stop and Start Fabric Mirror\n","use this to fix and replication/mirror issues on Meta SQLDB
\n","Documentation:
\n","https://learn.microsoft.com/en-us/fabric/database/sql/start-stop-mirroring-api?tabs=5dot1"],"metadata":{"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"jupyter_python"}},"id":"3887a878-1bc3-4f99-8038-2edf7ea0d89f"},{"cell_type":"code","source":["### STOP MIRROR ###\n","import sempy.fabric as fabric\n","import requests\n","\n","database_name =\"Meta\"\n","workspace_id= fabric.get_workspace_id()\n","items = fabric.FabricRestClient().get(f\"/v1/workspaces/{workspace_id}/SQLDatabases\").json()[\"value\"]\n","database=next((endpoint for endpoint in items if endpoint[\"displayName\"] == database_name))\n","database_id=database[\"id\"]\n","\n","url = f\"v1/workspaces/{workspace_id}/sqlDatabases/{database_id}/stopMirroring\"\n","r = fabric.FabricRestClient().post(url)\n","r.raise_for_status\n","\n","print (f\"Stop Command Sent to {database_name}\")\n","\n"],"outputs":[{"output_type":"display_data","data":{"application/vnd.jupyter.statement-meta+json":{"session_id":"ee2bf30e-3b6f-4be4-b4de-d68ce5294ae9","normalized_state":"finished","queued_time":"2025-12-11T18:55:08.1572122Z","session_start_time":null,"execution_start_time":"2025-12-11T18:55:08.1582251Z","execution_finish_time":"2025-12-11T18:55:09.5744834Z","parent_msg_id":"7d18495e-00b1-4843-8f50-190956bb78bf"}},"metadata":{}},{"output_type":"stream","name":"stdout","text":["Stop Command Sent to Meta\n"]}],"execution_count":17,"metadata":{"microsoft":{"language":"python","language_group":"jupyter_python"}},"id":"67210334-1b8e-43aa-baac-c72fdacf5b0b"},{"cell_type":"code","source":["### START MIRROR ###\n","import sempy.fabric as fabric\n","import requests\n","\n","database_name =\"Meta\"\n","workspace_id= fabric.get_workspace_id()\n","items = fabric.FabricRestClient().get(f\"/v1/workspaces/{workspace_id}/SQLDatabases\").json()[\"value\"]\n","database=next((endpoint for endpoint in items if endpoint[\"displayName\"] == database_name))\n","database_id=database[\"id\"]\n","\n","url = f\"v1/workspaces/{workspace_id}/sqlDatabases/{database_id}/startMirroring\"\n","r = fabric.FabricRestClient().post(url)\n","r.raise_for_status\n","\n","print (f\"Start Command Sent to {database_name}\")\n","\n"],"outputs":[{"output_type":"display_data","data":{"application/vnd.jupyter.statement-meta+json":{"session_id":"ee2bf30e-3b6f-4be4-b4de-d68ce5294ae9","normalized_state":"finished","queued_time":"2025-12-11T18:56:34.7472151Z","session_start_time":null,"execution_start_time":"2025-12-11T18:56:34.7482046Z","execution_finish_time":"2025-12-11T18:56:36.1537854Z","parent_msg_id":"004bfc60-9b04-4308-952b-7dad4ce5f80d"}},"metadata":{}},{"output_type":"stream","name":"stdout","text":["Start Command Sent to Meta\n"]}],"execution_count":19,"metadata":{"microsoft":{"language":"python","language_group":"jupyter_python"}},"id":"07a8724f-b7f5-48db-b7df-2b7d54e95552"}],"metadata":{"kernel_info":{"name":"jupyter","jupyter_kernel_name":"python3.11"},"kernelspec":{"name":"jupyter","display_name":"Jupyter"},"language_info":{"name":"python"},"microsoft":{"language":"python","language_group":"jupyter_python","ms_spell_check":{"ms_spell_check_language":"en"}},"nteract":{"version":"nteract-front-end@1.0.0"},"spark_compute":{"compute_id":"/trident/default","session_options":{"conf":{"spark.synapse.nbs.session.timeout":"1200000"}}},"dependencies":{}},"nbformat":4,"nbformat_minor":5}
--------------------------------------------------------------------------------
/08_Struct/TestStructType7GB.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"code","source":["\n","import os\n","from pyspark.sql.functions import lit, input_file_name, expr\n","from pyspark.sql.types import StructType, StructField, LongType, IntegerType, ByteType, DateType, TimestampType, BooleanType, DecimalType, StringType, ShortType\n","\n","\n","\n","file_path = \"Files/Test/TestCSV4.csv\"\n","\n","table_schema = StructType([\\\n"," StructField(\"GUESTCHECKLINEITEMID\", LongType(), True),\\\n"," StructField(\"ORGANIZATIONID\", ByteType(), True),\\\n"," StructField(\"LOCATIONID\", IntegerType(), True),\\\n"," StructField(\"REVENUECENTERID\", LongType(), True),\\\n"," StructField(\"ORDERTYPEID\", LongType(), True),\\\n"," StructField(\"BUSINESSDATE\", DateType(), True),\\\n"," StructField(\"FIXEDPERIOD\", ByteType(), True),\\\n"," StructField(\"TRANSDATETIME\", TimestampType(), True),\\\n"," StructField(\"POSTRANSREF\", LongType(), True),\\\n"," StructField(\"SERVICEROUNDNUM\", ByteType(), True),\\\n"," StructField(\"LINENUM\", ByteType(), True),\\\n"," StructField(\"SEATNUM\", ByteType(), True),\\\n"," StructField(\"DETAILTYPE\", ByteType(), True),\\\n"," StructField(\"RECORDID\", ByteType(), True),\\\n"," StructField(\"PRICELEVEL\", ByteType(), True),\\\n"," StructField(\"UWSID\", IntegerType(), True),\\\n"," StructField(\"CHECKEMPLOYEEID\", LongType(), True),\\\n"," StructField(\"TRANSEMPLOYEEID\", LongType(), True),\\\n"," StructField(\"MANAGEREMPLOYEEID\", LongType(), True),\\\n"," StructField(\"STATUS\", StringType(), True),\\\n"," StructField(\"BINARYSTATUS\", BooleanType(), True),\\\n"," StructField(\"VOIDFLAG\", BooleanType(), True),\\\n"," StructField(\"GENFLAG1\", BooleanType(), True),\\\n"," StructField(\"REASONCODE\", ByteType(), True),\\\n"," StructField(\"LINECOUNT\", ByteType(), True),\\\n"," StructField(\"LINETOTAL\", DecimalType(), True),\\\n"," StructField(\"REPORTLINECOUNT\", ByteType(), True),\\\n"," StructField(\"REPORTLINETOTAL\", DecimalType(), True),\\\n"," StructField(\"REFERENCEINFO\", StringType(), True),\\\n"," StructField(\"MOVEFLAG\", BooleanType(), True),\\\n"," StructField(\"DONOTSHOW\", BooleanType(), True),\\\n"," StructField(\"DAYPARTID\", IntegerType(), True),\\\n"," StructField(\"PRICEOVRDEFLAG\", BooleanType(), True),\\\n"," StructField(\"TAXEXEMPTFLAG\", BooleanType(), True),\\\n"," StructField(\"ERRORCORRECTFLAG\", BooleanType(), True),\\\n"," StructField(\"REASONCODEID\", LongType(), True),\\\n"," StructField(\"TAX1TOTAL\", DecimalType(), True),\\\n"," StructField(\"MAJORGROUPID\", LongType(), True),\\\n"," StructField(\"FAMILYGROUPID\", LongType(), True),\\\n"," StructField(\"DTLID\", ByteType(), True),\\\n"," StructField(\"ACTIVETAXES\", StringType(), True),\\\n"," StructField(\"ADJUSTDATETIME\", TimestampType(), True),\\\n"," StructField(\"TAX1POSREF\", StringType(), True)\\\n"," ])\n","\n","\n","df = spark.read.format(\"csv\").schema(table_schema).load(file_path)\n","df.write.mode(\"overwrite\").format(\"delta\").saveAsTable(\"testcsv4ss\")\n"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"session_id":"152d1132-70fb-4f54-b05e-9462b3537040","statement_id":4,"state":"finished","livy_statement_state":"available","queued_time":"2024-01-28T15:12:58.2536048Z","session_start_time":null,"execution_start_time":"2024-01-28T15:12:58.8168711Z","execution_finish_time":"2024-01-28T15:19:08.811508Z","parent_msg_id":"e2513c7e-0b5b-4001-8952-67215a7e2a72"},"text/plain":"StatementMeta(, 152d1132-70fb-4f54-b05e-9462b3537040, 4, Finished, Available)"},"metadata":{}}],"execution_count":2,"metadata":{"advisor":{"adviceMetadata":"{\"artifactId\":\"207fa3db-593e-4ee6-874c-d702125d3793\",\"activityId\":\"152d1132-70fb-4f54-b05e-9462b3537040\",\"applicationId\":\"application_1706453980995_0001\",\"jobGroupId\":\"4\",\"advices\":{\"info\":1}}"},"cellStatus":"{\"Bob Duffy.\":{\"queued_time\":\"2024-01-28T15:12:58.2536048Z\",\"session_start_time\":null,\"execution_start_time\":\"2024-01-28T15:12:58.8168711Z\",\"execution_finish_time\":\"2024-01-28T15:19:08.811508Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}"},"id":"9e2c44d9-2a30-469d-9d59-aa608bdc6130"}],"metadata":{"language_info":{"name":"python"},"microsoft":{"language":"python","ms_spell_check":{"ms_spell_check_language":"en"}},"widgets":{},"kernelspec":{"name":"synapse_pyspark","language":"Python","display_name":"Synapse PySpark"},"kernel_info":{"name":"synapse_pyspark"},"nteract":{"version":"nteract-front-end@1.0.0"},"notebook_environment":{},"synapse_widget":{"version":"0.1","state":{}},"save_output":true,"spark_compute":{"compute_id":"/trident/default","session_options":{"enableDebugMode":false,"conf":{}}},"trident":{"lakehouse":{"known_lakehouses":[{"id":"19785e4d-5572-4ced-bfab-f26e7c5de3ce"}],"default_lakehouse":"19785e4d-5572-4ced-bfab-f26e7c5de3ce","default_lakehouse_name":"FabricLH","default_lakehouse_workspace_id":"9b8a6500-5ccb-49a9-885b-b5b081efed75"}}},"nbformat":4,"nbformat_minor":5}
--------------------------------------------------------------------------------
/07_SharePoint/Ingest-SP.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"code","execution_count":1,"id":"37237e3e-00b3-4d8c-9cb4-f2c0cfc4676a","metadata":{"cellStatus":"{\"Bob Duffy.\":{\"queued_time\":\"2023-11-14T11:23:50.8769602Z\",\"session_start_time\":null,\"execution_start_time\":\"2023-11-14T11:24:00.2570934Z\",\"execution_finish_time\":\"2023-11-14T11:24:02.9397984Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}","jupyter":{"outputs_hidden":false,"source_hidden":false},"nteract":{"transient":{"deleting":false}},"tags":["parameters"]},"outputs":[{"data":{"application/vnd.livy.statement-meta+json":{"execution_finish_time":"2023-11-14T11:24:02.9397984Z","execution_start_time":"2023-11-14T11:24:00.2570934Z","livy_statement_state":"available","parent_msg_id":"e42342a8-18a8-4ea3-8a90-1804d9cf7be9","queued_time":"2023-11-14T11:23:50.8769602Z","session_id":"6e3f0897-7362-45ce-8035-76fa2de6c546","session_start_time":null,"spark_jobs":{"jobs":[],"limit":20,"numbers":{"FAILED":0,"RUNNING":0,"SUCCEEDED":0,"UNKNOWN":0},"rule":"ALL_DESC"},"spark_pool":null,"state":"finished","statement_id":3},"text/plain":["StatementMeta(, 6e3f0897-7362-45ce-8035-76fa2de6c546, 3, Finished, Available)"]},"metadata":{},"output_type":"display_data"}],"source":["ConnectionSettings = '{\"library\": \"Unittest\", \"tenant_id\":\"xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx\",\"app_client_id\":\"app-fabricdw-dev-clientid\",\"app_client_secret\":\"app-fabricdw-dev-clientsecret\",\"keyvault\":\"kv-fabric-dev\",\"sharepoint_url\":\"prodata365.sharepoint.com\",\"site\":\"Fabric\"}'\n","SourceSettings = '{}'\n","SourceDirectory = 'tst'\n","TargetDirectory = 'unittest/AW/tst'\n","SourceObject = '*.xlsx'\n","TargetFileName = ''\n","TargetSettings = ''"]},{"cell_type":"code","execution_count":2,"id":"7afd78f6-20ac-4207-bf58-27d67caea2f4","metadata":{"cellStatus":"{\"Bob Duffy.\":{\"queued_time\":\"2023-11-14T11:23:50.9218891Z\",\"session_start_time\":null,\"execution_start_time\":\"2023-11-14T11:24:03.6553854Z\",\"execution_finish_time\":\"2023-11-14T11:24:06.6519076Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}","jupyter":{"outputs_hidden":false,"source_hidden":false},"nteract":{"transient":{"deleting":false}}},"outputs":[{"data":{"application/vnd.livy.statement-meta+json":{"execution_finish_time":"2023-11-14T11:24:06.6519076Z","execution_start_time":"2023-11-14T11:24:03.6553854Z","livy_statement_state":"available","parent_msg_id":"5b9475e7-c090-4780-ae08-26736733adff","queued_time":"2023-11-14T11:23:50.9218891Z","session_id":"6e3f0897-7362-45ce-8035-76fa2de6c546","session_start_time":null,"spark_jobs":{"jobs":[],"limit":20,"numbers":{"FAILED":0,"RUNNING":0,"SUCCEEDED":0,"UNKNOWN":0},"rule":"ALL_DESC"},"spark_pool":null,"state":"finished","statement_id":4},"text/plain":["StatementMeta(, 6e3f0897-7362-45ce-8035-76fa2de6c546, 4, Finished, Available)"]},"metadata":{},"output_type":"display_data"}],"source":["from builtin.sharepoint import Sharepoint,AuthToken\n","import pandas\n","import json\n","from os.path import join\n","from pathlib import Path"]},{"cell_type":"code","execution_count":3,"id":"206c97e3-4b90-473a-88b2-47f3c3bb6f38","metadata":{"cellStatus":"{\"Bob Duffy.\":{\"queued_time\":\"2023-11-14T11:23:50.9837361Z\",\"session_start_time\":null,\"execution_start_time\":\"2023-11-14T11:24:07.3145173Z\",\"execution_finish_time\":\"2023-11-14T11:24:16.5076178Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}"},"outputs":[{"data":{"application/vnd.livy.statement-meta+json":{"execution_finish_time":"2023-11-14T11:24:16.5076178Z","execution_start_time":"2023-11-14T11:24:07.3145173Z","livy_statement_state":"available","parent_msg_id":"a8944e64-938f-4f15-a2da-309e92f0555b","queued_time":"2023-11-14T11:23:50.9837361Z","session_id":"6e3f0897-7362-45ce-8035-76fa2de6c546","session_start_time":null,"spark_jobs":{"jobs":[],"limit":20,"numbers":{"FAILED":0,"RUNNING":0,"SUCCEEDED":0,"UNKNOWN":0},"rule":"ALL_DESC"},"spark_pool":null,"state":"finished","statement_id":5},"text/plain":["StatementMeta(, 6e3f0897-7362-45ce-8035-76fa2de6c546, 5, Finished, Available)"]},"metadata":{},"output_type":"display_data"}],"source":["SourceSettings = SourceSettings or '{}'\n","ConnectionSettings = ConnectionSettings or '{}'\n","source_connection_options = json.loads(ConnectionSettings)\n","source_options = json.loads(SourceSettings)\n","\n","auth_token = AuthToken(**source_connection_options)\n","sharepoint = Sharepoint(auth_token, folder=SourceDirectory, file=SourceObject, **source_options, **source_connection_options)\n","\n","files = sharepoint.get_file_bytes()\n","\n","for file_name, file_bytes in files.items():\n"," Path(join(\"/lakehouse/default/Files/\",TargetDirectory)).mkdir(parents=True, exist_ok=True)\n","\n"," with open(join(\"/lakehouse/default/Files/\",TargetDirectory,file_name), \"wb\") as f:\n"," f.write(file_bytes.getbuffer())"]}],"metadata":{"kernel_info":{"name":"synapse_pyspark"},"kernelspec":{"display_name":"Synapse PySpark","language":"Python","name":"synapse_pyspark"},"language_info":{"name":"python"},"microsoft":{"host":{},"language":"python","ms_spell_check":{"ms_spell_check_language":"en"}},"notebook_environment":{},"nteract":{"version":"nteract-front-end@1.0.0"},"save_output":true,"spark_compute":{"compute_id":"/trident/default","session_options":{"conf":{},"enableDebugMode":false}},"synapse_widget":{"state":{},"version":"0.1"},"trident":{"lakehouse":{"default_lakehouse":"19785e4d-5572-4ced-bfab-f26e7c5de3ce","default_lakehouse_name":"FabricLH","default_lakehouse_workspace_id":"9b8a6500-5ccb-49a9-885b-b5b081efed75","known_lakehouses":[{"id":"19785e4d-5572-4ced-bfab-f26e7c5de3ce"}]}},"widgets":{}},"nbformat":4,"nbformat_minor":5}
2 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | ## Ignore Visual Studio temporary files, build results, and
2 | ## files generated by popular Visual Studio add-ons.
3 | ##
4 | ## Get latest from https://github.com/github/gitignore/blob/main/VisualStudio.gitignore
5 |
6 | # User-specific files
7 | *.rsuser
8 | *.suo
9 | *.user
10 | *.userosscache
11 | *.sln.docstates
12 |
13 | # User-specific files (MonoDevelop/Xamarin Studio)
14 | *.userprefs
15 |
16 | # Mono auto generated files
17 | mono_crash.*
18 |
19 | # Build results
20 | [Dd]ebug/
21 | [Dd]ebugPublic/
22 | [Rr]elease/
23 | [Rr]eleases/
24 | x64/
25 | x86/
26 | [Ww][Ii][Nn]32/
27 | [Aa][Rr][Mm]/
28 | [Aa][Rr][Mm]64/
29 | bld/
30 | [Bb]in/
31 | [Oo]bj/
32 | [Ll]og/
33 | [Ll]ogs/
34 |
35 | # Visual Studio 2015/2017 cache/options directory
36 | .vs/
37 | # Uncomment if you have tasks that create the project's static files in wwwroot
38 | #wwwroot/
39 |
40 | # Visual Studio 2017 auto generated files
41 | Generated\ Files/
42 |
43 | # MSTest test Results
44 | [Tt]est[Rr]esult*/
45 | [Bb]uild[Ll]og.*
46 |
47 | # NUnit
48 | *.VisualState.xml
49 | TestResult.xml
50 | nunit-*.xml
51 |
52 | # Build Results of an ATL Project
53 | [Dd]ebugPS/
54 | [Rr]eleasePS/
55 | dlldata.c
56 |
57 | # Benchmark Results
58 | BenchmarkDotNet.Artifacts/
59 |
60 | # .NET Core
61 | project.lock.json
62 | project.fragment.lock.json
63 | artifacts/
64 |
65 | # ASP.NET Scaffolding
66 | ScaffoldingReadMe.txt
67 |
68 | # StyleCop
69 | StyleCopReport.xml
70 |
71 | # Files built by Visual Studio
72 | *_i.c
73 | *_p.c
74 | *_h.h
75 | *.ilk
76 | *.meta
77 | *.obj
78 | *.iobj
79 | *.pch
80 | *.pdb
81 | *.ipdb
82 | *.pgc
83 | *.pgd
84 | *.rsp
85 | *.sbr
86 | *.tlb
87 | *.tli
88 | *.tlh
89 | *.tmp
90 | *.tmp_proj
91 | *_wpftmp.csproj
92 | *.log
93 | *.tlog
94 | *.vspscc
95 | *.vssscc
96 | .builds
97 | *.pidb
98 | *.svclog
99 | *.scc
100 |
101 | # Chutzpah Test files
102 | _Chutzpah*
103 |
104 | # Visual C++ cache files
105 | ipch/
106 | *.aps
107 | *.ncb
108 | *.opendb
109 | *.opensdf
110 | *.sdf
111 | *.cachefile
112 | *.VC.db
113 | *.VC.VC.opendb
114 |
115 | # Visual Studio profiler
116 | *.psess
117 | *.vsp
118 | *.vspx
119 | *.sap
120 |
121 | # Visual Studio Trace Files
122 | *.e2e
123 |
124 | # TFS 2012 Local Workspace
125 | $tf/
126 |
127 | # Guidance Automation Toolkit
128 | *.gpState
129 |
130 | # ReSharper is a .NET coding add-in
131 | _ReSharper*/
132 | *.[Rr]e[Ss]harper
133 | *.DotSettings.user
134 |
135 | # TeamCity is a build add-in
136 | _TeamCity*
137 |
138 | # DotCover is a Code Coverage Tool
139 | *.dotCover
140 |
141 | # AxoCover is a Code Coverage Tool
142 | .axoCover/*
143 | !.axoCover/settings.json
144 |
145 | # Coverlet is a free, cross platform Code Coverage Tool
146 | coverage*.json
147 | coverage*.xml
148 | coverage*.info
149 |
150 | # Visual Studio code coverage results
151 | *.coverage
152 | *.coveragexml
153 |
154 | # NCrunch
155 | _NCrunch_*
156 | .*crunch*.local.xml
157 | nCrunchTemp_*
158 |
159 | # MightyMoose
160 | *.mm.*
161 | AutoTest.Net/
162 |
163 | # Web workbench (sass)
164 | .sass-cache/
165 |
166 | # Installshield output folder
167 | [Ee]xpress/
168 |
169 | # DocProject is a documentation generator add-in
170 | DocProject/buildhelp/
171 | DocProject/Help/*.HxT
172 | DocProject/Help/*.HxC
173 | DocProject/Help/*.hhc
174 | DocProject/Help/*.hhk
175 | DocProject/Help/*.hhp
176 | DocProject/Help/Html2
177 | DocProject/Help/html
178 |
179 | # Click-Once directory
180 | publish/
181 |
182 | # Publish Web Output
183 | *.[Pp]ublish.xml
184 | *.azurePubxml
185 | # Note: Comment the next line if you want to checkin your web deploy settings,
186 | # but database connection strings (with potential passwords) will be unencrypted
187 | *.pubxml
188 | *.publishproj
189 |
190 | # Microsoft Azure Web App publish settings. Comment the next line if you want to
191 | # checkin your Azure Web App publish settings, but sensitive information contained
192 | # in these scripts will be unencrypted
193 | PublishScripts/
194 |
195 | # NuGet Packages
196 | *.nupkg
197 | # NuGet Symbol Packages
198 | *.snupkg
199 | # The packages folder can be ignored because of Package Restore
200 | **/[Pp]ackages/*
201 | # except build/, which is used as an MSBuild target.
202 | !**/[Pp]ackages/build/
203 | # Uncomment if necessary however generally it will be regenerated when needed
204 | #!**/[Pp]ackages/repositories.config
205 | # NuGet v3's project.json files produces more ignorable files
206 | *.nuget.props
207 | *.nuget.targets
208 |
209 | # Microsoft Azure Build Output
210 | csx/
211 | *.build.csdef
212 |
213 | # Microsoft Azure Emulator
214 | ecf/
215 | rcf/
216 |
217 | # Windows Store app package directories and files
218 | AppPackages/
219 | BundleArtifacts/
220 | Package.StoreAssociation.xml
221 | _pkginfo.txt
222 | *.appx
223 | *.appxbundle
224 | *.appxupload
225 |
226 | # Visual Studio cache files
227 | # files ending in .cache can be ignored
228 | *.[Cc]ache
229 | # but keep track of directories ending in .cache
230 | !?*.[Cc]ache/
231 |
232 | # Others
233 | ClientBin/
234 | ~$*
235 | *~
236 | *.dbmdl
237 | *.dbproj.schemaview
238 | *.jfm
239 | *.pfx
240 | *.publishsettings
241 | orleans.codegen.cs
242 |
243 | # Including strong name files can present a security risk
244 | # (https://github.com/github/gitignore/pull/2483#issue-259490424)
245 | #*.snk
246 |
247 | # Since there are multiple workflows, uncomment next line to ignore bower_components
248 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
249 | #bower_components/
250 |
251 | # RIA/Silverlight projects
252 | Generated_Code/
253 |
254 | # Backup & report files from converting an old project file
255 | # to a newer Visual Studio version. Backup files are not needed,
256 | # because we have git ;-)
257 | _UpgradeReport_Files/
258 | Backup*/
259 | UpgradeLog*.XML
260 | UpgradeLog*.htm
261 | ServiceFabricBackup/
262 | *.rptproj.bak
263 |
264 | # SQL Server files
265 | *.mdf
266 | *.ldf
267 | *.ndf
268 |
269 | # Business Intelligence projects
270 | *.rdl.data
271 | *.bim.layout
272 | *.bim_*.settings
273 | *.rptproj.rsuser
274 | *- [Bb]ackup.rdl
275 | *- [Bb]ackup ([0-9]).rdl
276 | *- [Bb]ackup ([0-9][0-9]).rdl
277 |
278 | # Microsoft Fakes
279 | FakesAssemblies/
280 |
281 | # GhostDoc plugin setting file
282 | *.GhostDoc.xml
283 |
284 | # Node.js Tools for Visual Studio
285 | .ntvs_analysis.dat
286 | node_modules/
287 |
288 | # Visual Studio 6 build log
289 | *.plg
290 |
291 | # Visual Studio 6 workspace options file
292 | *.opt
293 |
294 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
295 | *.vbw
296 |
297 | # Visual Studio 6 auto-generated project file (contains which files were open etc.)
298 | *.vbp
299 |
300 | # Visual Studio 6 workspace and project file (working project files containing files to include in project)
301 | *.dsw
302 | *.dsp
303 |
304 | # Visual Studio 6 technical files
305 | *.ncb
306 | *.aps
307 |
308 | # Visual Studio LightSwitch build output
309 | **/*.HTMLClient/GeneratedArtifacts
310 | **/*.DesktopClient/GeneratedArtifacts
311 | **/*.DesktopClient/ModelManifest.xml
312 | **/*.Server/GeneratedArtifacts
313 | **/*.Server/ModelManifest.xml
314 | _Pvt_Extensions
315 |
316 | # Paket dependency manager
317 | .paket/paket.exe
318 | paket-files/
319 |
320 | # FAKE - F# Make
321 | .fake/
322 |
323 | # CodeRush personal settings
324 | .cr/personal
325 |
326 | # Python Tools for Visual Studio (PTVS)
327 | __pycache__/
328 | *.pyc
329 |
330 | # Cake - Uncomment if you are using it
331 | # tools/**
332 | # !tools/packages.config
333 |
334 | # Tabs Studio
335 | *.tss
336 |
337 | # Telerik's JustMock configuration file
338 | *.jmconfig
339 |
340 | # BizTalk build output
341 | *.btp.cs
342 | *.btm.cs
343 | *.odx.cs
344 | *.xsd.cs
345 |
346 | # OpenCover UI analysis results
347 | OpenCover/
348 |
349 | # Azure Stream Analytics local run output
350 | ASALocalRun/
351 |
352 | # MSBuild Binary and Structured Log
353 | *.binlog
354 |
355 | # NVidia Nsight GPU debugger configuration file
356 | *.nvuser
357 |
358 | # MFractors (Xamarin productivity tool) working folder
359 | .mfractor/
360 |
361 | # Local History for Visual Studio
362 | .localhistory/
363 |
364 | # Visual Studio History (VSHistory) files
365 | .vshistory/
366 |
367 | # BeatPulse healthcheck temp database
368 | healthchecksdb
369 |
370 | # Backup folder for Package Reference Convert tool in Visual Studio 2017
371 | MigrationBackup/
372 |
373 | # Ionide (cross platform F# VS Code tools) working folder
374 | .ionide/
375 |
376 | # Fody - auto-generated XML schema
377 | FodyWeavers.xsd
378 |
379 | # VS Code files for those working on multiple tools
380 | .vscode/*
381 | !.vscode/settings.json
382 | !.vscode/tasks.json
383 | !.vscode/launch.json
384 | !.vscode/extensions.json
385 | *.code-workspace
386 |
387 | # Local History for Visual Studio Code
388 | .history/
389 |
390 | # Windows Installer files from build outputs
391 | *.cab
392 | *.msi
393 | *.msix
394 | *.msm
395 | *.msp
396 |
397 | # JetBrains Rider
398 | *.sln.iml
399 |
--------------------------------------------------------------------------------
/07_SharePoint/builtin/sharepoint.py:
--------------------------------------------------------------------------------
1 | from io import BytesIO
2 | import requests
3 | import pandas
4 | import fnmatch
5 | from pandas import ExcelFile, DataFrame
6 | try:
7 | from notebookutils import mssparkutils
8 | USE_MSSPARKUTILS = True
9 | except ModuleNotFoundError:
10 | USE_MSSPARKUTILS = False
11 | # while this can be in the except statement - importing modules in an except
12 | # confuses linters -_-
13 | if not USE_MSSPARKUTILS:
14 | from azure.keyvault.secrets import SecretClient
15 | from azure.identity import DefaultAzureCredential
16 | class AuthToken:
17 | """Class to retrieve token from tenant id, client id, seceret and scope.
18 |
19 | :param str token_url: https://login.microsoftonline.com/
20 | :param str tenant_id:
21 | :param str app_client_id: The id or keyvault secret name (if keyvault URL provided).
22 | :param str app_client_secret: The id or keyvault secret name (if keyvault URL provided).
23 | :param str scope: https://graph.microsoft.com/
24 | :param str keyvault_url: https://{key-vault-name}.vault.azure.net/ or {key-vault-name}.
25 | """
26 |
27 | def __init__(
28 | self,
29 | tenant_id,
30 | app_client_id,
31 | app_client_secret,
32 | scope="https://graph.microsoft.com/",
33 | keyvault: str or None = None,
34 | token_url: str or None = "https://login.microsoftonline.com/",
35 | **args
36 | ):
37 | self.access_token = None
38 | # Set the Token URL for Azure AD Endpoint
39 | if token_url is not None:
40 | self.token_url = f"{token_url}{tenant_id}/oauth2/token"
41 | else:
42 | self.token_url = (
43 | f"https://login.microsoftonline.com/{tenant_id}/oauth2/token"
44 | )
45 | if keyvault:
46 | if not keyvault.startswith("https://"):
47 | keyvault = f"https://{keyvault}.vault.azure.net/"
48 | if USE_MSSPARKUTILS:
49 | app_client_id = mssparkutils.credentials.getSecret(
50 | keyvault, app_client_id
51 | )
52 | app_client_secret = mssparkutils.credentials.getSecret(
53 | keyvault, app_client_secret
54 | )
55 | else:
56 | secret_client = SecretClient(
57 | vault_url=keyvault, credential=DefaultAzureCredential()
58 | )
59 | app_client_id = secret_client.get_secret(app_client_id).value
60 | app_client_secret = secret_client.get_secret(app_client_secret).value
61 |
62 | self.set_token(app_client_id, app_client_secret, scope)
63 |
64 | def set_token(self, client_id, client_secret, scope):
65 | """Sets the classes token value.
66 | :param str client_id:
67 | :param str client_secret:
68 | :param str scope:
69 | :return: None
70 | """
71 |
72 | data = {
73 | "grant_type": "client_credentials",
74 | "client_id": client_id,
75 | "client_secret": client_secret,
76 | "resource": scope,
77 | }
78 |
79 | response = requests.post(self.token_url, data=data)
80 | response.raise_for_status()
81 |
82 | token_data = response.json()
83 | self.access_token = token_data["access_token"]
84 | class Sharepoint():
85 | def __init__(self, auth_token : AuthToken, sharepoint_url = None, site=None, library=None, folder=None, file=None,**args):
86 | self.sharepoint_url = sharepoint_url
87 | self.site = site
88 | self.library = library
89 | self.folder = folder
90 | self.file = file
91 | self.headers = {"Authorization": f"Bearer {auth_token.access_token}"}
92 | def get_site_id_by_name(self, sharepoint_url= None, site_name = None):
93 | sharepoint_url = sharepoint_url or self.sharepoint_url
94 | site_name = site_name or self.site
95 | if not sharepoint_url:
96 | raise ValueError("sharepoint_url cannot be None or blank.")
97 | if not site_name:
98 | raise ValueError("site_name cannot be None or blank.")
99 |
100 | url = f"https://graph.microsoft.com/v1.0/sites/{sharepoint_url}:/sites/{site_name}?$select=id"
101 |
102 | headers = {
103 | 'Content-Type': 'application/x-www-form-urlencoded',
104 | **self.headers
105 | }
106 |
107 | response = requests.request("GET", url, headers=headers)
108 | response.raise_for_status()
109 |
110 | return response.json()["id"]
111 |
112 |
113 | def get_drive_id_by_name(self, site_id, library_name=None):
114 | library_name = library_name or self.library
115 | if (not site_id):
116 | raise ValueError("site_id cannot be None or blank.")
117 | if (not library_name):
118 | raise ValueError("library_name cannot be None or blank.")
119 |
120 | url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drives/"
121 |
122 | headers = {
123 | 'Content-Type': 'application/x-www-form-urlencoded',
124 | **self.headers
125 | }
126 |
127 | response = requests.request("GET", url, headers=headers)
128 | response.raise_for_status()
129 |
130 | drives = response.json()["value"]
131 | for drive in drives:
132 | if drive["name"] == library_name:
133 | return drive["id"]
134 | raise Exception("Drive name was not found.")
135 |
136 | def get_folder_id_by_name(self,site_id, drive_id, folder_name=None):
137 | folder_name = folder_name or self.folder
138 | if (not site_id):
139 | raise ValueError("site_id cannot be None or blank.")
140 | if (not drive_id):
141 | raise ValueError("drive_id cannot be None or blank.")
142 | if (not folder_name):
143 | raise ValueError("folder_name cannot be None or blank.")
144 | url = f"http://graph.microsoft.com/v1.0/sites/{site_id}/drives/{drive_id}/items/root:/{folder_name}"
145 |
146 | headers = {
147 | 'Content-Type': 'application/x-www-form-urlencoded',
148 | **self.headers
149 | }
150 |
151 | response = requests.request("GET", url, headers=headers)
152 | response.raise_for_status()
153 |
154 | return response.json()["id"]
155 |
156 | def get_file_url_by_name(self, site_id, drive_id, folder_id, file_name=None):
157 | file_name = file_name or self.file
158 | if not site_id or not drive_id or not folder_id or not file_name:
159 | raise ValueError("site_id, drive_id, folder_id, and file_name cannot be None or blank.")
160 |
161 | url = f"http://graph.microsoft.com/v1.0/sites/{site_id}/drives/{drive_id}/items/{folder_id}/children"
162 | headers = {
163 | **self.headers
164 | }
165 |
166 | response = requests.get(url, headers=headers)
167 |
168 | if response.status_code == 404:
169 | print(f"File not found: {file_name}")
170 | return {} # Return an empty dictionary if no matching files are found
171 | response.raise_for_status()
172 |
173 | files_returned = {}
174 | files = response.json()["value"]
175 |
176 | for file in files:
177 | if fnmatch.fnmatch(file["name"], file_name):
178 | files_returned[file["name"]] = file["@microsoft.graph.downloadUrl"]
179 |
180 | return files_returned
181 |
182 | def get_file_bytes(self, sharepoint_url:str | None =None, site_name:str | None=None,\
183 | library_name:str | None=None, folder_name:str | None=None, file_name:str | None=None) -> dict[str,BytesIO]:
184 | sharepoint_url = sharepoint_url or self.sharepoint_url
185 | site_name = site_name or self.site
186 | library_name = library_name or self.library
187 | folder_name = folder_name or self.folder
188 | file_name = file_name or self.file
189 |
190 | site_id = self.get_site_id_by_name(sharepoint_url, site_name)
191 | drive_id = self.get_drive_id_by_name(site_id, library_name)
192 | folder_id = self.get_folder_id_by_name(site_id, drive_id, folder_name)
193 | file_url = self.get_file_url_by_name(site_id,drive_id,folder_id, file_name)
194 |
195 | file_return = {}
196 | for file, url in file_url.items():
197 | response = requests.request("GET", url=file_url[file])
198 | if response.status_code == 404:
199 | print(f"File not found: {file}")
200 | continue # Skip this file if it's not found
201 | response.raise_for_status()
202 |
203 | file_return[file] = BytesIO(response.content)
204 | return file_return
205 |
206 | def get_excel_file(self, sharepoint_url:str | None =None, site_name:str | None=None,\
207 | library_name:str | None=None, folder_name:str | None=None, file_name: str | None = None) -> ExcelFile:
208 | sharepoint_url = sharepoint_url or self.sharepoint_url
209 | site_name = site_name or self.site
210 | library_name = library_name or self.library
211 | folder_name = folder_name or self.folder
212 | file_name = file_name or self.file
213 | if not file_name: raise Exception("Filename cannot be none.")
214 | if '*' in file_name or "%" in file_name:
215 | raise Exception("Wildcard name not supported for excel files.")
216 | file = self.get_file_bytes(sharepoint_url,site_name,library_name,folder_name, file_name)
217 |
218 | return ExcelFile(file[file_name])
219 | def df_from_excel(excel_file : ExcelFile, sheet_name):
220 | if not sheet_name: # checks if empty or None
221 | yield (excel_file.sheet_names[0],pandas.read_excel(excel_file))
222 | elif sheet_name == "*":
223 | for sheet in excel_file.sheet_names:
224 | yield (sheet,pandas.read_excel(excel_file, sheet_name=sheet))
225 | else:
226 | for sheet in sheet_name.split(","):
227 | yield (sheet,pandas.read_excel(excel_file, sheet_name=sheet))
228 |
229 | if __name__ == "__main__":
230 | import pandas
231 | import json
232 | from os.path import join
233 | from pathlib import Path
234 |
235 | # Environment parameters
236 | SourceConnectionSettings='{"tenant_id":"d8ca992a-5fbe-40b2-9b8b-844e198c4c94","app_client_id":"app-fabricdw-dev-clientid", "app_client_secret":"app-fabricdw-dev-clientsecret","keyvault":"kv-fabric-dev" ,"sharepoint_url":"prodata365.sharepoint.com","site" : "Fabric"}'
237 | # Source Settings
238 | SourceSettings = '{"library":"Unittest","sharepoint_url":"prodata365.sharepoint.com","site":"Fabric"}'
239 | # Pipeline Parameters
240 | SourceDirectory = "EmptyFolder"
241 | SourceObject = "*"
242 | TargetDirectory = "landing/erp"
243 | TargetFileName = ""
244 |
245 | source_connection_options = json.loads(SourceConnectionSettings)
246 |
247 | source_options = json.loads(SourceSettings)
248 |
249 | auth_token = AuthToken(**source_connection_options)
250 | sharepoint = Sharepoint(auth_token, folder=SourceDirectory, file=SourceObject, **source_options)
251 |
252 | files = sharepoint.get_file_bytes()
253 |
254 | for file_name, file_bytes in files.items():
255 | Path(join("/lakehouse/default/Files/",TargetDirectory)).mkdir(parents=True, exist_ok=True)
256 |
257 | with open(join("/lakehouse/default/Files/",TargetDirectory,file_name), "wb") as f:
258 | f.write(file_bytes.getbuffer())
259 |
--------------------------------------------------------------------------------
/06_RefreshPowerBIDataset/pbi_refresh.py:
--------------------------------------------------------------------------------
1 | """Tools used to get workspace, dataset names and refresh datasets.
2 |
3 | Power BI tools to refresh a dataset using client secret authentication.
4 |
5 | Sample use (client id and secret values directly):
6 | tenant_id = "xxxxxxxx-5fbe-40b2-xxxx-xxxx198c4c94" # replace with your azure tenant id
7 | app_client_id = "XXXXXXXX-b37a-41ed-xxxx-xxxx558e66b3"
8 | app_client_secret = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
9 | app_scope = "https://analysis.windows.net/powerbi/api"
10 | auth_token = AuthToken(tenant_id, app_client_id, app_app_client_secret, app_scope)
11 |
12 | workspace_name = "FabricDWUnitTests" # choose whichever workspace is applicable
13 | pbi_refresh = PowerBIRefresh(workspace_name, auth_token)
14 |
15 | dataset_name = "SampleDataset"
16 | pbi_refresh.refresh(dataset_name)
17 |
18 | Sample use (keyvault):
19 | tenant_id = "xxxxxxxx-5fbe-40b2-xxxx-xxxx198c4c94" # replace with your azure tenant id
20 | app_client_id_secretname = "fabricDW-app-client-id"
21 | app_client_secret_secretname = "fabricDW-app-client-secret"
22 | auth_token = AuthToken(tenant_id, app_client_id_secretname, app_client_secret_secretname)
23 |
24 | workspace_name = "FabricDWUnitTests" # choose whichever workspace is applicable
25 | pbi_refresh = PowerBIRefresh(workspace_name, auth_token)
26 |
27 | dataset_name = "SampleDataset"
28 | pbi_refresh.refresh(dataset_name)
29 | """
30 | import time
31 | from os.path import join
32 | import requests
33 |
34 | try:
35 | from notebookutils import mssparkutils
36 | USE_MSSPARKUTILS = True
37 | except ModuleNotFoundError:
38 | USE_MSSPARKUTILS = False
39 | # while this can be in the except statement - importing modules in an except
40 | # confuses linters -_-
41 | if not USE_MSSPARKUTILS:
42 | from azure.keyvault.secrets import SecretClient
43 | from azure.identity import DefaultAzureCredential
44 | READ_STATUS_TIMER = 5
45 | REST_TIMEOUT = 10
46 |
47 |
48 | class AuthToken:
49 | """Class to retrieve token from tenant id, client id, seceret and scope.
50 |
51 | :param str token_url: https://login.microsoftonline.com/
52 | :param str tenant_id:
53 | :param str app_client_id: The id or keyvault secret name (if keyvault URL provided).
54 | :param str app_client_secret: The id or keyvault secret name (if keyvault URL provided).
55 | :param str scope: https://analysis.windows.net/powerbi/api
56 | :param str keyvault_url: https://{key-vault-name}.vault.azure.net/ or {key-vault-name}.
57 | """
58 |
59 | def __init__(
60 | self,
61 | tenant_id,
62 | app_client_id,
63 | app_client_secret,
64 | scope="https://analysis.windows.net/powerbi/api",
65 | keyvault: str or None = None,
66 | token_url: str or None = "https://login.microsoftonline.com/",
67 | ):
68 | self.access_token = None
69 | # Set the Token URL for Azure AD Endpoint
70 | if token_url is not None:
71 | self.token_url = f"{token_url}{tenant_id}/oauth2/token"
72 | else:
73 | self.token_url = (
74 | f"https://login.microsoftonline.com/{tenant_id}/oauth2/token"
75 | )
76 | if keyvault:
77 | if not keyvault.startswith("https://"):
78 | keyvault = f"https://{keyvault}.vault.azure.net/"
79 | if USE_MSSPARKUTILS:
80 | app_client_id = mssparkutils.credentials.getSecret(
81 | keyvault, app_client_id
82 | )
83 | app_client_secret = mssparkutils.credentials.getSecret(
84 | keyvault, app_client_secret
85 | )
86 | else:
87 | secret_client = SecretClient(
88 | vault_url=keyvault, credential=DefaultAzureCredential()
89 | )
90 | app_client_id = secret_client.get_secret(app_client_id).value
91 | app_client_secret = secret_client.get_secret(app_client_secret).value
92 |
93 | self.set_token(app_client_id, app_client_secret, scope)
94 |
95 | def set_token(self, client_id, client_secret, scope):
96 | """Sets the classes token value.
97 | :param str client_id:
98 | :param str client_secret:
99 | :param str scope:
100 | :return: None
101 | """
102 | data = {
103 | "grant_type": "client_credentials",
104 | "client_id": client_id,
105 | "client_secret": client_secret,
106 | "resource": scope,
107 | }
108 |
109 | # Send POS request to obtain access token
110 | response = requests.post(self.token_url, data=data, timeout=REST_TIMEOUT)
111 |
112 | response.raise_for_status()
113 |
114 | token_data = response.json()
115 | self.access_token = token_data["access_token"]
116 |
117 |
118 | class PowerBIRefresh:
119 | """Class of tools to handle power BI refreshing.
120 | :param str base_url: https://api.powerbi.com/v1.0/myorg/
121 | :param AuthToken auth_token:
122 | :param workspace_name:
123 | """
124 |
125 | def __init__(
126 | self,
127 | workspace_name,
128 | auth_token: AuthToken or str,
129 | base_url: str or None = "https://api.powerbi.com/v1.0/myorg/",
130 | ):
131 |
132 | if isinstance(auth_token,str):
133 | self.headers = {"Authorization": f"Bearer {auth_token}"}
134 | elif isinstance(auth_token,AuthToken):
135 | self.headers = {"Authorization": f"Bearer {auth_token.access_token}"}
136 |
137 | self.base_url = base_url
138 |
139 | if isinstance(workspace_name, str):
140 | self.workspace_id = self.get_workspace_id(workspace_name)
141 | elif isinstance(workspace_name, list):
142 | self.workspace_id = self.get_workspace_id(workspace_name[0])
143 |
144 | def get_workspace_id(self, workspace_name) -> str:
145 | """Returns a workspace name.
146 |
147 | :param str workspace_name:
148 | :raises WorkspaceNameNotFoundException:
149 | :return: Id of workspace.
150 | :rtype: str
151 | """
152 | relative_url = join(self.base_url, "groups")
153 |
154 | response = requests.get(
155 | relative_url, headers=self.headers, timeout=REST_TIMEOUT
156 | )
157 | if not response.ok:
158 | response.raise_for_status()
159 |
160 | workspaces = response.json()["value"]
161 |
162 | for workspace in workspaces:
163 | if workspace["name"] == workspace_name:
164 | self.workspace_id = workspace["id"]
165 | return self.workspace_id
166 |
167 | raise WorkspaceNameNotFoundException(workspace_name)
168 |
169 | def get_dataset_ids(self, dataset_names, workspace_id=None) -> list:
170 | """Returns a list of dataset ids from a list of dataset names.
171 |
172 | :param list(str) dataset_names:
173 | :param workspace_id:
174 | :type workspace_id: str or None
175 | :raises DatasetNameNotFoundException:
176 | :return: list of dataset ids
177 | :rtype: list
178 | """
179 | workspace_id = self.workspace_id if workspace_id is None else workspace_id
180 | relative_url = join(self.base_url, f"groups/{workspace_id}/datasets")
181 |
182 | # Set the GET response using the relative URL
183 | response = requests.get(
184 | relative_url, headers=self.headers, timeout=REST_TIMEOUT
185 | )
186 |
187 | if not response.ok:
188 | response.raise_for_status()
189 |
190 | dataset_ids = []
191 | datasets = response.json()["value"]
192 |
193 | for dataset in datasets:
194 | for dataset_name in dataset_names:
195 | if dataset["name"] == dataset_name and dataset["isRefreshable"] is True:
196 | dataset_ids.append(dataset["id"])
197 | return dataset_ids
198 |
199 | raise DatasetNameNotFoundException(dataset_names)
200 |
201 | def get_dataset_name(self, dataset_id, workspace_id=None) -> str:
202 | """Returns a datasetname from its id.
203 |
204 | :param str dataset_id:
205 | :param workspace_id:
206 | :type workspace_id: str or None
207 | :raises DatasetNameNotFoundException:
208 | :return: dataset id
209 | :rtype: str
210 | """
211 | workspace_id = self.workspace_id if workspace_id is None else workspace_id
212 | relative_url = join(self.base_url, f"groups/{workspace_id}/datasets")
213 | response = requests.get(
214 | relative_url, headers=self.headers, timeout=REST_TIMEOUT
215 | )
216 |
217 | if not response.ok:
218 | response.raise_for_status()
219 |
220 | datasets = response.json()["value"]
221 | for dataset in datasets:
222 | if dataset["id"] != dataset_id:
223 | pass
224 | if dataset["isRefreshable"] is True:
225 | return dataset["name"]
226 | raise DatasetNameNotFoundException(dataset_id)
227 |
228 | def refresh_dataset(self, dataset_id, workspace_id=None):
229 | """Refreshes a dataset by id.
230 |
231 | :param str dataset_id:
232 | :param workspace_id:
233 | :type workspace_id: str or None
234 | :raises DatasetRefreshFailedException:
235 | :return: None
236 | :rtype: None
237 | """
238 | workspace_id = self.workspace_id if workspace_id is None else workspace_id
239 | relative_url = join(
240 | self.base_url, f"groups/{workspace_id}/datasets/{dataset_id}/refreshes"
241 | )
242 | response = requests.post(
243 | relative_url, headers=self.headers, timeout=REST_TIMEOUT
244 | )
245 |
246 | if response.ok:
247 | error_counter = 0
248 | error_limit = 5
249 | status = None
250 |
251 | print(
252 | f"Dataset {self.get_dataset_name(dataset_id, workspace_id)} refresh has been triggered successfully."
253 | )
254 |
255 | while error_counter < error_limit or status == "Unknown":
256 | try:
257 | status = self.get_dataset_refresh_status(dataset_id, workspace_id)
258 | except requests.HTTPError:
259 | error_counter += 1
260 | time.sleep(READ_STATUS_TIMER)
261 | continue
262 |
263 | if status == "Failed":
264 | raise DatasetRefreshFailedException(self, workspace_id, dataset_id)
265 | if status == "Completed":
266 | error_counter = 0
267 | break
268 |
269 | if error_counter > error_limit:
270 | raise FailedToGetStatusException(
271 | workspace_id, dataset_id, error_counter
272 | )
273 | else:
274 | print(
275 | f"Failed to trigger dataset"
276 | f"{self.get_dataset_name(dataset_id, workspace_id)} refresh."
277 | )
278 | print("Response status code:", response.status_code)
279 | print("Response content:", response.content)
280 | response.raise_for_status()
281 | raise DatasetRefreshFailedException(self, workspace_id, dataset_id)
282 |
283 | def get_dataset_refresh_status(self, dataset_id, workspace_id=None) -> str:
284 | """Gets the refresh status of a dataset by its dataset id.
285 |
286 | :param str dataset_id:
287 | :param workspace_id:
288 | :type workspace_id: str or None
289 | :return: The status of dataset refresh (current or previous).
290 | :rtype: str
291 | """
292 | workspace_id = self.workspace_id if workspace_id is None else workspace_id
293 | relative_url = join(
294 | self.base_url,
295 | f"groups/{workspace_id}/datasets/{dataset_id}/refreshes?$top=1",
296 | )
297 | response = requests.get(
298 | relative_url, headers=self.headers, timeout=REST_TIMEOUT
299 | )
300 | response.raise_for_status()
301 | refresh_status = response.json()["value"]
302 | status = refresh_status[0]["status"]
303 | return status
304 |
305 | def refresh(
306 | self,
307 | dataset_names: str or list(str),
308 | workspace_names: str or list(str) or None = None,
309 | ):
310 | """Invokes refresh of PowerBI Dataset, can be a list of workspaces and datasets or just one.
311 |
312 | :param workspace_names: List or comma seperated string of workspace names.
313 | :type workspace_names: str or list(str) or None
314 | :param dataset_names: List or comma seperated string of dataset names.
315 | :type dataset_names: str or list(str)
316 | :return: None.
317 | :rtype: None
318 | """
319 | if isinstance(workspace_names, list):
320 | workspace_list = workspace_names
321 | elif workspace_names is None:
322 | workspace_list = [self.workspace_id]
323 | else:
324 | workspace_list = workspace_names.split(",")
325 |
326 | if dataset_names is None:
327 | raise DatasetNameBlankException()
328 | else:
329 | if isinstance(dataset_names, list):
330 | dataset_list = dataset_names
331 | else:
332 | dataset_list = dataset_names.split(",")
333 |
334 | for workspace_name in workspace_list:
335 | workspace_id = (
336 | self.workspace_id
337 | if workspace_name == self.workspace_id
338 | else self.get_workspace_id(workspace_name)
339 | )
340 | dataset_ids = self.get_dataset_ids(dataset_list, workspace_id)
341 | for dataset_id in dataset_ids:
342 | self.refresh_dataset(dataset_id, workspace_id)
343 |
344 |
345 | class WorkspaceNameNotFoundException(Exception):
346 | """Workspace name not found runtime exception."""
347 |
348 | def __init__(self, workspace_name):
349 | message = f"workspace {workspace_name} Not Found"
350 | super().__init__(message)
351 |
352 |
353 | class DatasetNameNotFoundException(Exception):
354 | """Dataset name not found runtime exception."""
355 |
356 | def __init__(self, dataset_name):
357 | message = f"Dataset Name {dataset_name} Not Found"
358 | super().__init__(message)
359 |
360 |
361 | class DatasetNameBlankException(Exception):
362 | """Dataset name was blank."""
363 |
364 | def __init__(self):
365 | message = "Dataset Name cannot be blank"
366 | super().__init__(message)
367 |
368 |
369 | class DatasetRefreshFailedException(Exception):
370 | """Refresh of dataset was not successful"""
371 |
372 | def __init__(self, pbi_refr_tools, workspace_id, dataset_id):
373 | workspace_name = ""
374 | message = (
375 | f"Dataset {pbi_refr_tools.get_pbi_dataset_name(dataset_id)} ({dataset_id})"
376 | + f"in workspace {workspace_name} ({workspace_id}) failed to refresh."
377 | )
378 | super().__init__(message)
379 |
380 |
381 | class FailedToGetStatusException(Exception):
382 | """Failed to get status during refresh"""
383 |
384 | def __init__(self, workspace, dataset, retries):
385 | message = f"Dataset {dataset} in {workspace} failed to get status, after {retries} retries."
386 | super().__init__(message)
387 |
388 |
389 | if __name__ == "__main__":
390 | TENANT_ID = (
391 | "xxxxxxxx-5fbe-xxxx-xxxx-xxxxxxxxxxxxx" # replace with your azure tenant id
392 | )
393 | APP_CLIENT_ID = ""
394 | APP_CLIENT_SECRET = ""
395 | AUTH_TOKEN = AuthToken(
396 | TENANT_ID, APP_CLIENT_ID, APP_CLIENT_SECRET, keyvault=""
397 | )
398 |
399 | WORKSPACE_NAME = "" # choose whichever workspace is applicable
400 | PBI_REFRESH = PowerBIRefresh(WORKSPACE_NAME, AUTH_TOKEN)
401 | DATASET_NAME = ""
402 | PBI_REFRESH.refresh(DATASET_NAME)
403 |
--------------------------------------------------------------------------------