├── .gitignore
├── ARMTemplate
├── azuredeploy.json
├── azuredeploy.parameters.json
└── metadata.json
├── Exercise00-Setup
├── README.20230512.001.old
└── README.md
├── Exercise01-Claims
└── README.md
├── Exercise02-Observations
└── README.md
├── Exercise03-Patients
└── README.md
├── Images
├── Forking.gif
├── IngestingGif.gif
├── KQLgif01.gif
├── KQLgif02.gif
├── KQLgif03.gif
├── KQLgif04.gif
├── KQLgif05.gif
├── KQLgif06.gif
├── KQLgif07.gif
├── KQLgif08.gif
├── KQLgif09.gif
└── deploytoazure.svg
├── LICENSE
├── README.md
├── SECURITY.md
├── Troubleshooting
└── Readme.md
├── artifacts
├── Observation Table Creation.kql
├── Observations Analytics w KQL.kql
├── credential
│ └── WorkspaceSystemIdentity.json
├── dataflow
│ └── PatientJSON_Flatten_large.json
├── dataset
│ ├── ClaimDiagnosisParquetLarge.json
│ ├── ClaimDiagnosisSQL.json
│ ├── ClaimInsurance.json
│ ├── ClaimInsuranceParquetLarge.json
│ ├── ClaimProcedureParquetLarge.json
│ ├── ClaimProcedureSQL.json
│ ├── ObservationMain_LargeParquet.json
│ ├── Observation_SQLDS.json
│ ├── PatientAddressParquetLarge.json
│ ├── PatientAddressSQL.json
│ ├── PatientExtensionParquetLarge.json
│ ├── PatientIdentifierParquetLarge.json
│ ├── PatientIdentifierSQLLarge.json
│ ├── PatientRawParquetLarge.json
│ ├── Sink_DataPrep_Curated_DS.json
│ ├── Sink_DataPrep_DS.json
│ ├── Sink_DataPrep_Processed_DS.json
│ ├── Source_DataPrep_Curated_DS.json
│ ├── Source_DataPrep_DS.json
│ └── Source_DataPrep_Processed_DS.json
├── integrationRuntime
│ └── AutoResolveIntegrationRuntime.json
├── linkedService
│ ├── Source_Dataset_LS.json
│ ├── StorageLS.json
│ └── SynapseDedicatedPoolLS.json
├── notebook
│ ├── ClaimParquetFlatten_Large.json
│ ├── Claim_Ingestion_NDJSON2Parquet.json
│ ├── Lake Database And Table Creation.json
│ ├── ObservationParquetFlatten_Large.json
│ ├── Observation_Ingestion_NDJSON2Parquet.json
│ └── Patient_Ingestion_NDJSON2Parquet.json
├── pipeline
│ ├── Copy_Data_Source_To_Raw_PL.json
│ ├── FHIR_Pipeline4Claim_Spark_OC.json
│ ├── FHIR_Pipeline4Observation_Spark_OC.json
│ └── FHIR_Pipeline4Patient_DataFlow_OC.json
├── publish_config.json
└── sqlscript
│ ├── JSON_exploration_w_Serverless_Demo_OC.json
│ ├── Spark DB Exploration Scripts.json
│ └── Table Row Count.json
└── mybigdata
├── credential
└── WorkspaceSystemIdentity.json
├── integrationRuntime
└── AutoResolveIntegrationRuntime.json
├── linkedService
├── mybigdatademows-WorkspaceDefaultSqlServer.json
└── mybigdatademows-WorkspaceDefaultStorage.json
└── publish_config.json
/.gitignore:
--------------------------------------------------------------------------------
1 | ## Ignore Visual Studio temporary files, build results, and
2 | ## files generated by popular Visual Studio add-ons.
3 | ##
4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
5 |
6 | # User-specific files
7 | *.rsuser
8 | *.suo
9 | *.user
10 | *.userosscache
11 | *.sln.docstates
12 |
13 | # User-specific files (MonoDevelop/Xamarin Studio)
14 | *.userprefs
15 |
16 | # Mono auto generated files
17 | mono_crash.*
18 |
19 | # Build results
20 | [Dd]ebug/
21 | [Dd]ebugPublic/
22 | [Rr]elease/
23 | [Rr]eleases/
24 | x64/
25 | x86/
26 | [Aa][Rr][Mm]/
27 | [Aa][Rr][Mm]64/
28 | bld/
29 | [Bb]in/
30 | [Oo]bj/
31 | [Ll]og/
32 | [Ll]ogs/
33 |
34 | # Visual Studio 2015/2017 cache/options directory
35 | .vs/
36 | # Uncomment if you have tasks that create the project's static files in wwwroot
37 | #wwwroot/
38 |
39 | # Visual Studio 2017 auto generated files
40 | Generated\ Files/
41 |
42 | # MSTest test Results
43 | [Tt]est[Rr]esult*/
44 | [Bb]uild[Ll]og.*
45 |
46 | # NUnit
47 | *.VisualState.xml
48 | TestResult.xml
49 | nunit-*.xml
50 |
51 | # Build Results of an ATL Project
52 | [Dd]ebugPS/
53 | [Rr]eleasePS/
54 | dlldata.c
55 |
56 | # Benchmark Results
57 | BenchmarkDotNet.Artifacts/
58 |
59 | # .NET Core
60 | project.lock.json
61 | project.fragment.lock.json
62 | artifacts/
63 |
64 | # StyleCop
65 | StyleCopReport.xml
66 |
67 | # Files built by Visual Studio
68 | *_i.c
69 | *_p.c
70 | *_h.h
71 | *.ilk
72 | *.meta
73 | *.obj
74 | *.iobj
75 | *.pch
76 | *.pdb
77 | *.ipdb
78 | *.pgc
79 | *.pgd
80 | *.rsp
81 | *.sbr
82 | *.tlb
83 | *.tli
84 | *.tlh
85 | *.tmp
86 | *.tmp_proj
87 | *_wpftmp.csproj
88 | *.log
89 | *.vspscc
90 | *.vssscc
91 | .builds
92 | *.pidb
93 | *.svclog
94 | *.scc
95 |
96 | # Chutzpah Test files
97 | _Chutzpah*
98 |
99 | # Visual C++ cache files
100 | ipch/
101 | *.aps
102 | *.ncb
103 | *.opendb
104 | *.opensdf
105 | *.sdf
106 | *.cachefile
107 | *.VC.db
108 | *.VC.VC.opendb
109 |
110 | # Visual Studio profiler
111 | *.psess
112 | *.vsp
113 | *.vspx
114 | *.sap
115 |
116 | # Visual Studio Trace Files
117 | *.e2e
118 |
119 | # TFS 2012 Local Workspace
120 | $tf/
121 |
122 | # Guidance Automation Toolkit
123 | *.gpState
124 |
125 | # ReSharper is a .NET coding add-in
126 | _ReSharper*/
127 | *.[Rr]e[Ss]harper
128 | *.DotSettings.user
129 |
130 | # TeamCity is a build add-in
131 | _TeamCity*
132 |
133 | # DotCover is a Code Coverage Tool
134 | *.dotCover
135 |
136 | # AxoCover is a Code Coverage Tool
137 | .axoCover/*
138 | !.axoCover/settings.json
139 |
140 | # Visual Studio code coverage results
141 | *.coverage
142 | *.coveragexml
143 |
144 | # NCrunch
145 | _NCrunch_*
146 | .*crunch*.local.xml
147 | nCrunchTemp_*
148 |
149 | # MightyMoose
150 | *.mm.*
151 | AutoTest.Net/
152 |
153 | # Web workbench (sass)
154 | .sass-cache/
155 |
156 | # Installshield output folder
157 | [Ee]xpress/
158 |
159 | # DocProject is a documentation generator add-in
160 | DocProject/buildhelp/
161 | DocProject/Help/*.HxT
162 | DocProject/Help/*.HxC
163 | DocProject/Help/*.hhc
164 | DocProject/Help/*.hhk
165 | DocProject/Help/*.hhp
166 | DocProject/Help/Html2
167 | DocProject/Help/html
168 |
169 | # Click-Once directory
170 | publish/
171 |
172 | # Publish Web Output
173 | *.[Pp]ublish.xml
174 | *.azurePubxml
175 | # Note: Comment the next line if you want to checkin your web deploy settings,
176 | # but database connection strings (with potential passwords) will be unencrypted
177 | *.pubxml
178 | *.publishproj
179 |
180 | # Microsoft Azure Web App publish settings. Comment the next line if you want to
181 | # checkin your Azure Web App publish settings, but sensitive information contained
182 | # in these scripts will be unencrypted
183 | PublishScripts/
184 |
185 | # NuGet Packages
186 | *.nupkg
187 | # NuGet Symbol Packages
188 | *.snupkg
189 | # The packages folder can be ignored because of Package Restore
190 | **/[Pp]ackages/*
191 | # except build/, which is used as an MSBuild target.
192 | !**/[Pp]ackages/build/
193 | # Uncomment if necessary however generally it will be regenerated when needed
194 | #!**/[Pp]ackages/repositories.config
195 | # NuGet v3's project.json files produces more ignorable files
196 | *.nuget.props
197 | *.nuget.targets
198 |
199 | # Microsoft Azure Build Output
200 | csx/
201 | *.build.csdef
202 |
203 | # Microsoft Azure Emulator
204 | ecf/
205 | rcf/
206 |
207 | # Windows Store app package directories and files
208 | AppPackages/
209 | BundleArtifacts/
210 | Package.StoreAssociation.xml
211 | _pkginfo.txt
212 | *.appx
213 | *.appxbundle
214 | *.appxupload
215 |
216 | # Visual Studio cache files
217 | # files ending in .cache can be ignored
218 | *.[Cc]ache
219 | # but keep track of directories ending in .cache
220 | !?*.[Cc]ache/
221 |
222 | # Others
223 | ClientBin/
224 | ~$*
225 | *~
226 | *.dbmdl
227 | *.dbproj.schemaview
228 | *.jfm
229 | *.pfx
230 | *.publishsettings
231 | orleans.codegen.cs
232 |
233 | # Including strong name files can present a security risk
234 | # (https://github.com/github/gitignore/pull/2483#issue-259490424)
235 | #*.snk
236 |
237 | # Since there are multiple workflows, uncomment next line to ignore bower_components
238 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
239 | #bower_components/
240 |
241 | # RIA/Silverlight projects
242 | Generated_Code/
243 |
244 | # Backup & report files from converting an old project file
245 | # to a newer Visual Studio version. Backup files are not needed,
246 | # because we have git ;-)
247 | _UpgradeReport_Files/
248 | Backup*/
249 | UpgradeLog*.XML
250 | UpgradeLog*.htm
251 | ServiceFabricBackup/
252 | *.rptproj.bak
253 |
254 | # SQL Server files
255 | *.mdf
256 | *.ldf
257 | *.ndf
258 |
259 | # Business Intelligence projects
260 | *.rdl.data
261 | *.bim.layout
262 | *.bim_*.settings
263 | *.rptproj.rsuser
264 | *- [Bb]ackup.rdl
265 | *- [Bb]ackup ([0-9]).rdl
266 | *- [Bb]ackup ([0-9][0-9]).rdl
267 |
268 | # Microsoft Fakes
269 | FakesAssemblies/
270 |
271 | # GhostDoc plugin setting file
272 | *.GhostDoc.xml
273 |
274 | # Node.js Tools for Visual Studio
275 | .ntvs_analysis.dat
276 | node_modules/
277 |
278 | # Visual Studio 6 build log
279 | *.plg
280 |
281 | # Visual Studio 6 workspace options file
282 | *.opt
283 |
284 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
285 | *.vbw
286 |
287 | # Visual Studio LightSwitch build output
288 | **/*.HTMLClient/GeneratedArtifacts
289 | **/*.DesktopClient/GeneratedArtifacts
290 | **/*.DesktopClient/ModelManifest.xml
291 | **/*.Server/GeneratedArtifacts
292 | **/*.Server/ModelManifest.xml
293 | _Pvt_Extensions
294 |
295 | # Paket dependency manager
296 | .paket/paket.exe
297 | paket-files/
298 |
299 | # FAKE - F# Make
300 | .fake/
301 |
302 | # CodeRush personal settings
303 | .cr/personal
304 |
305 | # Python Tools for Visual Studio (PTVS)
306 | __pycache__/
307 | *.pyc
308 |
309 | # Cake - Uncomment if you are using it
310 | # tools/**
311 | # !tools/packages.config
312 |
313 | # Tabs Studio
314 | *.tss
315 |
316 | # Telerik's JustMock configuration file
317 | *.jmconfig
318 |
319 | # BizTalk build output
320 | *.btp.cs
321 | *.btm.cs
322 | *.odx.cs
323 | *.xsd.cs
324 |
325 | # OpenCover UI analysis results
326 | OpenCover/
327 |
328 | # Azure Stream Analytics local run output
329 | ASALocalRun/
330 |
331 | # MSBuild Binary and Structured Log
332 | *.binlog
333 |
334 | # NVidia Nsight GPU debugger configuration file
335 | *.nvuser
336 |
337 | # MFractors (Xamarin productivity tool) working folder
338 | .mfractor/
339 |
340 | # Local History for Visual Studio
341 | .localhistory/
342 |
343 | # BeatPulse healthcheck temp database
344 | healthchecksdb
345 |
346 | # Backup folder for Package Reference Convert tool in Visual Studio 2017
347 | MigrationBackup/
348 |
349 | # Ionide (cross platform F# VS Code tools) working folder
350 | .ionide/
--------------------------------------------------------------------------------
/ARMTemplate/azuredeploy.json:
--------------------------------------------------------------------------------
1 | {
2 | "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
3 | "contentVersion": "1.0.0.0",
4 | "parameters": {
5 | "allowAllConnections": {
6 | "type": "string",
7 | "allowedValues": [
8 | "true",
9 | "false"
10 | ],
11 | "defaultValue": "true"
12 | },
13 | "StorageAcccountName": {
14 | "type": "string",
15 | "defaultValue": "",
16 | "metadata": {
17 | "description": "Name of the StorageAccount"
18 | }
19 | },
20 | "StorageContainerName": {
21 | "type": "string",
22 | "defaultValue": "",
23 | "metadata": {
24 | "description": "Name of the Container in Storage Account"
25 | }
26 | },
27 | "WorkspaceName": {
28 | "type": "string",
29 | "defaultValue": "",
30 | "metadata": {
31 | "description": "Name of the Synapse Workspace"
32 | }
33 | },
34 | "ManagedResourceGroupName": {
35 | "type": "string",
36 | "defaultValue": "",
37 | "metadata": {
38 | "description": "Name of the Managed Resource Group for Synapse"
39 | }
40 | },
41 | "SqlPoolName": {
42 | "type": "string",
43 | "defaultValue": "",
44 | "metadata": {
45 | "description": "Name of the dedicated sql pool"
46 | }
47 | },
48 | "SparkPoolName": {
49 | "type": "string",
50 | "defaultValue": "",
51 | "maxLength": 15,
52 | "metadata": {
53 | "description": "Name of the synapse spark pool"
54 | }
55 | },
56 | "sparkDeployment": {
57 | "type": "string",
58 | "defaultValue": "true",
59 | "allowedValues": [
60 | "true",
61 | "false"
62 | ],
63 | "metadata": {
64 | "description": "'True' deploys an Apache Spark pool as well as a SQL pool. 'False' does not deploy an Apache Spark pool."
65 | }
66 | },
67 | "sparkNodeSize": {
68 | "type": "string",
69 | "defaultValue": "Medium",
70 | "allowedValues": [
71 | "Small",
72 | "Medium",
73 | "Large"
74 | ],
75 | "metadata": {
76 | "description": "This parameter will determine the node size if SparkDeployment is true"
77 | }
78 | },
79 | "sqlAdministratorLogin": {
80 | "type": "string",
81 | "metadata": {
82 | "description": "The username of the SQL Administrator"
83 | }
84 | },
85 | "sqlAdministratorLoginPassword": {
86 | "type": "securestring",
87 | "metadata": {
88 | "description": "The password for the SQL Administrator"
89 | }
90 | },
91 | "sku": {
92 | "type": "string",
93 | "defaultValue": "DW1000c",
94 | "allowedValues": [
95 | "DW100c",
96 | "DW200c",
97 | "DW300c",
98 | "DW400c",
99 | "DW500c",
100 | "DW1000c",
101 | "DW1500c",
102 | "DW2000c",
103 | "DW2500c",
104 | "DW3000c"
105 | ],
106 | "metadata": {
107 | "description": "Select the SKU of the SQL pool."
108 | }
109 | },
110 | "metadataSync": {
111 | "type": "bool",
112 | "defaultValue": false,
113 | "metadata": {
114 | "description": "Choose whether you want to synchronise metadata."
115 | }
116 | },
117 | "githubUsername": {
118 | "type": "string",
119 | "defaultValue": "",
120 | "metadata": {
121 | "description": "Username of your github account hosting synapse workspace resources"
122 | }
123 | }
124 | },
125 | "variables": {
126 | "_artifactsLocation": "[deployment().properties.templatelink.uri]",
127 | "location": "[resourceGroup().location]",
128 | "deploymentType": "synapseworkspacedeployment",
129 | "dlsName": "[toLower(parameters('StorageAcccountName'))]",
130 | "dlsFsName": "[toLower(parameters('StorageContainerName'))]",
131 | "sqlPoolName": "[toLower(parameters('SqlPoolName'))]",
132 | "workspaceName": "[toLower(parameters('WorkspaceName'))]",
133 | "sparkPoolName": "[toLower(parameters('SparkPoolName'))]",
134 | "storageAccountId": "[resourceId('Microsoft.Storage/storageAccounts', variables('dlsName'))]"
135 | },
136 | "resources": [
137 | {
138 | "type": "Microsoft.Storage/storageAccounts",
139 | "apiVersion": "2019-06-01",
140 | "name": "[variables('dlsName')]",
141 | "location": "[variables('location')]",
142 | "sku": {
143 | "name": "Standard_LRS"
144 | },
145 | "kind": "StorageV2",
146 | "properties": {
147 | "accessTier": "Hot",
148 | "supportsHttpsTrafficOnly": true,
149 | "isHnsEnabled": true
150 | },
151 | "resources": [
152 | {
153 | "name": "[concat('default/', variables('dlsFsName'))]",
154 | "type": "blobServices/containers",
155 | "apiVersion": "2019-06-01",
156 | "dependsOn": [
157 | "[variables('dlsName')]"
158 | ],
159 | "properties": {
160 | "publicAccess": "None"
161 | }
162 | }
163 | ]
164 | },
165 | {
166 | "type": "Microsoft.Synapse/workspaces",
167 | "apiVersion": "2021-06-01-preview",
168 | "name": "[variables('workspaceName')]",
169 | "location": "[variables('location')]",
170 | "identity": {
171 | "type": "SystemAssigned"
172 | },
173 | "dependsOn": [
174 | "[variables('dlsName')]",
175 | "[variables('dlsFsName')]"
176 | ],
177 | "properties": {
178 | "defaultDataLakeStorage": {
179 | "accountUrl": "[reference(variables('dlsName')).primaryEndpoints.dfs]",
180 | "filesystem": "[variables('dlsFsName')]"
181 | },
182 | "sqlAdministratorLogin": "[parameters('sqlAdministratorLogin')]",
183 | "sqlAdministratorLoginPassword": "[parameters('sqlAdministratorLoginPassword')]",
184 | "publicNetworkAccess": "Enabled",
185 | "managedResourceGroupName": "[parameters('ManagedResourceGroupName')]",
186 | "workspaceRepositoryConfiguration": {
187 | "type": "WorkspaceGitHubConfiguration",
188 | "hostName": "https://github.com",
189 | "accountName": "[parameters('githubUsername')]",
190 | "repositoryName": "AzureSynapseEndToEndDemo",
191 | "rootFolder": "/artifacts",
192 | "collaborationBranch": "main"
193 | }
194 | },
195 | "resources": [
196 | {
197 | "condition": "[equals(parameters('allowAllConnections'),'true')]",
198 | "type": "firewallrules",
199 | "apiVersion": "2019-06-01-preview",
200 | "name": "allowAll",
201 | "location": "[variables('location')]",
202 | "dependsOn": [ "[variables('workspaceName')]" ],
203 | "properties": {
204 | "startIpAddress": "0.0.0.0",
205 | "endIpAddress": "255.255.255.255"
206 | }
207 | },
208 | {
209 | "type": "firewallrules",
210 | "apiVersion": "2019-06-01-preview",
211 | "name": "AllowAllWindowsAzureIps",
212 | "location": "[variables('location')]",
213 | "dependsOn": [ "[variables('workspaceName')]" ],
214 | "properties": {
215 | "startIpAddress": "0.0.0.0",
216 | "endIpAddress": "0.0.0.0"
217 | }
218 | },
219 | {
220 | "type": "managedIdentitySqlControlSettings",
221 | "apiVersion": "2019-06-01-preview",
222 | "name": "default",
223 | "location": "[variables('location')]",
224 | "dependsOn": [ "[variables('workspaceName')]" ],
225 | "properties": {
226 | "grantSqlControlToManagedIdentity": {
227 | "desiredState": "Enabled"
228 | }
229 | }
230 | }
231 | ]
232 | },
233 | {
234 | "type": "Microsoft.Synapse/workspaces/sqlPools",
235 | "apiVersion": "2019-06-01-preview",
236 | "name": "[concat(variables('workspaceName'), '/', variables('sqlPoolName'))]",
237 | "location": "[variables('location')]",
238 | "sku": {
239 | "name": "[parameters('sku')]"
240 | },
241 | "dependsOn": [
242 | "[variables('workspaceName')]"
243 | ],
244 | "properties": {
245 | "createMode": "Default",
246 | "collation": "SQL_Latin1_General_CP1_CI_AS"
247 | },
248 | "resources": [
249 | {
250 | "condition": "[parameters('metadataSync')]",
251 | "type": "metadataSync",
252 | "apiVersion": "2019-06-01-preview",
253 | "name": "config",
254 | "location": "[variables('location')]",
255 | "dependsOn": [
256 | "[variables('sqlPoolName')]"
257 | ],
258 | "properties": {
259 | "Enabled": "[parameters('metadataSync')]"
260 | }
261 | }
262 | ]
263 | },
264 | {
265 | "condition": "[equals(parameters('sparkDeployment'),'true')]",
266 | "type": "Microsoft.Synapse/workspaces/bigDataPools",
267 | "apiVersion": "2019-06-01-preview",
268 | "name": "[concat(variables('workspaceName'), '/', variables('sparkPoolName'))]",
269 | "location": "[variables('location')]",
270 | "dependsOn": [
271 | "[variables('workspaceName')]"
272 | ],
273 | "properties": {
274 | "nodeCount": 5,
275 | "nodeSizeFamily": "MemoryOptimized",
276 | "nodeSize": "[parameters('sparkNodeSize')]",
277 | "autoScale": {
278 | "enabled": true,
279 | "minNodeCount": 3,
280 | "maxNodeCount": 5
281 | },
282 | "autoPause": {
283 | "enabled": true,
284 | "delayInMinutes": 15
285 | },
286 | "sparkVersion": "3.1"
287 | }
288 | },
289 | {
290 | "scope": "[concat('Microsoft.Storage/storageAccounts/', variables('dlsName'))]",
291 | "type": "Microsoft.Authorization/roleAssignments",
292 | "apiVersion": "2020-04-01-preview",
293 | "name": "[guid(uniqueString(variables('dlsName')))]",
294 | "location": "[variables('location')]",
295 | "dependsOn": [
296 | "[variables('workspaceName')]"
297 | ],
298 | "properties": {
299 | "roleDefinitionId": "[resourceId('Microsoft.Authorization/roleDefinitions', 'ba92f5b4-2d11-453d-a403-e96b0029c9fe')]",
300 | "principalId": "[reference(resourceId('Microsoft.Synapse/workspaces', variables('workspaceName')), '2019-06-01-preview', 'Full').identity.principalId]",
301 | "principalType": "ServicePrincipal"
302 | }
303 | }
304 | ]
305 | }
306 |
--------------------------------------------------------------------------------
/ARMTemplate/azuredeploy.parameters.json:
--------------------------------------------------------------------------------
1 | {
2 | "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentParameters.json#",
3 | "contentVersion": "1.0.0.0",
4 | "parameters": {
5 | "sqlAdministratorLogin": {
6 | "value": "sqladmin"
7 | },
8 | "sqlAdministratorLoginPassword": {
9 | "value": "Temp12345"
10 | }
11 | }
12 | }
--------------------------------------------------------------------------------
/ARMTemplate/metadata.json:
--------------------------------------------------------------------------------
1 | {
2 | "$schema": "https://aka.ms/azure-quickstart-templates-metadata-schema#",
3 | "itemDisplayName": "Azure Synapse End-End deployment",
4 | "description": "This template creates a Azure Synapse Workspace, including SQL Pools and optional Apache Spark Pools",
5 | "summary": "Azure Synapse Workspace",
6 | "validationType": "Manual",
7 | "githubUsername": "microsoft",
8 | "dateUpdated": "2020-09-10",
9 | "type": "QuickStart",
10 | "environments": [
11 | "AzureCloud"
12 | ]
13 | }
14 |
15 |
--------------------------------------------------------------------------------
/Exercise00-Setup/README.20230512.001.old:
--------------------------------------------------------------------------------
1 | ## Deploy Azure Synapse Demo in Your Azure Environment
2 |
3 | ### Pre-requisites to Deploy Synapse end-to-end Demo
4 |
5 | * You must have a github account
6 | * You must have an active azure subscription
7 |
8 | ### Deployment Steps
9 | Please follow the below steps to successfully deploy a Synapse workspace and its artifacts on your Azure subscription
10 |
11 | ### [](https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2Fmicrosoft%2FAzureSynapseEndToEndDemo%2Fmain%2FARMTemplate%2Fazuredeploy.json)
12 |
13 | * **Deploy to Azure** button takes you to the https://ms.portal.azure.com/#create/Microsoft.Template webpage. Please provide subscription, resource group, region, storage account name, storage container name, workspace name, dedicated sql pool name, spark pool name, spark pool node size, sql administration username/password, sku (dedicated sql pool Data Warehouse Units), and github username parameter values.
14 |
15 | >:exclamation::point_right:**It's incredibly important that you write down all the values in the above step. Many will need to be supplied later as parameters.**
16 |
17 | >*Note: The github username should be the target github account. Example: If https://github.com/microsoft/AzureSynapseEndToEndDemo is the github project url, then "microsoft" is github account name.*
18 |
19 | * Click on the **Review + Create** button to trigger deployment validation. If deployment validation is successful, the single click deployment will deploy a Synapse Workspace, Dedicated SQL Pool, and Spark Pool. This deployment also enables git configuration so all the required artifacts for the end-to-end demo are committed to your user github project. This completes the Azure Synapse end-to-end code deployment step.
20 |
21 | >*Note: If deployment is incomplete, please look at the resource group activity log and find the latest deployment errors for more information*
22 |
23 | ### Demo Setup
24 |
25 | First you will need to fill in a parameter before you can complete the exercises. We need to provide the linked service to your storage account with the storage account name you chose during deployment.
26 |
27 | 
28 |
29 |
30 | Once you click on the linked service name it will open a panel where we can make changes and provide the correct parameter for our storage account.
31 |
32 | 
33 |
34 |
35 | Now that the parameter is complete you'll need to copy the demo data from our Microsoft repository to your data lake. The data used in these exercises is synthetic health care data generated from [Synthea](https://synthea.mitre.org/) using their [Data Generator](https://github.com/synthetichealth/synthea/wiki/Basic-Setup-and-Running) and is all open source. Alternatively, you could generate the data yourself and copy it to your lake. To begin the copy you need to open the Data Prep Pipeline.
36 |
37 | 
38 |
39 |
40 | Once you have the pipeline open, you can execute it by clicking debug. When you click debug a flyout panel will open on the right side asking for two runtime parameters. First is the name of the storage account you chose during deployment and the second has the default value of '1tb'. You have a choice between two data source sizes and can choose either '1tb' or '30tb'. If you stick with the '1tb' default you can always go back later, run the pipeline again choosing '30tb' and copy that to your data lake as well.
41 |
42 | 
43 |
44 | 
45 |
46 |
47 | ## Congratulations on completing setup. You are now ready to move to [Exercise 01 - Claims](https://github.com/microsoft/AzureSynapseEndToEndDemo/blob/main/Exercise01-Claims/README.md)
48 |
--------------------------------------------------------------------------------
/Exercise00-Setup/README.md:
--------------------------------------------------------------------------------
1 | ## Deploy Azure Synapse Demo in Your Azure Environment
2 |
3 | ### Pre-requisites to Deploy Synapse end-to-end Demo
4 |
5 | * You must have a github account
6 | * You must have an active azure subscription
7 |
8 | ### Deployment Steps
9 | Please follow the below steps to successfully deploy a Synapse workspace and its artifacts on your Azure subscription
10 |
11 | * Fork microsoft/AzureSynapseEndToEndDemo project to your local github account. Make sure to check "Copy the main branch only".
12 |
13 | 
14 |
15 | * Once you fork the AzureSynapseEndToEndDemo project to your github account, please click on **Deploy to Azure** button to start the deployment
16 |
17 | [](https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2Fmicrosoft%2FAzureSynapseEndToEndDemo%2Fmain%2FARMTemplate%2Fazuredeploy.json)
18 |
19 | * **Deploy to Azure** button takes you to the https://ms.portal.azure.com/#create/Microsoft.Template webpage. Please provide subscription, resource group, region, storage account name, storage container name, workspace name, dedicated sql pool name, spark pool name, spark pool node size, sql administration username/password, sku (dedicated sql pool Data Warehouse Units), and github username parameter values.
20 |
21 | >:exclamation::point_right:**It's incredibly important that you write down all the values in the above step. Many will need to be supplied later as parameters.**
22 |
23 | >*Note: The github username should be the target github account where you forked the project. Example: If https://github.com/JohnDoe/AzureSynapseEndToEndDemo is the github project url, then "JohnDoe" is github account name.*
24 |
25 | * Click on the **Review + Create** button to trigger deployment validation. If deployment validation is successful, the single click deployment will deploy a Synapse Workspace, Dedicated SQL Pool, and Spark Pool. This deployment also enables git configuration so all the required artifacts for the end-to-end demo are committed to your user github project. This completes the Azure Synapse end-to-end code deployment step.
26 |
27 | >*Note: If deployment is incomplete, please look at the resource group activity log and find the latest deployment errors for more information*
28 |
29 | ### Demo Setup
30 |
31 | First you will need to fill in a parameter before you can complete the exercises. We need to provide the linked service to your storage account with the storage account name you chose during deployment.
32 |
33 | 
34 |
35 |
36 | Once you click on the linked service name it will open a panel where we can make changes and provide the correct parameter for our storage account.
37 |
38 | 
39 |
40 |
41 | Now that the parameter is complete you'll need to copy the demo data from our Microsoft repository to your data lake. The data used in these exercises is synthetic health care data generated from [Synthea](https://synthea.mitre.org/) using their [Data Generator](https://github.com/synthetichealth/synthea/wiki/Basic-Setup-and-Running) and is all open source. Alternatively, you could generate the data yourself and copy it to your lake. To begin the copy you need to open the Data Prep Pipeline.
42 |
43 | 
44 |
45 |
46 | Once you have the pipeline open, you can execute it by clicking debug. When you click debug a flyout panel will open on the right side asking for two runtime parameters. First is the name of the storage account you chose during deployment and the second has the default value of '1tb'. You have a choice between two data source sizes and can choose either '1tb' or '30tb'. If you stick with the '1tb' default you can always go back later, run the pipeline again choosing '30tb' and copy that to your data lake as well.
47 |
48 | 
49 |
50 | 
51 |
52 |
53 | ## Congratulations on completing setup. You are now ready to move to [Exercise 01 - Claims](/Exercise01-Claims/README.md)
54 |
--------------------------------------------------------------------------------
/Exercise01-Claims/README.md:
--------------------------------------------------------------------------------
1 | # Objective
2 | * This pipeline takes the JSON data that is in FHIR standard format from our "raw" ADLS container and converts it to parquet. Since Parquet is a columnar compressed file format this makes it much faster to import and work with the data. We store the parquet output in our "processed" container in ADLS under a folder called claim.
3 | 
4 |
5 |
6 | * We plan to eventually load this data into Dedicated SQL Pool across 3 tables representing Diagnosis, Insurance, and Procedures. We need to extract the data needed for each table, clean it, and write it back to ADLS. The second activity in our pipeline handles all of this in a single Synapse Spark Notebook.
7 | 
8 |
9 |
10 | * Now that the data is prepared and cleaned we are ready to load it into our Dedicated Pool, but we need to create the schema and tables first. We have a script activity that will run against our Dedicated Pool to create these artifacts for us.
11 |
12 | >*Note: Make sure your Dedicated Pool is running prior to executing this pipeline. You can see this in the SQL Pools tab under the Manage Hub.*
13 |
14 | 
15 |
16 |
17 | * We are now all setup with data ready to go and tables to load it in and we'll use a Copy Activity for each table and load them in parallel.
18 | 
19 |
20 |
21 | * There is one last thing in our pipeline. We have some data engineers that will need to explore the data in the lake to understand how they can enable new work streams for the business. They are currently skilling up in PySpark, but until then we need to give them the ability to explore the data through TSQL. We have created a Notebook activity that creates the meta data for SQL Tables on top of the data in our data lake. You'll be able to play around with some exploration scripts in a later activity.
22 | 
23 |
24 |
25 |
26 | # STEP 1: Parameter Setup
27 | Prior to running the claims pipeline (FHIR_Pipeline4Claim_Spark_OC) you will need to set the pipeline parameters to use the artifact names you chose during deployment. Go to the integrate hub, expand the claims folder, and select the pipeline to open it.
28 |
29 | 
30 |
31 |
32 | Once the pipeline opens you will need to click somewhere on the canvas (open space or background) to see the pipeline level parameters. This means that NONE of the activities should be highlighted or selected. Now select the Parameters tab in the bottom pane to view the pipeline level parameters.
33 |
34 | 
35 |
36 | Change the default value for each of the following five parameters to what you chose during deployment:
37 | * StorageName - This is the name of your Synapse workspace ADLS account
38 | * DatabaseName - This is the name of your database in Synapse Dedicated SQL Pool
39 | * ServerName - This is the name of your Synapse Dedicated SQL Pool
40 | * SparkPoolName - This is the name of your Synapse Spark Pool
41 | * DatasetSize - This is either "1tb" or "30tb" depending on which size dataset you want to use
42 |
43 | # STEP 2: Execute Pipeline
44 | * You need to hit the debug button to kick off the pipeline run.
45 | >*Note: Make sure your Dedicated Pool is running prior to executing this pipeline. You can see this in the SQL Pools tab under the Manage Hub.*
46 |
47 | 
48 |
49 |
50 | ## Congratulations on completing Exercise 01. You are now ready to move to [Exercise 02 - Observations](/Exercise02-Observations/README.md)
51 |
--------------------------------------------------------------------------------
/Exercise02-Observations/README.md:
--------------------------------------------------------------------------------
1 | # Objective 1: Dedicated SQL Pool
2 | * This pipeline takes the JSON data that is in FHIR standard format from our "raw" ADLS container and converts it to parquet. Since Parquet is a columnar compressed file format this makes it much faster to import and work with the data. We store the parquet output in our "processed" container in ADLS under a folder called "Observation".
3 | 
4 |
5 |
6 | * We plan to eventually load this data into Dedicated SQL Pool in a table called [fhir].[ObservationMain]. We need to extract the data needed for the table, clean it, and write it back to ADLS. The second activity in our pipeline handles all of this in a single Synapse Spark Notebook.
7 | 
8 |
9 |
10 | * Now that the data is prepared and cleaned we are ready to load it into our Dedicated Pool, but we need to create the tables first. We have a script activity that will run against our Dedicated Pool to create these artifacts for us.
11 |
12 | >*Note: Make sure your Dedicated Pool is running prior to executing this pipeline. You can see this in the SQL Pools tab under the Manage Hub.*
13 |
14 | 
15 |
16 |
17 | * We are now all setup with data ready to go and a table to load it in and we'll use a Copy Activity to perform the load.
18 | 
19 |
20 |
21 | # STEP 1: Parameter Setup
22 | Prior to running the observations pipeline (FHIR_Pipeline4Observation_Spark_OC) you will need to set the pipeline parameters to use the artifact names you chose during deployment. Go to the integrate hub, expand the obsevation folder, and select the pipeline to open it.
23 |
24 | 
25 |
26 |
27 | Once the pipeline opens you will need to click somewhere on the canvas (open space or background) to see the pipeline level parameters. This means that NONE of the activities should be highlighted or selected. Now select the Parameters tab in the bottom pane to view the pipeline level parameters.
28 |
29 | 
30 |
31 |
32 | Change the default value for each of the following five parameters to what you chose during deployment:
33 | * StorageName - This is the name of your Synapse workspace ADLS account
34 | * DatabaseName - This is the name of your database in Synapse Dedicated SQL Pool
35 | * ServerName - This is the name of your Synapse Dedicated SQL Pool
36 | * SparkPoolName - This is the name of your Synapse Spark Pool
37 | * DatasetSize - This is either "1tb" or "30tb" depending on which size dataset you want to use
38 |
39 | # STEP 2: Execute Pipeline
40 | * You need to hit the debug button to kick off the pipeline run.
41 | >*Note: Make sure your Dedicated Pool is running prior to executing this pipeline. You can see this in the SQL Pools tab under the Manage Hub.*
42 |
43 | 
44 |
45 | # Objective 2: ADX Pool
46 | * Objective 2 focuses on the steps to ingest, analyze and visualize the Observations data (time-series data) utilizing the Data Explorer Pool on Azure Synapse Analytics.
47 | * More specifically, we will be ingesting the data within the "Observation_Main" folder that is in the *curated* container within our Azure Data Lake Storage Gen 2. This "Observation_Main" folder contains the bulk of the Observations data already cleaned and prep and in parquet format.
48 |
49 | # STEP 1: Create a Data Explorer Pool
50 |
51 | 1. In Synapse studio, on the left-side pane, select **Manage > Data Explorer pools**
52 | 2. Select **New**, and then enter the following details on the **Basics** tab:
53 | | Setting | Value | Description |
54 | |:------|:------|:------
55 | | Data Explorer Pool Name | **adxpoolmedicaldata** | This is the name the Data Explorer pool will have |
56 | | Workload | **Compute Optimized** | This workload provides a higher CPU to SSD storage ratio. |
57 | | Node Size | **Small(4 cores)** | Set this to the smallest size to reduce costs for this quickstart |
58 | 3. Select **Review + Create > Create.** Your data explorer will start the provisioning process. Once it is complete move on to the next step.
59 |
60 | 
61 |
62 | # STEP 2: Create a Data Explorer Database
63 |
64 | 1. In Synapse Studio, on the left-side pane, Select **Data**.
65 | 2. Select + (Add new resource) > **Data Explorer Database** and paste the following information:
66 | | Setting | Value | Description |
67 | |:------|:------|:------
68 | | Data Explorer Pool Name | **adxpoolmedicaldata** | The name of the Data Explorer pool to use |
69 | | Name | **ObservationData** | This database name must be unique within the cluster. |
70 | | Default retention period | **365** | The time span (in days) for which it's guaranteed that the data is kept available to query. The time span is measured from the time that data is ingested. |
71 | |Default cache period | **31** | The time span (in days) for which to keep frequently queried data available in SSD storage or RAM, rather than in longer-term storage
72 | 3. Select **Create** to create the database. Creation typically takes less than a minute.
73 |
74 | 
75 |
76 | # STEP 3: Ingesting Data
77 |
78 | 1. In Synapse studio, on the left-side pane, select **Data**
79 |
80 | 2. Right-click ADX database *ObservationData* and click on **Open in Azure Data Explorer**. This opens the Azure Data Explorer web UI.
81 |
82 | 3. Once in the web UI click on the **Data** tab on the left. This opens the ADX "One-Click UI", where you can quickly ingest data, create database tables, and automatically map the table schema.
83 |
84 | 4. Click on **Ingest data**, and then enter the following details:
85 |
86 | | Setting | Value | Description |
87 | |:------|:------|:------
88 | | Cluster | **adxpoolmedicaldata** | Enter name of Data Explorer pool created or use the *Add connection* button to add Connection URI |
89 | | Database | **ObservationData** | Enter name of database created |
90 | | New Table | **ObservationCurated** | Enter the name for the table that will hold the taxi trip data |
91 |
92 | 5. Select **Next**, and then enter the following information for **Source**:
93 |
94 | | Setting | Value | Description |
95 | |:------|:------|:------
96 | | Source Type | **ADLS Gen2 Container** | Choose your source type
97 | | Ingestion Type | **One-time + Continuous** | Choose the type of ingestion you would like to perform (view i for more info.)
98 | | Select source | **Select Container** | This allows you to select the container from your Azure Subscription
99 | | Storage subscription | *NA* | Enter your Azure subscription name
100 | | Storage account | **storagemedicaldata** | Enter your storage account name
101 | | Container | **curated** | Enter the *curated* container name as this where the data resides
102 | | Sample Size | *NA* | Leave blank
103 | | Folder path | **fhir/1tb/Observation_main** | Find this path under Directory Properties for the *Observation_main* folder
104 | | File extension | **.parquet** | This is the format of the data
105 |
106 | 6. Select **Next: Schema**, this page displays the schema and a partial data preview of the **ObservationCurated** that will be created.
107 | * On the left hand menu you will see *Compression Type*, *Data format*, *Nested Levels* and *Mapping name* leave these configurations as displayed for this demo.
108 | * Change the data type for **Observation_id**, **encounter_id_reference** and **patient_id_reference** columns from string to **guid** data type. Do this by right-clicking on each of these columns and clicking on the **Change Data Type** button.
109 | * Similarly, change the data type for **issued** and **effectiveDateTime** columns from string to **datetime**.
110 | * Click on the carot on top right-hand corner to open the *Command viewer*. Here you can view the KQL code that is running in the background, such as the Create Table Command. This command is creating the table with all of the data types where the data will be stored: *ObservationCurated*.
111 |
112 | 8. Select **Next: Start Ingestion** and this will begin the ingestion process for the data. It is complete once all the files display a green checkmark. This should take approximately 10 minutes. Click **Close** to complete.
113 | * Note: You can also ingest the data using the pipeline named: *ObservationsData_ToSDXPool** which uses a COPY activity to bring the data into ADX. However, you must manually create a table in your ADX database prior to copying the data. Under KQL scripts you can find the *Observation Table Creation* script to create the table. After the table has been successfully created with the correct data types for the columns you can run the piepeline with your respective parameters.
114 |
115 | 
116 |
117 | # STEP 4: Analyze & Visualize Data using KQL
118 | 1. In Synpase studio, on the left-side pane, select **Develop**.
119 | 2. Under 'Notebooks' dropdown on the left side of the screen, click on the KQL notebook named **'Observations Analytics w KQL'**.
120 | 3. Once in the notebook, ensure you are connected to your ADX pool **'adxpoolmedicaldata'** and database **'ObservationData'** and then run each of the sections (a-i) of the script separately and observe the results:
121 |
122 | *a.* Get a quick preview of the **'ObservationCurated'** table.
123 |
124 | 
125 |
126 | *b.* Counts the number of Observations in the **'ObservationCurated'** table.
127 |
128 | 
129 |
130 | *c.* Summarizes the minimum and maximum of the *'issued'* column(date issued) in the **'ObservationCurated'** table.
131 |
132 | 
133 |
134 | *d.* Summarizes the count of records in the **'ObservationCurated'** table by grouping them into daily intervals based on the *'issued'* column.
135 |
136 | 
137 |
138 | *e.* Visualizes the timeseries chart for **'ObservationCurated'** table based on issued date. More specifically, it filters the table to select records with issued datetime between July 15th, 1910 and June 20th, 2021, (which are the min and max issued dates found in step c), then counts the number of observations for every 30 day intervals within that time range, and finally visualizes the results as a time chart.
139 |
140 | 
141 |
142 | *f.* Now we are trimming the dataset to analyze daily observations during a relatively normal time period (8 years between 2011 and 2019). Ultimately, we visualize the timechart again and it shows the pattern of observations day by day for 8 years.
143 |
144 | 
145 |
146 | *g.* We are now identifying anomalies between these 8 years (2011 and 2019) using the timeseries chart developed in the previous step. More specifically, it uses the *"series_decompose_anomalies"* function to identify anomalies in the observations count data with a threshold of 1.5. Then, it visualizes the anomalies as an anomaly chart titled "Anomalies for daily medical observations during 8 years". Anomalies can be seen as red dots on the chart.
147 |
148 | 
149 |
150 | *h.* Now are listing the anomalies in a table. More specifically,
151 | it uses the *"series_decompose_anomalies"* function to identify anomalies in the observations count data and extends the table with an *'anomalies'* column. The *"mv-expand"* function is used to expand the table to separate rows for each observations count and its corresponding anomaly value and issued datetime. The code then filters the table to only include rows where the anomaly value is not equal to zero.
152 |
153 | 
154 |
155 | *i.* Finally, we are using this query to separate anomaly properties. More specifically, it uses the *"series_decompose_anomalies"* function to decompose the observations data into anomalies, score, and baseline values. The table is expanded to separate rows for each issued datetime, observations count, score, baseline and anomaly (where the anomalies column is set to null if the anomaly value is 0). This query can later be used to visualize the timeseries chart with anomalies on powerBI (reference FHSI_Dual_Dims v2).
156 |
157 | 
158 |
159 | # Summarization
160 |
161 | - Overall, we were able to see how Synapse Studio is an incredibly powerful tool that brings immense value to analytical workloads in Azure. It offers a comprehensive suite of analytical components, including Pipelines, Spark, SQL, Data Explorer, and Power BI, all within a single Azure Resource. This integrated approach enhances efficiency and delivers exceptional benefits to users.
162 | - More specifically, as we saw here Azure Data Explorer (ADX) is a valuable component offered by Synapse Studio. ADX proved to be a fast and highly scalable analytics service optimized for querying and analyzing large volumes of diverse data in real-time. In this case, we saw that it is particularly well-suited for working with time series data due to its efficient storage and querying capabilities. Additionally, its integration with Synapse Studio allows users to perform ad-hoc data exploration and gain instant insights from massive datasets, facilitating rapid decision-making.
163 | - By providing these purpose-built analytical components within a single Azure Resource, Synapse Studio eliminates the complexity and overhead associated with managing multiple tools and integrations. It offers a cohesive environment for end-to-end data analytics, catering to diverse workload needs efficiently and effectively.
164 |
165 |
166 |
167 |
168 | ## Congratulations on completing Exercise 02. You are now ready to move to [Exercise 03 - Patients](/Exercise03-Patients/README.md)
169 |
--------------------------------------------------------------------------------
/Exercise03-Patients/README.md:
--------------------------------------------------------------------------------
1 | # Objective
2 | * This pipeline takes the JSON data that is in FHIR standard format from our "raw" ADLS container and converts it to parquet. Since Parquet is a columnar compressed file format this makes it much faster to import and work with the data. We store the parquet output in our "processed" container in ADLS under a folder called "Patient".
3 | 
4 |
5 |
6 | * We plan to eventually load this data into Dedicated SQL Pool across 2 tables representing Patient Addresses and Patient Indentifiers. We need to extract the data needed for each table, clean it, and write it back to ADLS. The second activity in our pipeline handles all of this inside a Data Flow Activity. This could have been done in a Spark notebook like the previous 2 activities, but this will let you compare the two methods.
7 | 
8 |
9 |
10 | * Now that the data is prepared and cleaned we are ready to load it into our Dedicated Pool, but we need to create the tables first. We have a script activity that will run against our Dedicated Pool to create these artifacts for us.
11 |
12 | >*Note: Make sure your Dedicated Pool is running prior to executing this pipeline. You can see this in the SQL Pools tab under the Manage Hub.*
13 |
14 | 
15 |
16 |
17 | * We are now all setup with data ready to go and a table to load it in and we'll use a Copy Activity to perform the load.
18 | 
19 |
20 |
21 | # STEP 1: Parameter Setup
22 | Prior to running the Patient pipeline (FHIR_Pipeline4Patient_DataFlow_OC) you will need to set the pipeline parameters to use the artifact names you chose during deployment. Go to the integrate hub, expand the patient folder, and select the pipeline to open it.
23 |
24 | 
25 |
26 |
27 | Once the pipeline opens you will need to click somewhere on the canvas (open space or background) to see the pipeline level parameters. This means that NONE of the activities should be highlighted or selected. Now select the Parameters tab in the bottom pane to view the pipeline level parameters.
28 |
29 | 
30 |
31 |
32 | Change the default value for each of the following five parameters to what you chose during deployment:
33 | * StorageName - This is the name of your Synapse workspace ADLS account
34 | * DatabaseName - This is the name of your database in Synapse Dedicated SQL Pool
35 | * ServerName - This is the name of your Synapse Dedicated SQL Pool
36 | * SparkPoolName - This is the name of your Synapse Spark Pool
37 | * DatasetSize - This is either "1tb" or "30tb" depending on which size dataset you want to use
38 |
39 | # STEP 2: Execute Pipeline
40 | * Since this pipeline has a data flow we'll kick it off a bit differently than the previous exercises. You will want to flip the radio button for "Data Flow Debug", hit the drop down arrow next to debug, and select the last option "Use Activity Runtime".
41 | >*Note: Make sure your Dedicated Pool is running prior to executing this pipeline. You can see this in the SQL Pools tab under the Manage Hub.*
42 |
43 | 
44 |
45 |
46 | ## Congratulations on completing Exercise 03.
47 |
--------------------------------------------------------------------------------
/Images/Forking.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/AzureSynapseEndToEndDemo/c0a7e065ba86a9bad1c0b21694f85c62429d3e3d/Images/Forking.gif
--------------------------------------------------------------------------------
/Images/IngestingGif.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/AzureSynapseEndToEndDemo/c0a7e065ba86a9bad1c0b21694f85c62429d3e3d/Images/IngestingGif.gif
--------------------------------------------------------------------------------
/Images/KQLgif01.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/AzureSynapseEndToEndDemo/c0a7e065ba86a9bad1c0b21694f85c62429d3e3d/Images/KQLgif01.gif
--------------------------------------------------------------------------------
/Images/KQLgif02.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/AzureSynapseEndToEndDemo/c0a7e065ba86a9bad1c0b21694f85c62429d3e3d/Images/KQLgif02.gif
--------------------------------------------------------------------------------
/Images/KQLgif03.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/AzureSynapseEndToEndDemo/c0a7e065ba86a9bad1c0b21694f85c62429d3e3d/Images/KQLgif03.gif
--------------------------------------------------------------------------------
/Images/KQLgif04.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/AzureSynapseEndToEndDemo/c0a7e065ba86a9bad1c0b21694f85c62429d3e3d/Images/KQLgif04.gif
--------------------------------------------------------------------------------
/Images/KQLgif05.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/AzureSynapseEndToEndDemo/c0a7e065ba86a9bad1c0b21694f85c62429d3e3d/Images/KQLgif05.gif
--------------------------------------------------------------------------------
/Images/KQLgif06.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/AzureSynapseEndToEndDemo/c0a7e065ba86a9bad1c0b21694f85c62429d3e3d/Images/KQLgif06.gif
--------------------------------------------------------------------------------
/Images/KQLgif07.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/AzureSynapseEndToEndDemo/c0a7e065ba86a9bad1c0b21694f85c62429d3e3d/Images/KQLgif07.gif
--------------------------------------------------------------------------------
/Images/KQLgif08.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/AzureSynapseEndToEndDemo/c0a7e065ba86a9bad1c0b21694f85c62429d3e3d/Images/KQLgif08.gif
--------------------------------------------------------------------------------
/Images/KQLgif09.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/AzureSynapseEndToEndDemo/c0a7e065ba86a9bad1c0b21694f85c62429d3e3d/Images/KQLgif09.gif
--------------------------------------------------------------------------------
/Images/deploytoazure.svg:
--------------------------------------------------------------------------------
1 |
2 |
51 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) Microsoft Corporation.
4 |
5 | Permissionis hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Azure Synapse End-End Demo
2 |
3 | This repository provides one-click infrastructure and artifact deployment for Azure Synapse Analytics to get you started with Big Data Analytics on a
4 | large sized Health Care sample data. You will learn how to ingest, process, and serve large volumes of data using various components of Synapse.
5 |
6 | ## Reference Architecture
7 | 
8 |
9 | ## CONTENTS
10 | * [Exercise 00 - Setup](Exercise00-Setup/README.md)
11 | * [Exercise 01 - Claims](Exercise01-Claims/README.md)
12 | * [Exercise 02 - Observations](Exercise02-Observations/README.md)
13 | * [Exercise 03 - Patients](Exercise03-Patients/README.md)
14 | * [Troubleshooting](Troubleshooting/Readme.md)
15 |
--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | ## Security
4 |
5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
6 |
7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below.
8 |
9 | ## Reporting Security Issues
10 |
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 |
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report).
14 |
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey).
16 |
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc).
18 |
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 |
21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 | * Full paths of source file(s) related to the manifestation of the issue
23 | * The location of the affected source code (tag/branch/commit or direct URL)
24 | * Any special configuration required to reproduce the issue
25 | * Step-by-step instructions to reproduce the issue
26 | * Proof-of-concept or exploit code (if possible)
27 | * Impact of the issue, including how an attacker might exploit the issue
28 |
29 | This information will help us triage your report more quickly.
30 |
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs.
32 |
33 | ## Preferred Languages
34 |
35 | We prefer all communications to be in English.
36 |
37 | ## Policy
38 |
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd).
40 |
41 |
42 |
--------------------------------------------------------------------------------
/Troubleshooting/Readme.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | ## Jump To Issue
4 | 1. [Missing Pipelines or Data](#missing-pipes)
5 |
6 |
7 |
8 | ----
9 |
10 | # Missing Pipelines or Data
11 | After you deploy the solution and open Synapse Studio, you notice that there are no pipelines or data. You might also see a notification that "Publishing in workspace mode is disabled".
12 |
13 | 
14 |
15 |
16 | # Resolution - Missing Pipelines or Data
17 | Not all artifacts can be deployed if you don't have Synapse registered in your GitHub account. You need to disconnect and reconnect GitHub from your Synapse workspace to force registration. Once that is complete you will need to redploy the solution.
18 |
19 | First you need to switch from Synapse Live mode to GitHub.
20 | 
21 |
22 | Next you need to head to the **_Manage Hub_** and select **_GitHub Configuration_** in the navigation pane.
23 | 
24 |
25 | Now you need to disconnect your workspace from your GitHub account.
26 | 
27 |
28 | Once you have successfully disconnected GitHub from your workspace it should look like this.
29 | 
30 |
31 | Now that you have disconnected, it's time to reconnect so the Synapse libraries can be properly installed in your GitHub account. Hit the configure button and select GitHub as your repository type.
32 | 
33 |
34 | Enter your GitHub repository Owner Name and select **_Continue_**
35 |
36 | 
37 |
38 | Now you will get a pop-up asking to authorize Azure Synapse to your GitHub account. Select **_Authorize Azure Synapse_**
39 | 
40 |
41 | Next you need to grab the URL for your repository to finish the configuration. You can get this by clicking the **_<>Code_** button toward the top of your repository and then selecting the **_Local_** tab. Now you can just click the copy button next to your HTTPS URL.
42 | 
43 |
44 | Now go back to your repository configuration and make sure you have the following things set and hit **_Apply_** when done:
45 | - "**_Use Repository Link_**" should be selected
46 | - Paste your URL in the **_Git Repository Link_** field
47 | - Use "**_main_**" for the **_Collaboration Branch_** field
48 | - Choose a Publish Branch like **_workspace_publish_**
49 | - #### You MUST set the **_Root Folder_** field to **_/artifacts_** including the slash that precedes it
50 | - #### The **_Import Existing Resources_** field MUST BE UNCHECKED
51 | 
52 |
53 | If you get an error about not having permissions to import the repository you need to **_UNCHECK_** the import setting.
54 | 
55 |
56 | [Back to Issue List](#issue-list)
57 | ---
58 |
--------------------------------------------------------------------------------
/artifacts/Observation Table Creation.kql:
--------------------------------------------------------------------------------
1 | // Create Table
2 | .create table ObservationsTable (Observation_id: guid, resourceType: string, issued: datetime, status: string, patient_id_reference: guid, encounter_id_reference: guid, effectiveDateTime: datetime , valueQuantity_code: string, valueQuantity_system: string, valueQuantity_unit: string, valueQuantity_value: real, valueString: string)
--------------------------------------------------------------------------------
/artifacts/Observations Analytics w KQL.kql:
--------------------------------------------------------------------------------
1 | // a. Quick preview of data
2 | ObservationCurated
3 | | take 100
4 |
5 | // b. Total number of observations
6 | ObservationCurated
7 | | count
8 |
9 | // c. Observations min and max date
10 | ObservationCurated
11 | | summarize min(issued), max(issued)
12 |
13 | // d. Summarizes the count of observations by grouping them into daily intervals based on the issued date.
14 | ObservationCurated
15 | | summarize count() by bin(issued, 1d)
16 |
17 | // e. Visualize timeseries chart
18 | ObservationCurated
19 | | where issued between (datetime(1910-07-15T12:49:47.219Z)..datetime(2021-06-20T11:41:23.934Z))
20 | | make-series observationscount=count() on issued from datetime(1910-07-15T12:49:47.219Z) to datetime(2021-06-20T11:41:23.934Z) step 30d
21 | | render timechart
22 |
23 | // f. Trimming dataset to analyze daily observations during a relatively normal time period (8 years between 2011 and 2019)
24 | ObservationCurated
25 | | where issued between (datetime(2011-08-05 00:00:00.0)..datetime(2019-01-01 00:00:00.0))
26 | | make-series observationscount=count() on issued from datetime(2011-08-05 00:00:00.0) to datetime(2019-01-01 00:00:00.0) step 1d
27 | | render timechart
28 |
29 | // g. We are now identifying anomalies between these 8 years (2011 and 2019) using the timeseries chart developed in the previous step.
30 | ObservationCurated
31 | | where issued between (datetime(2011-08-05 00:00:00.0)..datetime(2019-01-01 00:00:00.0))
32 | | make-series observationscount=count() on issued from datetime(2011-08-05 00:00:00.0) to datetime(2019-01-01 00:00:00.0) step 1d
33 | | extend anomalies = series_decompose_anomalies(observationscount, 1.5)
34 | | render anomalychart with(anomalycolumns=anomalies, title='Anomalies for daily medical observations during 8 years')
35 |
36 | // h. List Anomalies
37 | ObservationCurated
38 | | make-series observationscount=count() on issued from datetime(2011-08-05 00:00:00.0) to datetime(2019-01-01 00:00:00.0) step 1d
39 | | extend anomalies = series_decompose_anomalies(observationscount, 1.5)
40 | | mv-expand observationscount, anomalies, issued
41 | | where toint(anomalies) <> 0
42 | | sort by todatetime(issued)
43 |
44 | // i. Separate anomaly properties and use query to create power BI report
45 | ObservationCurated
46 | | make-series observationscount=count() on issued from datetime(2011-08-05 00:00:00.0) to datetime(2019-01-01 00:00:00.0) step 1d
47 | | extend (anomalies, score, baseline) = series_decompose_anomalies(observationscount, 1.5)
48 | | mv-expand anomalies, issued, observationscount, score, baseline
49 | | project anomalies = iff(toint(anomalies) == 0, int(null),toint(anomalies)), issued, observationscount, score, baseline
50 |
51 |
52 |
53 |
--------------------------------------------------------------------------------
/artifacts/credential/WorkspaceSystemIdentity.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "WorkspaceSystemIdentity",
3 | "properties": {
4 | "type": "ManagedIdentity"
5 | }
6 | }
--------------------------------------------------------------------------------
/artifacts/dataflow/PatientJSON_Flatten_large.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "PatientJSON_Flatten_large",
3 | "properties": {
4 | "folder": {
5 | "name": "Patient"
6 | },
7 | "type": "MappingDataFlow",
8 | "typeProperties": {
9 | "sources": [
10 | {
11 | "dataset": {
12 | "referenceName": "PatientRawParquetLarge",
13 | "type": "DatasetReference"
14 | },
15 | "name": "PatientNDJSON"
16 | }
17 | ],
18 | "sinks": [
19 | {
20 | "dataset": {
21 | "referenceName": "PatientIdentifierParquetLarge",
22 | "type": "DatasetReference"
23 | },
24 | "name": "sinkPatientIdentifier"
25 | },
26 | {
27 | "dataset": {
28 | "referenceName": "PatientExtensionParquetLarge",
29 | "type": "DatasetReference"
30 | },
31 | "name": "sinkPatientExtension"
32 | },
33 | {
34 | "dataset": {
35 | "referenceName": "PatientAddressParquetLarge",
36 | "type": "DatasetReference"
37 | },
38 | "name": "sinkPatientAddress"
39 | }
40 | ],
41 | "transformations": [
42 | {
43 | "name": "PatientIdentifierFlatten"
44 | },
45 | {
46 | "name": "PatientExtensionFlatten"
47 | },
48 | {
49 | "name": "PatientAddressFlatten"
50 | }
51 | ],
52 | "scriptLines": [
53 | "source(output(",
54 | " address as (city as string, country as string, extension as (extension as (url as string, valueDecimal as double)[], url as string)[], line as string[], postalCode as string, state as string)[],",
55 | " birthDate as string,",
56 | " communication as (language as (coding as (code as string, display as string, system as string)[], text as string))[],",
57 | " deceasedDateTime as string,",
58 | " extension as (url as string, valueAddress as (city as string, country as string, state as string), valueDecimal as double, valueString as string)[],",
59 | " gender as string,",
60 | " id as string,",
61 | " identifier as (system as string, type as (coding as (code as string, display as string, system as string)[], text as string), value as string)[],",
62 | " maritalStatus as (coding as (code as string, display as string, system as string)[], text as string),",
63 | " multipleBirthBoolean as boolean,",
64 | " multipleBirthInteger as long,",
65 | " name as (family as string, given as string[], prefix as string[], suffix as string[], use as string)[],",
66 | " resourceType as string,",
67 | " telecom as (system as string, use as string, value as string)[],",
68 | " text as (div as string, status as string)",
69 | " ),",
70 | " allowSchemaDrift: true,",
71 | " validateSchema: false,",
72 | " ignoreNoFilesFound: false,",
73 | " format: 'parquet') ~> PatientNDJSON",
74 | "PatientNDJSON foldDown(unroll(identifier.type.coding),",
75 | " mapColumn(",
76 | " patient_id = id,",
77 | " birthDate,",
78 | " deceasedDateTime,",
79 | " gender,",
80 | " text = maritalStatus.text,",
81 | " multipleBirthBoolean,",
82 | " multipleBirthInteger,",
83 | " resourceType,",
84 | " div = text.div,",
85 | " status = text.status,",
86 | " {identifier.system} = identifier.system,",
87 | " {identifier.type.coding.code} = identifier.type.coding.code,",
88 | " {identifier.type.coding.display} = identifier.type.coding.display,",
89 | " {identifier.type.coding.system} = identifier.type.coding.system,",
90 | " {identifier.type.text} = identifier.type.text,",
91 | " {identifier.value} = identifier.value",
92 | " ),",
93 | " skipDuplicateMapInputs: false,",
94 | " skipDuplicateMapOutputs: false) ~> PatientIdentifierFlatten",
95 | "PatientNDJSON foldDown(unroll(extension),",
96 | " mapColumn(",
97 | " patient_id = id,",
98 | " url = extension.url,",
99 | " {extension.valueAddress.city} = extension.valueAddress.city,",
100 | " {extension.valueAddress.country} = extension.valueAddress.country,",
101 | " {extension.valueAddress.state} = extension.valueAddress.state,",
102 | " {extension.valueDecimal} = extension.valueDecimal,",
103 | " {extension.valueString} = extension.valueString",
104 | " ),",
105 | " skipDuplicateMapInputs: false,",
106 | " skipDuplicateMapOutputs: false) ~> PatientExtensionFlatten",
107 | "PatientNDJSON foldDown(unroll(address.extension.extension),",
108 | " mapColumn(",
109 | " id,",
110 | " {address.city} = address.city,",
111 | " {address.country} = address.country,",
112 | " {address.extension.extension.url} = address.extension.extension.url,",
113 | " {address.extension.extension.valueDecimal} = address.extension.extension.valueDecimal,",
114 | " {address.extension.url} = address.extension.url,",
115 | " {address.postalCode} = address.postalCode,",
116 | " {address.state} = address.state",
117 | " ),",
118 | " skipDuplicateMapInputs: true,",
119 | " skipDuplicateMapOutputs: false) ~> PatientAddressFlatten",
120 | "PatientIdentifierFlatten sink(allowSchemaDrift: true,",
121 | " validateSchema: false,",
122 | " format: 'parquet',",
123 | " truncate: true,",
124 | " umask: 0022,",
125 | " preCommands: [],",
126 | " postCommands: [],",
127 | " skipDuplicateMapInputs: true,",
128 | " skipDuplicateMapOutputs: true) ~> sinkPatientIdentifier",
129 | "PatientExtensionFlatten sink(allowSchemaDrift: true,",
130 | " validateSchema: false,",
131 | " format: 'parquet',",
132 | " truncate: true,",
133 | " umask: 0022,",
134 | " preCommands: [],",
135 | " postCommands: [],",
136 | " skipDuplicateMapInputs: true,",
137 | " skipDuplicateMapOutputs: true) ~> sinkPatientExtension",
138 | "PatientAddressFlatten sink(allowSchemaDrift: true,",
139 | " validateSchema: false,",
140 | " format: 'parquet',",
141 | " truncate: true,",
142 | " umask: 0022,",
143 | " preCommands: [],",
144 | " postCommands: [],",
145 | " skipDuplicateMapInputs: true,",
146 | " skipDuplicateMapOutputs: true) ~> sinkPatientAddress"
147 | ]
148 | }
149 | }
150 | }
--------------------------------------------------------------------------------
/artifacts/dataset/ClaimDiagnosisParquetLarge.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "ClaimDiagnosisParquetLarge",
3 | "properties": {
4 | "linkedServiceName": {
5 | "referenceName": "StorageLS",
6 | "type": "LinkedServiceReference",
7 | "parameters": {
8 | "StorageName": {
9 | "value": "@dataset().StorageName",
10 | "type": "Expression"
11 | }
12 | }
13 | },
14 | "parameters": {
15 | "StorageName": {
16 | "type": "string"
17 | },
18 | "FolderPath": {
19 | "type": "string"
20 | }
21 | },
22 | "annotations": [],
23 | "type": "Parquet",
24 | "typeProperties": {
25 | "location": {
26 | "type": "AzureBlobFSLocation",
27 | "folderPath": {
28 | "value": "@dataset().FolderPath",
29 | "type": "Expression"
30 | },
31 | "fileSystem": "curated"
32 | },
33 | "compressionCodec": "snappy"
34 | },
35 | "schema": [
36 | {
37 | "name": "id",
38 | "type": "UTF8"
39 | },
40 | {
41 | "name": "resourceType",
42 | "type": "UTF8"
43 | },
44 | {
45 | "name": "status",
46 | "type": "UTF8"
47 | },
48 | {
49 | "name": "billablePeriod_end",
50 | "type": "UTF8"
51 | },
52 | {
53 | "name": "billablePeriod_start",
54 | "type": "UTF8"
55 | },
56 | {
57 | "name": "created",
58 | "type": "UTF8"
59 | },
60 | {
61 | "name": "patient_display",
62 | "type": "UTF8"
63 | },
64 | {
65 | "name": "patient_reference",
66 | "type": "UTF8"
67 | },
68 | {
69 | "name": "prescription_reference",
70 | "type": "UTF8"
71 | },
72 | {
73 | "name": "provider_display",
74 | "type": "UTF8"
75 | },
76 | {
77 | "name": "provider_reference",
78 | "type": "UTF8"
79 | },
80 | {
81 | "name": "total_currency",
82 | "type": "UTF8"
83 | },
84 | {
85 | "name": "total_value",
86 | "type": "DOUBLE"
87 | },
88 | {
89 | "name": "use",
90 | "type": "UTF8"
91 | },
92 | {
93 | "name": "display",
94 | "type": "UTF8"
95 | },
96 | {
97 | "name": "focal",
98 | "type": "BOOLEAN"
99 | },
100 | {
101 | "name": "sequence",
102 | "type": "INT64"
103 | }
104 | ]
105 | }
106 | }
--------------------------------------------------------------------------------
/artifacts/dataset/ClaimDiagnosisSQL.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "ClaimDiagnosisSQL",
3 | "properties": {
4 | "linkedServiceName": {
5 | "referenceName": "SynapseDedicatedPoolLS",
6 | "type": "LinkedServiceReference",
7 | "parameters": {
8 | "DatabaseName": {
9 | "value": "@dataset().DatabaseName",
10 | "type": "Expression"
11 | },
12 | "ServerName": {
13 | "value": "@dataset().ServerName",
14 | "type": "Expression"
15 | }
16 | }
17 | },
18 | "parameters": {
19 | "DatabaseName": {
20 | "type": "string"
21 | },
22 | "ServerName": {
23 | "type": "string"
24 | }
25 | },
26 | "annotations": [],
27 | "type": "AzureSqlDWTable",
28 | "schema": [],
29 | "typeProperties": {
30 | "schema": "fhir",
31 | "table": "ClaimDiagnosis"
32 | }
33 | }
34 | }
--------------------------------------------------------------------------------
/artifacts/dataset/ClaimInsurance.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "ClaimInsurance",
3 | "properties": {
4 | "linkedServiceName": {
5 | "referenceName": "SynapseDedicatedPoolLS",
6 | "type": "LinkedServiceReference",
7 | "parameters": {
8 | "DatabaseName": {
9 | "value": "@dataset().DatabaseName",
10 | "type": "Expression"
11 | },
12 | "ServerName": {
13 | "value": "@dataset().ServerName",
14 | "type": "Expression"
15 | }
16 | }
17 | },
18 | "parameters": {
19 | "DatabaseName": {
20 | "type": "string"
21 | },
22 | "ServerName": {
23 | "type": "string"
24 | }
25 | },
26 | "annotations": [],
27 | "type": "AzureSqlDWTable",
28 | "schema": [],
29 | "typeProperties": {
30 | "schema": "fhir",
31 | "table": "ClaimInsurance"
32 | }
33 | }
34 | }
--------------------------------------------------------------------------------
/artifacts/dataset/ClaimInsuranceParquetLarge.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "ClaimInsuranceParquetLarge",
3 | "properties": {
4 | "linkedServiceName": {
5 | "referenceName": "StorageLS",
6 | "type": "LinkedServiceReference",
7 | "parameters": {
8 | "StorageName": {
9 | "value": "@dataset().StorageName",
10 | "type": "Expression"
11 | }
12 | }
13 | },
14 | "parameters": {
15 | "StorageName": {
16 | "type": "string"
17 | },
18 | "FolderPath": {
19 | "type": "string"
20 | }
21 | },
22 | "annotations": [],
23 | "type": "Parquet",
24 | "typeProperties": {
25 | "location": {
26 | "type": "AzureBlobFSLocation",
27 | "fileName": "*",
28 | "folderPath": {
29 | "value": "@dataset().FolderPath",
30 | "type": "Expression"
31 | },
32 | "fileSystem": "curated"
33 | },
34 | "compressionCodec": "snappy"
35 | },
36 | "schema": [
37 | {
38 | "name": "id",
39 | "type": "UTF8"
40 | },
41 | {
42 | "name": "resourceType",
43 | "type": "UTF8"
44 | },
45 | {
46 | "name": "status",
47 | "type": "UTF8"
48 | },
49 | {
50 | "name": "billablePeriod_end",
51 | "type": "UTF8"
52 | },
53 | {
54 | "name": "billablePeriod_start",
55 | "type": "UTF8"
56 | },
57 | {
58 | "name": "created",
59 | "type": "UTF8"
60 | },
61 | {
62 | "name": "patient_display",
63 | "type": "UTF8"
64 | },
65 | {
66 | "name": "patient_reference",
67 | "type": "UTF8"
68 | },
69 | {
70 | "name": "prescription_reference",
71 | "type": "UTF8"
72 | },
73 | {
74 | "name": "provider_display",
75 | "type": "UTF8"
76 | },
77 | {
78 | "name": "provider_reference",
79 | "type": "UTF8"
80 | },
81 | {
82 | "name": "total_currency",
83 | "type": "UTF8"
84 | },
85 | {
86 | "name": "total_value",
87 | "type": "DOUBLE"
88 | },
89 | {
90 | "name": "use",
91 | "type": "UTF8"
92 | },
93 | {
94 | "name": "display",
95 | "type": "UTF8"
96 | },
97 | {
98 | "name": "focal",
99 | "type": "BOOLEAN"
100 | },
101 | {
102 | "name": "sequence",
103 | "type": "INT64"
104 | }
105 | ]
106 | }
107 | }
--------------------------------------------------------------------------------
/artifacts/dataset/ClaimProcedureParquetLarge.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "ClaimProcedureParquetLarge",
3 | "properties": {
4 | "linkedServiceName": {
5 | "referenceName": "StorageLS",
6 | "type": "LinkedServiceReference",
7 | "parameters": {
8 | "StorageName": {
9 | "value": "@dataset().StorageName",
10 | "type": "Expression"
11 | }
12 | }
13 | },
14 | "parameters": {
15 | "StorageName": {
16 | "type": "string"
17 | },
18 | "FolderPath": {
19 | "type": "string"
20 | }
21 | },
22 | "annotations": [],
23 | "type": "Parquet",
24 | "typeProperties": {
25 | "location": {
26 | "type": "AzureBlobFSLocation",
27 | "fileName": "*",
28 | "folderPath": {
29 | "value": "@dataset().FolderPath",
30 | "type": "Expression"
31 | },
32 | "fileSystem": "curated"
33 | },
34 | "compressionCodec": "snappy"
35 | },
36 | "schema": [
37 | {
38 | "name": "id",
39 | "type": "UTF8"
40 | },
41 | {
42 | "name": "resourceType",
43 | "type": "UTF8"
44 | },
45 | {
46 | "name": "status",
47 | "type": "UTF8"
48 | },
49 | {
50 | "name": "billablePeriod_end",
51 | "type": "UTF8"
52 | },
53 | {
54 | "name": "billablePeriod_start",
55 | "type": "UTF8"
56 | },
57 | {
58 | "name": "created",
59 | "type": "UTF8"
60 | },
61 | {
62 | "name": "patient_display",
63 | "type": "UTF8"
64 | },
65 | {
66 | "name": "patient_reference",
67 | "type": "UTF8"
68 | },
69 | {
70 | "name": "prescription_reference",
71 | "type": "UTF8"
72 | },
73 | {
74 | "name": "provider_display",
75 | "type": "UTF8"
76 | },
77 | {
78 | "name": "provider_reference",
79 | "type": "UTF8"
80 | },
81 | {
82 | "name": "total_currency",
83 | "type": "UTF8"
84 | },
85 | {
86 | "name": "total_value",
87 | "type": "DOUBLE"
88 | },
89 | {
90 | "name": "use",
91 | "type": "UTF8"
92 | },
93 | {
94 | "name": "reference",
95 | "type": "UTF8"
96 | },
97 | {
98 | "name": "sequence",
99 | "type": "INT64"
100 | }
101 | ]
102 | }
103 | }
--------------------------------------------------------------------------------
/artifacts/dataset/ClaimProcedureSQL.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "ClaimProcedureSQL",
3 | "properties": {
4 | "linkedServiceName": {
5 | "referenceName": "SynapseDedicatedPoolLS",
6 | "type": "LinkedServiceReference",
7 | "parameters": {
8 | "DatabaseName": {
9 | "value": "@dataset().DatabaseName",
10 | "type": "Expression"
11 | },
12 | "ServerName": {
13 | "value": "@dataset().ServerName",
14 | "type": "Expression"
15 | }
16 | }
17 | },
18 | "parameters": {
19 | "DatabaseName": {
20 | "type": "string"
21 | },
22 | "ServerName": {
23 | "type": "string"
24 | }
25 | },
26 | "annotations": [],
27 | "type": "AzureSqlDWTable",
28 | "schema": [],
29 | "typeProperties": {
30 | "schema": "fhir",
31 | "table": "ClaimProcedure"
32 | }
33 | }
34 | }
--------------------------------------------------------------------------------
/artifacts/dataset/ObservationMain_LargeParquet.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "ObservationMain_LargeParquet",
3 | "properties": {
4 | "linkedServiceName": {
5 | "referenceName": "StorageLS",
6 | "type": "LinkedServiceReference",
7 | "parameters": {
8 | "StorageName": {
9 | "value": "@dataset().StorageName",
10 | "type": "Expression"
11 | }
12 | }
13 | },
14 | "parameters": {
15 | "StorageName": {
16 | "type": "string"
17 | },
18 | "DatasetSize": {
19 | "type": "string"
20 | }
21 | },
22 | "annotations": [],
23 | "type": "Parquet",
24 | "typeProperties": {
25 | "location": {
26 | "type": "AzureBlobFSLocation",
27 | "fileName": "*",
28 | "folderPath": {
29 | "value": "@dataset().DatasetSize",
30 | "type": "Expression"
31 | },
32 | "fileSystem": "curated"
33 | },
34 | "compressionCodec": "snappy"
35 | },
36 | "schema": []
37 | },
38 | "type": "Microsoft.Synapse/workspaces/datasets"
39 | }
--------------------------------------------------------------------------------
/artifacts/dataset/Observation_SQLDS.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "Observation_SQLDS",
3 | "properties": {
4 | "linkedServiceName": {
5 | "referenceName": "SynapseDedicatedPoolLS",
6 | "type": "LinkedServiceReference",
7 | "parameters": {
8 | "DatabaseName": {
9 | "value": "@dataset().DatabaseName",
10 | "type": "Expression"
11 | },
12 | "ServerName": {
13 | "value": "@dataset().ServerName",
14 | "type": "Expression"
15 | }
16 | }
17 | },
18 | "parameters": {
19 | "DatabaseName": {
20 | "type": "string"
21 | },
22 | "ServerName": {
23 | "type": "string"
24 | }
25 | },
26 | "annotations": [],
27 | "type": "AzureSqlDWTable",
28 | "schema": [],
29 | "typeProperties": {
30 | "schema": "fhir",
31 | "table": "ObservationMain"
32 | }
33 | }
34 | }
--------------------------------------------------------------------------------
/artifacts/dataset/PatientAddressParquetLarge.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "PatientAddressParquetLarge",
3 | "properties": {
4 | "linkedServiceName": {
5 | "referenceName": "StorageLS",
6 | "type": "LinkedServiceReference",
7 | "parameters": {
8 | "StorageName": {
9 | "value": "@dataset().StorageName",
10 | "type": "Expression"
11 | }
12 | }
13 | },
14 | "parameters": {
15 | "StorageName": {
16 | "type": "string"
17 | },
18 | "DatasetSize": {
19 | "type": "string"
20 | }
21 | },
22 | "annotations": [],
23 | "type": "Parquet",
24 | "typeProperties": {
25 | "location": {
26 | "type": "AzureBlobFSLocation",
27 | "folderPath": {
28 | "value": "@concat('fhir/',dataset().DatasetSize,'/PatientAddress')",
29 | "type": "Expression"
30 | },
31 | "fileSystem": "curated"
32 | },
33 | "compressionCodec": "snappy"
34 | },
35 | "schema": []
36 | },
37 | "type": "Microsoft.Synapse/workspaces/datasets"
38 | }
--------------------------------------------------------------------------------
/artifacts/dataset/PatientAddressSQL.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "PatientAddressSQL",
3 | "properties": {
4 | "linkedServiceName": {
5 | "referenceName": "SynapseDedicatedPoolLS",
6 | "type": "LinkedServiceReference",
7 | "parameters": {
8 | "DatabaseName": {
9 | "value": "@dataset().DatabaseName",
10 | "type": "Expression"
11 | },
12 | "ServerName": {
13 | "value": "@dataset().ServerName",
14 | "type": "Expression"
15 | }
16 | }
17 | },
18 | "parameters": {
19 | "DatabaseName": {
20 | "type": "string"
21 | },
22 | "ServerName": {
23 | "type": "string"
24 | }
25 | },
26 | "annotations": [],
27 | "type": "AzureSqlDWTable",
28 | "schema": [],
29 | "typeProperties": {
30 | "schema": "fhir",
31 | "table": "PatientAddress"
32 | }
33 | }
34 | }
--------------------------------------------------------------------------------
/artifacts/dataset/PatientExtensionParquetLarge.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "PatientExtensionParquetLarge",
3 | "properties": {
4 | "linkedServiceName": {
5 | "referenceName": "StorageLS",
6 | "type": "LinkedServiceReference",
7 | "parameters": {
8 | "StorageName": {
9 | "value": "@dataset().StorageName",
10 | "type": "Expression"
11 | }
12 | }
13 | },
14 | "parameters": {
15 | "StorageName": {
16 | "type": "string"
17 | },
18 | "DatasetSize": {
19 | "type": "string"
20 | }
21 | },
22 | "annotations": [],
23 | "type": "Parquet",
24 | "typeProperties": {
25 | "location": {
26 | "type": "AzureBlobFSLocation",
27 | "folderPath": {
28 | "value": "@concat('fhir/',dataset().DatasetSize,'/PatientExtension')",
29 | "type": "Expression"
30 | },
31 | "fileSystem": "curated"
32 | },
33 | "compressionCodec": "snappy"
34 | },
35 | "schema": []
36 | },
37 | "type": "Microsoft.Synapse/workspaces/datasets"
38 | }
--------------------------------------------------------------------------------
/artifacts/dataset/PatientIdentifierParquetLarge.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "PatientIdentifierParquetLarge",
3 | "properties": {
4 | "linkedServiceName": {
5 | "referenceName": "StorageLS",
6 | "type": "LinkedServiceReference",
7 | "parameters": {
8 | "StorageName": {
9 | "value": "@dataset().StorageName",
10 | "type": "Expression"
11 | }
12 | }
13 | },
14 | "parameters": {
15 | "StorageName": {
16 | "type": "string"
17 | },
18 | "DatasetSize": {
19 | "type": "string"
20 | }
21 | },
22 | "annotations": [],
23 | "type": "Parquet",
24 | "typeProperties": {
25 | "location": {
26 | "type": "AzureBlobFSLocation",
27 | "folderPath": {
28 | "value": "@concat('fhir/',dataset().DatasetSize,'/PatientIdentifier')",
29 | "type": "Expression"
30 | },
31 | "fileSystem": "curated"
32 | },
33 | "compressionCodec": "snappy"
34 | },
35 | "schema": []
36 | },
37 | "type": "Microsoft.Synapse/workspaces/datasets"
38 | }
--------------------------------------------------------------------------------
/artifacts/dataset/PatientIdentifierSQLLarge.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "PatientIdentifierSQLLarge",
3 | "properties": {
4 | "linkedServiceName": {
5 | "referenceName": "SynapseDedicatedPoolLS",
6 | "type": "LinkedServiceReference",
7 | "parameters": {
8 | "DatabaseName": {
9 | "value": "@dataset().DatabaseName",
10 | "type": "Expression"
11 | },
12 | "ServerName": {
13 | "value": "@dataset().ServerName",
14 | "type": "Expression"
15 | }
16 | }
17 | },
18 | "parameters": {
19 | "DatabaseName": {
20 | "type": "string"
21 | },
22 | "ServerName": {
23 | "type": "string"
24 | }
25 | },
26 | "annotations": [],
27 | "type": "AzureSqlDWTable",
28 | "schema": [],
29 | "typeProperties": {
30 | "schema": "fhir",
31 | "table": "PatientIdentifier"
32 | }
33 | },
34 | "type": "Microsoft.Synapse/workspaces/datasets"
35 | }
--------------------------------------------------------------------------------
/artifacts/dataset/PatientRawParquetLarge.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "PatientRawParquetLarge",
3 | "properties": {
4 | "linkedServiceName": {
5 | "referenceName": "StorageLS",
6 | "type": "LinkedServiceReference",
7 | "parameters": {
8 | "StorageName": {
9 | "value": "@dataset().StorageName",
10 | "type": "Expression"
11 | }
12 | }
13 | },
14 | "parameters": {
15 | "StorageName": {
16 | "type": "string"
17 | },
18 | "DatasetSize": {
19 | "type": "string"
20 | }
21 | },
22 | "annotations": [],
23 | "type": "Parquet",
24 | "typeProperties": {
25 | "location": {
26 | "type": "AzureBlobFSLocation",
27 | "folderPath": {
28 | "value": "@concat('fhir/',dataset().DatasetSize,'/Patient')",
29 | "type": "Expression"
30 | },
31 | "fileSystem": "processed"
32 | },
33 | "compressionCodec": "snappy"
34 | },
35 | "schema": []
36 | }
37 | }
--------------------------------------------------------------------------------
/artifacts/dataset/Sink_DataPrep_Curated_DS.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "Sink_DataPrep_Curated_DS",
3 | "properties": {
4 | "linkedServiceName": {
5 | "referenceName": "StorageLS",
6 | "type": "LinkedServiceReference",
7 | "parameters": {
8 | "StorageName": {
9 | "value": "@dataset().StorageName",
10 | "type": "Expression"
11 | }
12 | }
13 | },
14 | "parameters": {
15 | "StorageName": {
16 | "type": "string"
17 | }
18 | },
19 | "annotations": [],
20 | "type": "Binary",
21 | "typeProperties": {
22 | "location": {
23 | "type": "AzureBlobFSLocation",
24 | "fileSystem": "curated"
25 | }
26 | }
27 | }
28 | }
--------------------------------------------------------------------------------
/artifacts/dataset/Sink_DataPrep_DS.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "Sink_DataPrep_DS",
3 | "properties": {
4 | "linkedServiceName": {
5 | "referenceName": "StorageLS",
6 | "type": "LinkedServiceReference",
7 | "parameters": {
8 | "StorageName": {
9 | "value": "@dataset().StorageName",
10 | "type": "Expression"
11 | }
12 | }
13 | },
14 | "parameters": {
15 | "StorageName": {
16 | "type": "string"
17 | },
18 | "DatasetSize": {
19 | "type": "string"
20 | }
21 | },
22 | "annotations": [],
23 | "type": "Binary",
24 | "typeProperties": {
25 | "location": {
26 | "type": "AzureBlobFSLocation",
27 | "folderPath": {
28 | "value": "@concat('fhir_ndjson/', dataset().DatasetSize)",
29 | "type": "Expression"
30 | },
31 | "fileSystem": "raw"
32 | }
33 | }
34 | }
35 | }
--------------------------------------------------------------------------------
/artifacts/dataset/Sink_DataPrep_Processed_DS.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "Sink_DataPrep_Processed_DS",
3 | "properties": {
4 | "linkedServiceName": {
5 | "referenceName": "StorageLS",
6 | "type": "LinkedServiceReference",
7 | "parameters": {
8 | "StorageName": {
9 | "value": "@dataset().StorageName",
10 | "type": "Expression"
11 | }
12 | }
13 | },
14 | "parameters": {
15 | "StorageName": {
16 | "type": "string"
17 | }
18 | },
19 | "annotations": [],
20 | "type": "Binary",
21 | "typeProperties": {
22 | "location": {
23 | "type": "AzureBlobFSLocation",
24 | "fileSystem": "processed"
25 | }
26 | }
27 | }
28 | }
--------------------------------------------------------------------------------
/artifacts/dataset/Source_DataPrep_Curated_DS.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "Source_DataPrep_Curated_DS",
3 | "properties": {
4 | "linkedServiceName": {
5 | "referenceName": "Source_Dataset_LS",
6 | "type": "LinkedServiceReference",
7 | "parameters": {
8 | "StorageWithSaSUrl": {
9 | "value": "@concat('https://medicaldl.blob.core.windows.net/source?sp=rl&st=2023-04-05T05:00:00Z&se=2028-04-05T05:00:00Z&spr=https&sv=2021-12-02&sr=c&sig=t7P4PfO0HqRHAW%2FJQsMH9K3cgf9MguIgSKGdNYoyar4%3D')",
10 | "type": "Expression"
11 | }
12 | }
13 | },
14 | "annotations": [],
15 | "type": "Binary",
16 | "typeProperties": {
17 | "location": {
18 | "type": "AzureBlobStorageLocation",
19 | "fileName": "Create_Curated.txt",
20 | "container": "source"
21 | }
22 | }
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/artifacts/dataset/Source_DataPrep_DS.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "Source_DataPrep_DS",
3 | "properties": {
4 | "linkedServiceName": {
5 | "referenceName": "Source_Dataset_LS",
6 | "type": "LinkedServiceReference",
7 | "parameters": {
8 | "StorageWithSaSUrl": {
9 | "value": "@concat('https://medicaldl.blob.core.windows.net/source?SAS=REDACTED)",
10 | "type": "Expression"
11 | }
12 | }
13 | },
14 | "parameters": {
15 | "DatasetSize": {
16 | "type": "string"
17 | }
18 | },
19 | "annotations": [],
20 | "type": "Binary",
21 | "typeProperties": {
22 | "location": {
23 | "type": "AzureBlobStorageLocation",
24 | "folderPath": {
25 | "value": "@dataset().DatasetSize",
26 | "type": "Expression"
27 | },
28 | "container": "source"
29 | }
30 | }
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/artifacts/dataset/Source_DataPrep_Processed_DS.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "Source_DataPrep_Processed_DS",
3 | "properties": {
4 | "linkedServiceName": {
5 | "referenceName": "Source_Dataset_LS",
6 | "type": "LinkedServiceReference",
7 | "parameters": {
8 | "StorageWithSaSUrl": {
9 | "value": "@concat('https://medicaldl.blob.core.windows.net/source?sp=rl&st=2023-04-05T05:00:00Z&se=2028-04-05T05:00:00Z&spr=https&sv=2021-12-02&sr=c&sig=t7P4PfO0HqRHAW%2FJQsMH9K3cgf9MguIgSKGdNYoyar4%3D')",
10 | "type": "Expression"
11 | }
12 | }
13 | },
14 | "annotations": [],
15 | "type": "Binary",
16 | "typeProperties": {
17 | "location": {
18 | "type": "AzureBlobStorageLocation",
19 | "fileName": "Create_Processed.txt",
20 | "container": "source"
21 | }
22 | }
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/artifacts/integrationRuntime/AutoResolveIntegrationRuntime.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "AutoResolveIntegrationRuntime",
3 | "properties": {
4 | "type": "Managed",
5 | "typeProperties": {
6 | "computeProperties": {
7 | "location": "AutoResolve",
8 | "dataFlowProperties": {
9 | "computeType": "General",
10 | "coreCount": 4,
11 | "timeToLive": 0
12 | }
13 | }
14 | }
15 | }
16 | }
--------------------------------------------------------------------------------
/artifacts/linkedService/Source_Dataset_LS.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "Source_Dataset_LS",
3 | "type": "Microsoft.Synapse/workspaces/linkedservices",
4 | "properties": {
5 | "parameters": {
6 | "StorageWithSaSUrl": {
7 | "type": "string"
8 | }
9 | },
10 | "annotations": [],
11 | "type": "AzureBlobStorage",
12 | "typeProperties": {
13 | "sasUri": "@{linkedService().StorageWithSaSUrl}"
14 | },
15 | "connectVia": {
16 | "referenceName": "AutoResolveIntegrationRuntime",
17 | "type": "IntegrationRuntimeReference"
18 | }
19 | }
20 | }
--------------------------------------------------------------------------------
/artifacts/linkedService/StorageLS.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "StorageLS",
3 | "properties": {
4 | "parameters": {
5 | "StorageName": {
6 | "type": "string",
7 | "defaultValue": "synapsee2elake"
8 | }
9 | },
10 | "annotations": [],
11 | "type": "AzureBlobFS",
12 | "typeProperties": {
13 | "url": "@{concat('https://',linkedService().StorageName,'.dfs.core.windows.net')}"
14 | },
15 | "connectVia": {
16 | "referenceName": "AutoResolveIntegrationRuntime",
17 | "type": "IntegrationRuntimeReference"
18 | }
19 | }
20 | }
--------------------------------------------------------------------------------
/artifacts/linkedService/SynapseDedicatedPoolLS.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "SynapseDedicatedPoolLS",
3 | "properties": {
4 | "parameters": {
5 | "DatabaseName": {
6 | "type": "string",
7 | "defaultValue": "healthcare"
8 | },
9 | "ServerName": {
10 | "type": "string",
11 | "defaultValue": "health"
12 | }
13 | },
14 | "annotations": [],
15 | "type": "AzureSqlDW",
16 | "typeProperties": {
17 | "connectionString": "Integrated Security=False;Encrypt=True;Connection Timeout=30;Data Source=\"@{concat(linkedService().ServerName,'.sql.azuresynapse.net')}\";Initial Catalog=@{linkedService().DatabaseName}"
18 | },
19 | "connectVia": {
20 | "referenceName": "AutoResolveIntegrationRuntime",
21 | "type": "IntegrationRuntimeReference"
22 | }
23 | }
24 | }
--------------------------------------------------------------------------------
/artifacts/notebook/ClaimParquetFlatten_Large.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "ClaimParquetFlatten_Large",
3 | "properties": {
4 | "folder": {
5 | "name": "Claims"
6 | },
7 | "nbformat": 4,
8 | "nbformat_minor": 2,
9 | "bigDataPool": {
10 | "referenceName": "healthcare",
11 | "type": "BigDataPoolReference"
12 | },
13 | "sessionProperties": {
14 | "driverMemory": "112g",
15 | "driverCores": 16,
16 | "executorMemory": "112g",
17 | "executorCores": 16,
18 | "numExecutors": 4,
19 | "conf": {
20 | "spark.dynamicAllocation.enabled": "false",
21 | "spark.dynamicAllocation.minExecutors": "4",
22 | "spark.dynamicAllocation.maxExecutors": "4",
23 | "spark.autotune.trackingId": "f6bbd8e2-1229-423c-bf5f-0432650ae015"
24 | }
25 | },
26 | "metadata": {
27 | "saveOutput": true,
28 | "enableDebugMode": false,
29 | "kernelspec": {
30 | "name": "synapse_pyspark",
31 | "display_name": "Synapse PySpark"
32 | },
33 | "language_info": {
34 | "name": "python"
35 | },
36 | "a365ComputeOptions": {
37 | "id": "/subscriptions/7e416de3-c506-4776-8270-83fd73c6cc37/resourceGroups/syne2e/providers/Microsoft.Synapse/workspaces/health/bigDataPools/healthcare",
38 | "name": "healthcare",
39 | "type": "Spark",
40 | "endpoint": "https://health.dev.azuresynapse.net/livyApi/versions/2019-11-01-preview/sparkPools/healthcare",
41 | "auth": {
42 | "type": "AAD",
43 | "authResource": "https://dev.azuresynapse.net"
44 | },
45 | "sparkVersion": "3.1",
46 | "nodeCount": 3,
47 | "cores": 16,
48 | "memory": 112,
49 | "automaticScaleJobs": false
50 | },
51 | "sessionKeepAliveTimeout": 180
52 | },
53 | "cells": [
54 | {
55 | "cell_type": "code",
56 | "metadata": {
57 | "tags": [
58 | "parameters"
59 | ]
60 | },
61 | "source": [
62 | "StorageName = \"medicaldl\"\r\n",
63 | "DatasetSize = \"1tb\""
64 | ],
65 | "execution_count": 1
66 | },
67 | {
68 | "cell_type": "code",
69 | "metadata": {
70 | "jupyter": {
71 | "source_hidden": false,
72 | "outputs_hidden": false
73 | },
74 | "nteract": {
75 | "transient": {
76 | "deleting": false
77 | }
78 | }
79 | },
80 | "source": [
81 | "curated_location = \"abfss://curated@\" + StorageName + \".dfs.core.windows.net/fhir/\"+ DatasetSize +\"/\"\r\n",
82 | "processed_location = \"abfss://processed@\" + StorageName + \".dfs.core.windows.net/fhir/\"+ DatasetSize +\"/\"\r\n",
83 | "write_mode=\"overwrite\""
84 | ],
85 | "execution_count": null
86 | },
87 | {
88 | "cell_type": "code",
89 | "metadata": {
90 | "jupyter": {
91 | "source_hidden": false,
92 | "outputs_hidden": false
93 | },
94 | "nteract": {
95 | "transient": {
96 | "deleting": false
97 | }
98 | }
99 | },
100 | "source": [
101 | "Claim_df=spark.read.parquet(processed_location+\"Claim/\")"
102 | ],
103 | "execution_count": 2
104 | },
105 | {
106 | "cell_type": "code",
107 | "metadata": {
108 | "jupyter": {
109 | "source_hidden": false,
110 | "outputs_hidden": false
111 | },
112 | "nteract": {
113 | "transient": {
114 | "deleting": false
115 | }
116 | },
117 | "collapsed": false
118 | },
119 | "source": [
120 | "display(Claim_df.limit(10))"
121 | ],
122 | "execution_count": 3
123 | },
124 | {
125 | "cell_type": "code",
126 | "metadata": {
127 | "jupyter": {
128 | "source_hidden": false,
129 | "outputs_hidden": false
130 | },
131 | "nteract": {
132 | "transient": {
133 | "deleting": false
134 | }
135 | }
136 | },
137 | "source": [
138 | "Claim_df.printSchema()"
139 | ],
140 | "execution_count": 4
141 | },
142 | {
143 | "cell_type": "code",
144 | "metadata": {
145 | "jupyter": {
146 | "source_hidden": false,
147 | "outputs_hidden": false
148 | },
149 | "nteract": {
150 | "transient": {
151 | "deleting": false
152 | }
153 | }
154 | },
155 | "source": [
156 | "from pyspark.sql.functions import explode\r\n",
157 | "from pyspark.sql.functions import regexp_replace\r\n",
158 | "\r\n",
159 | "Claim_main_explode_df = Claim_df.select(\r\n",
160 | " \"id\",\"resourceType\",\"status\",\"billablePeriod.end\",\"billablePeriod.start\",\"created\",\"patient.display\",\"patient.reference\",\r\n",
161 | " \"prescription.reference\",\"provider.display\",\"provider.reference\",\"total.currency\",\"total.value\",\"use\")\r\n",
162 | "\r\n",
163 | "#use toDF() to rename the columns\r\n",
164 | "Claim_main_df= Claim_main_explode_df.toDF(*( \r\n",
165 | " \"Claim_id\",\"resourceType\",\"status\",\"billablePeriod_end\",\"billablePeriod_start\",\"created\",\r\n",
166 | " \"patient_display\",\"patient_id_reference\",\r\n",
167 | " \"prescription_reference\",\"provider_display\",\"provider_org_id_reference\",\"total_currency\",\"total_value\",\"use\"))\r\n",
168 | "\r\n",
169 | "Claim_main_df = Claim_main_df.withColumn(\"patient_id_reference\",regexp_replace(\"patient_id_reference\",\"Patient/\",\"\")).withColumn(\r\n",
170 | " \"provider_org_id_reference\",regexp_replace(\"provider_org_id_reference\",\"Organization/\",\"\"))\r\n",
171 | "\r\n",
172 | "\r\n",
173 | "# adding schema optimization\r\n",
174 | "# Arshad"
175 | ],
176 | "execution_count": 5
177 | },
178 | {
179 | "cell_type": "code",
180 | "metadata": {
181 | "jupyter": {
182 | "source_hidden": false,
183 | "outputs_hidden": false
184 | },
185 | "nteract": {
186 | "transient": {
187 | "deleting": false
188 | }
189 | }
190 | },
191 | "source": [
192 | "Claim_main_df.printSchema()"
193 | ],
194 | "execution_count": 6
195 | },
196 | {
197 | "cell_type": "code",
198 | "metadata": {
199 | "jupyter": {
200 | "source_hidden": false,
201 | "outputs_hidden": false
202 | },
203 | "nteract": {
204 | "transient": {
205 | "deleting": false
206 | }
207 | },
208 | "collapsed": false
209 | },
210 | "source": [
211 | "display(Claim_main_df.limit(10))"
212 | ],
213 | "execution_count": 7
214 | },
215 | {
216 | "cell_type": "code",
217 | "metadata": {
218 | "jupyter": {
219 | "source_hidden": false,
220 | "outputs_hidden": false
221 | },
222 | "nteract": {
223 | "transient": {
224 | "deleting": false
225 | }
226 | }
227 | },
228 | "source": [
229 | "Claim_main_df.count()"
230 | ],
231 | "execution_count": 8
232 | },
233 | {
234 | "cell_type": "code",
235 | "metadata": {
236 | "jupyter": {
237 | "source_hidden": false,
238 | "outputs_hidden": false
239 | },
240 | "nteract": {
241 | "transient": {
242 | "deleting": false
243 | }
244 | }
245 | },
246 | "source": [
247 | "Claim_main_df.write.mode(write_mode).parquet(curated_location+\"Claim_main/\")"
248 | ],
249 | "execution_count": 16
250 | },
251 | {
252 | "cell_type": "code",
253 | "metadata": {
254 | "jupyter": {
255 | "source_hidden": false,
256 | "outputs_hidden": false
257 | },
258 | "nteract": {
259 | "transient": {
260 | "deleting": false
261 | }
262 | }
263 | },
264 | "source": [
265 | "from pyspark.sql.functions import explode\r\n",
266 | "\r\n",
267 | "Claim_insurance_explode_df = Claim_df.select(\"id\", explode(Claim_df.insurance))\r\n",
268 | "\r\n",
269 | "#use toDF() to rename the columns\r\n",
270 | "Claim_insurance_df= Claim_insurance_explode_df.toDF(*( \"Claim_id\",\"insurance\"))\r\n",
271 | "\r\n",
272 | "Claim_insurance_df = Claim_insurance_df.select(\r\n",
273 | " \"Claim_id\",\"insurance.coverage.display\",\"insurance.focal\",\"insurance.sequence\"\r\n",
274 | " ).toDF(*( \r\n",
275 | " \"Claim_id\",\"insurance_coverage.display\",\"insurance_focal\",\"insurance_sequence\" ))\r\n",
276 | "\r\n",
277 | "# base_df_explode = base_df.select(explode(base_df.entry))"
278 | ],
279 | "execution_count": 9
280 | },
281 | {
282 | "cell_type": "code",
283 | "metadata": {
284 | "jupyter": {
285 | "source_hidden": false,
286 | "outputs_hidden": false
287 | },
288 | "nteract": {
289 | "transient": {
290 | "deleting": false
291 | }
292 | }
293 | },
294 | "source": [
295 | "Claim_insurance_df.printSchema()"
296 | ],
297 | "execution_count": 10
298 | },
299 | {
300 | "cell_type": "code",
301 | "metadata": {
302 | "jupyter": {
303 | "source_hidden": false,
304 | "outputs_hidden": false
305 | },
306 | "nteract": {
307 | "transient": {
308 | "deleting": false
309 | }
310 | },
311 | "collapsed": false
312 | },
313 | "source": [
314 | "display(Claim_insurance_df.limit(10))"
315 | ],
316 | "execution_count": 11
317 | },
318 | {
319 | "cell_type": "code",
320 | "metadata": {
321 | "jupyter": {
322 | "source_hidden": false,
323 | "outputs_hidden": false
324 | },
325 | "nteract": {
326 | "transient": {
327 | "deleting": false
328 | }
329 | }
330 | },
331 | "source": [
332 | "Claim_insurance_df.count()"
333 | ],
334 | "execution_count": 12
335 | },
336 | {
337 | "cell_type": "code",
338 | "metadata": {
339 | "jupyter": {
340 | "source_hidden": false,
341 | "outputs_hidden": false
342 | },
343 | "nteract": {
344 | "transient": {
345 | "deleting": false
346 | }
347 | }
348 | },
349 | "source": [
350 | "Claim_insurance_df.write.mode(write_mode).parquet(curated_location+\"Claim_insurance/\")"
351 | ],
352 | "execution_count": 17
353 | },
354 | {
355 | "cell_type": "code",
356 | "metadata": {
357 | "jupyter": {
358 | "source_hidden": false,
359 | "outputs_hidden": false
360 | },
361 | "nteract": {
362 | "transient": {
363 | "deleting": false
364 | }
365 | }
366 | },
367 | "source": [
368 | "from pyspark.sql.functions import explode\r\n",
369 | "from pyspark.sql.functions import regexp_replace\r\n",
370 | "\r\n",
371 | "Claim_diagnosis_explode_df = Claim_df.select(\r\n",
372 | " \"id\", explode(Claim_df.diagnosis))\r\n",
373 | "\r\n",
374 | "#use toDF() to rename the columns\r\n",
375 | "Claim_diagnosis_df= Claim_diagnosis_explode_df.toDF(*( \"id\", \"diagnosis\"))\r\n",
376 | "\r\n",
377 | "Claim_diagnosis_df = Claim_diagnosis_df.select(\r\n",
378 | " \"id\",\"diagnosis.diagnosisReference.reference\",\"diagnosis.sequence\"\r\n",
379 | " ).toDF(*( \r\n",
380 | " \"Claim_id\",\"diagnosis_reference\",\"diagnosis_sequence\"))\r\n",
381 | "Claim_diagnosis_df=Claim_diagnosis_df.withColumn(\r\n",
382 | " \"diagnosis_reference\",regexp_replace(\"diagnosis_reference\",\"Condition/\",\"\")) "
383 | ],
384 | "execution_count": 3
385 | },
386 | {
387 | "cell_type": "code",
388 | "metadata": {
389 | "jupyter": {
390 | "source_hidden": false,
391 | "outputs_hidden": false
392 | },
393 | "nteract": {
394 | "transient": {
395 | "deleting": false
396 | }
397 | }
398 | },
399 | "source": [
400 | "Claim_diagnosis_df.printSchema()"
401 | ],
402 | "execution_count": 20
403 | },
404 | {
405 | "cell_type": "code",
406 | "metadata": {
407 | "jupyter": {
408 | "source_hidden": false,
409 | "outputs_hidden": false
410 | },
411 | "nteract": {
412 | "transient": {
413 | "deleting": false
414 | }
415 | }
416 | },
417 | "source": [
418 | "Claim_diagnosis_df.write.mode(write_mode).parquet(curated_location+\"Claim_diagnosis/\")"
419 | ],
420 | "execution_count": 4
421 | },
422 | {
423 | "cell_type": "code",
424 | "metadata": {
425 | "jupyter": {
426 | "source_hidden": false,
427 | "outputs_hidden": false
428 | },
429 | "nteract": {
430 | "transient": {
431 | "deleting": false
432 | }
433 | }
434 | },
435 | "source": [
436 | "from pyspark.sql.functions import explode\r\n",
437 | "from pyspark.sql.functions import regexp_replace\r\n",
438 | "\r\n",
439 | "Claim_procedure_explode_df = Claim_df.select(\"id\",explode(Claim_df.procedure))\r\n",
440 | "\r\n",
441 | "#use toDF() to rename the columns\r\n",
442 | "Claim_procedure_df= Claim_procedure_explode_df.toDF(*( \"id\",\"procedure\"))\r\n",
443 | "\r\n",
444 | "Claim_procedure_df = Claim_procedure_df.select(\r\n",
445 | " \"id\",\"procedure.procedureReference.reference\",\"procedure.sequence\"\r\n",
446 | " ).toDF(*(\r\n",
447 | " \"Claim_id\",\"procedure_reference\",\"procedure_sequence\"))\r\n",
448 | "Claim_procedure_df=Claim_procedure_df.withColumn(\"procedure_reference\",regexp_replace(\"procedure_reference\",\"Procedure/\",\"\"))"
449 | ],
450 | "execution_count": 3
451 | },
452 | {
453 | "cell_type": "code",
454 | "metadata": {
455 | "jupyter": {
456 | "source_hidden": false,
457 | "outputs_hidden": false
458 | },
459 | "nteract": {
460 | "transient": {
461 | "deleting": false
462 | }
463 | },
464 | "collapsed": false
465 | },
466 | "source": [
467 | "display(Claim_procedure_df.limit(10))"
468 | ],
469 | "execution_count": 4
470 | },
471 | {
472 | "cell_type": "code",
473 | "metadata": {
474 | "jupyter": {
475 | "source_hidden": false,
476 | "outputs_hidden": false
477 | },
478 | "nteract": {
479 | "transient": {
480 | "deleting": false
481 | }
482 | }
483 | },
484 | "source": [
485 | "Claim_procedure_df.write.mode(write_mode).parquet(curated_location+\"Claim_procedure/\")"
486 | ],
487 | "execution_count": 5
488 | },
489 | {
490 | "cell_type": "code",
491 | "metadata": {
492 | "jupyter": {
493 | "source_hidden": false,
494 | "outputs_hidden": false
495 | },
496 | "nteract": {
497 | "transient": {
498 | "deleting": false
499 | }
500 | }
501 | },
502 | "source": [
503 | "#Claim_procedure_df.write.format(\"delta\").save(curated_location+\"Claim_procedure_delta/\")\r\n",
504 | "#Claim_diagnosis_df.write.format(\"delta\").save(curated_location+\"Claim_diagnosis_delta/\")\r\n",
505 | "#Claim_insurance_df.write.format(\"delta\").save(curated_location+\"Claim_insurance_delta/\")"
506 | ],
507 | "execution_count": 11
508 | }
509 | ]
510 | }
511 | }
--------------------------------------------------------------------------------
/artifacts/notebook/Claim_Ingestion_NDJSON2Parquet.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "Claim_Ingestion_NDJSON2Parquet",
3 | "properties": {
4 | "folder": {
5 | "name": "Claims"
6 | },
7 | "nbformat": 4,
8 | "nbformat_minor": 2,
9 | "bigDataPool": {
10 | "referenceName": "healthcare",
11 | "type": "BigDataPoolReference"
12 | },
13 | "sessionProperties": {
14 | "driverMemory": "112g",
15 | "driverCores": 16,
16 | "executorMemory": "112g",
17 | "executorCores": 16,
18 | "numExecutors": 4,
19 | "conf": {
20 | "spark.dynamicAllocation.enabled": "false",
21 | "spark.dynamicAllocation.minExecutors": "4",
22 | "spark.dynamicAllocation.maxExecutors": "4",
23 | "spark.autotune.trackingId": "c4357b4e-2833-4f00-89f0-e12e26a21fb1"
24 | }
25 | },
26 | "metadata": {
27 | "saveOutput": true,
28 | "enableDebugMode": false,
29 | "kernelspec": {
30 | "name": "synapse_pyspark",
31 | "display_name": "python"
32 | },
33 | "language_info": {
34 | "name": "python"
35 | },
36 | "a365ComputeOptions": {
37 | "id": "/subscriptions/7e416de3-c506-4776-8270-83fd73c6cc37/resourceGroups/syne2e/providers/Microsoft.Synapse/workspaces/health/bigDataPools/healthcare",
38 | "name": "healthcare",
39 | "type": "Spark",
40 | "endpoint": "https://health.dev.azuresynapse.net/livyApi/versions/2019-11-01-preview/sparkPools/healthcare",
41 | "auth": {
42 | "type": "AAD",
43 | "authResource": "https://dev.azuresynapse.net"
44 | },
45 | "sparkVersion": "3.1",
46 | "nodeCount": 3,
47 | "cores": 16,
48 | "memory": 112,
49 | "automaticScaleJobs": false
50 | },
51 | "sessionKeepAliveTimeout": 30
52 | },
53 | "cells": [
54 | {
55 | "cell_type": "code",
56 | "metadata": {
57 | "tags": [
58 | "parameters"
59 | ]
60 | },
61 | "source": [
62 | "StorageName = \"medicaldl\"\r\n",
63 | "DatasetSize = \"1tb\""
64 | ],
65 | "execution_count": null
66 | },
67 | {
68 | "cell_type": "code",
69 | "metadata": {
70 | "jupyter": {
71 | "source_hidden": false,
72 | "outputs_hidden": false
73 | },
74 | "nteract": {
75 | "transient": {
76 | "deleting": false
77 | }
78 | }
79 | },
80 | "source": [
81 | "raw_location = \"abfss://raw@\" + StorageName + \".dfs.core.windows.net/fhir_ndjson/\"+ DatasetSize +\"/*/\"\r\n",
82 | "processed_location = \"abfss://processed@\" + StorageName+ \".dfs.core.windows.net/fhir/\"+ DatasetSize +\"/\"\r\n",
83 | "write_mode=\"overwrite\""
84 | ],
85 | "execution_count": null
86 | },
87 | {
88 | "cell_type": "code",
89 | "metadata": {
90 | "jupyter": {
91 | "source_hidden": false,
92 | "outputs_hidden": false
93 | },
94 | "nteract": {
95 | "transient": {
96 | "deleting": false
97 | }
98 | }
99 | },
100 | "source": [
101 | "Claim_df = spark.read.option(\"multiline\", \"false\").json(raw_location+\"Claim.ndjson\")\r\n",
102 | "Claim_df.write.mode(write_mode).parquet(processed_location+\"Claim/\")\r\n",
103 | "#display(Claim_df.limit(10))\r\n",
104 | "#Claim_df.count()"
105 | ],
106 | "execution_count": null
107 | }
108 | ]
109 | }
110 | }
--------------------------------------------------------------------------------
/artifacts/notebook/Lake Database And Table Creation.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "Lake Database And Table Creation",
3 | "properties": {
4 | "folder": {
5 | "name": "Exploration"
6 | },
7 | "nbformat": 4,
8 | "nbformat_minor": 2,
9 | "bigDataPool": {
10 | "referenceName": "healthcare",
11 | "type": "BigDataPoolReference"
12 | },
13 | "sessionProperties": {
14 | "driverMemory": "112g",
15 | "driverCores": 16,
16 | "executorMemory": "112g",
17 | "executorCores": 16,
18 | "numExecutors": 2,
19 | "conf": {
20 | "spark.dynamicAllocation.enabled": "false",
21 | "spark.dynamicAllocation.minExecutors": "2",
22 | "spark.dynamicAllocation.maxExecutors": "2",
23 | "spark.autotune.trackingId": "cdcb3e0e-61cf-4e33-8866-6df1ba4e31ee"
24 | }
25 | },
26 | "metadata": {
27 | "saveOutput": true,
28 | "enableDebugMode": false,
29 | "kernelspec": {
30 | "name": "synapse_pyspark",
31 | "display_name": "Synapse PySpark"
32 | },
33 | "language_info": {
34 | "name": "python"
35 | },
36 | "a365ComputeOptions": {
37 | "id": "/subscriptions/7e416de3-c506-4776-8270-83fd73c6cc37/resourceGroups/syne2e/providers/Microsoft.Synapse/workspaces/health/bigDataPools/healthcare",
38 | "name": "healthcare",
39 | "type": "Spark",
40 | "endpoint": "https://health.dev.azuresynapse.net/livyApi/versions/2019-11-01-preview/sparkPools/healthcare",
41 | "auth": {
42 | "type": "AAD",
43 | "authResource": "https://dev.azuresynapse.net"
44 | },
45 | "sparkVersion": "3.1",
46 | "nodeCount": 3,
47 | "cores": 16,
48 | "memory": 112,
49 | "automaticScaleJobs": false
50 | },
51 | "sessionKeepAliveTimeout": 30
52 | },
53 | "cells": [
54 | {
55 | "cell_type": "code",
56 | "metadata": {
57 | "jupyter": {
58 | "source_hidden": false,
59 | "outputs_hidden": false
60 | },
61 | "nteract": {
62 | "transient": {
63 | "deleting": false
64 | }
65 | },
66 | "tags": [
67 | "parameters"
68 | ]
69 | },
70 | "source": [
71 | "StorageName = \"medicaldl\"\r\n",
72 | "DatasetSize = \"1tb\""
73 | ],
74 | "execution_count": 36
75 | },
76 | {
77 | "cell_type": "code",
78 | "metadata": {
79 | "microsoft": {
80 | "language": "sparksql"
81 | },
82 | "collapsed": false
83 | },
84 | "source": [
85 | "%%sql\r\n",
86 | "DROP DATABASE IF EXISTS fhirdbexploration CASCADE;\r\n",
87 | "CREATE DATABASE fhirdbexploration;"
88 | ],
89 | "execution_count": 20
90 | },
91 | {
92 | "cell_type": "code",
93 | "metadata": {
94 | "jupyter": {
95 | "source_hidden": false,
96 | "outputs_hidden": false
97 | },
98 | "nteract": {
99 | "transient": {
100 | "deleting": false
101 | }
102 | },
103 | "microsoft": {
104 | "language": "sparksql"
105 | },
106 | "collapsed": false
107 | },
108 | "source": [
109 | "%%sql\r\n",
110 | "USE fhirdbexploration"
111 | ],
112 | "execution_count": 23
113 | },
114 | {
115 | "cell_type": "code",
116 | "metadata": {
117 | "jupyter": {
118 | "source_hidden": false,
119 | "outputs_hidden": false
120 | },
121 | "nteract": {
122 | "transient": {
123 | "deleting": false
124 | }
125 | }
126 | },
127 | "source": [
128 | "curated_location = \"abfss://curated@\" + StorageName + \".dfs.core.windows.net/fhir/\"+ DatasetSize +\"/\""
129 | ],
130 | "execution_count": 33
131 | },
132 | {
133 | "cell_type": "code",
134 | "metadata": {
135 | "jupyter": {
136 | "source_hidden": false,
137 | "outputs_hidden": false
138 | },
139 | "nteract": {
140 | "transient": {
141 | "deleting": false
142 | }
143 | }
144 | },
145 | "source": [
146 | "claimDiagonisLocation = curated_location + \"Claim_diagnosis/\"\r\n",
147 | "spark.conf.set('claimDiagonisLocation',claimDiagonisLocation)\r\n",
148 | "\r\n",
149 | "claimInsuranceLocation = curated_location + \"Claim_insurance/\"\r\n",
150 | "spark.conf.set('claimInsuranceLocation',claimInsuranceLocation)\r\n",
151 | "\r\n",
152 | "claimProcedureLocation = curated_location + \"Claim_procedure/\"\r\n",
153 | "spark.conf.set('claimProcedureLocation',claimProcedureLocation)"
154 | ],
155 | "execution_count": 34
156 | },
157 | {
158 | "cell_type": "code",
159 | "metadata": {
160 | "jupyter": {
161 | "source_hidden": false,
162 | "outputs_hidden": false
163 | },
164 | "nteract": {
165 | "transient": {
166 | "deleting": false
167 | }
168 | },
169 | "microsoft": {
170 | "language": "sparksql"
171 | },
172 | "collapsed": false
173 | },
174 | "source": [
175 | "%%sql\r\n",
176 | "DROP TABLE IF EXISTS fhirdbexploration.claimdiagnosis;\r\n",
177 | "CREATE TABLE fhirdbexploration.claimdiagnosis USING PARQUET LOCATION '${claimDiagonisLocation}';\r\n",
178 | "\r\n",
179 | "DROP TABLE IF EXISTS fhirdbexploration.claiminsurance;\r\n",
180 | "CREATE TABLE fhirdbexploration.claiminsurance USING PARQUET LOCATION '${claimInsuranceLocation}';\r\n",
181 | "\r\n",
182 | "DROP TABLE IF EXISTS fhirdbexploration.claimprocedure;\r\n",
183 | "CREATE TABLE fhirdbexploration.claimprocedure USING PARQUET LOCATION '${claimProcedureLocation}';"
184 | ],
185 | "execution_count": 35
186 | }
187 | ]
188 | }
189 | }
--------------------------------------------------------------------------------
/artifacts/notebook/ObservationParquetFlatten_Large.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "ObservationParquetFlatten_Large",
3 | "properties": {
4 | "folder": {
5 | "name": "Observation"
6 | },
7 | "nbformat": 4,
8 | "nbformat_minor": 2,
9 | "bigDataPool": {
10 | "referenceName": "healthcare",
11 | "type": "BigDataPoolReference"
12 | },
13 | "sessionProperties": {
14 | "driverMemory": "112g",
15 | "driverCores": 16,
16 | "executorMemory": "112g",
17 | "executorCores": 16,
18 | "numExecutors": 4,
19 | "conf": {
20 | "spark.dynamicAllocation.enabled": "false",
21 | "spark.dynamicAllocation.minExecutors": "4",
22 | "spark.dynamicAllocation.maxExecutors": "4",
23 | "spark.autotune.trackingId": "0ff2572c-9472-4bb0-980c-22f1e1f08db5"
24 | }
25 | },
26 | "metadata": {
27 | "saveOutput": true,
28 | "enableDebugMode": false,
29 | "kernelspec": {
30 | "name": "synapse_pyspark",
31 | "display_name": "Synapse PySpark"
32 | },
33 | "language_info": {
34 | "name": "python"
35 | },
36 | "a365ComputeOptions": {
37 | "id": "/subscriptions/7e416de3-c506-4776-8270-83fd73c6cc37/resourceGroups/syne2e/providers/Microsoft.Synapse/workspaces/health/bigDataPools/healthcare",
38 | "name": "healthcare",
39 | "type": "Spark",
40 | "endpoint": "https://health.dev.azuresynapse.net/livyApi/versions/2019-11-01-preview/sparkPools/healthcare",
41 | "auth": {
42 | "type": "AAD",
43 | "authResource": "https://dev.azuresynapse.net"
44 | },
45 | "sparkVersion": "3.1",
46 | "nodeCount": 3,
47 | "cores": 16,
48 | "memory": 112,
49 | "automaticScaleJobs": false
50 | },
51 | "sessionKeepAliveTimeout": 30
52 | },
53 | "cells": [
54 | {
55 | "cell_type": "code",
56 | "metadata": {
57 | "jupyter": {
58 | "source_hidden": false,
59 | "outputs_hidden": false
60 | },
61 | "nteract": {
62 | "transient": {
63 | "deleting": false
64 | }
65 | },
66 | "tags": [
67 | "parameters"
68 | ]
69 | },
70 | "source": [
71 | "StorageName = \"medicaldl\"\r\n",
72 | "DatasetSize = \"1tb\""
73 | ],
74 | "execution_count": null
75 | },
76 | {
77 | "cell_type": "code",
78 | "source": [
79 | "curated_location = \"abfss://curated@\"+ StorageName +\".dfs.core.windows.net/fhir/\"+DatasetSize+\"/\"\r\n",
80 | "processed_location = \"abfss://processed@\"+StorageName+\".dfs.core.windows.net/fhir/\"+DatasetSize+\"/\"\r\n",
81 | "write_mode=\"overwrite\""
82 | ],
83 | "execution_count": 1
84 | },
85 | {
86 | "cell_type": "code",
87 | "metadata": {
88 | "jupyter": {
89 | "source_hidden": false,
90 | "outputs_hidden": false
91 | },
92 | "nteract": {
93 | "transient": {
94 | "deleting": false
95 | }
96 | }
97 | },
98 | "source": [
99 | "Observation_df=spark.read.parquet(processed_location+\"Observation/\")"
100 | ],
101 | "execution_count": 2
102 | },
103 | {
104 | "cell_type": "code",
105 | "metadata": {
106 | "jupyter": {
107 | "source_hidden": false,
108 | "outputs_hidden": true
109 | },
110 | "nteract": {
111 | "transient": {
112 | "deleting": false
113 | }
114 | },
115 | "collapsed": false
116 | },
117 | "source": [
118 | "#display(Observation_df.limit(10))"
119 | ],
120 | "execution_count": 3
121 | },
122 | {
123 | "cell_type": "code",
124 | "metadata": {
125 | "jupyter": {
126 | "source_hidden": false,
127 | "outputs_hidden": false
128 | },
129 | "nteract": {
130 | "transient": {
131 | "deleting": false
132 | }
133 | }
134 | },
135 | "source": [
136 | "from pyspark.sql.functions import regexp_replace\r\n",
137 | "\r\n",
138 | "Observation_df=Observation_df.withColumn(\"subject\",regexp_replace(\"subject.reference\",\"Patient/\",\"\")).withColumn(\"encounter\",regexp_replace(\"encounter.reference\",\"Encounter/\",\"\"))\r\n",
139 | "Observation_df=Observation_df.withColumnRenamed(\"id\",\"Observation_id\")\r\n",
140 | "Observation_df=Observation_df.withColumnRenamed(\"subject\",\"patient_id_reference\")\r\n",
141 | "Observation_df=Observation_df.withColumnRenamed(\"encounter\",\"encounter_id_reference\")"
142 | ],
143 | "execution_count": 3
144 | },
145 | {
146 | "cell_type": "code",
147 | "metadata": {
148 | "jupyter": {
149 | "source_hidden": false,
150 | "outputs_hidden": true
151 | },
152 | "nteract": {
153 | "transient": {
154 | "deleting": false
155 | }
156 | }
157 | },
158 | "source": [
159 | "Observation_df.printSchema()"
160 | ],
161 | "execution_count": 6
162 | },
163 | {
164 | "cell_type": "code",
165 | "metadata": {
166 | "jupyter": {
167 | "source_hidden": false,
168 | "outputs_hidden": false
169 | },
170 | "nteract": {
171 | "transient": {
172 | "deleting": false
173 | }
174 | }
175 | },
176 | "source": [
177 | "from pyspark.sql.functions import explode\r\n",
178 | "\r\n",
179 | "Observation_main_df = Observation_df.select(\r\n",
180 | " \"Observation_id\",\"resourceType\",\"issued\",\"status\", \"patient_id_reference\",\"encounter_id_reference\",\"effectiveDateTime\",\r\n",
181 | " \"valueQuantity.code\",\"valueQuantity.system\",\"valueQuantity.unit\",\"valueQuantity.value\",\"valueString\").toDF(*(\r\n",
182 | " \"Observation_id\",\"resourceType\",\"issued\",\"status\",\"patient_id_reference\",\"encounter_id_reference\",\"effectiveDateTime\",\r\n",
183 | " \"valueQuantity_code\",\"valueQuantity_system\",\r\n",
184 | " \"valueQuantity_unit\",\"valueQuantity_value\",\"valueString\"))"
185 | ],
186 | "execution_count": 8
187 | },
188 | {
189 | "cell_type": "code",
190 | "metadata": {
191 | "jupyter": {
192 | "source_hidden": false,
193 | "outputs_hidden": false
194 | },
195 | "nteract": {
196 | "transient": {
197 | "deleting": false
198 | }
199 | }
200 | },
201 | "source": [
202 | "Observation_main_df.write.mode(write_mode).parquet(curated_location+\"Observation_main/\")\r\n",
203 | "#Observation_main_df.write.format(\"delta\").saveAsTable(\"fhirlakedb.Observation_main\")\r\n",
204 | "#Patient_identifier_df.write.format(\"delta\").save(curated_location+\"Condition_clinicalStatus_delta/\")"
205 | ],
206 | "execution_count": 9
207 | },
208 | {
209 | "cell_type": "code",
210 | "metadata": {
211 | "jupyter": {
212 | "source_hidden": false,
213 | "outputs_hidden": false
214 | },
215 | "nteract": {
216 | "transient": {
217 | "deleting": false
218 | }
219 | }
220 | },
221 | "source": [
222 | "from pyspark.sql.functions import explode\r\n",
223 | "\r\n",
224 | "Observation_category_explode_df = Observation_df.select(\r\n",
225 | " \"Observation_id\",\r\n",
226 | " explode(Observation_df.category)).toDF(*(\"Observation_id\",\"category\"))\r\n",
227 | "\r\n",
228 | "Observation_category_explode2_df = Observation_category_explode_df.select(\r\n",
229 | " \"Observation_id\",\r\n",
230 | " explode(Observation_category_explode_df.category.coding)).toDF(*(\"Observation_id\",\"category\")) \r\n",
231 | "\r\n",
232 | "Observation_category_df = Observation_category_explode2_df.select(\"Observation_id\",\"category.*\").toDF(*(\"Observation_id\",\"category_code\",\"category_display\",\"category_system\"))"
233 | ],
234 | "execution_count": 10
235 | },
236 | {
237 | "cell_type": "code",
238 | "metadata": {
239 | "jupyter": {
240 | "source_hidden": false,
241 | "outputs_hidden": false
242 | },
243 | "nteract": {
244 | "transient": {
245 | "deleting": false
246 | }
247 | }
248 | },
249 | "source": [
250 | "Observation_category_df.write.mode(write_mode).parquet(curated_location+\"Observation_category/\")"
251 | ],
252 | "execution_count": 13
253 | },
254 | {
255 | "cell_type": "code",
256 | "metadata": {
257 | "jupyter": {
258 | "source_hidden": false,
259 | "outputs_hidden": false
260 | },
261 | "nteract": {
262 | "transient": {
263 | "deleting": false
264 | }
265 | }
266 | },
267 | "source": [
268 | "from pyspark.sql.functions import explode\r\n",
269 | "\r\n",
270 | "Observation_code_explode_df = Observation_df.select(\r\n",
271 | " \"Observation_id\",\"code.text\",explode(Observation_df.code.coding)).toDF(*(\"Observation_id\",\"text\",\"coding\"))\r\n",
272 | "Observation_code_df = Observation_code_explode_df.select(\"Observation_id\",\"text\",\"coding.*\").toDF(*(\"Observation_id\",\"code_text\",\"coding_code\",\"coding_display\",\"coding_system\")) "
273 | ],
274 | "execution_count": 15
275 | },
276 | {
277 | "cell_type": "code",
278 | "metadata": {
279 | "jupyter": {
280 | "source_hidden": false,
281 | "outputs_hidden": false
282 | },
283 | "nteract": {
284 | "transient": {
285 | "deleting": false
286 | }
287 | }
288 | },
289 | "source": [
290 | "Observation_code_df.write.mode(write_mode).parquet(curated_location+\"Observation_code/\")"
291 | ],
292 | "execution_count": 16
293 | },
294 | {
295 | "cell_type": "code",
296 | "metadata": {
297 | "jupyter": {
298 | "source_hidden": false,
299 | "outputs_hidden": false
300 | },
301 | "nteract": {
302 | "transient": {
303 | "deleting": false
304 | }
305 | }
306 | },
307 | "source": [
308 | "from pyspark.sql.functions import explode\r\n",
309 | "from pyspark.sql.functions import explode_outer\r\n",
310 | "\r\n",
311 | "Observation_component_explode_df = Observation_df.select(\r\n",
312 | " \"Observation_id\",explode(Observation_df.component))\r\n",
313 | "\r\n",
314 | "Observation_component_explode2_df = Observation_component_explode_df.select(\"Observation_id\",explode_outer(Observation_component_explode_df.col.code.coding),\"col.code.text\",\r\n",
315 | " \"col.valueQuantity.code\",\"col.valueQuantity.system\",\"col.valueQuantity.unit\",\"col.valueQuantity.value\").toDF(*(\"Observation_id\",\"component_code\",\"component_text\",\r\n",
316 | " \"component_valueQuantity_code\",\"component_valueQuantity_system\",\"component_valueQuantity_unit\",\"component_valueQuantity_value\")) \r\n",
317 | "\r\n",
318 | "Observation_component_df = Observation_component_explode2_df.select(\"Observation_id\",\"component_code.*\",\"component_text\",\r\n",
319 | " \"component_valueQuantity_code\",\"component_valueQuantity_system\",\"component_valueQuantity_unit\",\"component_valueQuantity_value\").toDF(*(\"Observation_id\",\"component_code\",\r\n",
320 | " \"component_display\",\"component_system\",\"component_text\",\r\n",
321 | " \"component_valueQuantity_code\",\"component_valueQuantity_system\",\"component_valueQuantity_unit\",\"component_valueQuantity_value\"))"
322 | ],
323 | "execution_count": 4
324 | },
325 | {
326 | "cell_type": "code",
327 | "metadata": {
328 | "jupyter": {
329 | "source_hidden": false,
330 | "outputs_hidden": false
331 | },
332 | "nteract": {
333 | "transient": {
334 | "deleting": false
335 | }
336 | }
337 | },
338 | "source": [
339 | "Observation_component_df.write.mode(write_mode).parquet(curated_location+\"Observation_component/\")"
340 | ],
341 | "execution_count": 7
342 | },
343 | {
344 | "cell_type": "code",
345 | "metadata": {
346 | "jupyter": {
347 | "source_hidden": false,
348 | "outputs_hidden": false
349 | },
350 | "nteract": {
351 | "transient": {
352 | "deleting": false
353 | }
354 | }
355 | },
356 | "source": [
357 | "from pyspark.sql.functions import explode\r\n",
358 | "\r\n",
359 | "Observation_valueCodeableConcept_explode_df = Observation_df.select(\r\n",
360 | " \"Observation_id\",explode(Observation_df.valueCodeableConcept.coding),\"valueCodeableConcept.text\").toDF(*(\"Observation_id\",\"coding\",\"valueCodeableConcept_text\"))\r\n",
361 | "\r\n",
362 | "Observation_valueCodeableConcept_df = Observation_valueCodeableConcept_explode_df.select(\"Observation_id\",\"coding.*\",\"valueCodeableConcept_text\").toDF(*(\"Observation_id\",\r\n",
363 | " \"valueCodeableConcept_code\", \"valueCodeableConcept_display\",\"valueCodeableConcept_system\",\"valueCodeableConcept_text\"))"
364 | ],
365 | "execution_count": 22
366 | },
367 | {
368 | "cell_type": "code",
369 | "metadata": {
370 | "jupyter": {
371 | "source_hidden": false,
372 | "outputs_hidden": false
373 | },
374 | "nteract": {
375 | "transient": {
376 | "deleting": false
377 | }
378 | }
379 | },
380 | "source": [
381 | "Observation_valueCodeableConcept_df.write.mode(write_mode).parquet(curated_location+\"Observation_valueCodeableConcept/\")"
382 | ],
383 | "execution_count": 26
384 | }
385 | ]
386 | }
387 | }
--------------------------------------------------------------------------------
/artifacts/notebook/Observation_Ingestion_NDJSON2Parquet.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "Observation_Ingestion_NDJSON2Parquet",
3 | "properties": {
4 | "folder": {
5 | "name": "Observation"
6 | },
7 | "nbformat": 4,
8 | "nbformat_minor": 2,
9 | "bigDataPool": {
10 | "referenceName": "healthcare",
11 | "type": "BigDataPoolReference"
12 | },
13 | "sessionProperties": {
14 | "driverMemory": "112g",
15 | "driverCores": 16,
16 | "executorMemory": "112g",
17 | "executorCores": 16,
18 | "numExecutors": 4,
19 | "conf": {
20 | "spark.dynamicAllocation.enabled": "false",
21 | "spark.dynamicAllocation.minExecutors": "4",
22 | "spark.dynamicAllocation.maxExecutors": "4",
23 | "spark.autotune.trackingId": "b3cc26e4-f21d-4a99-b27a-9a7b169b7dd3"
24 | }
25 | },
26 | "metadata": {
27 | "saveOutput": true,
28 | "enableDebugMode": false,
29 | "kernelspec": {
30 | "name": "synapse_pyspark",
31 | "display_name": "Synapse PySpark"
32 | },
33 | "language_info": {
34 | "name": "python"
35 | },
36 | "a365ComputeOptions": {
37 | "id": "/subscriptions/7e416de3-c506-4776-8270-83fd73c6cc37/resourceGroups/syne2e/providers/Microsoft.Synapse/workspaces/health/bigDataPools/healthcare",
38 | "name": "healthcare",
39 | "type": "Spark",
40 | "endpoint": "https://health.dev.azuresynapse.net/livyApi/versions/2019-11-01-preview/sparkPools/healthcare",
41 | "auth": {
42 | "type": "AAD",
43 | "authResource": "https://dev.azuresynapse.net"
44 | },
45 | "sparkVersion": "3.1",
46 | "nodeCount": 3,
47 | "cores": 16,
48 | "memory": 112,
49 | "automaticScaleJobs": false
50 | },
51 | "sessionKeepAliveTimeout": 30
52 | },
53 | "cells": [
54 | {
55 | "cell_type": "code",
56 | "metadata": {
57 | "tags": [
58 | "parameters"
59 | ]
60 | },
61 | "source": [
62 | "StorageName = \"medicaldl\"\r\n",
63 | "DatasetSize = \"1tb\""
64 | ],
65 | "execution_count": 2
66 | },
67 | {
68 | "cell_type": "code",
69 | "metadata": {
70 | "jupyter": {
71 | "source_hidden": false,
72 | "outputs_hidden": false
73 | },
74 | "nteract": {
75 | "transient": {
76 | "deleting": false
77 | }
78 | }
79 | },
80 | "source": [
81 | "raw_location = \"abfss://raw@\" + StorageName + \".dfs.core.windows.net/fhir_ndjson/\"+ DatasetSize +\"/*/\"\r\n",
82 | "processed_location = \"abfss://processed@\" + StorageName + \".dfs.core.windows.net/fhir/\"+ DatasetSize +\"/\"\r\n",
83 | "write_mode=\"overwrite\""
84 | ],
85 | "execution_count": null
86 | },
87 | {
88 | "cell_type": "code",
89 | "metadata": {
90 | "jupyter": {
91 | "source_hidden": false,
92 | "outputs_hidden": false
93 | },
94 | "nteract": {
95 | "transient": {
96 | "deleting": false
97 | }
98 | }
99 | },
100 | "source": [
101 | "Observation_df = spark.read.option(\"multiline\", \"false\").json(raw_location+\"Observation.ndjson\")\r\n",
102 | "Observation_df.write.mode(write_mode).parquet(processed_location+\"Observation/\")\r\n",
103 | "#display(Observation_df.limit(10))\r\n",
104 | "#Observation_df.count()"
105 | ],
106 | "execution_count": 3
107 | }
108 | ]
109 | }
110 | }
--------------------------------------------------------------------------------
/artifacts/notebook/Patient_Ingestion_NDJSON2Parquet.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "Patient_Ingestion_NDJSON2Parquet",
3 | "properties": {
4 | "folder": {
5 | "name": "Patient"
6 | },
7 | "nbformat": 4,
8 | "nbformat_minor": 2,
9 | "bigDataPool": {
10 | "referenceName": "healthcare",
11 | "type": "BigDataPoolReference"
12 | },
13 | "sessionProperties": {
14 | "driverMemory": "112g",
15 | "driverCores": 16,
16 | "executorMemory": "112g",
17 | "executorCores": 16,
18 | "numExecutors": 2,
19 | "conf": {
20 | "spark.dynamicAllocation.enabled": "false",
21 | "spark.dynamicAllocation.minExecutors": "2",
22 | "spark.dynamicAllocation.maxExecutors": "2",
23 | "spark.autotune.trackingId": "3c6ed713-0024-4eb0-ba62-6f58dc091de4"
24 | }
25 | },
26 | "metadata": {
27 | "saveOutput": true,
28 | "enableDebugMode": false,
29 | "kernelspec": {
30 | "name": "synapse_pyspark",
31 | "display_name": "python"
32 | },
33 | "language_info": {
34 | "name": "python"
35 | },
36 | "a365ComputeOptions": {
37 | "id": "/subscriptions/7e416de3-c506-4776-8270-83fd73c6cc37/resourceGroups/syne2e/providers/Microsoft.Synapse/workspaces/health/bigDataPools/healthcare",
38 | "name": "healthcare",
39 | "type": "Spark",
40 | "endpoint": "https://health.dev.azuresynapse.net/livyApi/versions/2019-11-01-preview/sparkPools/healthcare",
41 | "auth": {
42 | "type": "AAD",
43 | "authResource": "https://dev.azuresynapse.net"
44 | },
45 | "sparkVersion": "3.1",
46 | "nodeCount": 3,
47 | "cores": 16,
48 | "memory": 112,
49 | "automaticScaleJobs": false
50 | },
51 | "sessionKeepAliveTimeout": 30
52 | },
53 | "cells": [
54 | {
55 | "cell_type": "code",
56 | "metadata": {
57 | "tags": [
58 | "parameters"
59 | ]
60 | },
61 | "source": [
62 | "StorageName = \"medicaldl\"\r\n",
63 | "DatasetSize = \"1tb\""
64 | ],
65 | "execution_count": null
66 | },
67 | {
68 | "cell_type": "code",
69 | "metadata": {
70 | "jupyter": {
71 | "source_hidden": false,
72 | "outputs_hidden": false
73 | },
74 | "nteract": {
75 | "transient": {
76 | "deleting": false
77 | }
78 | }
79 | },
80 | "source": [
81 | "raw_location = \"abfss://raw@\" + StorageName + \".dfs.core.windows.net/fhir_ndjson/\"+ DatasetSize +\"/*/\"\r\n",
82 | "processed_location = \"abfss://processed@\" + StorageName + \".dfs.core.windows.net/fhir/\"+ DatasetSize +\"/\"\r\n",
83 | "write_mode=\"overwrite\""
84 | ],
85 | "execution_count": null
86 | },
87 | {
88 | "cell_type": "code",
89 | "metadata": {
90 | "jupyter": {
91 | "source_hidden": false,
92 | "outputs_hidden": false
93 | },
94 | "nteract": {
95 | "transient": {
96 | "deleting": false
97 | }
98 | }
99 | },
100 | "source": [
101 | "patient_df = spark.read.option(\"multiline\", \"false\").json(raw_location+\"Patient.ndjson\")\r\n",
102 | "patient_df.write.mode(write_mode).parquet(processed_location+\"Patient/\")\r\n",
103 | "#display(patient_df.limit(10))\r\n",
104 | "#patient_df.count()"
105 | ],
106 | "execution_count": null
107 | }
108 | ]
109 | }
110 | }
--------------------------------------------------------------------------------
/artifacts/pipeline/Copy_Data_Source_To_Raw_PL.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "Copy_Data_Source_To_Raw_PL",
3 | "properties": {
4 | "activities": [
5 | {
6 | "name": "Copy Source Data To Raw Container",
7 | "type": "Copy",
8 | "dependsOn": [],
9 | "policy": {
10 | "timeout": "0.12:00:00",
11 | "retry": 0,
12 | "retryIntervalInSeconds": 30,
13 | "secureOutput": false,
14 | "secureInput": false
15 | },
16 | "userProperties": [],
17 | "typeProperties": {
18 | "source": {
19 | "type": "BinarySource",
20 | "storeSettings": {
21 | "type": "AzureBlobStorageReadSettings",
22 | "recursive": true
23 | },
24 | "formatSettings": {
25 | "type": "BinaryReadSettings"
26 | }
27 | },
28 | "sink": {
29 | "type": "BinarySink",
30 | "storeSettings": {
31 | "type": "AzureBlobFSWriteSettings",
32 | "copyBehavior": "PreserveHierarchy"
33 | }
34 | },
35 | "enableStaging": false
36 | },
37 | "inputs": [
38 | {
39 | "referenceName": "Source_DataPrep_DS",
40 | "type": "DatasetReference",
41 | "parameters": {
42 | "DatasetSize": {
43 | "value": "@pipeline().parameters.DatasetSize",
44 | "type": "Expression"
45 | }
46 | }
47 | }
48 | ],
49 | "outputs": [
50 | {
51 | "referenceName": "Sink_DataPrep_DS",
52 | "type": "DatasetReference",
53 | "parameters": {
54 | "StorageName": {
55 | "value": "@pipeline().parameters.TargetStorageName",
56 | "type": "Expression"
57 | },
58 | "DatasetSize": {
59 | "value": "@pipeline().parameters.DatasetSize",
60 | "type": "Expression"
61 | }
62 | }
63 | }
64 | ]
65 | },
66 | {
67 | "name": "Create Curated Container",
68 | "type": "Copy",
69 | "dependsOn": [
70 | {
71 | "activity": "Copy Source Data To Raw Container",
72 | "dependencyConditions": [
73 | "Succeeded"
74 | ]
75 | }
76 | ],
77 | "policy": {
78 | "timeout": "0.12:00:00",
79 | "retry": 0,
80 | "retryIntervalInSeconds": 30,
81 | "secureOutput": false,
82 | "secureInput": false
83 | },
84 | "userProperties": [],
85 | "typeProperties": {
86 | "source": {
87 | "type": "BinarySource",
88 | "storeSettings": {
89 | "type": "AzureBlobStorageReadSettings",
90 | "recursive": true
91 | },
92 | "formatSettings": {
93 | "type": "BinaryReadSettings"
94 | }
95 | },
96 | "sink": {
97 | "type": "BinarySink",
98 | "storeSettings": {
99 | "type": "AzureBlobFSWriteSettings",
100 | "copyBehavior": "PreserveHierarchy"
101 | }
102 | },
103 | "enableStaging": false
104 | },
105 | "inputs": [
106 | {
107 | "referenceName": "Source_DataPrep_Curated_DS",
108 | "type": "DatasetReference"
109 | }
110 | ],
111 | "outputs": [
112 | {
113 | "referenceName": "Sink_DataPrep_Curated_DS",
114 | "type": "DatasetReference",
115 | "parameters": {
116 | "StorageName": {
117 | "value": "@pipeline().parameters.TargetStorageName",
118 | "type": "Expression"
119 | }
120 | }
121 | }
122 | ]
123 | },
124 | {
125 | "name": "Create Processed Container",
126 | "type": "Copy",
127 | "dependsOn": [
128 | {
129 | "activity": "Copy Source Data To Raw Container",
130 | "dependencyConditions": [
131 | "Succeeded"
132 | ]
133 | }
134 | ],
135 | "policy": {
136 | "timeout": "0.12:00:00",
137 | "retry": 0,
138 | "retryIntervalInSeconds": 30,
139 | "secureOutput": false,
140 | "secureInput": false
141 | },
142 | "userProperties": [],
143 | "typeProperties": {
144 | "source": {
145 | "type": "BinarySource",
146 | "storeSettings": {
147 | "type": "AzureBlobStorageReadSettings",
148 | "recursive": true
149 | },
150 | "formatSettings": {
151 | "type": "BinaryReadSettings"
152 | }
153 | },
154 | "sink": {
155 | "type": "BinarySink",
156 | "storeSettings": {
157 | "type": "AzureBlobFSWriteSettings",
158 | "copyBehavior": "PreserveHierarchy"
159 | }
160 | },
161 | "enableStaging": false
162 | },
163 | "inputs": [
164 | {
165 | "referenceName": "Source_DataPrep_Processed_DS",
166 | "type": "DatasetReference"
167 | }
168 | ],
169 | "outputs": [
170 | {
171 | "referenceName": "Sink_DataPrep_Processed_DS",
172 | "type": "DatasetReference",
173 | "parameters": {
174 | "StorageName": {
175 | "value": "@pipeline().parameters.TargetStorageName",
176 | "type": "Expression"
177 | }
178 | }
179 | }
180 | ]
181 | }
182 | ],
183 | "parameters": {
184 | "TargetStorageName": {
185 | "type": "string"
186 | },
187 | "DatasetSize": {
188 | "type": "string",
189 | "defaultValue": "1tb"
190 | }
191 | },
192 | "folder": {
193 | "name": "Data Prep"
194 | },
195 | "annotations": []
196 | }
197 | }
--------------------------------------------------------------------------------
/artifacts/pipeline/FHIR_Pipeline4Claim_Spark_OC.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "FHIR_Pipeline4Claim_Spark_OC",
3 | "properties": {
4 | "activities": [
5 | {
6 | "name": "NDJSON_Ingestion_Claim",
7 | "type": "SynapseNotebook",
8 | "dependsOn": [],
9 | "policy": {
10 | "timeout": "7.00:00:00",
11 | "retry": 0,
12 | "retryIntervalInSeconds": 30,
13 | "secureOutput": false,
14 | "secureInput": false
15 | },
16 | "userProperties": [],
17 | "typeProperties": {
18 | "notebook": {
19 | "referenceName": "Claim_Ingestion_NDJSON2Parquet",
20 | "type": "NotebookReference"
21 | },
22 | "parameters": {
23 | "StorageName": {
24 | "value": {
25 | "value": "@pipeline().parameters.StorageName",
26 | "type": "Expression"
27 | },
28 | "type": "string"
29 | },
30 | "DatasetSize": {
31 | "value": {
32 | "value": "@pipeline().parameters.DatasetSize",
33 | "type": "Expression"
34 | },
35 | "type": "string"
36 | }
37 | },
38 | "snapshot": true,
39 | "sparkPool": {
40 | "referenceName": {
41 | "value": "@pipeline().parameters.SparkPoolName",
42 | "type": "Expression"
43 | },
44 | "type": "BigDataPoolReference"
45 | },
46 | "executorSize": null,
47 | "conf": {
48 | "spark.dynamicAllocation.enabled": null,
49 | "spark.dynamicAllocation.minExecutors": null,
50 | "spark.dynamicAllocation.maxExecutors": null
51 | },
52 | "driverSize": null,
53 | "numExecutors": null
54 | }
55 | },
56 | {
57 | "name": "ClaimParquetFlatten_Large",
58 | "type": "SynapseNotebook",
59 | "dependsOn": [
60 | {
61 | "activity": "NDJSON_Ingestion_Claim",
62 | "dependencyConditions": [
63 | "Succeeded"
64 | ]
65 | }
66 | ],
67 | "policy": {
68 | "timeout": "7.00:00:00",
69 | "retry": 0,
70 | "retryIntervalInSeconds": 30,
71 | "secureOutput": false,
72 | "secureInput": false
73 | },
74 | "userProperties": [],
75 | "typeProperties": {
76 | "notebook": {
77 | "referenceName": "ClaimParquetFlatten_Large",
78 | "type": "NotebookReference"
79 | },
80 | "parameters": {
81 | "StorageName": {
82 | "value": {
83 | "value": "@pipeline().parameters.StorageName",
84 | "type": "Expression"
85 | },
86 | "type": "string"
87 | },
88 | "DatasetSize": {
89 | "value": {
90 | "value": "@pipeline().parameters.DatasetSize",
91 | "type": "Expression"
92 | },
93 | "type": "string"
94 | }
95 | },
96 | "snapshot": true,
97 | "sparkPool": {
98 | "referenceName": {
99 | "value": "@pipeline().parameters.SparkPoolName",
100 | "type": "Expression"
101 | },
102 | "type": "BigDataPoolReference"
103 | },
104 | "conf": {
105 | "spark.dynamicAllocation.enabled": null,
106 | "spark.dynamicAllocation.minExecutors": null,
107 | "spark.dynamicAllocation.maxExecutors": null
108 | },
109 | "numExecutors": null
110 | }
111 | },
112 | {
113 | "name": "ClaimDiagnosis2SQL",
114 | "type": "Copy",
115 | "dependsOn": [
116 | {
117 | "activity": "Create Tables",
118 | "dependencyConditions": [
119 | "Succeeded"
120 | ]
121 | }
122 | ],
123 | "policy": {
124 | "timeout": "7.00:00:00",
125 | "retry": 0,
126 | "retryIntervalInSeconds": 30,
127 | "secureOutput": false,
128 | "secureInput": false
129 | },
130 | "userProperties": [],
131 | "typeProperties": {
132 | "source": {
133 | "type": "ParquetSource",
134 | "storeSettings": {
135 | "type": "AzureBlobFSReadSettings",
136 | "recursive": true,
137 | "wildcardFolderPath": {
138 | "value": "@concat('fhir/',pipeline().parameters.DatasetSize,'/Claim_diagnosis')",
139 | "type": "Expression"
140 | },
141 | "wildcardFileName": "*.parquet",
142 | "enablePartitionDiscovery": false
143 | }
144 | },
145 | "sink": {
146 | "type": "SqlDWSink",
147 | "allowCopyCommand": true,
148 | "tableOption": "autoCreate",
149 | "disableMetricsCollection": false
150 | },
151 | "enableStaging": true,
152 | "stagingSettings": {
153 | "linkedServiceName": {
154 | "referenceName": "StorageLS",
155 | "type": "LinkedServiceReference",
156 | "parameters": {
157 | "StorageName": {
158 | "value": "@pipeline().parameters.StorageName",
159 | "type": "Expression"
160 | }
161 | }
162 | },
163 | "path": "staging"
164 | }
165 | },
166 | "inputs": [
167 | {
168 | "referenceName": "ClaimDiagnosisParquetLarge",
169 | "type": "DatasetReference",
170 | "parameters": {
171 | "StorageName": {
172 | "value": "@pipeline().parameters.StorageName",
173 | "type": "Expression"
174 | },
175 | "FolderPath": {
176 | "value": "@concat('fhir/',pipeline().parameters.DatasetSize,'/Claim_diagnosis')",
177 | "type": "Expression"
178 | }
179 | }
180 | }
181 | ],
182 | "outputs": [
183 | {
184 | "referenceName": "ClaimDiagnosisSQL",
185 | "type": "DatasetReference",
186 | "parameters": {
187 | "DatabaseName": {
188 | "value": "@pipeline().parameters.DatabaseName",
189 | "type": "Expression"
190 | },
191 | "ServerName": {
192 | "value": "@pipeline().parameters.ServerName",
193 | "type": "Expression"
194 | }
195 | }
196 | }
197 | ]
198 | },
199 | {
200 | "name": "ClaimInsurance2SQL",
201 | "type": "Copy",
202 | "dependsOn": [
203 | {
204 | "activity": "Create Tables",
205 | "dependencyConditions": [
206 | "Succeeded"
207 | ]
208 | }
209 | ],
210 | "policy": {
211 | "timeout": "7.00:00:00",
212 | "retry": 0,
213 | "retryIntervalInSeconds": 30,
214 | "secureOutput": false,
215 | "secureInput": false
216 | },
217 | "userProperties": [],
218 | "typeProperties": {
219 | "source": {
220 | "type": "ParquetSource",
221 | "storeSettings": {
222 | "type": "AzureBlobFSReadSettings",
223 | "recursive": true,
224 | "wildcardFolderPath": {
225 | "value": "@concat('fhir/',pipeline().parameters.DatasetSize,'/Claim_insurance')",
226 | "type": "Expression"
227 | },
228 | "wildcardFileName": "*.parquet",
229 | "enablePartitionDiscovery": false
230 | }
231 | },
232 | "sink": {
233 | "type": "SqlDWSink",
234 | "allowCopyCommand": true,
235 | "tableOption": "autoCreate",
236 | "disableMetricsCollection": false
237 | },
238 | "enableStaging": true,
239 | "stagingSettings": {
240 | "linkedServiceName": {
241 | "referenceName": "StorageLS",
242 | "type": "LinkedServiceReference",
243 | "parameters": {
244 | "StorageName": {
245 | "value": "@pipeline().parameters.StorageName",
246 | "type": "Expression"
247 | }
248 | }
249 | },
250 | "path": "staging"
251 | }
252 | },
253 | "inputs": [
254 | {
255 | "referenceName": "ClaimInsuranceParquetLarge",
256 | "type": "DatasetReference",
257 | "parameters": {
258 | "StorageName": {
259 | "value": "@pipeline().parameters.StorageName",
260 | "type": "Expression"
261 | },
262 | "FolderPath": {
263 | "value": "@concat('fhir/',pipeline().parameters.DatasetSize,'/Claim_insurance')",
264 | "type": "Expression"
265 | }
266 | }
267 | }
268 | ],
269 | "outputs": [
270 | {
271 | "referenceName": "ClaimInsurance",
272 | "type": "DatasetReference",
273 | "parameters": {
274 | "DatabaseName": {
275 | "value": "@pipeline().parameters.DatabaseName",
276 | "type": "Expression"
277 | },
278 | "ServerName": {
279 | "value": "@pipeline().parameters.ServerName",
280 | "type": "Expression"
281 | }
282 | }
283 | }
284 | ]
285 | },
286 | {
287 | "name": "ClaimProcedure2SQL",
288 | "type": "Copy",
289 | "dependsOn": [
290 | {
291 | "activity": "Create Tables",
292 | "dependencyConditions": [
293 | "Succeeded"
294 | ]
295 | }
296 | ],
297 | "policy": {
298 | "timeout": "7.00:00:00",
299 | "retry": 0,
300 | "retryIntervalInSeconds": 30,
301 | "secureOutput": false,
302 | "secureInput": false
303 | },
304 | "userProperties": [],
305 | "typeProperties": {
306 | "source": {
307 | "type": "ParquetSource",
308 | "storeSettings": {
309 | "type": "AzureBlobFSReadSettings",
310 | "recursive": true,
311 | "wildcardFolderPath": {
312 | "value": "@concat('fhir/',pipeline().parameters.DatasetSize,'/Claim_procedure')",
313 | "type": "Expression"
314 | },
315 | "wildcardFileName": "*.parquet",
316 | "enablePartitionDiscovery": false
317 | }
318 | },
319 | "sink": {
320 | "type": "SqlDWSink",
321 | "allowCopyCommand": true,
322 | "tableOption": "autoCreate",
323 | "disableMetricsCollection": false
324 | },
325 | "enableStaging": true,
326 | "stagingSettings": {
327 | "linkedServiceName": {
328 | "referenceName": "StorageLS",
329 | "type": "LinkedServiceReference",
330 | "parameters": {
331 | "StorageName": {
332 | "value": "@pipeline().parameters.StorageName",
333 | "type": "Expression"
334 | }
335 | }
336 | },
337 | "path": "staging"
338 | }
339 | },
340 | "inputs": [
341 | {
342 | "referenceName": "ClaimProcedureParquetLarge",
343 | "type": "DatasetReference",
344 | "parameters": {
345 | "StorageName": {
346 | "value": "@pipeline().parameters.StorageName",
347 | "type": "Expression"
348 | },
349 | "FolderPath": {
350 | "value": "@concat('fhir/',pipeline().parameters.DatasetSize,'/Claim_procedure')",
351 | "type": "Expression"
352 | }
353 | }
354 | }
355 | ],
356 | "outputs": [
357 | {
358 | "referenceName": "ClaimProcedureSQL",
359 | "type": "DatasetReference",
360 | "parameters": {
361 | "DatabaseName": {
362 | "value": "@pipeline().parameters.DatabaseName",
363 | "type": "Expression"
364 | },
365 | "ServerName": {
366 | "value": "@pipeline().parameters.ServerName",
367 | "type": "Expression"
368 | }
369 | }
370 | }
371 | ]
372 | },
373 | {
374 | "name": "Create Tables",
375 | "type": "Script",
376 | "dependsOn": [
377 | {
378 | "activity": "ClaimParquetFlatten_Large",
379 | "dependencyConditions": [
380 | "Succeeded"
381 | ]
382 | }
383 | ],
384 | "policy": {
385 | "timeout": "0.12:00:00",
386 | "retry": 0,
387 | "retryIntervalInSeconds": 30,
388 | "secureOutput": false,
389 | "secureInput": false
390 | },
391 | "userProperties": [],
392 | "linkedServiceName": {
393 | "referenceName": "SynapseDedicatedPoolLS",
394 | "type": "LinkedServiceReference",
395 | "parameters": {
396 | "DatabaseName": {
397 | "value": "@pipeline().parameters.DatabaseName",
398 | "type": "Expression"
399 | },
400 | "ServerName": {
401 | "value": "@pipeline().parameters.ServerName",
402 | "type": "Expression"
403 | }
404 | }
405 | },
406 | "typeProperties": {
407 | "scripts": [
408 | {
409 | "type": "Query",
410 | "text": "IF NOT EXISTS (SELECT * FROM sys.schemas WHERE name='fhir')\n\tEXEC('CREATE SCHEMA [fhir]')\n\nIF OBJECT_ID('fhir.ClaimDiagnosis') IS NOT NULL\nBEGIN\n DROP TABLE [fhir].[ClaimDiagnosis]\nEND\n\nCREATE TABLE [fhir].[ClaimDiagnosis]\n( \n\t[Claim_id] [nvarchar](64) NULL,\n\t[diagnosis_reference] [nvarchar](64) NULL,\n\t[diagnosis_sequence] [bigint] NULL\n)\nWITH\n(\n\tDISTRIBUTION = HASH ( [Claim_id] ),\n\tCLUSTERED COLUMNSTORE INDEX\n)\n\nIF OBJECT_ID('fhir.ClaimInsurance') IS NOT NULL\nBEGIN\n DROP TABLE [fhir].[ClaimInsurance]\nEND\n\nCREATE TABLE [fhir].[ClaimInsurance]\n( \n\t[Claim_id] [nvarchar](64) NULL,\n\t[insurance_coverage.display] [nvarchar](64) NULL,\n\t[insurance_focal] [bit] NULL,\n\t[insurance_sequence] [bigint] NULL\n)\nWITH\n(\n\tDISTRIBUTION = HASH ( [Claim_id] ),\n\tCLUSTERED COLUMNSTORE INDEX\n)\n\nIF OBJECT_ID('fhir.ClaimProcedure') IS NOT NULL\nBEGIN\n DROP TABLE [fhir].[ClaimProcedure]\nEND\n\nCREATE TABLE [fhir].[ClaimProcedure]\n( \n\t[Claim_id] [nvarchar](64) NULL,\n\t[procedure_reference] [nvarchar](64) NULL,\n\t[procedure_sequence] [bigint] NULL\n)\nWITH\n(\n\tDISTRIBUTION = HASH ( [Claim_id] ),\n\tCLUSTERED COLUMNSTORE INDEX\n)"
411 | }
412 | ]
413 | }
414 | },
415 | {
416 | "name": "LakeDatabase And Table Creation",
417 | "type": "SynapseNotebook",
418 | "dependsOn": [
419 | {
420 | "activity": "ClaimParquetFlatten_Large",
421 | "dependencyConditions": [
422 | "Succeeded"
423 | ]
424 | }
425 | ],
426 | "policy": {
427 | "timeout": "7.00:00:00",
428 | "retry": 0,
429 | "retryIntervalInSeconds": 30,
430 | "secureOutput": false,
431 | "secureInput": false
432 | },
433 | "userProperties": [],
434 | "typeProperties": {
435 | "notebook": {
436 | "referenceName": "Lake Database And Table Creation",
437 | "type": "NotebookReference"
438 | },
439 | "parameters": {
440 | "StorageName": {
441 | "value": {
442 | "value": "@pipeline().parameters.StorageName",
443 | "type": "Expression"
444 | },
445 | "type": "string"
446 | },
447 | "DatasetSize": {
448 | "value": {
449 | "value": "@pipeline().parameters.DatasetSize",
450 | "type": "Expression"
451 | },
452 | "type": "string"
453 | }
454 | },
455 | "snapshot": true,
456 | "sparkPool": {
457 | "referenceName": {
458 | "value": "@pipeline().parameters.SparkPoolName",
459 | "type": "Expression"
460 | },
461 | "type": "BigDataPoolReference"
462 | },
463 | "conf": {
464 | "spark.dynamicAllocation.enabled": null,
465 | "spark.dynamicAllocation.minExecutors": null,
466 | "spark.dynamicAllocation.maxExecutors": null
467 | },
468 | "numExecutors": null
469 | }
470 | }
471 | ],
472 | "parameters": {
473 | "StorageName": {
474 | "type": "string",
475 | "defaultValue": "synapsee2elake"
476 | },
477 | "DatabaseName": {
478 | "type": "string",
479 | "defaultValue": "synapsee2edw"
480 | },
481 | "ServerName": {
482 | "type": "string",
483 | "defaultValue": "synapsee2e"
484 | },
485 | "SparkPoolName": {
486 | "type": "string",
487 | "defaultValue": "synapsee2espark"
488 | },
489 | "DatasetSize": {
490 | "type": "string",
491 | "defaultValue": "1tb"
492 | }
493 | },
494 | "folder": {
495 | "name": "Claims"
496 | },
497 | "annotations": []
498 | }
499 | }
--------------------------------------------------------------------------------
/artifacts/pipeline/FHIR_Pipeline4Observation_Spark_OC.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "FHIR_Pipeline4Observation_Spark_OC",
3 | "properties": {
4 | "activities": [
5 | {
6 | "name": "NDJSON_Ingestion_Observation",
7 | "type": "SynapseNotebook",
8 | "dependsOn": [],
9 | "policy": {
10 | "timeout": "0.12:00:00",
11 | "retry": 0,
12 | "retryIntervalInSeconds": 30,
13 | "secureOutput": false,
14 | "secureInput": false
15 | },
16 | "userProperties": [],
17 | "typeProperties": {
18 | "notebook": {
19 | "referenceName": "Observation_Ingestion_NDJSON2Parquet",
20 | "type": "NotebookReference"
21 | },
22 | "parameters": {
23 | "StorageName": {
24 | "value": {
25 | "value": "@pipeline().parameters.StorageName",
26 | "type": "Expression"
27 | },
28 | "type": "string"
29 | }
30 | },
31 | "snapshot": true,
32 | "sparkPool": {
33 | "referenceName": {
34 | "value": "@pipeline().parameters.SparkPoolName",
35 | "type": "Expression"
36 | },
37 | "type": "BigDataPoolReference"
38 | },
39 | "executorSize": null,
40 | "conf": {
41 | "spark.dynamicAllocation.enabled": null,
42 | "spark.dynamicAllocation.minExecutors": null,
43 | "spark.dynamicAllocation.maxExecutors": null
44 | },
45 | "driverSize": null,
46 | "numExecutors": null
47 | }
48 | },
49 | {
50 | "name": "ObservationParquetFlatten_Large",
51 | "type": "SynapseNotebook",
52 | "dependsOn": [
53 | {
54 | "activity": "NDJSON_Ingestion_Observation",
55 | "dependencyConditions": [
56 | "Succeeded"
57 | ]
58 | }
59 | ],
60 | "policy": {
61 | "timeout": "0.12:00:00",
62 | "retry": 0,
63 | "retryIntervalInSeconds": 30,
64 | "secureOutput": false,
65 | "secureInput": false
66 | },
67 | "userProperties": [],
68 | "typeProperties": {
69 | "notebook": {
70 | "referenceName": "ObservationParquetFlatten_Large",
71 | "type": "NotebookReference"
72 | },
73 | "parameters": {
74 | "StorageName": {
75 | "value": {
76 | "value": "@pipeline().parameters.StorageName",
77 | "type": "Expression"
78 | },
79 | "type": "string"
80 | }
81 | },
82 | "snapshot": true,
83 | "sparkPool": {
84 | "referenceName": {
85 | "value": "@pipeline().parameters.SparkPoolName",
86 | "type": "Expression"
87 | },
88 | "type": "BigDataPoolReference"
89 | },
90 | "executorSize": null,
91 | "conf": {
92 | "spark.dynamicAllocation.enabled": null,
93 | "spark.dynamicAllocation.minExecutors": null,
94 | "spark.dynamicAllocation.maxExecutors": null
95 | },
96 | "driverSize": null,
97 | "numExecutors": null
98 | }
99 | },
100 | {
101 | "name": "Observation_Parquet_large2SQL",
102 | "type": "Copy",
103 | "dependsOn": [
104 | {
105 | "activity": "Create Tables",
106 | "dependencyConditions": [
107 | "Succeeded"
108 | ]
109 | }
110 | ],
111 | "policy": {
112 | "timeout": "0.12:00:00",
113 | "retry": 0,
114 | "retryIntervalInSeconds": 30,
115 | "secureOutput": false,
116 | "secureInput": false
117 | },
118 | "userProperties": [],
119 | "typeProperties": {
120 | "source": {
121 | "type": "ParquetSource",
122 | "storeSettings": {
123 | "type": "AzureBlobFSReadSettings",
124 | "recursive": true,
125 | "wildcardFolderPath": {
126 | "value": "@concat('fhir/',pipeline().parameters.DatasetSize,'/Observation_main')",
127 | "type": "Expression"
128 | },
129 | "wildcardFileName": "*.parquet",
130 | "enablePartitionDiscovery": false
131 | }
132 | },
133 | "sink": {
134 | "type": "SqlDWSink",
135 | "allowCopyCommand": true,
136 | "tableOption": "autoCreate",
137 | "disableMetricsCollection": false
138 | },
139 | "enableStaging": true,
140 | "stagingSettings": {
141 | "linkedServiceName": {
142 | "referenceName": "StorageLS",
143 | "type": "LinkedServiceReference",
144 | "parameters": {
145 | "StorageName": {
146 | "value": "@pipeline().parameters.StorageName",
147 | "type": "Expression"
148 | }
149 | }
150 | },
151 | "path": "staging"
152 | }
153 | },
154 | "inputs": [
155 | {
156 | "referenceName": "ObservationMain_LargeParquet",
157 | "type": "DatasetReference",
158 | "parameters": {
159 | "StorageName": {
160 | "value": "@pipeline().parameters.StorageName",
161 | "type": "Expression"
162 | },
163 | "DatasetSize": {
164 | "value": "@pipeline().parameters.DatasetSize",
165 | "type": "Expression"
166 | }
167 | }
168 | }
169 | ],
170 | "outputs": [
171 | {
172 | "referenceName": "Observation_SQLDS",
173 | "type": "DatasetReference",
174 | "parameters": {
175 | "DatabaseName": {
176 | "value": "@pipeline().parameters.DatabaseName",
177 | "type": "Expression"
178 | },
179 | "ServerName": {
180 | "value": "@pipeline().parameters.ServerName",
181 | "type": "Expression"
182 | }
183 | }
184 | }
185 | ]
186 | },
187 | {
188 | "name": "Create Tables",
189 | "type": "Script",
190 | "dependsOn": [
191 | {
192 | "activity": "ObservationParquetFlatten_Large",
193 | "dependencyConditions": [
194 | "Succeeded"
195 | ]
196 | }
197 | ],
198 | "policy": {
199 | "timeout": "0.12:00:00",
200 | "retry": 0,
201 | "retryIntervalInSeconds": 30,
202 | "secureOutput": false,
203 | "secureInput": false
204 | },
205 | "userProperties": [],
206 | "linkedServiceName": {
207 | "referenceName": "SynapseDedicatedPoolLS",
208 | "type": "LinkedServiceReference",
209 | "parameters": {
210 | "DatabaseName": {
211 | "value": "@pipeline().parameters.DatabaseName",
212 | "type": "Expression"
213 | },
214 | "ServerName": {
215 | "value": "@pipeline().parameters.ServerName",
216 | "type": "Expression"
217 | }
218 | }
219 | },
220 | "typeProperties": {
221 | "scripts": [
222 | {
223 | "type": "Query",
224 | "text": "IF OBJECT_ID('fhir.ObservationMain') IS NOT NULL\r\nBEGIN\r\n DROP TABLE [fhir].[ObservationMain]\r\nEND\r\n\r\nCREATE TABLE [fhir].[ObservationMain]\r\n( \r\n\t[Observation_id] [varchar](64) NULL,\r\n\t[resourceType] [varchar](100) NULL,\r\n\t[issued] VARCHAR(30) NULL,\r\n\t[status] [varchar](10) NULL,\r\n\t[patient_id_reference] [varchar](64) NULL,\r\n\t[encounter_id_reference] [varchar](64) NULL,\r\n\t[effectiveDateTime] VARCHAR(30) NULL,\r\n\t[valueQuantity_code] [varchar](50) NULL,\r\n\t[valueQuantity_system] [varchar](100) NULL,\r\n\t[valueQuantity_unit] [varchar](50) NULL,\r\n\t[valueQuantity_value] [float] NULL,\r\n\t[valueString] [nvarchar](200) NULL\r\n)\r\nWITH\r\n(\r\n\tDISTRIBUTION = ROUND_ROBIN,\r\n\tHEAP\r\n)"
225 | }
226 | ]
227 | }
228 | }
229 | ],
230 | "parameters": {
231 | "StorageName": {
232 | "type": "string",
233 | "defaultValue": "synapsee2elake"
234 | },
235 | "DatabaseName": {
236 | "type": "string",
237 | "defaultValue": "synapsee2edw"
238 | },
239 | "ServerName": {
240 | "type": "string",
241 | "defaultValue": "synapsee2e"
242 | },
243 | "SparkPoolName": {
244 | "type": "string",
245 | "defaultValue": "synapsee2espark"
246 | },
247 | "DatasetSize": {
248 | "type": "string",
249 | "defaultValue": "1tb"
250 | }
251 | },
252 | "folder": {
253 | "name": "Observation"
254 | },
255 | "annotations": []
256 | }
257 | }
--------------------------------------------------------------------------------
/artifacts/pipeline/FHIR_Pipeline4Patient_DataFlow_OC.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "FHIR_Pipeline4Patient_DataFlow_OC",
3 | "properties": {
4 | "activities": [
5 | {
6 | "name": "NDJSON_Ingestion_Patient",
7 | "type": "SynapseNotebook",
8 | "dependsOn": [],
9 | "policy": {
10 | "timeout": "7.00:00:00",
11 | "retry": 0,
12 | "retryIntervalInSeconds": 30,
13 | "secureOutput": false,
14 | "secureInput": false
15 | },
16 | "userProperties": [],
17 | "typeProperties": {
18 | "notebook": {
19 | "referenceName": "Patient_Ingestion_NDJSON2Parquet",
20 | "type": "NotebookReference"
21 | },
22 | "parameters": {
23 | "StorageName": {
24 | "value": {
25 | "value": "@pipeline().parameters.StorageName",
26 | "type": "Expression"
27 | },
28 | "type": "string"
29 | },
30 | "DatasetSize": {
31 | "value": {
32 | "value": "@pipeline().parameters.DatasetSize",
33 | "type": "Expression"
34 | },
35 | "type": "string"
36 | }
37 | },
38 | "snapshot": true,
39 | "sparkPool": {
40 | "referenceName": {
41 | "value": "@pipeline().parameters.SparkPoolName",
42 | "type": "Expression"
43 | },
44 | "type": "BigDataPoolReference"
45 | },
46 | "conf": {
47 | "spark.dynamicAllocation.enabled": null,
48 | "spark.dynamicAllocation.minExecutors": null,
49 | "spark.dynamicAllocation.maxExecutors": null
50 | },
51 | "numExecutors": null
52 | }
53 | },
54 | {
55 | "name": "PatientParquet2Sink",
56 | "type": "ExecuteDataFlow",
57 | "dependsOn": [
58 | {
59 | "activity": "NDJSON_Ingestion_Patient",
60 | "dependencyConditions": [
61 | "Succeeded"
62 | ]
63 | }
64 | ],
65 | "policy": {
66 | "timeout": "1.00:00:00",
67 | "retry": 0,
68 | "retryIntervalInSeconds": 30,
69 | "secureOutput": false,
70 | "secureInput": false
71 | },
72 | "userProperties": [],
73 | "typeProperties": {
74 | "dataflow": {
75 | "referenceName": "PatientJSON_Flatten_large",
76 | "type": "DataFlowReference",
77 | "datasetParameters": {
78 | "PatientNDJSON": {
79 | "StorageName": {
80 | "value": "@pipeline().parameters.StorageName",
81 | "type": "Expression"
82 | },
83 | "DatasetSize": {
84 | "value": "@pipeline().parameters.DatasetSize",
85 | "type": "Expression"
86 | }
87 | },
88 | "sinkPatientIdentifier": {
89 | "StorageName": {
90 | "value": "@pipeline().parameters.StorageName",
91 | "type": "Expression"
92 | },
93 | "DatasetSize": {
94 | "value": "@pipeline().parameters.DatasetSize",
95 | "type": "Expression"
96 | }
97 | },
98 | "sinkPatientExtension": {
99 | "StorageName": {
100 | "value": "@pipeline().parameters.StorageName",
101 | "type": "Expression"
102 | },
103 | "DatasetSize": {
104 | "value": "@pipeline().parameters.DatasetSize",
105 | "type": "Expression"
106 | }
107 | },
108 | "sinkPatientAddress": {
109 | "StorageName": {
110 | "value": "@pipeline().parameters.StorageName",
111 | "type": "Expression"
112 | },
113 | "DatasetSize": {
114 | "value": "@pipeline().parameters.DatasetSize",
115 | "type": "Expression"
116 | }
117 | }
118 | }
119 | },
120 | "compute": {
121 | "coreCount": 8,
122 | "computeType": "General"
123 | },
124 | "traceLevel": "Fine",
125 | "runConcurrently": true
126 | }
127 | },
128 | {
129 | "name": "PatientAddress_large2SQL",
130 | "type": "Copy",
131 | "dependsOn": [
132 | {
133 | "activity": "Create Tables",
134 | "dependencyConditions": [
135 | "Succeeded"
136 | ]
137 | }
138 | ],
139 | "policy": {
140 | "timeout": "7.00:00:00",
141 | "retry": 0,
142 | "retryIntervalInSeconds": 30,
143 | "secureOutput": false,
144 | "secureInput": false
145 | },
146 | "userProperties": [],
147 | "typeProperties": {
148 | "source": {
149 | "type": "ParquetSource",
150 | "storeSettings": {
151 | "type": "AzureBlobFSReadSettings",
152 | "recursive": true,
153 | "wildcardFolderPath": {
154 | "value": "@concat('fhir/',pipeline().parameters.DatasetSize,'/PatientAddress')",
155 | "type": "Expression"
156 | },
157 | "wildcardFileName": "*.parquet",
158 | "enablePartitionDiscovery": false
159 | }
160 | },
161 | "sink": {
162 | "type": "SqlDWSink",
163 | "allowCopyCommand": true,
164 | "tableOption": "autoCreate",
165 | "disableMetricsCollection": false
166 | },
167 | "enableStaging": true,
168 | "stagingSettings": {
169 | "linkedServiceName": {
170 | "referenceName": "StorageLS",
171 | "type": "LinkedServiceReference",
172 | "parameters": {
173 | "StorageName": {
174 | "value": "@pipeline().parameters.StorageName",
175 | "type": "Expression"
176 | }
177 | }
178 | },
179 | "path": "staging"
180 | },
181 | "translator": {
182 | "type": "TabularTranslator",
183 | "mappings": [
184 | {
185 | "source": {
186 | "name": "id",
187 | "type": "String"
188 | },
189 | "sink": {
190 | "name": "id",
191 | "type": "String"
192 | }
193 | },
194 | {
195 | "source": {
196 | "name": "address.city",
197 | "type": "String"
198 | },
199 | "sink": {
200 | "name": "address.city",
201 | "type": "String"
202 | }
203 | },
204 | {
205 | "source": {
206 | "name": "address.country",
207 | "type": "String"
208 | },
209 | "sink": {
210 | "name": "address.country",
211 | "type": "String"
212 | }
213 | },
214 | {
215 | "source": {
216 | "name": "address.extension.extension.url",
217 | "type": "String"
218 | },
219 | "sink": {
220 | "name": "address.extension.extension.url",
221 | "type": "String"
222 | }
223 | },
224 | {
225 | "source": {
226 | "name": "address.extension.extension.valueDecimal",
227 | "type": "Double"
228 | },
229 | "sink": {
230 | "name": "address.extension.extension.valueDecimal",
231 | "type": "Double"
232 | }
233 | },
234 | {
235 | "source": {
236 | "name": "address.extension.url",
237 | "type": "String"
238 | },
239 | "sink": {
240 | "name": "address.extension.url",
241 | "type": "String"
242 | }
243 | },
244 | {
245 | "source": {
246 | "name": "address.postalCode",
247 | "type": "String"
248 | },
249 | "sink": {
250 | "name": "address.postalCode",
251 | "type": "String"
252 | }
253 | },
254 | {
255 | "source": {
256 | "name": "address.state",
257 | "type": "String"
258 | },
259 | "sink": {
260 | "name": "address.state",
261 | "type": "String"
262 | }
263 | }
264 | ]
265 | }
266 | },
267 | "inputs": [
268 | {
269 | "referenceName": "PatientAddressParquetLarge",
270 | "type": "DatasetReference",
271 | "parameters": {
272 | "StorageName": {
273 | "value": "@pipeline().parameters.StorageName",
274 | "type": "Expression"
275 | },
276 | "DatasetSize": {
277 | "value": "@pipeline().parameters.DatasetSize",
278 | "type": "Expression"
279 | }
280 | }
281 | }
282 | ],
283 | "outputs": [
284 | {
285 | "referenceName": "PatientAddressSQL",
286 | "type": "DatasetReference",
287 | "parameters": {
288 | "DatabaseName": {
289 | "value": "@pipeline().parameters.DatabaseName",
290 | "type": "Expression"
291 | },
292 | "ServerName": {
293 | "value": "@pipeline().parameters.ServerName",
294 | "type": "Expression"
295 | }
296 | }
297 | }
298 | ]
299 | },
300 | {
301 | "name": "PatientIdentifier_large2SQL",
302 | "type": "Copy",
303 | "dependsOn": [
304 | {
305 | "activity": "Create Tables",
306 | "dependencyConditions": [
307 | "Succeeded"
308 | ]
309 | }
310 | ],
311 | "policy": {
312 | "timeout": "7.00:00:00",
313 | "retry": 0,
314 | "retryIntervalInSeconds": 30,
315 | "secureOutput": false,
316 | "secureInput": false
317 | },
318 | "userProperties": [],
319 | "typeProperties": {
320 | "source": {
321 | "type": "ParquetSource",
322 | "storeSettings": {
323 | "type": "AzureBlobFSReadSettings",
324 | "recursive": true,
325 | "wildcardFolderPath": {
326 | "value": "@concat('fhir/',pipeline().parameters.DatasetSize,'/PatientIdentifier')",
327 | "type": "Expression"
328 | },
329 | "wildcardFileName": "*.parquet",
330 | "enablePartitionDiscovery": false
331 | }
332 | },
333 | "sink": {
334 | "type": "SqlDWSink",
335 | "allowCopyCommand": true,
336 | "tableOption": "autoCreate",
337 | "disableMetricsCollection": false
338 | },
339 | "enableStaging": true,
340 | "stagingSettings": {
341 | "linkedServiceName": {
342 | "referenceName": "StorageLS",
343 | "type": "LinkedServiceReference",
344 | "parameters": {
345 | "StorageName": {
346 | "value": "@pipeline().parameters.StorageName",
347 | "type": "Expression"
348 | }
349 | }
350 | },
351 | "path": "staging"
352 | },
353 | "translator": {
354 | "type": "TabularTranslator",
355 | "mappings": [
356 | {
357 | "source": {
358 | "name": "patient_id",
359 | "type": "String"
360 | },
361 | "sink": {
362 | "name": "patient_id",
363 | "type": "String"
364 | }
365 | },
366 | {
367 | "source": {
368 | "name": "birthDate",
369 | "type": "String"
370 | },
371 | "sink": {
372 | "name": "birthDate",
373 | "type": "String"
374 | }
375 | },
376 | {
377 | "source": {
378 | "name": "deceasedDateTime",
379 | "type": "String"
380 | },
381 | "sink": {
382 | "name": "deceasedDateTime",
383 | "type": "String"
384 | }
385 | },
386 | {
387 | "source": {
388 | "name": "gender",
389 | "type": "String"
390 | },
391 | "sink": {
392 | "name": "gender",
393 | "type": "String"
394 | }
395 | },
396 | {
397 | "source": {
398 | "name": "text",
399 | "type": "String"
400 | },
401 | "sink": {
402 | "name": "text",
403 | "type": "String"
404 | }
405 | },
406 | {
407 | "source": {
408 | "name": "multipleBirthBoolean",
409 | "type": "Boolean"
410 | },
411 | "sink": {
412 | "name": "multipleBirthBoolean",
413 | "type": "Boolean"
414 | }
415 | },
416 | {
417 | "source": {
418 | "name": "multipleBirthInteger",
419 | "type": "Int64"
420 | },
421 | "sink": {
422 | "name": "multipleBirthInteger",
423 | "type": "Int64"
424 | }
425 | },
426 | {
427 | "source": {
428 | "name": "resourceType",
429 | "type": "String"
430 | },
431 | "sink": {
432 | "name": "resourceType",
433 | "type": "String"
434 | }
435 | },
436 | {
437 | "source": {
438 | "name": "div",
439 | "type": "String"
440 | },
441 | "sink": {
442 | "name": "div",
443 | "type": "String"
444 | }
445 | },
446 | {
447 | "source": {
448 | "name": "status",
449 | "type": "String"
450 | },
451 | "sink": {
452 | "name": "status",
453 | "type": "String"
454 | }
455 | },
456 | {
457 | "source": {
458 | "name": "identifier.system",
459 | "type": "String"
460 | },
461 | "sink": {
462 | "name": "identifier.system",
463 | "type": "String"
464 | }
465 | },
466 | {
467 | "source": {
468 | "name": "identifier.type.coding.code",
469 | "type": "String"
470 | },
471 | "sink": {
472 | "name": "identifier.type.coding.code",
473 | "type": "String"
474 | }
475 | },
476 | {
477 | "source": {
478 | "name": "identifier.type.coding.display",
479 | "type": "String"
480 | },
481 | "sink": {
482 | "name": "identifier.type.coding.display",
483 | "type": "String"
484 | }
485 | },
486 | {
487 | "source": {
488 | "name": "identifier.type.coding.system",
489 | "type": "String"
490 | },
491 | "sink": {
492 | "name": "identifier.type.coding.system",
493 | "type": "String"
494 | }
495 | },
496 | {
497 | "source": {
498 | "name": "identifier.type.text",
499 | "type": "String"
500 | },
501 | "sink": {
502 | "name": "identifier.type.text",
503 | "type": "String"
504 | }
505 | },
506 | {
507 | "source": {
508 | "name": "identifier.value",
509 | "type": "String"
510 | },
511 | "sink": {
512 | "name": "identifier.value",
513 | "type": "String"
514 | }
515 | }
516 | ]
517 | }
518 | },
519 | "inputs": [
520 | {
521 | "referenceName": "PatientIdentifierParquetLarge",
522 | "type": "DatasetReference",
523 | "parameters": {
524 | "StorageName": {
525 | "value": "@pipeline().parameters.StorageName",
526 | "type": "Expression"
527 | },
528 | "DatasetSize": {
529 | "value": "@pipeline().parameters.DatasetSize",
530 | "type": "Expression"
531 | }
532 | }
533 | }
534 | ],
535 | "outputs": [
536 | {
537 | "referenceName": "PatientIdentifierSQLLarge",
538 | "type": "DatasetReference",
539 | "parameters": {
540 | "DatabaseName": {
541 | "value": "@pipeline().parameters.DatabaseName",
542 | "type": "Expression"
543 | },
544 | "ServerName": {
545 | "value": "@pipeline().parameters.ServerName",
546 | "type": "Expression"
547 | }
548 | }
549 | }
550 | ]
551 | },
552 | {
553 | "name": "Create Tables",
554 | "type": "Script",
555 | "dependsOn": [
556 | {
557 | "activity": "PatientParquet2Sink",
558 | "dependencyConditions": [
559 | "Succeeded"
560 | ]
561 | }
562 | ],
563 | "policy": {
564 | "timeout": "0.12:00:00",
565 | "retry": 0,
566 | "retryIntervalInSeconds": 30,
567 | "secureOutput": false,
568 | "secureInput": false
569 | },
570 | "userProperties": [],
571 | "linkedServiceName": {
572 | "referenceName": "SynapseDedicatedPoolLS",
573 | "type": "LinkedServiceReference",
574 | "parameters": {
575 | "DatabaseName": {
576 | "value": "@pipeline().parameters.DatabaseName",
577 | "type": "Expression"
578 | },
579 | "ServerName": {
580 | "value": "@pipeline().parameters.ServerName",
581 | "type": "Expression"
582 | }
583 | }
584 | },
585 | "typeProperties": {
586 | "scripts": [
587 | {
588 | "type": "Query",
589 | "text": "IF OBJECT_ID('fhir.PatientAddress') IS NOT NULL\r\nBEGIN\r\n DROP TABLE [fhir].[PatientAddress]\r\nEND\r\n\r\n\r\nCREATE TABLE [fhir].[PatientAddress]\r\n( \r\n\t[id] [nvarchar](64) NULL,\r\n\t[address.city] [nvarchar](50) NULL,\r\n\t[address.country] [nvarchar](50) NULL,\r\n\t[address.extension.extension.url] [nvarchar](50) NULL,\r\n\t[address.extension.extension.valueDecimal] [float] NULL,\r\n\t[address.extension.url] [nvarchar](1000) NULL,\r\n\t[address.postalCode] [nvarchar](50) NULL,\r\n\t[address.state] [nvarchar](50) NULL\r\n)\r\nWITH\r\n(\r\n\tDISTRIBUTION = ROUND_ROBIN,\r\n\tHEAP\r\n)\r\n\r\nIF OBJECT_ID('fhir.PatientIdentifier') IS NOT NULL\r\nBEGIN\r\n DROP TABLE [fhir].[PatientIdentifier]\r\nEND\r\n\r\nCREATE TABLE [fhir].[PatientIdentifier]\r\n( \r\n\t[patient_id] [nvarchar](64) NULL,\r\n\t[birthDate] [nvarchar](200) NULL,\r\n\t[deceasedDateTime] [nvarchar](200) NULL,\r\n\t[gender] [nvarchar](50) NULL,\r\n\t[text] [nvarchar](1000) NULL,\r\n\t[multipleBirthBoolean] [bit] NULL,\r\n\t[multipleBirthInteger] [bigint] NULL,\r\n\t[resourceType] [nvarchar](100) NULL,\r\n\t[div] [nvarchar](max) NULL,\r\n\t[status] [nvarchar](500) NULL,\r\n\t[identifier.system] [nvarchar](2000) NULL,\r\n\t[identifier.type.coding.code] [nvarchar](500) NULL,\r\n\t[identifier.type.coding.display] [nvarchar](1000) NULL,\r\n\t[identifier.type.coding.system] [nvarchar](1000) NULL,\r\n\t[identifier.type.text] [nvarchar](1000) NULL,\r\n\t[identifier.value] [nvarchar](640) NULL\r\n)\r\nWITH\r\n(\r\n\tDISTRIBUTION = ROUND_ROBIN,\r\n\tHEAP\r\n)"
590 | }
591 | ]
592 | }
593 | }
594 | ],
595 | "parameters": {
596 | "StorageName": {
597 | "type": "string",
598 | "defaultValue": "synapsee2elake"
599 | },
600 | "DatabaseName": {
601 | "type": "string",
602 | "defaultValue": "synapsee2edw"
603 | },
604 | "ServerName": {
605 | "type": "string",
606 | "defaultValue": "synapsee2e"
607 | },
608 | "SparkPoolName": {
609 | "type": "string",
610 | "defaultValue": "synapsee2espark"
611 | },
612 | "DatasetSize": {
613 | "type": "string",
614 | "defaultValue": "1tb"
615 | }
616 | },
617 | "folder": {
618 | "name": "Patient"
619 | },
620 | "annotations": []
621 | }
622 | }
--------------------------------------------------------------------------------
/artifacts/publish_config.json:
--------------------------------------------------------------------------------
1 | {"publishBranch":"workspace_publish"}
--------------------------------------------------------------------------------
/artifacts/sqlscript/JSON_exploration_w_Serverless_Demo_OC.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "JSON_exploration_w_Serverless_Demo_OC",
3 | "properties": {
4 | "description": "Json data (FHIR NDJSON) exploration with Serverless",
5 | "folder": {
6 | "name": "Exploration"
7 | },
8 | "content": {
9 | "query": "SELECT TOP 100 *\nFROM\n OPENROWSET(\n BULK 'https://medicaldl.dfs.core.windows.net/raw/fhir_ndjson/1tb/*/Observation.ndjson',\n FORMAT = 'CSV',\n FIELDQUOTE = '0x0b',\n FIELDTERMINATOR ='0x0b'\n )\n WITH (\n line varchar(max)\n ) AS [result]\n\nSELECT TOP 100 *\nFROM\n OPENROWSET(\n BULK 'Observation.ndjson',\n DATA_SOURCE = 'JSONSource',\n FORMAT = 'CSV',\n FIELDQUOTE = '0x0b',\n FIELDTERMINATOR ='0x0b'\n )\n WITH (\n line varchar(max)\n ) AS [result]\n\n/* Flatten the NDJSO with JSON_VALUE, JSON_QUERY on CROSS APPLY OPENJSON */\n/* refreence: https://diangermishuizen.com/query-json-data-in-sql-server-and-synapse-analytics/ */\n/* Query 1, with only JSON_VALUE, JSON_QUERY */\nSELECT top 100\n JSON_VALUE(line, '$.resourceType') AS resourceType,\n JSON_VALUE(line, '$.id') AS id,\n JSON_VALUE(line, '$.status') AS status,\n JSON_query(line, '$.category') AS category_string ,\n JSON_query(line, '$.code') AS code_string\nFROM\n OPENROWSET(\n BULK 'Observation.ndjson',\n DATA_SOURCE = 'JSONSource',\n FORMAT = 'CSV',\n -- FIELDQUOTE and FIELDTERMINATOR are set to 0x0b as we do not expect to find it in the file.\n FIELDQUOTE = '0x0b',\n FIELDTERMINATOR ='0x0b'\n )\n WITH (\n line varchar(max)\n ) AS [result]\n\n/* Query 2, add CROSS APPLY OPENJSON to read array */\nSELECT top 100\n JSON_VALUE(line, '$.resourceType') AS resourceType,\n JSON_VALUE(line, '$.id') AS id,\n JSON_VALUE(line, '$.status') AS status,\n JSON_query(line, '$.valueQuantity') AS valueQuantity_string,\n valueQuantity_NestedArray_value,\n valueQuantity_NestedArray_unit ,\n JSON_query(line, '$.category') AS category_string\nFROM\n OPENROWSET(\n BULK 'Observation.ndjson',\n DATA_SOURCE = 'JSONSource',\n FORMAT = 'CSV',\n -- FIELDQUOTE and FIELDTERMINATOR are set to 0x0b as we do not expect to find it in the file.\n FIELDQUOTE = '0x0b',\n FIELDTERMINATOR ='0x0b'\n )\n WITH (\n line varchar(max)\n ) AS [result]\nCROSS APPLY OPENJSON \n (JSON_QUERY([line], '$.valueQuantity')) /*Note, if you want only the top most record from this array, replace this line with \"(JSON_QUERY([jsonContent], '$.attribute_with_nested_array[0]'))\"*/\nWITH(\n [valueQuantity_NestedArray_value] varchar(255) '$.value',\n [valueQuantity_NestedArray_unit] varchar(255) '$.unit'\n) AS [valueQuantity_NestedArray]\n\n\n/* Query 3, multiple CROSS APPLY OPENJSON to read */\nSELECT top 100\n JSON_VALUE(line, '$.resourceType') AS resourceType,\n JSON_VALUE(line, '$.id') AS id,\n JSON_VALUE(line, '$.status') AS status,\n JSON_query(line, '$.valueQuantity') AS valueQuantity_string,\n valueQuantity_NestedArray_value,\n valueQuantity_NestedArray_unit ,\n JSON_query(line, '$.category') AS category_string,\n encounter_reference\nFROM\n OPENROWSET(\n BULK 'Observation.ndjson',\n DATA_SOURCE = 'JSONSource',\n FORMAT = 'CSV',\n -- FIELDQUOTE and FIELDTERMINATOR are set to 0x0b as we do not expect to find it in the file.\n FIELDQUOTE = '0x0b',\n FIELDTERMINATOR ='0x0b'\n )\n WITH (\n line varchar(max)\n ) AS [result]\nCROSS APPLY OPENJSON \n (JSON_QUERY([line], '$.valueQuantity')) /*Note, if you want only the top most record from this array, replace this line with \"(JSON_QUERY([jsonContent], '$.attribute_with_nested_array[0]'))\"*/\nWITH(\n [valueQuantity_NestedArray_value] varchar(255) '$.value',\n [valueQuantity_NestedArray_unit] varchar(255) '$.unit'\n) AS [valueQuantity_NestedArray]\nCROSS APPLY OPENJSON \n (JSON_QUERY([line], '$.encounter')) \nWITH(\n [encounter_reference] varchar(255) '$.reference'\n) AS [encounter_reference_NestedArray]",
10 | "metadata": {
11 | "language": "sql"
12 | },
13 | "currentConnection": {
14 | "databaseName": "FHIRRef",
15 | "poolName": "Built-in"
16 | },
17 | "resultLimit": 5000
18 | },
19 | "type": "SqlQuery"
20 | }
21 | }
--------------------------------------------------------------------------------
/artifacts/sqlscript/Spark DB Exploration Scripts.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "Spark DB Exploration Scripts",
3 | "properties": {
4 | "folder": {
5 | "name": "Exploration"
6 | },
7 | "content": {
8 | "query": "SELECT TOP (100) [Claim_id]\n,[insurance_coverage.display]\n,[insurance_focal]\n,[insurance_sequence]\n FROM [fhirdbexploration].[dbo].[claiminsurance]\n\n SELECT TOP (100) [Claim_id]\n,[procedure_reference]\n,[procedure_sequence]\n FROM [fhirdbexploration].[dbo].[claimprocedure]\n\n SELECT TOP (100) [Claim_id]\n,[diagnosis_reference]\n,[diagnosis_sequence]\n FROM [fhirdbexploration].[dbo].[patientdianosis]",
9 | "metadata": {
10 | "language": "sql"
11 | },
12 | "currentConnection": {
13 | "databaseName": "FHIRRef",
14 | "poolName": "Built-in"
15 | },
16 | "resultLimit": 5000
17 | },
18 | "type": "SqlQuery"
19 | }
20 | }
--------------------------------------------------------------------------------
/artifacts/sqlscript/Table Row Count.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "Table Row Count",
3 | "properties": {
4 | "content": {
5 | "query": "SELECT COUNT(*)\nFROM [fhir].[ClaimDiagnosis]\n\nSELECT COUNT(*)\nFROM [fhir].[ClaimInsurance]\n\nSELECT COUNT(*)\nFROM [fhir].[ClaimProcedure]\n\nSELECT COUNT(*)\nFROM [fhir].[PatientAddress]\n\nSELECT COUNT(*)\nFROM [fhir].[PatientIdentifier]",
6 | "metadata": {
7 | "language": "sql"
8 | },
9 | "currentConnection": {
10 | "databaseName": "synapsee2edw",
11 | "poolName": "synapsee2edw"
12 | },
13 | "resultLimit": 5000
14 | },
15 | "type": "SqlQuery"
16 | }
17 | }
--------------------------------------------------------------------------------
/mybigdata/credential/WorkspaceSystemIdentity.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "WorkspaceSystemIdentity",
3 | "properties": {
4 | "type": "ManagedIdentity"
5 | }
6 | }
--------------------------------------------------------------------------------
/mybigdata/integrationRuntime/AutoResolveIntegrationRuntime.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "AutoResolveIntegrationRuntime",
3 | "properties": {
4 | "type": "Managed",
5 | "typeProperties": {
6 | "computeProperties": {
7 | "location": "AutoResolve",
8 | "dataFlowProperties": {
9 | "computeType": "General",
10 | "coreCount": 8,
11 | "timeToLive": 0
12 | }
13 | }
14 | }
15 | }
16 | }
--------------------------------------------------------------------------------
/mybigdata/linkedService/mybigdatademows-WorkspaceDefaultSqlServer.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "mybigdatademows-WorkspaceDefaultSqlServer",
3 | "type": "Microsoft.Synapse/workspaces/linkedservices",
4 | "properties": {
5 | "typeProperties": {
6 | "connectionString": "Data Source=tcp:mybigdatademows.sql.azuresynapse.net,1433;Initial Catalog=@{linkedService().DBName}"
7 | },
8 | "parameters": {
9 | "DBName": {
10 | "type": "String"
11 | }
12 | },
13 | "type": "AzureSqlDW",
14 | "connectVia": {
15 | "referenceName": "AutoResolveIntegrationRuntime",
16 | "type": "IntegrationRuntimeReference"
17 | },
18 | "annotations": []
19 | }
20 | }
--------------------------------------------------------------------------------
/mybigdata/linkedService/mybigdatademows-WorkspaceDefaultStorage.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "mybigdatademows-WorkspaceDefaultStorage",
3 | "type": "Microsoft.Synapse/workspaces/linkedservices",
4 | "properties": {
5 | "typeProperties": {
6 | "url": "https://mybigdatademos.dfs.core.windows.net/"
7 | },
8 | "type": "AzureBlobFS",
9 | "connectVia": {
10 | "referenceName": "AutoResolveIntegrationRuntime",
11 | "type": "IntegrationRuntimeReference"
12 | },
13 | "annotations": []
14 | }
15 | }
--------------------------------------------------------------------------------
/mybigdata/publish_config.json:
--------------------------------------------------------------------------------
1 | {"publishBranch":"workspace_publish"}
--------------------------------------------------------------------------------