├── .gitignore ├── ARMTemplate ├── azuredeploy.json ├── azuredeploy.parameters.json └── metadata.json ├── Exercise00-Setup ├── README.20230512.001.old └── README.md ├── Exercise01-Claims └── README.md ├── Exercise02-Observations └── README.md ├── Exercise03-Patients └── README.md ├── Images ├── Forking.gif ├── IngestingGif.gif ├── KQLgif01.gif ├── KQLgif02.gif ├── KQLgif03.gif ├── KQLgif04.gif ├── KQLgif05.gif ├── KQLgif06.gif ├── KQLgif07.gif ├── KQLgif08.gif ├── KQLgif09.gif └── deploytoazure.svg ├── LICENSE ├── README.md ├── SECURITY.md ├── Troubleshooting └── Readme.md ├── artifacts ├── Observation Table Creation.kql ├── Observations Analytics w KQL.kql ├── credential │ └── WorkspaceSystemIdentity.json ├── dataflow │ └── PatientJSON_Flatten_large.json ├── dataset │ ├── ClaimDiagnosisParquetLarge.json │ ├── ClaimDiagnosisSQL.json │ ├── ClaimInsurance.json │ ├── ClaimInsuranceParquetLarge.json │ ├── ClaimProcedureParquetLarge.json │ ├── ClaimProcedureSQL.json │ ├── ObservationMain_LargeParquet.json │ ├── Observation_SQLDS.json │ ├── PatientAddressParquetLarge.json │ ├── PatientAddressSQL.json │ ├── PatientExtensionParquetLarge.json │ ├── PatientIdentifierParquetLarge.json │ ├── PatientIdentifierSQLLarge.json │ ├── PatientRawParquetLarge.json │ ├── Sink_DataPrep_Curated_DS.json │ ├── Sink_DataPrep_DS.json │ ├── Sink_DataPrep_Processed_DS.json │ ├── Source_DataPrep_Curated_DS.json │ ├── Source_DataPrep_DS.json │ └── Source_DataPrep_Processed_DS.json ├── integrationRuntime │ └── AutoResolveIntegrationRuntime.json ├── linkedService │ ├── Source_Dataset_LS.json │ ├── StorageLS.json │ └── SynapseDedicatedPoolLS.json ├── notebook │ ├── ClaimParquetFlatten_Large.json │ ├── Claim_Ingestion_NDJSON2Parquet.json │ ├── Lake Database And Table Creation.json │ ├── ObservationParquetFlatten_Large.json │ ├── Observation_Ingestion_NDJSON2Parquet.json │ └── Patient_Ingestion_NDJSON2Parquet.json ├── pipeline │ ├── Copy_Data_Source_To_Raw_PL.json │ ├── FHIR_Pipeline4Claim_Spark_OC.json │ ├── FHIR_Pipeline4Observation_Spark_OC.json │ └── FHIR_Pipeline4Patient_DataFlow_OC.json ├── publish_config.json └── sqlscript │ ├── JSON_exploration_w_Serverless_Demo_OC.json │ ├── Spark DB Exploration Scripts.json │ └── Table Row Count.json └── mybigdata ├── credential └── WorkspaceSystemIdentity.json ├── integrationRuntime └── AutoResolveIntegrationRuntime.json ├── linkedService ├── mybigdatademows-WorkspaceDefaultSqlServer.json └── mybigdatademows-WorkspaceDefaultStorage.json └── publish_config.json /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | ## 4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore 5 | 6 | # User-specific files 7 | *.rsuser 8 | *.suo 9 | *.user 10 | *.userosscache 11 | *.sln.docstates 12 | 13 | # User-specific files (MonoDevelop/Xamarin Studio) 14 | *.userprefs 15 | 16 | # Mono auto generated files 17 | mono_crash.* 18 | 19 | # Build results 20 | [Dd]ebug/ 21 | [Dd]ebugPublic/ 22 | [Rr]elease/ 23 | [Rr]eleases/ 24 | x64/ 25 | x86/ 26 | [Aa][Rr][Mm]/ 27 | [Aa][Rr][Mm]64/ 28 | bld/ 29 | [Bb]in/ 30 | [Oo]bj/ 31 | [Ll]og/ 32 | [Ll]ogs/ 33 | 34 | # Visual Studio 2015/2017 cache/options directory 35 | .vs/ 36 | # Uncomment if you have tasks that create the project's static files in wwwroot 37 | #wwwroot/ 38 | 39 | # Visual Studio 2017 auto generated files 40 | Generated\ Files/ 41 | 42 | # MSTest test Results 43 | [Tt]est[Rr]esult*/ 44 | [Bb]uild[Ll]og.* 45 | 46 | # NUnit 47 | *.VisualState.xml 48 | TestResult.xml 49 | nunit-*.xml 50 | 51 | # Build Results of an ATL Project 52 | [Dd]ebugPS/ 53 | [Rr]eleasePS/ 54 | dlldata.c 55 | 56 | # Benchmark Results 57 | BenchmarkDotNet.Artifacts/ 58 | 59 | # .NET Core 60 | project.lock.json 61 | project.fragment.lock.json 62 | artifacts/ 63 | 64 | # StyleCop 65 | StyleCopReport.xml 66 | 67 | # Files built by Visual Studio 68 | *_i.c 69 | *_p.c 70 | *_h.h 71 | *.ilk 72 | *.meta 73 | *.obj 74 | *.iobj 75 | *.pch 76 | *.pdb 77 | *.ipdb 78 | *.pgc 79 | *.pgd 80 | *.rsp 81 | *.sbr 82 | *.tlb 83 | *.tli 84 | *.tlh 85 | *.tmp 86 | *.tmp_proj 87 | *_wpftmp.csproj 88 | *.log 89 | *.vspscc 90 | *.vssscc 91 | .builds 92 | *.pidb 93 | *.svclog 94 | *.scc 95 | 96 | # Chutzpah Test files 97 | _Chutzpah* 98 | 99 | # Visual C++ cache files 100 | ipch/ 101 | *.aps 102 | *.ncb 103 | *.opendb 104 | *.opensdf 105 | *.sdf 106 | *.cachefile 107 | *.VC.db 108 | *.VC.VC.opendb 109 | 110 | # Visual Studio profiler 111 | *.psess 112 | *.vsp 113 | *.vspx 114 | *.sap 115 | 116 | # Visual Studio Trace Files 117 | *.e2e 118 | 119 | # TFS 2012 Local Workspace 120 | $tf/ 121 | 122 | # Guidance Automation Toolkit 123 | *.gpState 124 | 125 | # ReSharper is a .NET coding add-in 126 | _ReSharper*/ 127 | *.[Rr]e[Ss]harper 128 | *.DotSettings.user 129 | 130 | # TeamCity is a build add-in 131 | _TeamCity* 132 | 133 | # DotCover is a Code Coverage Tool 134 | *.dotCover 135 | 136 | # AxoCover is a Code Coverage Tool 137 | .axoCover/* 138 | !.axoCover/settings.json 139 | 140 | # Visual Studio code coverage results 141 | *.coverage 142 | *.coveragexml 143 | 144 | # NCrunch 145 | _NCrunch_* 146 | .*crunch*.local.xml 147 | nCrunchTemp_* 148 | 149 | # MightyMoose 150 | *.mm.* 151 | AutoTest.Net/ 152 | 153 | # Web workbench (sass) 154 | .sass-cache/ 155 | 156 | # Installshield output folder 157 | [Ee]xpress/ 158 | 159 | # DocProject is a documentation generator add-in 160 | DocProject/buildhelp/ 161 | DocProject/Help/*.HxT 162 | DocProject/Help/*.HxC 163 | DocProject/Help/*.hhc 164 | DocProject/Help/*.hhk 165 | DocProject/Help/*.hhp 166 | DocProject/Help/Html2 167 | DocProject/Help/html 168 | 169 | # Click-Once directory 170 | publish/ 171 | 172 | # Publish Web Output 173 | *.[Pp]ublish.xml 174 | *.azurePubxml 175 | # Note: Comment the next line if you want to checkin your web deploy settings, 176 | # but database connection strings (with potential passwords) will be unencrypted 177 | *.pubxml 178 | *.publishproj 179 | 180 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 181 | # checkin your Azure Web App publish settings, but sensitive information contained 182 | # in these scripts will be unencrypted 183 | PublishScripts/ 184 | 185 | # NuGet Packages 186 | *.nupkg 187 | # NuGet Symbol Packages 188 | *.snupkg 189 | # The packages folder can be ignored because of Package Restore 190 | **/[Pp]ackages/* 191 | # except build/, which is used as an MSBuild target. 192 | !**/[Pp]ackages/build/ 193 | # Uncomment if necessary however generally it will be regenerated when needed 194 | #!**/[Pp]ackages/repositories.config 195 | # NuGet v3's project.json files produces more ignorable files 196 | *.nuget.props 197 | *.nuget.targets 198 | 199 | # Microsoft Azure Build Output 200 | csx/ 201 | *.build.csdef 202 | 203 | # Microsoft Azure Emulator 204 | ecf/ 205 | rcf/ 206 | 207 | # Windows Store app package directories and files 208 | AppPackages/ 209 | BundleArtifacts/ 210 | Package.StoreAssociation.xml 211 | _pkginfo.txt 212 | *.appx 213 | *.appxbundle 214 | *.appxupload 215 | 216 | # Visual Studio cache files 217 | # files ending in .cache can be ignored 218 | *.[Cc]ache 219 | # but keep track of directories ending in .cache 220 | !?*.[Cc]ache/ 221 | 222 | # Others 223 | ClientBin/ 224 | ~$* 225 | *~ 226 | *.dbmdl 227 | *.dbproj.schemaview 228 | *.jfm 229 | *.pfx 230 | *.publishsettings 231 | orleans.codegen.cs 232 | 233 | # Including strong name files can present a security risk 234 | # (https://github.com/github/gitignore/pull/2483#issue-259490424) 235 | #*.snk 236 | 237 | # Since there are multiple workflows, uncomment next line to ignore bower_components 238 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 239 | #bower_components/ 240 | 241 | # RIA/Silverlight projects 242 | Generated_Code/ 243 | 244 | # Backup & report files from converting an old project file 245 | # to a newer Visual Studio version. Backup files are not needed, 246 | # because we have git ;-) 247 | _UpgradeReport_Files/ 248 | Backup*/ 249 | UpgradeLog*.XML 250 | UpgradeLog*.htm 251 | ServiceFabricBackup/ 252 | *.rptproj.bak 253 | 254 | # SQL Server files 255 | *.mdf 256 | *.ldf 257 | *.ndf 258 | 259 | # Business Intelligence projects 260 | *.rdl.data 261 | *.bim.layout 262 | *.bim_*.settings 263 | *.rptproj.rsuser 264 | *- [Bb]ackup.rdl 265 | *- [Bb]ackup ([0-9]).rdl 266 | *- [Bb]ackup ([0-9][0-9]).rdl 267 | 268 | # Microsoft Fakes 269 | FakesAssemblies/ 270 | 271 | # GhostDoc plugin setting file 272 | *.GhostDoc.xml 273 | 274 | # Node.js Tools for Visual Studio 275 | .ntvs_analysis.dat 276 | node_modules/ 277 | 278 | # Visual Studio 6 build log 279 | *.plg 280 | 281 | # Visual Studio 6 workspace options file 282 | *.opt 283 | 284 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.) 285 | *.vbw 286 | 287 | # Visual Studio LightSwitch build output 288 | **/*.HTMLClient/GeneratedArtifacts 289 | **/*.DesktopClient/GeneratedArtifacts 290 | **/*.DesktopClient/ModelManifest.xml 291 | **/*.Server/GeneratedArtifacts 292 | **/*.Server/ModelManifest.xml 293 | _Pvt_Extensions 294 | 295 | # Paket dependency manager 296 | .paket/paket.exe 297 | paket-files/ 298 | 299 | # FAKE - F# Make 300 | .fake/ 301 | 302 | # CodeRush personal settings 303 | .cr/personal 304 | 305 | # Python Tools for Visual Studio (PTVS) 306 | __pycache__/ 307 | *.pyc 308 | 309 | # Cake - Uncomment if you are using it 310 | # tools/** 311 | # !tools/packages.config 312 | 313 | # Tabs Studio 314 | *.tss 315 | 316 | # Telerik's JustMock configuration file 317 | *.jmconfig 318 | 319 | # BizTalk build output 320 | *.btp.cs 321 | *.btm.cs 322 | *.odx.cs 323 | *.xsd.cs 324 | 325 | # OpenCover UI analysis results 326 | OpenCover/ 327 | 328 | # Azure Stream Analytics local run output 329 | ASALocalRun/ 330 | 331 | # MSBuild Binary and Structured Log 332 | *.binlog 333 | 334 | # NVidia Nsight GPU debugger configuration file 335 | *.nvuser 336 | 337 | # MFractors (Xamarin productivity tool) working folder 338 | .mfractor/ 339 | 340 | # Local History for Visual Studio 341 | .localhistory/ 342 | 343 | # BeatPulse healthcheck temp database 344 | healthchecksdb 345 | 346 | # Backup folder for Package Reference Convert tool in Visual Studio 2017 347 | MigrationBackup/ 348 | 349 | # Ionide (cross platform F# VS Code tools) working folder 350 | .ionide/ -------------------------------------------------------------------------------- /ARMTemplate/azuredeploy.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", 3 | "contentVersion": "1.0.0.0", 4 | "parameters": { 5 | "allowAllConnections": { 6 | "type": "string", 7 | "allowedValues": [ 8 | "true", 9 | "false" 10 | ], 11 | "defaultValue": "true" 12 | }, 13 | "StorageAcccountName": { 14 | "type": "string", 15 | "defaultValue": "", 16 | "metadata": { 17 | "description": "Name of the StorageAccount" 18 | } 19 | }, 20 | "StorageContainerName": { 21 | "type": "string", 22 | "defaultValue": "", 23 | "metadata": { 24 | "description": "Name of the Container in Storage Account" 25 | } 26 | }, 27 | "WorkspaceName": { 28 | "type": "string", 29 | "defaultValue": "", 30 | "metadata": { 31 | "description": "Name of the Synapse Workspace" 32 | } 33 | }, 34 | "ManagedResourceGroupName": { 35 | "type": "string", 36 | "defaultValue": "", 37 | "metadata": { 38 | "description": "Name of the Managed Resource Group for Synapse" 39 | } 40 | }, 41 | "SqlPoolName": { 42 | "type": "string", 43 | "defaultValue": "", 44 | "metadata": { 45 | "description": "Name of the dedicated sql pool" 46 | } 47 | }, 48 | "SparkPoolName": { 49 | "type": "string", 50 | "defaultValue": "", 51 | "maxLength": 15, 52 | "metadata": { 53 | "description": "Name of the synapse spark pool" 54 | } 55 | }, 56 | "sparkDeployment": { 57 | "type": "string", 58 | "defaultValue": "true", 59 | "allowedValues": [ 60 | "true", 61 | "false" 62 | ], 63 | "metadata": { 64 | "description": "'True' deploys an Apache Spark pool as well as a SQL pool. 'False' does not deploy an Apache Spark pool." 65 | } 66 | }, 67 | "sparkNodeSize": { 68 | "type": "string", 69 | "defaultValue": "Medium", 70 | "allowedValues": [ 71 | "Small", 72 | "Medium", 73 | "Large" 74 | ], 75 | "metadata": { 76 | "description": "This parameter will determine the node size if SparkDeployment is true" 77 | } 78 | }, 79 | "sqlAdministratorLogin": { 80 | "type": "string", 81 | "metadata": { 82 | "description": "The username of the SQL Administrator" 83 | } 84 | }, 85 | "sqlAdministratorLoginPassword": { 86 | "type": "securestring", 87 | "metadata": { 88 | "description": "The password for the SQL Administrator" 89 | } 90 | }, 91 | "sku": { 92 | "type": "string", 93 | "defaultValue": "DW1000c", 94 | "allowedValues": [ 95 | "DW100c", 96 | "DW200c", 97 | "DW300c", 98 | "DW400c", 99 | "DW500c", 100 | "DW1000c", 101 | "DW1500c", 102 | "DW2000c", 103 | "DW2500c", 104 | "DW3000c" 105 | ], 106 | "metadata": { 107 | "description": "Select the SKU of the SQL pool." 108 | } 109 | }, 110 | "metadataSync": { 111 | "type": "bool", 112 | "defaultValue": false, 113 | "metadata": { 114 | "description": "Choose whether you want to synchronise metadata." 115 | } 116 | }, 117 | "githubUsername": { 118 | "type": "string", 119 | "defaultValue": "", 120 | "metadata": { 121 | "description": "Username of your github account hosting synapse workspace resources" 122 | } 123 | } 124 | }, 125 | "variables": { 126 | "_artifactsLocation": "[deployment().properties.templatelink.uri]", 127 | "location": "[resourceGroup().location]", 128 | "deploymentType": "synapseworkspacedeployment", 129 | "dlsName": "[toLower(parameters('StorageAcccountName'))]", 130 | "dlsFsName": "[toLower(parameters('StorageContainerName'))]", 131 | "sqlPoolName": "[toLower(parameters('SqlPoolName'))]", 132 | "workspaceName": "[toLower(parameters('WorkspaceName'))]", 133 | "sparkPoolName": "[toLower(parameters('SparkPoolName'))]", 134 | "storageAccountId": "[resourceId('Microsoft.Storage/storageAccounts', variables('dlsName'))]" 135 | }, 136 | "resources": [ 137 | { 138 | "type": "Microsoft.Storage/storageAccounts", 139 | "apiVersion": "2019-06-01", 140 | "name": "[variables('dlsName')]", 141 | "location": "[variables('location')]", 142 | "sku": { 143 | "name": "Standard_LRS" 144 | }, 145 | "kind": "StorageV2", 146 | "properties": { 147 | "accessTier": "Hot", 148 | "supportsHttpsTrafficOnly": true, 149 | "isHnsEnabled": true 150 | }, 151 | "resources": [ 152 | { 153 | "name": "[concat('default/', variables('dlsFsName'))]", 154 | "type": "blobServices/containers", 155 | "apiVersion": "2019-06-01", 156 | "dependsOn": [ 157 | "[variables('dlsName')]" 158 | ], 159 | "properties": { 160 | "publicAccess": "None" 161 | } 162 | } 163 | ] 164 | }, 165 | { 166 | "type": "Microsoft.Synapse/workspaces", 167 | "apiVersion": "2021-06-01-preview", 168 | "name": "[variables('workspaceName')]", 169 | "location": "[variables('location')]", 170 | "identity": { 171 | "type": "SystemAssigned" 172 | }, 173 | "dependsOn": [ 174 | "[variables('dlsName')]", 175 | "[variables('dlsFsName')]" 176 | ], 177 | "properties": { 178 | "defaultDataLakeStorage": { 179 | "accountUrl": "[reference(variables('dlsName')).primaryEndpoints.dfs]", 180 | "filesystem": "[variables('dlsFsName')]" 181 | }, 182 | "sqlAdministratorLogin": "[parameters('sqlAdministratorLogin')]", 183 | "sqlAdministratorLoginPassword": "[parameters('sqlAdministratorLoginPassword')]", 184 | "publicNetworkAccess": "Enabled", 185 | "managedResourceGroupName": "[parameters('ManagedResourceGroupName')]", 186 | "workspaceRepositoryConfiguration": { 187 | "type": "WorkspaceGitHubConfiguration", 188 | "hostName": "https://github.com", 189 | "accountName": "[parameters('githubUsername')]", 190 | "repositoryName": "AzureSynapseEndToEndDemo", 191 | "rootFolder": "/artifacts", 192 | "collaborationBranch": "main" 193 | } 194 | }, 195 | "resources": [ 196 | { 197 | "condition": "[equals(parameters('allowAllConnections'),'true')]", 198 | "type": "firewallrules", 199 | "apiVersion": "2019-06-01-preview", 200 | "name": "allowAll", 201 | "location": "[variables('location')]", 202 | "dependsOn": [ "[variables('workspaceName')]" ], 203 | "properties": { 204 | "startIpAddress": "0.0.0.0", 205 | "endIpAddress": "255.255.255.255" 206 | } 207 | }, 208 | { 209 | "type": "firewallrules", 210 | "apiVersion": "2019-06-01-preview", 211 | "name": "AllowAllWindowsAzureIps", 212 | "location": "[variables('location')]", 213 | "dependsOn": [ "[variables('workspaceName')]" ], 214 | "properties": { 215 | "startIpAddress": "0.0.0.0", 216 | "endIpAddress": "0.0.0.0" 217 | } 218 | }, 219 | { 220 | "type": "managedIdentitySqlControlSettings", 221 | "apiVersion": "2019-06-01-preview", 222 | "name": "default", 223 | "location": "[variables('location')]", 224 | "dependsOn": [ "[variables('workspaceName')]" ], 225 | "properties": { 226 | "grantSqlControlToManagedIdentity": { 227 | "desiredState": "Enabled" 228 | } 229 | } 230 | } 231 | ] 232 | }, 233 | { 234 | "type": "Microsoft.Synapse/workspaces/sqlPools", 235 | "apiVersion": "2019-06-01-preview", 236 | "name": "[concat(variables('workspaceName'), '/', variables('sqlPoolName'))]", 237 | "location": "[variables('location')]", 238 | "sku": { 239 | "name": "[parameters('sku')]" 240 | }, 241 | "dependsOn": [ 242 | "[variables('workspaceName')]" 243 | ], 244 | "properties": { 245 | "createMode": "Default", 246 | "collation": "SQL_Latin1_General_CP1_CI_AS" 247 | }, 248 | "resources": [ 249 | { 250 | "condition": "[parameters('metadataSync')]", 251 | "type": "metadataSync", 252 | "apiVersion": "2019-06-01-preview", 253 | "name": "config", 254 | "location": "[variables('location')]", 255 | "dependsOn": [ 256 | "[variables('sqlPoolName')]" 257 | ], 258 | "properties": { 259 | "Enabled": "[parameters('metadataSync')]" 260 | } 261 | } 262 | ] 263 | }, 264 | { 265 | "condition": "[equals(parameters('sparkDeployment'),'true')]", 266 | "type": "Microsoft.Synapse/workspaces/bigDataPools", 267 | "apiVersion": "2019-06-01-preview", 268 | "name": "[concat(variables('workspaceName'), '/', variables('sparkPoolName'))]", 269 | "location": "[variables('location')]", 270 | "dependsOn": [ 271 | "[variables('workspaceName')]" 272 | ], 273 | "properties": { 274 | "nodeCount": 5, 275 | "nodeSizeFamily": "MemoryOptimized", 276 | "nodeSize": "[parameters('sparkNodeSize')]", 277 | "autoScale": { 278 | "enabled": true, 279 | "minNodeCount": 3, 280 | "maxNodeCount": 5 281 | }, 282 | "autoPause": { 283 | "enabled": true, 284 | "delayInMinutes": 15 285 | }, 286 | "sparkVersion": "3.1" 287 | } 288 | }, 289 | { 290 | "scope": "[concat('Microsoft.Storage/storageAccounts/', variables('dlsName'))]", 291 | "type": "Microsoft.Authorization/roleAssignments", 292 | "apiVersion": "2020-04-01-preview", 293 | "name": "[guid(uniqueString(variables('dlsName')))]", 294 | "location": "[variables('location')]", 295 | "dependsOn": [ 296 | "[variables('workspaceName')]" 297 | ], 298 | "properties": { 299 | "roleDefinitionId": "[resourceId('Microsoft.Authorization/roleDefinitions', 'ba92f5b4-2d11-453d-a403-e96b0029c9fe')]", 300 | "principalId": "[reference(resourceId('Microsoft.Synapse/workspaces', variables('workspaceName')), '2019-06-01-preview', 'Full').identity.principalId]", 301 | "principalType": "ServicePrincipal" 302 | } 303 | } 304 | ] 305 | } 306 | -------------------------------------------------------------------------------- /ARMTemplate/azuredeploy.parameters.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentParameters.json#", 3 | "contentVersion": "1.0.0.0", 4 | "parameters": { 5 | "sqlAdministratorLogin": { 6 | "value": "sqladmin" 7 | }, 8 | "sqlAdministratorLoginPassword": { 9 | "value": "Temp12345" 10 | } 11 | } 12 | } -------------------------------------------------------------------------------- /ARMTemplate/metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://aka.ms/azure-quickstart-templates-metadata-schema#", 3 | "itemDisplayName": "Azure Synapse End-End deployment", 4 | "description": "This template creates a Azure Synapse Workspace, including SQL Pools and optional Apache Spark Pools", 5 | "summary": "Azure Synapse Workspace", 6 | "validationType": "Manual", 7 | "githubUsername": "microsoft", 8 | "dateUpdated": "2020-09-10", 9 | "type": "QuickStart", 10 | "environments": [ 11 | "AzureCloud" 12 | ] 13 | } 14 | 15 | -------------------------------------------------------------------------------- /Exercise00-Setup/README.20230512.001.old: -------------------------------------------------------------------------------- 1 | ## Deploy Azure Synapse Demo in Your Azure Environment 2 | 3 | ### Pre-requisites to Deploy Synapse end-to-end Demo 4 | 5 | * You must have a github account 6 | * You must have an active azure subscription 7 | 8 | ### Deployment Steps 9 | Please follow the below steps to successfully deploy a Synapse workspace and its artifacts on your Azure subscription 10 | 11 | ### [![Deploy To Azure](/Images/deploytoazure.svg?sanitize=true)](https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2Fmicrosoft%2FAzureSynapseEndToEndDemo%2Fmain%2FARMTemplate%2Fazuredeploy.json) 12 | 13 | * **Deploy to Azure** button takes you to the https://ms.portal.azure.com/#create/Microsoft.Template webpage. Please provide subscription, resource group, region, storage account name, storage container name, workspace name, dedicated sql pool name, spark pool name, spark pool node size, sql administration username/password, sku (dedicated sql pool Data Warehouse Units), and github username parameter values. 14 | 15 | >:exclamation::point_right:**It's incredibly important that you write down all the values in the above step. Many will need to be supplied later as parameters.** 16 | 17 | >*Note: The github username should be the target github account. Example: If https://github.com/microsoft/AzureSynapseEndToEndDemo is the github project url, then "microsoft" is github account name.* 18 | 19 | * Click on the **Review + Create** button to trigger deployment validation. If deployment validation is successful, the single click deployment will deploy a Synapse Workspace, Dedicated SQL Pool, and Spark Pool. This deployment also enables git configuration so all the required artifacts for the end-to-end demo are committed to your user github project. This completes the Azure Synapse end-to-end code deployment step. 20 | 21 | >*Note: If deployment is incomplete, please look at the resource group activity log and find the latest deployment errors for more information* 22 | 23 | ### Demo Setup 24 | 25 | First you will need to fill in a parameter before you can complete the exercises. We need to provide the linked service to your storage account with the storage account name you chose during deployment. 26 | 27 | ![image](https://user-images.githubusercontent.com/59613090/192065803-c1c7ccd8-0ab5-487f-aeca-0bb957d9e24e.png) 28 | 29 | 30 | Once you click on the linked service name it will open a panel where we can make changes and provide the correct parameter for our storage account. 31 | 32 | ![image](https://user-images.githubusercontent.com/59613090/192065892-d103a4b9-dffb-4198-8036-28ab4045382a.png) 33 | 34 | 35 | Now that the parameter is complete you'll need to copy the demo data from our Microsoft repository to your data lake. The data used in these exercises is synthetic health care data generated from [Synthea](https://synthea.mitre.org/) using their [Data Generator](https://github.com/synthetichealth/synthea/wiki/Basic-Setup-and-Running) and is all open source. Alternatively, you could generate the data yourself and copy it to your lake. To begin the copy you need to open the Data Prep Pipeline. 36 | 37 | ![image](https://user-images.githubusercontent.com/59613090/192581982-60376d3f-201c-4416-bd9e-57f41c81f285.png) 38 | 39 | 40 | Once you have the pipeline open, you can execute it by clicking debug. When you click debug a flyout panel will open on the right side asking for two runtime parameters. First is the name of the storage account you chose during deployment and the second has the default value of '1tb'. You have a choice between two data source sizes and can choose either '1tb' or '30tb'. If you stick with the '1tb' default you can always go back later, run the pipeline again choosing '30tb' and copy that to your data lake as well. 41 | 42 | ![image](https://user-images.githubusercontent.com/59613090/193362543-5b3cc7a2-59a4-44cb-a40c-99d5e90d75b9.png) 43 | 44 | ![image](https://user-images.githubusercontent.com/59613090/193361209-7b9ba056-d7b4-4415-baeb-6b7f012b1d47.png) 45 | 46 | 47 | ## Congratulations on completing setup. You are now ready to move to [Exercise 01 - Claims](https://github.com/microsoft/AzureSynapseEndToEndDemo/blob/main/Exercise01-Claims/README.md) 48 | -------------------------------------------------------------------------------- /Exercise00-Setup/README.md: -------------------------------------------------------------------------------- 1 | ## Deploy Azure Synapse Demo in Your Azure Environment 2 | 3 | ### Pre-requisites to Deploy Synapse end-to-end Demo 4 | 5 | * You must have a github account 6 | * You must have an active azure subscription 7 | 8 | ### Deployment Steps 9 | Please follow the below steps to successfully deploy a Synapse workspace and its artifacts on your Azure subscription 10 | 11 | * Fork microsoft/AzureSynapseEndToEndDemo project to your local github account. Make sure to check "Copy the main branch only". 12 | 13 | ![Forking](/Images/Forking.gif) 14 | 15 | * Once you fork the AzureSynapseEndToEndDemo project to your github account, please click on **Deploy to Azure** button to start the deployment 16 | 17 | [![Deploy To Azure](/Images/deploytoazure.svg?sanitize=true)](https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2Fmicrosoft%2FAzureSynapseEndToEndDemo%2Fmain%2FARMTemplate%2Fazuredeploy.json) 18 | 19 | * **Deploy to Azure** button takes you to the https://ms.portal.azure.com/#create/Microsoft.Template webpage. Please provide subscription, resource group, region, storage account name, storage container name, workspace name, dedicated sql pool name, spark pool name, spark pool node size, sql administration username/password, sku (dedicated sql pool Data Warehouse Units), and github username parameter values. 20 | 21 | >:exclamation::point_right:**It's incredibly important that you write down all the values in the above step. Many will need to be supplied later as parameters.** 22 | 23 | >*Note: The github username should be the target github account where you forked the project. Example: If https://github.com/JohnDoe/AzureSynapseEndToEndDemo is the github project url, then "JohnDoe" is github account name.* 24 | 25 | * Click on the **Review + Create** button to trigger deployment validation. If deployment validation is successful, the single click deployment will deploy a Synapse Workspace, Dedicated SQL Pool, and Spark Pool. This deployment also enables git configuration so all the required artifacts for the end-to-end demo are committed to your user github project. This completes the Azure Synapse end-to-end code deployment step. 26 | 27 | >*Note: If deployment is incomplete, please look at the resource group activity log and find the latest deployment errors for more information* 28 | 29 | ### Demo Setup 30 | 31 | First you will need to fill in a parameter before you can complete the exercises. We need to provide the linked service to your storage account with the storage account name you chose during deployment. 32 | 33 | ![image](https://user-images.githubusercontent.com/59613090/192065803-c1c7ccd8-0ab5-487f-aeca-0bb957d9e24e.png) 34 | 35 | 36 | Once you click on the linked service name it will open a panel where we can make changes and provide the correct parameter for our storage account. 37 | 38 | ![image](https://user-images.githubusercontent.com/59613090/192065892-d103a4b9-dffb-4198-8036-28ab4045382a.png) 39 | 40 | 41 | Now that the parameter is complete you'll need to copy the demo data from our Microsoft repository to your data lake. The data used in these exercises is synthetic health care data generated from [Synthea](https://synthea.mitre.org/) using their [Data Generator](https://github.com/synthetichealth/synthea/wiki/Basic-Setup-and-Running) and is all open source. Alternatively, you could generate the data yourself and copy it to your lake. To begin the copy you need to open the Data Prep Pipeline. 42 | 43 | ![image](https://user-images.githubusercontent.com/59613090/192581982-60376d3f-201c-4416-bd9e-57f41c81f285.png) 44 | 45 | 46 | Once you have the pipeline open, you can execute it by clicking debug. When you click debug a flyout panel will open on the right side asking for two runtime parameters. First is the name of the storage account you chose during deployment and the second has the default value of '1tb'. You have a choice between two data source sizes and can choose either '1tb' or '30tb'. If you stick with the '1tb' default you can always go back later, run the pipeline again choosing '30tb' and copy that to your data lake as well. 47 | 48 | ![image](https://user-images.githubusercontent.com/59613090/193362543-5b3cc7a2-59a4-44cb-a40c-99d5e90d75b9.png) 49 | 50 | ![image](https://user-images.githubusercontent.com/59613090/193361209-7b9ba056-d7b4-4415-baeb-6b7f012b1d47.png) 51 | 52 | 53 | ## Congratulations on completing setup. You are now ready to move to [Exercise 01 - Claims](/Exercise01-Claims/README.md) 54 | -------------------------------------------------------------------------------- /Exercise01-Claims/README.md: -------------------------------------------------------------------------------- 1 | # Objective 2 | * This pipeline takes the JSON data that is in FHIR standard format from our "raw" ADLS container and converts it to parquet. Since Parquet is a columnar compressed file format this makes it much faster to import and work with the data. We store the parquet output in our "processed" container in ADLS under a folder called claim. 3 | ![image](https://user-images.githubusercontent.com/59613090/193112535-e9c68b13-95e0-4463-a572-4cdc1b8d694d.png) 4 | 5 | 6 | * We plan to eventually load this data into Dedicated SQL Pool across 3 tables representing Diagnosis, Insurance, and Procedures. We need to extract the data needed for each table, clean it, and write it back to ADLS. The second activity in our pipeline handles all of this in a single Synapse Spark Notebook. 7 | ![image](https://user-images.githubusercontent.com/59613090/193112711-8c6733f5-87e7-4639-a2e3-58afb6f2b414.png) 8 | 9 | 10 | * Now that the data is prepared and cleaned we are ready to load it into our Dedicated Pool, but we need to create the schema and tables first. We have a script activity that will run against our Dedicated Pool to create these artifacts for us. 11 | 12 | >*Note: Make sure your Dedicated Pool is running prior to executing this pipeline. You can see this in the SQL Pools tab under the Manage Hub.* 13 | 14 | ![image](https://user-images.githubusercontent.com/59613090/193114025-309980e7-7c0a-415a-912b-fa8832c109ea.png) 15 | 16 | 17 | * We are now all setup with data ready to go and tables to load it in and we'll use a Copy Activity for each table and load them in parallel. 18 | ![image](https://user-images.githubusercontent.com/59613090/193114655-add929b4-a317-49b5-8a48-014a5e15ddaa.png) 19 | 20 | 21 | * There is one last thing in our pipeline. We have some data engineers that will need to explore the data in the lake to understand how they can enable new work streams for the business. They are currently skilling up in PySpark, but until then we need to give them the ability to explore the data through TSQL. We have created a Notebook activity that creates the meta data for SQL Tables on top of the data in our data lake. You'll be able to play around with some exploration scripts in a later activity. 22 | ![image](https://user-images.githubusercontent.com/59613090/193118363-04c00d81-a374-4113-9a97-e3d3f3994807.png) 23 | 24 | 25 | 26 | # STEP 1: Parameter Setup 27 | Prior to running the claims pipeline (FHIR_Pipeline4Claim_Spark_OC) you will need to set the pipeline parameters to use the artifact names you chose during deployment. Go to the integrate hub, expand the claims folder, and select the pipeline to open it. 28 | 29 | ![image](https://user-images.githubusercontent.com/59613090/192874762-8647fe4e-23c5-4430-b0c1-d557812de371.png) 30 | 31 | 32 | Once the pipeline opens you will need to click somewhere on the canvas (open space or background) to see the pipeline level parameters. This means that NONE of the activities should be highlighted or selected. Now select the Parameters tab in the bottom pane to view the pipeline level parameters. 33 | 34 | ![image](https://user-images.githubusercontent.com/59613090/192875386-3f5eb80a-1920-40b7-aefc-483b4b5853d4.png) 35 | 36 | Change the default value for each of the following five parameters to what you chose during deployment: 37 | * StorageName - This is the name of your Synapse workspace ADLS account 38 | * DatabaseName - This is the name of your database in Synapse Dedicated SQL Pool 39 | * ServerName - This is the name of your Synapse Dedicated SQL Pool 40 | * SparkPoolName - This is the name of your Synapse Spark Pool 41 | * DatasetSize - This is either "1tb" or "30tb" depending on which size dataset you want to use 42 | 43 | # STEP 2: Execute Pipeline 44 | * You need to hit the debug button to kick off the pipeline run. 45 | >*Note: Make sure your Dedicated Pool is running prior to executing this pipeline. You can see this in the SQL Pools tab under the Manage Hub.* 46 | 47 | ![image](https://user-images.githubusercontent.com/59613090/192880611-b693730c-4b2d-4145-b5af-931f6a808050.png) 48 | 49 | 50 | ## Congratulations on completing Exercise 01. You are now ready to move to [Exercise 02 - Observations](/Exercise02-Observations/README.md) 51 | -------------------------------------------------------------------------------- /Exercise02-Observations/README.md: -------------------------------------------------------------------------------- 1 | # Objective 1: Dedicated SQL Pool 2 | * This pipeline takes the JSON data that is in FHIR standard format from our "raw" ADLS container and converts it to parquet. Since Parquet is a columnar compressed file format this makes it much faster to import and work with the data. We store the parquet output in our "processed" container in ADLS under a folder called "Observation". 3 | ![image](https://user-images.githubusercontent.com/59613090/193125969-460256eb-b025-4e56-8e16-ad10677b54f2.png) 4 | 5 | 6 | * We plan to eventually load this data into Dedicated SQL Pool in a table called [fhir].[ObservationMain]. We need to extract the data needed for the table, clean it, and write it back to ADLS. The second activity in our pipeline handles all of this in a single Synapse Spark Notebook. 7 | ![image](https://user-images.githubusercontent.com/59613090/193125858-14673041-4408-4afb-a0f2-3017de0c4550.png) 8 | 9 | 10 | * Now that the data is prepared and cleaned we are ready to load it into our Dedicated Pool, but we need to create the tables first. We have a script activity that will run against our Dedicated Pool to create these artifacts for us. 11 | 12 | >*Note: Make sure your Dedicated Pool is running prior to executing this pipeline. You can see this in the SQL Pools tab under the Manage Hub.* 13 | 14 | ![image](https://user-images.githubusercontent.com/59613090/193132332-12689085-0516-45ac-ab6d-ddd91ba3928d.png) 15 | 16 | 17 | * We are now all setup with data ready to go and a table to load it in and we'll use a Copy Activity to perform the load. 18 | ![image](https://user-images.githubusercontent.com/59613090/193132405-61afde90-c500-4097-8060-58bc00f0411e.png) 19 | 20 | 21 | # STEP 1: Parameter Setup 22 | Prior to running the observations pipeline (FHIR_Pipeline4Observation_Spark_OC) you will need to set the pipeline parameters to use the artifact names you chose during deployment. Go to the integrate hub, expand the obsevation folder, and select the pipeline to open it. 23 | 24 | ![image](https://user-images.githubusercontent.com/59613090/193133194-68a05a70-e2c7-43b9-81fa-393b2050b231.png) 25 | 26 | 27 | Once the pipeline opens you will need to click somewhere on the canvas (open space or background) to see the pipeline level parameters. This means that NONE of the activities should be highlighted or selected. Now select the Parameters tab in the bottom pane to view the pipeline level parameters. 28 | 29 | ![image](https://user-images.githubusercontent.com/59613090/193143735-1d23e579-ba28-442e-94bb-e7f95c8c1be5.png) 30 | 31 | 32 | Change the default value for each of the following five parameters to what you chose during deployment: 33 | * StorageName - This is the name of your Synapse workspace ADLS account 34 | * DatabaseName - This is the name of your database in Synapse Dedicated SQL Pool 35 | * ServerName - This is the name of your Synapse Dedicated SQL Pool 36 | * SparkPoolName - This is the name of your Synapse Spark Pool 37 | * DatasetSize - This is either "1tb" or "30tb" depending on which size dataset you want to use 38 | 39 | # STEP 2: Execute Pipeline 40 | * You need to hit the debug button to kick off the pipeline run. 41 | >*Note: Make sure your Dedicated Pool is running prior to executing this pipeline. You can see this in the SQL Pools tab under the Manage Hub.* 42 | 43 | ![image](https://user-images.githubusercontent.com/59613090/193143925-bcab20eb-d2d1-4b40-81fd-b464d2ad90d2.png) 44 | 45 | # Objective 2: ADX Pool 46 | * Objective 2 focuses on the steps to ingest, analyze and visualize the Observations data (time-series data) utilizing the Data Explorer Pool on Azure Synapse Analytics. 47 | * More specifically, we will be ingesting the data within the "Observation_Main" folder that is in the *curated* container within our Azure Data Lake Storage Gen 2. This "Observation_Main" folder contains the bulk of the Observations data already cleaned and prep and in parquet format. 48 | 49 | # STEP 1: Create a Data Explorer Pool 50 | 51 | 1. In Synapse studio, on the left-side pane, select **Manage > Data Explorer pools** 52 | 2. Select **New**, and then enter the following details on the **Basics** tab: 53 | | Setting | Value | Description | 54 | |:------|:------|:------ 55 | | Data Explorer Pool Name | **adxpoolmedicaldata** | This is the name the Data Explorer pool will have | 56 | | Workload | **Compute Optimized** | This workload provides a higher CPU to SSD storage ratio. | 57 | | Node Size | **Small(4 cores)** | Set this to the smallest size to reduce costs for this quickstart | 58 | 3. Select **Review + Create > Create.** Your data explorer will start the provisioning process. Once it is complete move on to the next step. 59 | 60 | ![Creating ADX pool](https://github.com/Azure/Test-Drive-Azure-Synapse-with-a-1-click-POC/raw/nataliarodri906-patch-1/images/gif1.gif) 61 | 62 | # STEP 2: Create a Data Explorer Database 63 | 64 | 1. In Synapse Studio, on the left-side pane, Select **Data**. 65 | 2. Select + (Add new resource) > **Data Explorer Database** and paste the following information: 66 | | Setting | Value | Description | 67 | |:------|:------|:------ 68 | | Data Explorer Pool Name | **adxpoolmedicaldata** | The name of the Data Explorer pool to use | 69 | | Name | **ObservationData** | This database name must be unique within the cluster. | 70 | | Default retention period | **365** | The time span (in days) for which it's guaranteed that the data is kept available to query. The time span is measured from the time that data is ingested. | 71 | |Default cache period | **31** | The time span (in days) for which to keep frequently queried data available in SSD storage or RAM, rather than in longer-term storage 72 | 3. Select **Create** to create the database. Creation typically takes less than a minute. 73 | 74 | ![Creating Data Explorer Pool](https://github.com/Azure/Test-Drive-Azure-Synapse-with-a-1-click-POC/raw/nataliarodri906-patch-1/images/gif2.gif) 75 | 76 | # STEP 3: Ingesting Data 77 | 78 | 1. In Synapse studio, on the left-side pane, select **Data** 79 | 80 | 2. Right-click ADX database *ObservationData* and click on **Open in Azure Data Explorer**. This opens the Azure Data Explorer web UI. 81 | 82 | 3. Once in the web UI click on the **Data** tab on the left. This opens the ADX "One-Click UI", where you can quickly ingest data, create database tables, and automatically map the table schema. 83 | 84 | 4. Click on **Ingest data**, and then enter the following details: 85 | 86 | | Setting | Value | Description | 87 | |:------|:------|:------ 88 | | Cluster | **adxpoolmedicaldata** | Enter name of Data Explorer pool created or use the *Add connection* button to add Connection URI | 89 | | Database | **ObservationData** | Enter name of database created | 90 | | New Table | **ObservationCurated** | Enter the name for the table that will hold the taxi trip data | 91 | 92 | 5. Select **Next**, and then enter the following information for **Source**: 93 | 94 | | Setting | Value | Description | 95 | |:------|:------|:------ 96 | | Source Type | **ADLS Gen2 Container** | Choose your source type 97 | | Ingestion Type | **One-time + Continuous** | Choose the type of ingestion you would like to perform (view i for more info.) 98 | | Select source | **Select Container** | This allows you to select the container from your Azure Subscription 99 | | Storage subscription | *NA* | Enter your Azure subscription name 100 | | Storage account | **storagemedicaldata** | Enter your storage account name 101 | | Container | **curated** | Enter the *curated* container name as this where the data resides 102 | | Sample Size | *NA* | Leave blank 103 | | Folder path | **fhir/1tb/Observation_main** | Find this path under Directory Properties for the *Observation_main* folder 104 | | File extension | **.parquet** | This is the format of the data 105 | 106 | 6. Select **Next: Schema**, this page displays the schema and a partial data preview of the **ObservationCurated** that will be created. 107 | * On the left hand menu you will see *Compression Type*, *Data format*, *Nested Levels* and *Mapping name* leave these configurations as displayed for this demo. 108 | * Change the data type for **Observation_id**, **encounter_id_reference** and **patient_id_reference** columns from string to **guid** data type. Do this by right-clicking on each of these columns and clicking on the **Change Data Type** button. 109 | * Similarly, change the data type for **issued** and **effectiveDateTime** columns from string to **datetime**. 110 | * Click on the carot on top right-hand corner to open the *Command viewer*. Here you can view the KQL code that is running in the background, such as the Create Table Command. This command is creating the table with all of the data types where the data will be stored: *ObservationCurated*. 111 | 112 | 8. Select **Next: Start Ingestion** and this will begin the ingestion process for the data. It is complete once all the files display a green checkmark. This should take approximately 10 minutes. Click **Close** to complete. 113 | * Note: You can also ingest the data using the pipeline named: *ObservationsData_ToSDXPool** which uses a COPY activity to bring the data into ADX. However, you must manually create a table in your ADX database prior to copying the data. Under KQL scripts you can find the *Observation Table Creation* script to create the table. After the table has been successfully created with the correct data types for the columns you can run the piepeline with your respective parameters. 114 | 115 | ![Ingesting Data](https://github.com/nataliarodri906/AzureSynapseEndToEndDemo/blob/7dd6b66ab99ce0b5aed9feb2b0aa43811dccb5f5/Images/IngestingGif.gif) 116 | 117 | # STEP 4: Analyze & Visualize Data using KQL 118 | 1. In Synpase studio, on the left-side pane, select **Develop**. 119 | 2. Under 'Notebooks' dropdown on the left side of the screen, click on the KQL notebook named **'Observations Analytics w KQL'**. 120 | 3. Once in the notebook, ensure you are connected to your ADX pool **'adxpoolmedicaldata'** and database **'ObservationData'** and then run each of the sections (a-i) of the script separately and observe the results: 121 | 122 | *a.* Get a quick preview of the **'ObservationCurated'** table. 123 | 124 | ![Take](https://github.com/nataliarodri906/AzureSynapseEndToEndDemo/blob/7dd6b66ab99ce0b5aed9feb2b0aa43811dccb5f5/Images/KQLgif01.gif) 125 | 126 | *b.* Counts the number of Observations in the **'ObservationCurated'** table. 127 | 128 | ![count](https://github.com/nataliarodri906/AzureSynapseEndToEndDemo/blob/7dd6b66ab99ce0b5aed9feb2b0aa43811dccb5f5/Images/KQLgif02.gif) 129 | 130 | *c.* Summarizes the minimum and maximum of the *'issued'* column(date issued) in the **'ObservationCurated'** table. 131 | 132 | ![summarize1](https://github.com/nataliarodri906/AzureSynapseEndToEndDemo/blob/7dd6b66ab99ce0b5aed9feb2b0aa43811dccb5f5/Images/KQLgif03.gif) 133 | 134 | *d.* Summarizes the count of records in the **'ObservationCurated'** table by grouping them into daily intervals based on the *'issued'* column. 135 | 136 | ![summarize2](https://github.com/nataliarodri906/AzureSynapseEndToEndDemo/blob/7dd6b66ab99ce0b5aed9feb2b0aa43811dccb5f5/Images/KQLgif04.gif) 137 | 138 | *e.* Visualizes the timeseries chart for **'ObservationCurated'** table based on issued date. More specifically, it filters the table to select records with issued datetime between July 15th, 1910 and June 20th, 2021, (which are the min and max issued dates found in step c), then counts the number of observations for every 30 day intervals within that time range, and finally visualizes the results as a time chart. 139 | 140 | ![timechart1](https://github.com/nataliarodri906/AzureSynapseEndToEndDemo/blob/7dd6b66ab99ce0b5aed9feb2b0aa43811dccb5f5/Images/KQLgif05.gif) 141 | 142 | *f.* Now we are trimming the dataset to analyze daily observations during a relatively normal time period (8 years between 2011 and 2019). Ultimately, we visualize the timechart again and it shows the pattern of observations day by day for 8 years. 143 | 144 | ![timechart2](https://github.com/nataliarodri906/AzureSynapseEndToEndDemo/blob/7dd6b66ab99ce0b5aed9feb2b0aa43811dccb5f5/Images/KQLgif06.gif) 145 | 146 | *g.* We are now identifying anomalies between these 8 years (2011 and 2019) using the timeseries chart developed in the previous step. More specifically, it uses the *"series_decompose_anomalies"* function to identify anomalies in the observations count data with a threshold of 1.5. Then, it visualizes the anomalies as an anomaly chart titled "Anomalies for daily medical observations during 8 years". Anomalies can be seen as red dots on the chart. 147 | 148 | ![anomalieschart](https://github.com/nataliarodri906/AzureSynapseEndToEndDemo/blob/7dd6b66ab99ce0b5aed9feb2b0aa43811dccb5f5/Images/KQLgif07.gif) 149 | 150 | *h.* Now are listing the anomalies in a table. More specifically, 151 | it uses the *"series_decompose_anomalies"* function to identify anomalies in the observations count data and extends the table with an *'anomalies'* column. The *"mv-expand"* function is used to expand the table to separate rows for each observations count and its corresponding anomaly value and issued datetime. The code then filters the table to only include rows where the anomaly value is not equal to zero. 152 | 153 | ![anomalieslist](https://github.com/nataliarodri906/AzureSynapseEndToEndDemo/blob/7dd6b66ab99ce0b5aed9feb2b0aa43811dccb5f5/Images/KQLgif08.gif) 154 | 155 | *i.* Finally, we are using this query to separate anomaly properties. More specifically, it uses the *"series_decompose_anomalies"* function to decompose the observations data into anomalies, score, and baseline values. The table is expanded to separate rows for each issued datetime, observations count, score, baseline and anomaly (where the anomalies column is set to null if the anomaly value is 0). This query can later be used to visualize the timeseries chart with anomalies on powerBI (reference FHSI_Dual_Dims v2). 156 | 157 | ![decomposeanomalies](https://github.com/nataliarodri906/AzureSynapseEndToEndDemo/blob/7dd6b66ab99ce0b5aed9feb2b0aa43811dccb5f5/Images/KQLgif09.gif) 158 | 159 | # Summarization 160 | 161 | - Overall, we were able to see how Synapse Studio is an incredibly powerful tool that brings immense value to analytical workloads in Azure. It offers a comprehensive suite of analytical components, including Pipelines, Spark, SQL, Data Explorer, and Power BI, all within a single Azure Resource. This integrated approach enhances efficiency and delivers exceptional benefits to users. 162 | - More specifically, as we saw here Azure Data Explorer (ADX) is a valuable component offered by Synapse Studio. ADX proved to be a fast and highly scalable analytics service optimized for querying and analyzing large volumes of diverse data in real-time. In this case, we saw that it is particularly well-suited for working with time series data due to its efficient storage and querying capabilities. Additionally, its integration with Synapse Studio allows users to perform ad-hoc data exploration and gain instant insights from massive datasets, facilitating rapid decision-making. 163 | - By providing these purpose-built analytical components within a single Azure Resource, Synapse Studio eliminates the complexity and overhead associated with managing multiple tools and integrations. It offers a cohesive environment for end-to-end data analytics, catering to diverse workload needs efficiently and effectively. 164 | 165 | 166 | 167 | 168 | ## Congratulations on completing Exercise 02. You are now ready to move to [Exercise 03 - Patients](/Exercise03-Patients/README.md) 169 | -------------------------------------------------------------------------------- /Exercise03-Patients/README.md: -------------------------------------------------------------------------------- 1 | # Objective 2 | * This pipeline takes the JSON data that is in FHIR standard format from our "raw" ADLS container and converts it to parquet. Since Parquet is a columnar compressed file format this makes it much faster to import and work with the data. We store the parquet output in our "processed" container in ADLS under a folder called "Patient". 3 | ![image](https://user-images.githubusercontent.com/59613090/193136231-4a965468-3d2e-4f24-80a3-c76d0bd8a387.png) 4 | 5 | 6 | * We plan to eventually load this data into Dedicated SQL Pool across 2 tables representing Patient Addresses and Patient Indentifiers. We need to extract the data needed for each table, clean it, and write it back to ADLS. The second activity in our pipeline handles all of this inside a Data Flow Activity. This could have been done in a Spark notebook like the previous 2 activities, but this will let you compare the two methods. 7 | ![image](https://user-images.githubusercontent.com/59613090/193136520-d0bcbab6-fc6f-4896-8ab4-19798833c384.png) 8 | 9 | 10 | * Now that the data is prepared and cleaned we are ready to load it into our Dedicated Pool, but we need to create the tables first. We have a script activity that will run against our Dedicated Pool to create these artifacts for us. 11 | 12 | >*Note: Make sure your Dedicated Pool is running prior to executing this pipeline. You can see this in the SQL Pools tab under the Manage Hub.* 13 | 14 | ![image](https://user-images.githubusercontent.com/59613090/193136622-d84e952a-60f2-47f9-8895-717bf9e81c6b.png) 15 | 16 | 17 | * We are now all setup with data ready to go and a table to load it in and we'll use a Copy Activity to perform the load. 18 | ![image](https://user-images.githubusercontent.com/59613090/193144477-0a98c0d6-1f98-45b6-8e9a-42745960b837.png) 19 | 20 | 21 | # STEP 1: Parameter Setup 22 | Prior to running the Patient pipeline (FHIR_Pipeline4Patient_DataFlow_OC) you will need to set the pipeline parameters to use the artifact names you chose during deployment. Go to the integrate hub, expand the patient folder, and select the pipeline to open it. 23 | 24 | ![image](https://user-images.githubusercontent.com/59613090/193138455-cbb13596-0a2c-4353-808d-92958a7772f6.png) 25 | 26 | 27 | Once the pipeline opens you will need to click somewhere on the canvas (open space or background) to see the pipeline level parameters. This means that NONE of the activities should be highlighted or selected. Now select the Parameters tab in the bottom pane to view the pipeline level parameters. 28 | 29 | ![image](https://user-images.githubusercontent.com/59613090/193144874-eb863277-d90b-4f32-a208-6d8d8f2aff96.png) 30 | 31 | 32 | Change the default value for each of the following five parameters to what you chose during deployment: 33 | * StorageName - This is the name of your Synapse workspace ADLS account 34 | * DatabaseName - This is the name of your database in Synapse Dedicated SQL Pool 35 | * ServerName - This is the name of your Synapse Dedicated SQL Pool 36 | * SparkPoolName - This is the name of your Synapse Spark Pool 37 | * DatasetSize - This is either "1tb" or "30tb" depending on which size dataset you want to use 38 | 39 | # STEP 2: Execute Pipeline 40 | * Since this pipeline has a data flow we'll kick it off a bit differently than the previous exercises. You will want to flip the radio button for "Data Flow Debug", hit the drop down arrow next to debug, and select the last option "Use Activity Runtime". 41 | >*Note: Make sure your Dedicated Pool is running prior to executing this pipeline. You can see this in the SQL Pools tab under the Manage Hub.* 42 | 43 | ![image](https://user-images.githubusercontent.com/59613090/193149525-ada67b9c-90b6-466e-bde7-5546f33fac56.png) 44 | 45 | 46 | ## Congratulations on completing Exercise 03. 47 | -------------------------------------------------------------------------------- /Images/Forking.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/AzureSynapseEndToEndDemo/c0a7e065ba86a9bad1c0b21694f85c62429d3e3d/Images/Forking.gif -------------------------------------------------------------------------------- /Images/IngestingGif.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/AzureSynapseEndToEndDemo/c0a7e065ba86a9bad1c0b21694f85c62429d3e3d/Images/IngestingGif.gif -------------------------------------------------------------------------------- /Images/KQLgif01.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/AzureSynapseEndToEndDemo/c0a7e065ba86a9bad1c0b21694f85c62429d3e3d/Images/KQLgif01.gif -------------------------------------------------------------------------------- /Images/KQLgif02.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/AzureSynapseEndToEndDemo/c0a7e065ba86a9bad1c0b21694f85c62429d3e3d/Images/KQLgif02.gif -------------------------------------------------------------------------------- /Images/KQLgif03.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/AzureSynapseEndToEndDemo/c0a7e065ba86a9bad1c0b21694f85c62429d3e3d/Images/KQLgif03.gif -------------------------------------------------------------------------------- /Images/KQLgif04.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/AzureSynapseEndToEndDemo/c0a7e065ba86a9bad1c0b21694f85c62429d3e3d/Images/KQLgif04.gif -------------------------------------------------------------------------------- /Images/KQLgif05.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/AzureSynapseEndToEndDemo/c0a7e065ba86a9bad1c0b21694f85c62429d3e3d/Images/KQLgif05.gif -------------------------------------------------------------------------------- /Images/KQLgif06.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/AzureSynapseEndToEndDemo/c0a7e065ba86a9bad1c0b21694f85c62429d3e3d/Images/KQLgif06.gif -------------------------------------------------------------------------------- /Images/KQLgif07.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/AzureSynapseEndToEndDemo/c0a7e065ba86a9bad1c0b21694f85c62429d3e3d/Images/KQLgif07.gif -------------------------------------------------------------------------------- /Images/KQLgif08.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/AzureSynapseEndToEndDemo/c0a7e065ba86a9bad1c0b21694f85c62429d3e3d/Images/KQLgif08.gif -------------------------------------------------------------------------------- /Images/KQLgif09.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/AzureSynapseEndToEndDemo/c0a7e065ba86a9bad1c0b21694f85c62429d3e3d/Images/KQLgif09.gif -------------------------------------------------------------------------------- /Images/deploytoazure.svg: -------------------------------------------------------------------------------- 1 | 2 | 15 | 17 | 35 | 41 | 45 | 50 | 51 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. 4 | 5 | Permissionis hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Azure Synapse End-End Demo 2 | 3 | This repository provides one-click infrastructure and artifact deployment for Azure Synapse Analytics to get you started with Big Data Analytics on a 4 | large sized Health Care sample data. You will learn how to ingest, process, and serve large volumes of data using various components of Synapse. 5 | 6 | ## Reference Architecture 7 | ![image](https://user-images.githubusercontent.com/59613090/192642933-23285334-d36c-40e7-8fc1-2e3ed9006ba0.png) 8 | 9 | ## CONTENTS 10 | * [Exercise 00 - Setup](Exercise00-Setup/README.md) 11 | * [Exercise 01 - Claims](Exercise01-Claims/README.md) 12 | * [Exercise 02 - Observations](Exercise02-Observations/README.md) 13 | * [Exercise 03 - Patients](Exercise03-Patients/README.md) 14 | * [Troubleshooting](Troubleshooting/Readme.md) 15 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd). 40 | 41 | 42 | -------------------------------------------------------------------------------- /Troubleshooting/Readme.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Jump To Issue 4 | 1. [Missing Pipelines or Data](#missing-pipes) 5 | 6 | 7 | 8 | ---- 9 | 10 | # Missing Pipelines or Data 11 | After you deploy the solution and open Synapse Studio, you notice that there are no pipelines or data. You might also see a notification that "Publishing in workspace mode is disabled". 12 | 13 | ![image](https://user-images.githubusercontent.com/59613090/233481119-d184c4d3-27a2-4bb4-9b2a-90e3e6a5be4e.png) 14 | 15 | 16 | # Resolution - Missing Pipelines or Data 17 | Not all artifacts can be deployed if you don't have Synapse registered in your GitHub account. You need to disconnect and reconnect GitHub from your Synapse workspace to force registration. Once that is complete you will need to redploy the solution. 18 | 19 | First you need to switch from Synapse Live mode to GitHub. 20 | ![image](https://user-images.githubusercontent.com/59613090/233480998-b4cb25aa-9973-43ca-8884-2e41887dd49f.png) 21 | 22 | Next you need to head to the **_Manage Hub_** and select **_GitHub Configuration_** in the navigation pane. 23 | ![image](https://user-images.githubusercontent.com/59613090/233492816-ce4d13ca-11c1-4bd1-8612-21fa885cdcda.png) 24 | 25 | Now you need to disconnect your workspace from your GitHub account. 26 | ![image](https://user-images.githubusercontent.com/59613090/233493184-878cfe0e-9fec-45ec-b1d5-c736212478af.png) 27 | 28 | Once you have successfully disconnected GitHub from your workspace it should look like this. 29 | ![image](https://user-images.githubusercontent.com/59613090/233494498-5ad84f71-d4f7-4855-a2cf-7e669564e3bc.png) 30 | 31 | Now that you have disconnected, it's time to reconnect so the Synapse libraries can be properly installed in your GitHub account. Hit the configure button and select GitHub as your repository type. 32 | ![image](https://user-images.githubusercontent.com/59613090/233498108-e6e0af27-9ebd-4377-a667-750f9c5b5d49.png) 33 | 34 | Enter your GitHub repository Owner Name and select **_Continue_** 35 | 36 | ![image](https://user-images.githubusercontent.com/59613090/233498324-a489eec1-a2d1-47e6-a0cc-ef3aedfe756e.png) 37 | 38 | Now you will get a pop-up asking to authorize Azure Synapse to your GitHub account. Select **_Authorize Azure Synapse_** 39 | ![image](https://user-images.githubusercontent.com/59613090/233499635-cbde78cb-6505-4908-baa5-1cc4b1925c4b.png) 40 | 41 | Next you need to grab the URL for your repository to finish the configuration. You can get this by clicking the **_<>Code_** button toward the top of your repository and then selecting the **_Local_** tab. Now you can just click the copy button next to your HTTPS URL. 42 | ![image](https://user-images.githubusercontent.com/59613090/234100272-c92007d6-3622-45ac-b518-4b03cd8414ee.png) 43 | 44 | Now go back to your repository configuration and make sure you have the following things set and hit **_Apply_** when done: 45 | - "**_Use Repository Link_**" should be selected 46 | - Paste your URL in the **_Git Repository Link_** field 47 | - Use "**_main_**" for the **_Collaboration Branch_** field 48 | - Choose a Publish Branch like **_workspace_publish_** 49 | - #### You MUST set the **_Root Folder_** field to **_/artifacts_** including the slash that precedes it 50 | - #### The **_Import Existing Resources_** field MUST BE UNCHECKED 51 | ![image](https://user-images.githubusercontent.com/59613090/234103325-dcf52e98-6e85-4f14-b32e-b4e1c9618a10.png) 52 | 53 | If you get an error about not having permissions to import the repository you need to **_UNCHECK_** the import setting. 54 | ![image](https://user-images.githubusercontent.com/59613090/234103995-00e54a20-66cf-4367-b787-5cc113a3dcf0.png) 55 | 56 | [Back to Issue List](#issue-list) 57 | --- 58 | -------------------------------------------------------------------------------- /artifacts/Observation Table Creation.kql: -------------------------------------------------------------------------------- 1 | // Create Table 2 | .create table ObservationsTable (Observation_id: guid, resourceType: string, issued: datetime, status: string, patient_id_reference: guid, encounter_id_reference: guid, effectiveDateTime: datetime , valueQuantity_code: string, valueQuantity_system: string, valueQuantity_unit: string, valueQuantity_value: real, valueString: string) -------------------------------------------------------------------------------- /artifacts/Observations Analytics w KQL.kql: -------------------------------------------------------------------------------- 1 | // a. Quick preview of data 2 | ObservationCurated 3 | | take 100 4 | 5 | // b. Total number of observations 6 | ObservationCurated 7 | | count 8 | 9 | // c. Observations min and max date 10 | ObservationCurated 11 | | summarize min(issued), max(issued) 12 | 13 | // d. Summarizes the count of observations by grouping them into daily intervals based on the issued date. 14 | ObservationCurated 15 | | summarize count() by bin(issued, 1d) 16 | 17 | // e. Visualize timeseries chart 18 | ObservationCurated 19 | | where issued between (datetime(1910-07-15T12:49:47.219Z)..datetime(2021-06-20T11:41:23.934Z)) 20 | | make-series observationscount=count() on issued from datetime(1910-07-15T12:49:47.219Z) to datetime(2021-06-20T11:41:23.934Z) step 30d 21 | | render timechart 22 | 23 | // f. Trimming dataset to analyze daily observations during a relatively normal time period (8 years between 2011 and 2019) 24 | ObservationCurated 25 | | where issued between (datetime(2011-08-05 00:00:00.0)..datetime(2019-01-01 00:00:00.0)) 26 | | make-series observationscount=count() on issued from datetime(2011-08-05 00:00:00.0) to datetime(2019-01-01 00:00:00.0) step 1d 27 | | render timechart 28 | 29 | // g. We are now identifying anomalies between these 8 years (2011 and 2019) using the timeseries chart developed in the previous step. 30 | ObservationCurated 31 | | where issued between (datetime(2011-08-05 00:00:00.0)..datetime(2019-01-01 00:00:00.0)) 32 | | make-series observationscount=count() on issued from datetime(2011-08-05 00:00:00.0) to datetime(2019-01-01 00:00:00.0) step 1d 33 | | extend anomalies = series_decompose_anomalies(observationscount, 1.5) 34 | | render anomalychart with(anomalycolumns=anomalies, title='Anomalies for daily medical observations during 8 years') 35 | 36 | // h. List Anomalies 37 | ObservationCurated 38 | | make-series observationscount=count() on issued from datetime(2011-08-05 00:00:00.0) to datetime(2019-01-01 00:00:00.0) step 1d 39 | | extend anomalies = series_decompose_anomalies(observationscount, 1.5) 40 | | mv-expand observationscount, anomalies, issued 41 | | where toint(anomalies) <> 0 42 | | sort by todatetime(issued) 43 | 44 | // i. Separate anomaly properties and use query to create power BI report 45 | ObservationCurated 46 | | make-series observationscount=count() on issued from datetime(2011-08-05 00:00:00.0) to datetime(2019-01-01 00:00:00.0) step 1d 47 | | extend (anomalies, score, baseline) = series_decompose_anomalies(observationscount, 1.5) 48 | | mv-expand anomalies, issued, observationscount, score, baseline 49 | | project anomalies = iff(toint(anomalies) == 0, int(null),toint(anomalies)), issued, observationscount, score, baseline 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /artifacts/credential/WorkspaceSystemIdentity.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "WorkspaceSystemIdentity", 3 | "properties": { 4 | "type": "ManagedIdentity" 5 | } 6 | } -------------------------------------------------------------------------------- /artifacts/dataflow/PatientJSON_Flatten_large.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "PatientJSON_Flatten_large", 3 | "properties": { 4 | "folder": { 5 | "name": "Patient" 6 | }, 7 | "type": "MappingDataFlow", 8 | "typeProperties": { 9 | "sources": [ 10 | { 11 | "dataset": { 12 | "referenceName": "PatientRawParquetLarge", 13 | "type": "DatasetReference" 14 | }, 15 | "name": "PatientNDJSON" 16 | } 17 | ], 18 | "sinks": [ 19 | { 20 | "dataset": { 21 | "referenceName": "PatientIdentifierParquetLarge", 22 | "type": "DatasetReference" 23 | }, 24 | "name": "sinkPatientIdentifier" 25 | }, 26 | { 27 | "dataset": { 28 | "referenceName": "PatientExtensionParquetLarge", 29 | "type": "DatasetReference" 30 | }, 31 | "name": "sinkPatientExtension" 32 | }, 33 | { 34 | "dataset": { 35 | "referenceName": "PatientAddressParquetLarge", 36 | "type": "DatasetReference" 37 | }, 38 | "name": "sinkPatientAddress" 39 | } 40 | ], 41 | "transformations": [ 42 | { 43 | "name": "PatientIdentifierFlatten" 44 | }, 45 | { 46 | "name": "PatientExtensionFlatten" 47 | }, 48 | { 49 | "name": "PatientAddressFlatten" 50 | } 51 | ], 52 | "scriptLines": [ 53 | "source(output(", 54 | " address as (city as string, country as string, extension as (extension as (url as string, valueDecimal as double)[], url as string)[], line as string[], postalCode as string, state as string)[],", 55 | " birthDate as string,", 56 | " communication as (language as (coding as (code as string, display as string, system as string)[], text as string))[],", 57 | " deceasedDateTime as string,", 58 | " extension as (url as string, valueAddress as (city as string, country as string, state as string), valueDecimal as double, valueString as string)[],", 59 | " gender as string,", 60 | " id as string,", 61 | " identifier as (system as string, type as (coding as (code as string, display as string, system as string)[], text as string), value as string)[],", 62 | " maritalStatus as (coding as (code as string, display as string, system as string)[], text as string),", 63 | " multipleBirthBoolean as boolean,", 64 | " multipleBirthInteger as long,", 65 | " name as (family as string, given as string[], prefix as string[], suffix as string[], use as string)[],", 66 | " resourceType as string,", 67 | " telecom as (system as string, use as string, value as string)[],", 68 | " text as (div as string, status as string)", 69 | " ),", 70 | " allowSchemaDrift: true,", 71 | " validateSchema: false,", 72 | " ignoreNoFilesFound: false,", 73 | " format: 'parquet') ~> PatientNDJSON", 74 | "PatientNDJSON foldDown(unroll(identifier.type.coding),", 75 | " mapColumn(", 76 | " patient_id = id,", 77 | " birthDate,", 78 | " deceasedDateTime,", 79 | " gender,", 80 | " text = maritalStatus.text,", 81 | " multipleBirthBoolean,", 82 | " multipleBirthInteger,", 83 | " resourceType,", 84 | " div = text.div,", 85 | " status = text.status,", 86 | " {identifier.system} = identifier.system,", 87 | " {identifier.type.coding.code} = identifier.type.coding.code,", 88 | " {identifier.type.coding.display} = identifier.type.coding.display,", 89 | " {identifier.type.coding.system} = identifier.type.coding.system,", 90 | " {identifier.type.text} = identifier.type.text,", 91 | " {identifier.value} = identifier.value", 92 | " ),", 93 | " skipDuplicateMapInputs: false,", 94 | " skipDuplicateMapOutputs: false) ~> PatientIdentifierFlatten", 95 | "PatientNDJSON foldDown(unroll(extension),", 96 | " mapColumn(", 97 | " patient_id = id,", 98 | " url = extension.url,", 99 | " {extension.valueAddress.city} = extension.valueAddress.city,", 100 | " {extension.valueAddress.country} = extension.valueAddress.country,", 101 | " {extension.valueAddress.state} = extension.valueAddress.state,", 102 | " {extension.valueDecimal} = extension.valueDecimal,", 103 | " {extension.valueString} = extension.valueString", 104 | " ),", 105 | " skipDuplicateMapInputs: false,", 106 | " skipDuplicateMapOutputs: false) ~> PatientExtensionFlatten", 107 | "PatientNDJSON foldDown(unroll(address.extension.extension),", 108 | " mapColumn(", 109 | " id,", 110 | " {address.city} = address.city,", 111 | " {address.country} = address.country,", 112 | " {address.extension.extension.url} = address.extension.extension.url,", 113 | " {address.extension.extension.valueDecimal} = address.extension.extension.valueDecimal,", 114 | " {address.extension.url} = address.extension.url,", 115 | " {address.postalCode} = address.postalCode,", 116 | " {address.state} = address.state", 117 | " ),", 118 | " skipDuplicateMapInputs: true,", 119 | " skipDuplicateMapOutputs: false) ~> PatientAddressFlatten", 120 | "PatientIdentifierFlatten sink(allowSchemaDrift: true,", 121 | " validateSchema: false,", 122 | " format: 'parquet',", 123 | " truncate: true,", 124 | " umask: 0022,", 125 | " preCommands: [],", 126 | " postCommands: [],", 127 | " skipDuplicateMapInputs: true,", 128 | " skipDuplicateMapOutputs: true) ~> sinkPatientIdentifier", 129 | "PatientExtensionFlatten sink(allowSchemaDrift: true,", 130 | " validateSchema: false,", 131 | " format: 'parquet',", 132 | " truncate: true,", 133 | " umask: 0022,", 134 | " preCommands: [],", 135 | " postCommands: [],", 136 | " skipDuplicateMapInputs: true,", 137 | " skipDuplicateMapOutputs: true) ~> sinkPatientExtension", 138 | "PatientAddressFlatten sink(allowSchemaDrift: true,", 139 | " validateSchema: false,", 140 | " format: 'parquet',", 141 | " truncate: true,", 142 | " umask: 0022,", 143 | " preCommands: [],", 144 | " postCommands: [],", 145 | " skipDuplicateMapInputs: true,", 146 | " skipDuplicateMapOutputs: true) ~> sinkPatientAddress" 147 | ] 148 | } 149 | } 150 | } -------------------------------------------------------------------------------- /artifacts/dataset/ClaimDiagnosisParquetLarge.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "ClaimDiagnosisParquetLarge", 3 | "properties": { 4 | "linkedServiceName": { 5 | "referenceName": "StorageLS", 6 | "type": "LinkedServiceReference", 7 | "parameters": { 8 | "StorageName": { 9 | "value": "@dataset().StorageName", 10 | "type": "Expression" 11 | } 12 | } 13 | }, 14 | "parameters": { 15 | "StorageName": { 16 | "type": "string" 17 | }, 18 | "FolderPath": { 19 | "type": "string" 20 | } 21 | }, 22 | "annotations": [], 23 | "type": "Parquet", 24 | "typeProperties": { 25 | "location": { 26 | "type": "AzureBlobFSLocation", 27 | "folderPath": { 28 | "value": "@dataset().FolderPath", 29 | "type": "Expression" 30 | }, 31 | "fileSystem": "curated" 32 | }, 33 | "compressionCodec": "snappy" 34 | }, 35 | "schema": [ 36 | { 37 | "name": "id", 38 | "type": "UTF8" 39 | }, 40 | { 41 | "name": "resourceType", 42 | "type": "UTF8" 43 | }, 44 | { 45 | "name": "status", 46 | "type": "UTF8" 47 | }, 48 | { 49 | "name": "billablePeriod_end", 50 | "type": "UTF8" 51 | }, 52 | { 53 | "name": "billablePeriod_start", 54 | "type": "UTF8" 55 | }, 56 | { 57 | "name": "created", 58 | "type": "UTF8" 59 | }, 60 | { 61 | "name": "patient_display", 62 | "type": "UTF8" 63 | }, 64 | { 65 | "name": "patient_reference", 66 | "type": "UTF8" 67 | }, 68 | { 69 | "name": "prescription_reference", 70 | "type": "UTF8" 71 | }, 72 | { 73 | "name": "provider_display", 74 | "type": "UTF8" 75 | }, 76 | { 77 | "name": "provider_reference", 78 | "type": "UTF8" 79 | }, 80 | { 81 | "name": "total_currency", 82 | "type": "UTF8" 83 | }, 84 | { 85 | "name": "total_value", 86 | "type": "DOUBLE" 87 | }, 88 | { 89 | "name": "use", 90 | "type": "UTF8" 91 | }, 92 | { 93 | "name": "display", 94 | "type": "UTF8" 95 | }, 96 | { 97 | "name": "focal", 98 | "type": "BOOLEAN" 99 | }, 100 | { 101 | "name": "sequence", 102 | "type": "INT64" 103 | } 104 | ] 105 | } 106 | } -------------------------------------------------------------------------------- /artifacts/dataset/ClaimDiagnosisSQL.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "ClaimDiagnosisSQL", 3 | "properties": { 4 | "linkedServiceName": { 5 | "referenceName": "SynapseDedicatedPoolLS", 6 | "type": "LinkedServiceReference", 7 | "parameters": { 8 | "DatabaseName": { 9 | "value": "@dataset().DatabaseName", 10 | "type": "Expression" 11 | }, 12 | "ServerName": { 13 | "value": "@dataset().ServerName", 14 | "type": "Expression" 15 | } 16 | } 17 | }, 18 | "parameters": { 19 | "DatabaseName": { 20 | "type": "string" 21 | }, 22 | "ServerName": { 23 | "type": "string" 24 | } 25 | }, 26 | "annotations": [], 27 | "type": "AzureSqlDWTable", 28 | "schema": [], 29 | "typeProperties": { 30 | "schema": "fhir", 31 | "table": "ClaimDiagnosis" 32 | } 33 | } 34 | } -------------------------------------------------------------------------------- /artifacts/dataset/ClaimInsurance.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "ClaimInsurance", 3 | "properties": { 4 | "linkedServiceName": { 5 | "referenceName": "SynapseDedicatedPoolLS", 6 | "type": "LinkedServiceReference", 7 | "parameters": { 8 | "DatabaseName": { 9 | "value": "@dataset().DatabaseName", 10 | "type": "Expression" 11 | }, 12 | "ServerName": { 13 | "value": "@dataset().ServerName", 14 | "type": "Expression" 15 | } 16 | } 17 | }, 18 | "parameters": { 19 | "DatabaseName": { 20 | "type": "string" 21 | }, 22 | "ServerName": { 23 | "type": "string" 24 | } 25 | }, 26 | "annotations": [], 27 | "type": "AzureSqlDWTable", 28 | "schema": [], 29 | "typeProperties": { 30 | "schema": "fhir", 31 | "table": "ClaimInsurance" 32 | } 33 | } 34 | } -------------------------------------------------------------------------------- /artifacts/dataset/ClaimInsuranceParquetLarge.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "ClaimInsuranceParquetLarge", 3 | "properties": { 4 | "linkedServiceName": { 5 | "referenceName": "StorageLS", 6 | "type": "LinkedServiceReference", 7 | "parameters": { 8 | "StorageName": { 9 | "value": "@dataset().StorageName", 10 | "type": "Expression" 11 | } 12 | } 13 | }, 14 | "parameters": { 15 | "StorageName": { 16 | "type": "string" 17 | }, 18 | "FolderPath": { 19 | "type": "string" 20 | } 21 | }, 22 | "annotations": [], 23 | "type": "Parquet", 24 | "typeProperties": { 25 | "location": { 26 | "type": "AzureBlobFSLocation", 27 | "fileName": "*", 28 | "folderPath": { 29 | "value": "@dataset().FolderPath", 30 | "type": "Expression" 31 | }, 32 | "fileSystem": "curated" 33 | }, 34 | "compressionCodec": "snappy" 35 | }, 36 | "schema": [ 37 | { 38 | "name": "id", 39 | "type": "UTF8" 40 | }, 41 | { 42 | "name": "resourceType", 43 | "type": "UTF8" 44 | }, 45 | { 46 | "name": "status", 47 | "type": "UTF8" 48 | }, 49 | { 50 | "name": "billablePeriod_end", 51 | "type": "UTF8" 52 | }, 53 | { 54 | "name": "billablePeriod_start", 55 | "type": "UTF8" 56 | }, 57 | { 58 | "name": "created", 59 | "type": "UTF8" 60 | }, 61 | { 62 | "name": "patient_display", 63 | "type": "UTF8" 64 | }, 65 | { 66 | "name": "patient_reference", 67 | "type": "UTF8" 68 | }, 69 | { 70 | "name": "prescription_reference", 71 | "type": "UTF8" 72 | }, 73 | { 74 | "name": "provider_display", 75 | "type": "UTF8" 76 | }, 77 | { 78 | "name": "provider_reference", 79 | "type": "UTF8" 80 | }, 81 | { 82 | "name": "total_currency", 83 | "type": "UTF8" 84 | }, 85 | { 86 | "name": "total_value", 87 | "type": "DOUBLE" 88 | }, 89 | { 90 | "name": "use", 91 | "type": "UTF8" 92 | }, 93 | { 94 | "name": "display", 95 | "type": "UTF8" 96 | }, 97 | { 98 | "name": "focal", 99 | "type": "BOOLEAN" 100 | }, 101 | { 102 | "name": "sequence", 103 | "type": "INT64" 104 | } 105 | ] 106 | } 107 | } -------------------------------------------------------------------------------- /artifacts/dataset/ClaimProcedureParquetLarge.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "ClaimProcedureParquetLarge", 3 | "properties": { 4 | "linkedServiceName": { 5 | "referenceName": "StorageLS", 6 | "type": "LinkedServiceReference", 7 | "parameters": { 8 | "StorageName": { 9 | "value": "@dataset().StorageName", 10 | "type": "Expression" 11 | } 12 | } 13 | }, 14 | "parameters": { 15 | "StorageName": { 16 | "type": "string" 17 | }, 18 | "FolderPath": { 19 | "type": "string" 20 | } 21 | }, 22 | "annotations": [], 23 | "type": "Parquet", 24 | "typeProperties": { 25 | "location": { 26 | "type": "AzureBlobFSLocation", 27 | "fileName": "*", 28 | "folderPath": { 29 | "value": "@dataset().FolderPath", 30 | "type": "Expression" 31 | }, 32 | "fileSystem": "curated" 33 | }, 34 | "compressionCodec": "snappy" 35 | }, 36 | "schema": [ 37 | { 38 | "name": "id", 39 | "type": "UTF8" 40 | }, 41 | { 42 | "name": "resourceType", 43 | "type": "UTF8" 44 | }, 45 | { 46 | "name": "status", 47 | "type": "UTF8" 48 | }, 49 | { 50 | "name": "billablePeriod_end", 51 | "type": "UTF8" 52 | }, 53 | { 54 | "name": "billablePeriod_start", 55 | "type": "UTF8" 56 | }, 57 | { 58 | "name": "created", 59 | "type": "UTF8" 60 | }, 61 | { 62 | "name": "patient_display", 63 | "type": "UTF8" 64 | }, 65 | { 66 | "name": "patient_reference", 67 | "type": "UTF8" 68 | }, 69 | { 70 | "name": "prescription_reference", 71 | "type": "UTF8" 72 | }, 73 | { 74 | "name": "provider_display", 75 | "type": "UTF8" 76 | }, 77 | { 78 | "name": "provider_reference", 79 | "type": "UTF8" 80 | }, 81 | { 82 | "name": "total_currency", 83 | "type": "UTF8" 84 | }, 85 | { 86 | "name": "total_value", 87 | "type": "DOUBLE" 88 | }, 89 | { 90 | "name": "use", 91 | "type": "UTF8" 92 | }, 93 | { 94 | "name": "reference", 95 | "type": "UTF8" 96 | }, 97 | { 98 | "name": "sequence", 99 | "type": "INT64" 100 | } 101 | ] 102 | } 103 | } -------------------------------------------------------------------------------- /artifacts/dataset/ClaimProcedureSQL.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "ClaimProcedureSQL", 3 | "properties": { 4 | "linkedServiceName": { 5 | "referenceName": "SynapseDedicatedPoolLS", 6 | "type": "LinkedServiceReference", 7 | "parameters": { 8 | "DatabaseName": { 9 | "value": "@dataset().DatabaseName", 10 | "type": "Expression" 11 | }, 12 | "ServerName": { 13 | "value": "@dataset().ServerName", 14 | "type": "Expression" 15 | } 16 | } 17 | }, 18 | "parameters": { 19 | "DatabaseName": { 20 | "type": "string" 21 | }, 22 | "ServerName": { 23 | "type": "string" 24 | } 25 | }, 26 | "annotations": [], 27 | "type": "AzureSqlDWTable", 28 | "schema": [], 29 | "typeProperties": { 30 | "schema": "fhir", 31 | "table": "ClaimProcedure" 32 | } 33 | } 34 | } -------------------------------------------------------------------------------- /artifacts/dataset/ObservationMain_LargeParquet.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "ObservationMain_LargeParquet", 3 | "properties": { 4 | "linkedServiceName": { 5 | "referenceName": "StorageLS", 6 | "type": "LinkedServiceReference", 7 | "parameters": { 8 | "StorageName": { 9 | "value": "@dataset().StorageName", 10 | "type": "Expression" 11 | } 12 | } 13 | }, 14 | "parameters": { 15 | "StorageName": { 16 | "type": "string" 17 | }, 18 | "DatasetSize": { 19 | "type": "string" 20 | } 21 | }, 22 | "annotations": [], 23 | "type": "Parquet", 24 | "typeProperties": { 25 | "location": { 26 | "type": "AzureBlobFSLocation", 27 | "fileName": "*", 28 | "folderPath": { 29 | "value": "@dataset().DatasetSize", 30 | "type": "Expression" 31 | }, 32 | "fileSystem": "curated" 33 | }, 34 | "compressionCodec": "snappy" 35 | }, 36 | "schema": [] 37 | }, 38 | "type": "Microsoft.Synapse/workspaces/datasets" 39 | } -------------------------------------------------------------------------------- /artifacts/dataset/Observation_SQLDS.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Observation_SQLDS", 3 | "properties": { 4 | "linkedServiceName": { 5 | "referenceName": "SynapseDedicatedPoolLS", 6 | "type": "LinkedServiceReference", 7 | "parameters": { 8 | "DatabaseName": { 9 | "value": "@dataset().DatabaseName", 10 | "type": "Expression" 11 | }, 12 | "ServerName": { 13 | "value": "@dataset().ServerName", 14 | "type": "Expression" 15 | } 16 | } 17 | }, 18 | "parameters": { 19 | "DatabaseName": { 20 | "type": "string" 21 | }, 22 | "ServerName": { 23 | "type": "string" 24 | } 25 | }, 26 | "annotations": [], 27 | "type": "AzureSqlDWTable", 28 | "schema": [], 29 | "typeProperties": { 30 | "schema": "fhir", 31 | "table": "ObservationMain" 32 | } 33 | } 34 | } -------------------------------------------------------------------------------- /artifacts/dataset/PatientAddressParquetLarge.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "PatientAddressParquetLarge", 3 | "properties": { 4 | "linkedServiceName": { 5 | "referenceName": "StorageLS", 6 | "type": "LinkedServiceReference", 7 | "parameters": { 8 | "StorageName": { 9 | "value": "@dataset().StorageName", 10 | "type": "Expression" 11 | } 12 | } 13 | }, 14 | "parameters": { 15 | "StorageName": { 16 | "type": "string" 17 | }, 18 | "DatasetSize": { 19 | "type": "string" 20 | } 21 | }, 22 | "annotations": [], 23 | "type": "Parquet", 24 | "typeProperties": { 25 | "location": { 26 | "type": "AzureBlobFSLocation", 27 | "folderPath": { 28 | "value": "@concat('fhir/',dataset().DatasetSize,'/PatientAddress')", 29 | "type": "Expression" 30 | }, 31 | "fileSystem": "curated" 32 | }, 33 | "compressionCodec": "snappy" 34 | }, 35 | "schema": [] 36 | }, 37 | "type": "Microsoft.Synapse/workspaces/datasets" 38 | } -------------------------------------------------------------------------------- /artifacts/dataset/PatientAddressSQL.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "PatientAddressSQL", 3 | "properties": { 4 | "linkedServiceName": { 5 | "referenceName": "SynapseDedicatedPoolLS", 6 | "type": "LinkedServiceReference", 7 | "parameters": { 8 | "DatabaseName": { 9 | "value": "@dataset().DatabaseName", 10 | "type": "Expression" 11 | }, 12 | "ServerName": { 13 | "value": "@dataset().ServerName", 14 | "type": "Expression" 15 | } 16 | } 17 | }, 18 | "parameters": { 19 | "DatabaseName": { 20 | "type": "string" 21 | }, 22 | "ServerName": { 23 | "type": "string" 24 | } 25 | }, 26 | "annotations": [], 27 | "type": "AzureSqlDWTable", 28 | "schema": [], 29 | "typeProperties": { 30 | "schema": "fhir", 31 | "table": "PatientAddress" 32 | } 33 | } 34 | } -------------------------------------------------------------------------------- /artifacts/dataset/PatientExtensionParquetLarge.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "PatientExtensionParquetLarge", 3 | "properties": { 4 | "linkedServiceName": { 5 | "referenceName": "StorageLS", 6 | "type": "LinkedServiceReference", 7 | "parameters": { 8 | "StorageName": { 9 | "value": "@dataset().StorageName", 10 | "type": "Expression" 11 | } 12 | } 13 | }, 14 | "parameters": { 15 | "StorageName": { 16 | "type": "string" 17 | }, 18 | "DatasetSize": { 19 | "type": "string" 20 | } 21 | }, 22 | "annotations": [], 23 | "type": "Parquet", 24 | "typeProperties": { 25 | "location": { 26 | "type": "AzureBlobFSLocation", 27 | "folderPath": { 28 | "value": "@concat('fhir/',dataset().DatasetSize,'/PatientExtension')", 29 | "type": "Expression" 30 | }, 31 | "fileSystem": "curated" 32 | }, 33 | "compressionCodec": "snappy" 34 | }, 35 | "schema": [] 36 | }, 37 | "type": "Microsoft.Synapse/workspaces/datasets" 38 | } -------------------------------------------------------------------------------- /artifacts/dataset/PatientIdentifierParquetLarge.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "PatientIdentifierParquetLarge", 3 | "properties": { 4 | "linkedServiceName": { 5 | "referenceName": "StorageLS", 6 | "type": "LinkedServiceReference", 7 | "parameters": { 8 | "StorageName": { 9 | "value": "@dataset().StorageName", 10 | "type": "Expression" 11 | } 12 | } 13 | }, 14 | "parameters": { 15 | "StorageName": { 16 | "type": "string" 17 | }, 18 | "DatasetSize": { 19 | "type": "string" 20 | } 21 | }, 22 | "annotations": [], 23 | "type": "Parquet", 24 | "typeProperties": { 25 | "location": { 26 | "type": "AzureBlobFSLocation", 27 | "folderPath": { 28 | "value": "@concat('fhir/',dataset().DatasetSize,'/PatientIdentifier')", 29 | "type": "Expression" 30 | }, 31 | "fileSystem": "curated" 32 | }, 33 | "compressionCodec": "snappy" 34 | }, 35 | "schema": [] 36 | }, 37 | "type": "Microsoft.Synapse/workspaces/datasets" 38 | } -------------------------------------------------------------------------------- /artifacts/dataset/PatientIdentifierSQLLarge.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "PatientIdentifierSQLLarge", 3 | "properties": { 4 | "linkedServiceName": { 5 | "referenceName": "SynapseDedicatedPoolLS", 6 | "type": "LinkedServiceReference", 7 | "parameters": { 8 | "DatabaseName": { 9 | "value": "@dataset().DatabaseName", 10 | "type": "Expression" 11 | }, 12 | "ServerName": { 13 | "value": "@dataset().ServerName", 14 | "type": "Expression" 15 | } 16 | } 17 | }, 18 | "parameters": { 19 | "DatabaseName": { 20 | "type": "string" 21 | }, 22 | "ServerName": { 23 | "type": "string" 24 | } 25 | }, 26 | "annotations": [], 27 | "type": "AzureSqlDWTable", 28 | "schema": [], 29 | "typeProperties": { 30 | "schema": "fhir", 31 | "table": "PatientIdentifier" 32 | } 33 | }, 34 | "type": "Microsoft.Synapse/workspaces/datasets" 35 | } -------------------------------------------------------------------------------- /artifacts/dataset/PatientRawParquetLarge.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "PatientRawParquetLarge", 3 | "properties": { 4 | "linkedServiceName": { 5 | "referenceName": "StorageLS", 6 | "type": "LinkedServiceReference", 7 | "parameters": { 8 | "StorageName": { 9 | "value": "@dataset().StorageName", 10 | "type": "Expression" 11 | } 12 | } 13 | }, 14 | "parameters": { 15 | "StorageName": { 16 | "type": "string" 17 | }, 18 | "DatasetSize": { 19 | "type": "string" 20 | } 21 | }, 22 | "annotations": [], 23 | "type": "Parquet", 24 | "typeProperties": { 25 | "location": { 26 | "type": "AzureBlobFSLocation", 27 | "folderPath": { 28 | "value": "@concat('fhir/',dataset().DatasetSize,'/Patient')", 29 | "type": "Expression" 30 | }, 31 | "fileSystem": "processed" 32 | }, 33 | "compressionCodec": "snappy" 34 | }, 35 | "schema": [] 36 | } 37 | } -------------------------------------------------------------------------------- /artifacts/dataset/Sink_DataPrep_Curated_DS.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Sink_DataPrep_Curated_DS", 3 | "properties": { 4 | "linkedServiceName": { 5 | "referenceName": "StorageLS", 6 | "type": "LinkedServiceReference", 7 | "parameters": { 8 | "StorageName": { 9 | "value": "@dataset().StorageName", 10 | "type": "Expression" 11 | } 12 | } 13 | }, 14 | "parameters": { 15 | "StorageName": { 16 | "type": "string" 17 | } 18 | }, 19 | "annotations": [], 20 | "type": "Binary", 21 | "typeProperties": { 22 | "location": { 23 | "type": "AzureBlobFSLocation", 24 | "fileSystem": "curated" 25 | } 26 | } 27 | } 28 | } -------------------------------------------------------------------------------- /artifacts/dataset/Sink_DataPrep_DS.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Sink_DataPrep_DS", 3 | "properties": { 4 | "linkedServiceName": { 5 | "referenceName": "StorageLS", 6 | "type": "LinkedServiceReference", 7 | "parameters": { 8 | "StorageName": { 9 | "value": "@dataset().StorageName", 10 | "type": "Expression" 11 | } 12 | } 13 | }, 14 | "parameters": { 15 | "StorageName": { 16 | "type": "string" 17 | }, 18 | "DatasetSize": { 19 | "type": "string" 20 | } 21 | }, 22 | "annotations": [], 23 | "type": "Binary", 24 | "typeProperties": { 25 | "location": { 26 | "type": "AzureBlobFSLocation", 27 | "folderPath": { 28 | "value": "@concat('fhir_ndjson/', dataset().DatasetSize)", 29 | "type": "Expression" 30 | }, 31 | "fileSystem": "raw" 32 | } 33 | } 34 | } 35 | } -------------------------------------------------------------------------------- /artifacts/dataset/Sink_DataPrep_Processed_DS.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Sink_DataPrep_Processed_DS", 3 | "properties": { 4 | "linkedServiceName": { 5 | "referenceName": "StorageLS", 6 | "type": "LinkedServiceReference", 7 | "parameters": { 8 | "StorageName": { 9 | "value": "@dataset().StorageName", 10 | "type": "Expression" 11 | } 12 | } 13 | }, 14 | "parameters": { 15 | "StorageName": { 16 | "type": "string" 17 | } 18 | }, 19 | "annotations": [], 20 | "type": "Binary", 21 | "typeProperties": { 22 | "location": { 23 | "type": "AzureBlobFSLocation", 24 | "fileSystem": "processed" 25 | } 26 | } 27 | } 28 | } -------------------------------------------------------------------------------- /artifacts/dataset/Source_DataPrep_Curated_DS.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Source_DataPrep_Curated_DS", 3 | "properties": { 4 | "linkedServiceName": { 5 | "referenceName": "Source_Dataset_LS", 6 | "type": "LinkedServiceReference", 7 | "parameters": { 8 | "StorageWithSaSUrl": { 9 | "value": "@concat('https://medicaldl.blob.core.windows.net/source?sp=rl&st=2023-04-05T05:00:00Z&se=2028-04-05T05:00:00Z&spr=https&sv=2021-12-02&sr=c&sig=t7P4PfO0HqRHAW%2FJQsMH9K3cgf9MguIgSKGdNYoyar4%3D')", 10 | "type": "Expression" 11 | } 12 | } 13 | }, 14 | "annotations": [], 15 | "type": "Binary", 16 | "typeProperties": { 17 | "location": { 18 | "type": "AzureBlobStorageLocation", 19 | "fileName": "Create_Curated.txt", 20 | "container": "source" 21 | } 22 | } 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /artifacts/dataset/Source_DataPrep_DS.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Source_DataPrep_DS", 3 | "properties": { 4 | "linkedServiceName": { 5 | "referenceName": "Source_Dataset_LS", 6 | "type": "LinkedServiceReference", 7 | "parameters": { 8 | "StorageWithSaSUrl": { 9 | "value": "@concat('https://medicaldl.blob.core.windows.net/source?SAS=REDACTED)", 10 | "type": "Expression" 11 | } 12 | } 13 | }, 14 | "parameters": { 15 | "DatasetSize": { 16 | "type": "string" 17 | } 18 | }, 19 | "annotations": [], 20 | "type": "Binary", 21 | "typeProperties": { 22 | "location": { 23 | "type": "AzureBlobStorageLocation", 24 | "folderPath": { 25 | "value": "@dataset().DatasetSize", 26 | "type": "Expression" 27 | }, 28 | "container": "source" 29 | } 30 | } 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /artifacts/dataset/Source_DataPrep_Processed_DS.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Source_DataPrep_Processed_DS", 3 | "properties": { 4 | "linkedServiceName": { 5 | "referenceName": "Source_Dataset_LS", 6 | "type": "LinkedServiceReference", 7 | "parameters": { 8 | "StorageWithSaSUrl": { 9 | "value": "@concat('https://medicaldl.blob.core.windows.net/source?sp=rl&st=2023-04-05T05:00:00Z&se=2028-04-05T05:00:00Z&spr=https&sv=2021-12-02&sr=c&sig=t7P4PfO0HqRHAW%2FJQsMH9K3cgf9MguIgSKGdNYoyar4%3D')", 10 | "type": "Expression" 11 | } 12 | } 13 | }, 14 | "annotations": [], 15 | "type": "Binary", 16 | "typeProperties": { 17 | "location": { 18 | "type": "AzureBlobStorageLocation", 19 | "fileName": "Create_Processed.txt", 20 | "container": "source" 21 | } 22 | } 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /artifacts/integrationRuntime/AutoResolveIntegrationRuntime.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "AutoResolveIntegrationRuntime", 3 | "properties": { 4 | "type": "Managed", 5 | "typeProperties": { 6 | "computeProperties": { 7 | "location": "AutoResolve", 8 | "dataFlowProperties": { 9 | "computeType": "General", 10 | "coreCount": 4, 11 | "timeToLive": 0 12 | } 13 | } 14 | } 15 | } 16 | } -------------------------------------------------------------------------------- /artifacts/linkedService/Source_Dataset_LS.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Source_Dataset_LS", 3 | "type": "Microsoft.Synapse/workspaces/linkedservices", 4 | "properties": { 5 | "parameters": { 6 | "StorageWithSaSUrl": { 7 | "type": "string" 8 | } 9 | }, 10 | "annotations": [], 11 | "type": "AzureBlobStorage", 12 | "typeProperties": { 13 | "sasUri": "@{linkedService().StorageWithSaSUrl}" 14 | }, 15 | "connectVia": { 16 | "referenceName": "AutoResolveIntegrationRuntime", 17 | "type": "IntegrationRuntimeReference" 18 | } 19 | } 20 | } -------------------------------------------------------------------------------- /artifacts/linkedService/StorageLS.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "StorageLS", 3 | "properties": { 4 | "parameters": { 5 | "StorageName": { 6 | "type": "string", 7 | "defaultValue": "synapsee2elake" 8 | } 9 | }, 10 | "annotations": [], 11 | "type": "AzureBlobFS", 12 | "typeProperties": { 13 | "url": "@{concat('https://',linkedService().StorageName,'.dfs.core.windows.net')}" 14 | }, 15 | "connectVia": { 16 | "referenceName": "AutoResolveIntegrationRuntime", 17 | "type": "IntegrationRuntimeReference" 18 | } 19 | } 20 | } -------------------------------------------------------------------------------- /artifacts/linkedService/SynapseDedicatedPoolLS.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "SynapseDedicatedPoolLS", 3 | "properties": { 4 | "parameters": { 5 | "DatabaseName": { 6 | "type": "string", 7 | "defaultValue": "healthcare" 8 | }, 9 | "ServerName": { 10 | "type": "string", 11 | "defaultValue": "health" 12 | } 13 | }, 14 | "annotations": [], 15 | "type": "AzureSqlDW", 16 | "typeProperties": { 17 | "connectionString": "Integrated Security=False;Encrypt=True;Connection Timeout=30;Data Source=\"@{concat(linkedService().ServerName,'.sql.azuresynapse.net')}\";Initial Catalog=@{linkedService().DatabaseName}" 18 | }, 19 | "connectVia": { 20 | "referenceName": "AutoResolveIntegrationRuntime", 21 | "type": "IntegrationRuntimeReference" 22 | } 23 | } 24 | } -------------------------------------------------------------------------------- /artifacts/notebook/ClaimParquetFlatten_Large.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "ClaimParquetFlatten_Large", 3 | "properties": { 4 | "folder": { 5 | "name": "Claims" 6 | }, 7 | "nbformat": 4, 8 | "nbformat_minor": 2, 9 | "bigDataPool": { 10 | "referenceName": "healthcare", 11 | "type": "BigDataPoolReference" 12 | }, 13 | "sessionProperties": { 14 | "driverMemory": "112g", 15 | "driverCores": 16, 16 | "executorMemory": "112g", 17 | "executorCores": 16, 18 | "numExecutors": 4, 19 | "conf": { 20 | "spark.dynamicAllocation.enabled": "false", 21 | "spark.dynamicAllocation.minExecutors": "4", 22 | "spark.dynamicAllocation.maxExecutors": "4", 23 | "spark.autotune.trackingId": "f6bbd8e2-1229-423c-bf5f-0432650ae015" 24 | } 25 | }, 26 | "metadata": { 27 | "saveOutput": true, 28 | "enableDebugMode": false, 29 | "kernelspec": { 30 | "name": "synapse_pyspark", 31 | "display_name": "Synapse PySpark" 32 | }, 33 | "language_info": { 34 | "name": "python" 35 | }, 36 | "a365ComputeOptions": { 37 | "id": "/subscriptions/7e416de3-c506-4776-8270-83fd73c6cc37/resourceGroups/syne2e/providers/Microsoft.Synapse/workspaces/health/bigDataPools/healthcare", 38 | "name": "healthcare", 39 | "type": "Spark", 40 | "endpoint": "https://health.dev.azuresynapse.net/livyApi/versions/2019-11-01-preview/sparkPools/healthcare", 41 | "auth": { 42 | "type": "AAD", 43 | "authResource": "https://dev.azuresynapse.net" 44 | }, 45 | "sparkVersion": "3.1", 46 | "nodeCount": 3, 47 | "cores": 16, 48 | "memory": 112, 49 | "automaticScaleJobs": false 50 | }, 51 | "sessionKeepAliveTimeout": 180 52 | }, 53 | "cells": [ 54 | { 55 | "cell_type": "code", 56 | "metadata": { 57 | "tags": [ 58 | "parameters" 59 | ] 60 | }, 61 | "source": [ 62 | "StorageName = \"medicaldl\"\r\n", 63 | "DatasetSize = \"1tb\"" 64 | ], 65 | "execution_count": 1 66 | }, 67 | { 68 | "cell_type": "code", 69 | "metadata": { 70 | "jupyter": { 71 | "source_hidden": false, 72 | "outputs_hidden": false 73 | }, 74 | "nteract": { 75 | "transient": { 76 | "deleting": false 77 | } 78 | } 79 | }, 80 | "source": [ 81 | "curated_location = \"abfss://curated@\" + StorageName + \".dfs.core.windows.net/fhir/\"+ DatasetSize +\"/\"\r\n", 82 | "processed_location = \"abfss://processed@\" + StorageName + \".dfs.core.windows.net/fhir/\"+ DatasetSize +\"/\"\r\n", 83 | "write_mode=\"overwrite\"" 84 | ], 85 | "execution_count": null 86 | }, 87 | { 88 | "cell_type": "code", 89 | "metadata": { 90 | "jupyter": { 91 | "source_hidden": false, 92 | "outputs_hidden": false 93 | }, 94 | "nteract": { 95 | "transient": { 96 | "deleting": false 97 | } 98 | } 99 | }, 100 | "source": [ 101 | "Claim_df=spark.read.parquet(processed_location+\"Claim/\")" 102 | ], 103 | "execution_count": 2 104 | }, 105 | { 106 | "cell_type": "code", 107 | "metadata": { 108 | "jupyter": { 109 | "source_hidden": false, 110 | "outputs_hidden": false 111 | }, 112 | "nteract": { 113 | "transient": { 114 | "deleting": false 115 | } 116 | }, 117 | "collapsed": false 118 | }, 119 | "source": [ 120 | "display(Claim_df.limit(10))" 121 | ], 122 | "execution_count": 3 123 | }, 124 | { 125 | "cell_type": "code", 126 | "metadata": { 127 | "jupyter": { 128 | "source_hidden": false, 129 | "outputs_hidden": false 130 | }, 131 | "nteract": { 132 | "transient": { 133 | "deleting": false 134 | } 135 | } 136 | }, 137 | "source": [ 138 | "Claim_df.printSchema()" 139 | ], 140 | "execution_count": 4 141 | }, 142 | { 143 | "cell_type": "code", 144 | "metadata": { 145 | "jupyter": { 146 | "source_hidden": false, 147 | "outputs_hidden": false 148 | }, 149 | "nteract": { 150 | "transient": { 151 | "deleting": false 152 | } 153 | } 154 | }, 155 | "source": [ 156 | "from pyspark.sql.functions import explode\r\n", 157 | "from pyspark.sql.functions import regexp_replace\r\n", 158 | "\r\n", 159 | "Claim_main_explode_df = Claim_df.select(\r\n", 160 | " \"id\",\"resourceType\",\"status\",\"billablePeriod.end\",\"billablePeriod.start\",\"created\",\"patient.display\",\"patient.reference\",\r\n", 161 | " \"prescription.reference\",\"provider.display\",\"provider.reference\",\"total.currency\",\"total.value\",\"use\")\r\n", 162 | "\r\n", 163 | "#use toDF() to rename the columns\r\n", 164 | "Claim_main_df= Claim_main_explode_df.toDF(*( \r\n", 165 | " \"Claim_id\",\"resourceType\",\"status\",\"billablePeriod_end\",\"billablePeriod_start\",\"created\",\r\n", 166 | " \"patient_display\",\"patient_id_reference\",\r\n", 167 | " \"prescription_reference\",\"provider_display\",\"provider_org_id_reference\",\"total_currency\",\"total_value\",\"use\"))\r\n", 168 | "\r\n", 169 | "Claim_main_df = Claim_main_df.withColumn(\"patient_id_reference\",regexp_replace(\"patient_id_reference\",\"Patient/\",\"\")).withColumn(\r\n", 170 | " \"provider_org_id_reference\",regexp_replace(\"provider_org_id_reference\",\"Organization/\",\"\"))\r\n", 171 | "\r\n", 172 | "\r\n", 173 | "# adding schema optimization\r\n", 174 | "# Arshad" 175 | ], 176 | "execution_count": 5 177 | }, 178 | { 179 | "cell_type": "code", 180 | "metadata": { 181 | "jupyter": { 182 | "source_hidden": false, 183 | "outputs_hidden": false 184 | }, 185 | "nteract": { 186 | "transient": { 187 | "deleting": false 188 | } 189 | } 190 | }, 191 | "source": [ 192 | "Claim_main_df.printSchema()" 193 | ], 194 | "execution_count": 6 195 | }, 196 | { 197 | "cell_type": "code", 198 | "metadata": { 199 | "jupyter": { 200 | "source_hidden": false, 201 | "outputs_hidden": false 202 | }, 203 | "nteract": { 204 | "transient": { 205 | "deleting": false 206 | } 207 | }, 208 | "collapsed": false 209 | }, 210 | "source": [ 211 | "display(Claim_main_df.limit(10))" 212 | ], 213 | "execution_count": 7 214 | }, 215 | { 216 | "cell_type": "code", 217 | "metadata": { 218 | "jupyter": { 219 | "source_hidden": false, 220 | "outputs_hidden": false 221 | }, 222 | "nteract": { 223 | "transient": { 224 | "deleting": false 225 | } 226 | } 227 | }, 228 | "source": [ 229 | "Claim_main_df.count()" 230 | ], 231 | "execution_count": 8 232 | }, 233 | { 234 | "cell_type": "code", 235 | "metadata": { 236 | "jupyter": { 237 | "source_hidden": false, 238 | "outputs_hidden": false 239 | }, 240 | "nteract": { 241 | "transient": { 242 | "deleting": false 243 | } 244 | } 245 | }, 246 | "source": [ 247 | "Claim_main_df.write.mode(write_mode).parquet(curated_location+\"Claim_main/\")" 248 | ], 249 | "execution_count": 16 250 | }, 251 | { 252 | "cell_type": "code", 253 | "metadata": { 254 | "jupyter": { 255 | "source_hidden": false, 256 | "outputs_hidden": false 257 | }, 258 | "nteract": { 259 | "transient": { 260 | "deleting": false 261 | } 262 | } 263 | }, 264 | "source": [ 265 | "from pyspark.sql.functions import explode\r\n", 266 | "\r\n", 267 | "Claim_insurance_explode_df = Claim_df.select(\"id\", explode(Claim_df.insurance))\r\n", 268 | "\r\n", 269 | "#use toDF() to rename the columns\r\n", 270 | "Claim_insurance_df= Claim_insurance_explode_df.toDF(*( \"Claim_id\",\"insurance\"))\r\n", 271 | "\r\n", 272 | "Claim_insurance_df = Claim_insurance_df.select(\r\n", 273 | " \"Claim_id\",\"insurance.coverage.display\",\"insurance.focal\",\"insurance.sequence\"\r\n", 274 | " ).toDF(*( \r\n", 275 | " \"Claim_id\",\"insurance_coverage.display\",\"insurance_focal\",\"insurance_sequence\" ))\r\n", 276 | "\r\n", 277 | "# base_df_explode = base_df.select(explode(base_df.entry))" 278 | ], 279 | "execution_count": 9 280 | }, 281 | { 282 | "cell_type": "code", 283 | "metadata": { 284 | "jupyter": { 285 | "source_hidden": false, 286 | "outputs_hidden": false 287 | }, 288 | "nteract": { 289 | "transient": { 290 | "deleting": false 291 | } 292 | } 293 | }, 294 | "source": [ 295 | "Claim_insurance_df.printSchema()" 296 | ], 297 | "execution_count": 10 298 | }, 299 | { 300 | "cell_type": "code", 301 | "metadata": { 302 | "jupyter": { 303 | "source_hidden": false, 304 | "outputs_hidden": false 305 | }, 306 | "nteract": { 307 | "transient": { 308 | "deleting": false 309 | } 310 | }, 311 | "collapsed": false 312 | }, 313 | "source": [ 314 | "display(Claim_insurance_df.limit(10))" 315 | ], 316 | "execution_count": 11 317 | }, 318 | { 319 | "cell_type": "code", 320 | "metadata": { 321 | "jupyter": { 322 | "source_hidden": false, 323 | "outputs_hidden": false 324 | }, 325 | "nteract": { 326 | "transient": { 327 | "deleting": false 328 | } 329 | } 330 | }, 331 | "source": [ 332 | "Claim_insurance_df.count()" 333 | ], 334 | "execution_count": 12 335 | }, 336 | { 337 | "cell_type": "code", 338 | "metadata": { 339 | "jupyter": { 340 | "source_hidden": false, 341 | "outputs_hidden": false 342 | }, 343 | "nteract": { 344 | "transient": { 345 | "deleting": false 346 | } 347 | } 348 | }, 349 | "source": [ 350 | "Claim_insurance_df.write.mode(write_mode).parquet(curated_location+\"Claim_insurance/\")" 351 | ], 352 | "execution_count": 17 353 | }, 354 | { 355 | "cell_type": "code", 356 | "metadata": { 357 | "jupyter": { 358 | "source_hidden": false, 359 | "outputs_hidden": false 360 | }, 361 | "nteract": { 362 | "transient": { 363 | "deleting": false 364 | } 365 | } 366 | }, 367 | "source": [ 368 | "from pyspark.sql.functions import explode\r\n", 369 | "from pyspark.sql.functions import regexp_replace\r\n", 370 | "\r\n", 371 | "Claim_diagnosis_explode_df = Claim_df.select(\r\n", 372 | " \"id\", explode(Claim_df.diagnosis))\r\n", 373 | "\r\n", 374 | "#use toDF() to rename the columns\r\n", 375 | "Claim_diagnosis_df= Claim_diagnosis_explode_df.toDF(*( \"id\", \"diagnosis\"))\r\n", 376 | "\r\n", 377 | "Claim_diagnosis_df = Claim_diagnosis_df.select(\r\n", 378 | " \"id\",\"diagnosis.diagnosisReference.reference\",\"diagnosis.sequence\"\r\n", 379 | " ).toDF(*( \r\n", 380 | " \"Claim_id\",\"diagnosis_reference\",\"diagnosis_sequence\"))\r\n", 381 | "Claim_diagnosis_df=Claim_diagnosis_df.withColumn(\r\n", 382 | " \"diagnosis_reference\",regexp_replace(\"diagnosis_reference\",\"Condition/\",\"\")) " 383 | ], 384 | "execution_count": 3 385 | }, 386 | { 387 | "cell_type": "code", 388 | "metadata": { 389 | "jupyter": { 390 | "source_hidden": false, 391 | "outputs_hidden": false 392 | }, 393 | "nteract": { 394 | "transient": { 395 | "deleting": false 396 | } 397 | } 398 | }, 399 | "source": [ 400 | "Claim_diagnosis_df.printSchema()" 401 | ], 402 | "execution_count": 20 403 | }, 404 | { 405 | "cell_type": "code", 406 | "metadata": { 407 | "jupyter": { 408 | "source_hidden": false, 409 | "outputs_hidden": false 410 | }, 411 | "nteract": { 412 | "transient": { 413 | "deleting": false 414 | } 415 | } 416 | }, 417 | "source": [ 418 | "Claim_diagnosis_df.write.mode(write_mode).parquet(curated_location+\"Claim_diagnosis/\")" 419 | ], 420 | "execution_count": 4 421 | }, 422 | { 423 | "cell_type": "code", 424 | "metadata": { 425 | "jupyter": { 426 | "source_hidden": false, 427 | "outputs_hidden": false 428 | }, 429 | "nteract": { 430 | "transient": { 431 | "deleting": false 432 | } 433 | } 434 | }, 435 | "source": [ 436 | "from pyspark.sql.functions import explode\r\n", 437 | "from pyspark.sql.functions import regexp_replace\r\n", 438 | "\r\n", 439 | "Claim_procedure_explode_df = Claim_df.select(\"id\",explode(Claim_df.procedure))\r\n", 440 | "\r\n", 441 | "#use toDF() to rename the columns\r\n", 442 | "Claim_procedure_df= Claim_procedure_explode_df.toDF(*( \"id\",\"procedure\"))\r\n", 443 | "\r\n", 444 | "Claim_procedure_df = Claim_procedure_df.select(\r\n", 445 | " \"id\",\"procedure.procedureReference.reference\",\"procedure.sequence\"\r\n", 446 | " ).toDF(*(\r\n", 447 | " \"Claim_id\",\"procedure_reference\",\"procedure_sequence\"))\r\n", 448 | "Claim_procedure_df=Claim_procedure_df.withColumn(\"procedure_reference\",regexp_replace(\"procedure_reference\",\"Procedure/\",\"\"))" 449 | ], 450 | "execution_count": 3 451 | }, 452 | { 453 | "cell_type": "code", 454 | "metadata": { 455 | "jupyter": { 456 | "source_hidden": false, 457 | "outputs_hidden": false 458 | }, 459 | "nteract": { 460 | "transient": { 461 | "deleting": false 462 | } 463 | }, 464 | "collapsed": false 465 | }, 466 | "source": [ 467 | "display(Claim_procedure_df.limit(10))" 468 | ], 469 | "execution_count": 4 470 | }, 471 | { 472 | "cell_type": "code", 473 | "metadata": { 474 | "jupyter": { 475 | "source_hidden": false, 476 | "outputs_hidden": false 477 | }, 478 | "nteract": { 479 | "transient": { 480 | "deleting": false 481 | } 482 | } 483 | }, 484 | "source": [ 485 | "Claim_procedure_df.write.mode(write_mode).parquet(curated_location+\"Claim_procedure/\")" 486 | ], 487 | "execution_count": 5 488 | }, 489 | { 490 | "cell_type": "code", 491 | "metadata": { 492 | "jupyter": { 493 | "source_hidden": false, 494 | "outputs_hidden": false 495 | }, 496 | "nteract": { 497 | "transient": { 498 | "deleting": false 499 | } 500 | } 501 | }, 502 | "source": [ 503 | "#Claim_procedure_df.write.format(\"delta\").save(curated_location+\"Claim_procedure_delta/\")\r\n", 504 | "#Claim_diagnosis_df.write.format(\"delta\").save(curated_location+\"Claim_diagnosis_delta/\")\r\n", 505 | "#Claim_insurance_df.write.format(\"delta\").save(curated_location+\"Claim_insurance_delta/\")" 506 | ], 507 | "execution_count": 11 508 | } 509 | ] 510 | } 511 | } -------------------------------------------------------------------------------- /artifacts/notebook/Claim_Ingestion_NDJSON2Parquet.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Claim_Ingestion_NDJSON2Parquet", 3 | "properties": { 4 | "folder": { 5 | "name": "Claims" 6 | }, 7 | "nbformat": 4, 8 | "nbformat_minor": 2, 9 | "bigDataPool": { 10 | "referenceName": "healthcare", 11 | "type": "BigDataPoolReference" 12 | }, 13 | "sessionProperties": { 14 | "driverMemory": "112g", 15 | "driverCores": 16, 16 | "executorMemory": "112g", 17 | "executorCores": 16, 18 | "numExecutors": 4, 19 | "conf": { 20 | "spark.dynamicAllocation.enabled": "false", 21 | "spark.dynamicAllocation.minExecutors": "4", 22 | "spark.dynamicAllocation.maxExecutors": "4", 23 | "spark.autotune.trackingId": "c4357b4e-2833-4f00-89f0-e12e26a21fb1" 24 | } 25 | }, 26 | "metadata": { 27 | "saveOutput": true, 28 | "enableDebugMode": false, 29 | "kernelspec": { 30 | "name": "synapse_pyspark", 31 | "display_name": "python" 32 | }, 33 | "language_info": { 34 | "name": "python" 35 | }, 36 | "a365ComputeOptions": { 37 | "id": "/subscriptions/7e416de3-c506-4776-8270-83fd73c6cc37/resourceGroups/syne2e/providers/Microsoft.Synapse/workspaces/health/bigDataPools/healthcare", 38 | "name": "healthcare", 39 | "type": "Spark", 40 | "endpoint": "https://health.dev.azuresynapse.net/livyApi/versions/2019-11-01-preview/sparkPools/healthcare", 41 | "auth": { 42 | "type": "AAD", 43 | "authResource": "https://dev.azuresynapse.net" 44 | }, 45 | "sparkVersion": "3.1", 46 | "nodeCount": 3, 47 | "cores": 16, 48 | "memory": 112, 49 | "automaticScaleJobs": false 50 | }, 51 | "sessionKeepAliveTimeout": 30 52 | }, 53 | "cells": [ 54 | { 55 | "cell_type": "code", 56 | "metadata": { 57 | "tags": [ 58 | "parameters" 59 | ] 60 | }, 61 | "source": [ 62 | "StorageName = \"medicaldl\"\r\n", 63 | "DatasetSize = \"1tb\"" 64 | ], 65 | "execution_count": null 66 | }, 67 | { 68 | "cell_type": "code", 69 | "metadata": { 70 | "jupyter": { 71 | "source_hidden": false, 72 | "outputs_hidden": false 73 | }, 74 | "nteract": { 75 | "transient": { 76 | "deleting": false 77 | } 78 | } 79 | }, 80 | "source": [ 81 | "raw_location = \"abfss://raw@\" + StorageName + \".dfs.core.windows.net/fhir_ndjson/\"+ DatasetSize +\"/*/\"\r\n", 82 | "processed_location = \"abfss://processed@\" + StorageName+ \".dfs.core.windows.net/fhir/\"+ DatasetSize +\"/\"\r\n", 83 | "write_mode=\"overwrite\"" 84 | ], 85 | "execution_count": null 86 | }, 87 | { 88 | "cell_type": "code", 89 | "metadata": { 90 | "jupyter": { 91 | "source_hidden": false, 92 | "outputs_hidden": false 93 | }, 94 | "nteract": { 95 | "transient": { 96 | "deleting": false 97 | } 98 | } 99 | }, 100 | "source": [ 101 | "Claim_df = spark.read.option(\"multiline\", \"false\").json(raw_location+\"Claim.ndjson\")\r\n", 102 | "Claim_df.write.mode(write_mode).parquet(processed_location+\"Claim/\")\r\n", 103 | "#display(Claim_df.limit(10))\r\n", 104 | "#Claim_df.count()" 105 | ], 106 | "execution_count": null 107 | } 108 | ] 109 | } 110 | } -------------------------------------------------------------------------------- /artifacts/notebook/Lake Database And Table Creation.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Lake Database And Table Creation", 3 | "properties": { 4 | "folder": { 5 | "name": "Exploration" 6 | }, 7 | "nbformat": 4, 8 | "nbformat_minor": 2, 9 | "bigDataPool": { 10 | "referenceName": "healthcare", 11 | "type": "BigDataPoolReference" 12 | }, 13 | "sessionProperties": { 14 | "driverMemory": "112g", 15 | "driverCores": 16, 16 | "executorMemory": "112g", 17 | "executorCores": 16, 18 | "numExecutors": 2, 19 | "conf": { 20 | "spark.dynamicAllocation.enabled": "false", 21 | "spark.dynamicAllocation.minExecutors": "2", 22 | "spark.dynamicAllocation.maxExecutors": "2", 23 | "spark.autotune.trackingId": "cdcb3e0e-61cf-4e33-8866-6df1ba4e31ee" 24 | } 25 | }, 26 | "metadata": { 27 | "saveOutput": true, 28 | "enableDebugMode": false, 29 | "kernelspec": { 30 | "name": "synapse_pyspark", 31 | "display_name": "Synapse PySpark" 32 | }, 33 | "language_info": { 34 | "name": "python" 35 | }, 36 | "a365ComputeOptions": { 37 | "id": "/subscriptions/7e416de3-c506-4776-8270-83fd73c6cc37/resourceGroups/syne2e/providers/Microsoft.Synapse/workspaces/health/bigDataPools/healthcare", 38 | "name": "healthcare", 39 | "type": "Spark", 40 | "endpoint": "https://health.dev.azuresynapse.net/livyApi/versions/2019-11-01-preview/sparkPools/healthcare", 41 | "auth": { 42 | "type": "AAD", 43 | "authResource": "https://dev.azuresynapse.net" 44 | }, 45 | "sparkVersion": "3.1", 46 | "nodeCount": 3, 47 | "cores": 16, 48 | "memory": 112, 49 | "automaticScaleJobs": false 50 | }, 51 | "sessionKeepAliveTimeout": 30 52 | }, 53 | "cells": [ 54 | { 55 | "cell_type": "code", 56 | "metadata": { 57 | "jupyter": { 58 | "source_hidden": false, 59 | "outputs_hidden": false 60 | }, 61 | "nteract": { 62 | "transient": { 63 | "deleting": false 64 | } 65 | }, 66 | "tags": [ 67 | "parameters" 68 | ] 69 | }, 70 | "source": [ 71 | "StorageName = \"medicaldl\"\r\n", 72 | "DatasetSize = \"1tb\"" 73 | ], 74 | "execution_count": 36 75 | }, 76 | { 77 | "cell_type": "code", 78 | "metadata": { 79 | "microsoft": { 80 | "language": "sparksql" 81 | }, 82 | "collapsed": false 83 | }, 84 | "source": [ 85 | "%%sql\r\n", 86 | "DROP DATABASE IF EXISTS fhirdbexploration CASCADE;\r\n", 87 | "CREATE DATABASE fhirdbexploration;" 88 | ], 89 | "execution_count": 20 90 | }, 91 | { 92 | "cell_type": "code", 93 | "metadata": { 94 | "jupyter": { 95 | "source_hidden": false, 96 | "outputs_hidden": false 97 | }, 98 | "nteract": { 99 | "transient": { 100 | "deleting": false 101 | } 102 | }, 103 | "microsoft": { 104 | "language": "sparksql" 105 | }, 106 | "collapsed": false 107 | }, 108 | "source": [ 109 | "%%sql\r\n", 110 | "USE fhirdbexploration" 111 | ], 112 | "execution_count": 23 113 | }, 114 | { 115 | "cell_type": "code", 116 | "metadata": { 117 | "jupyter": { 118 | "source_hidden": false, 119 | "outputs_hidden": false 120 | }, 121 | "nteract": { 122 | "transient": { 123 | "deleting": false 124 | } 125 | } 126 | }, 127 | "source": [ 128 | "curated_location = \"abfss://curated@\" + StorageName + \".dfs.core.windows.net/fhir/\"+ DatasetSize +\"/\"" 129 | ], 130 | "execution_count": 33 131 | }, 132 | { 133 | "cell_type": "code", 134 | "metadata": { 135 | "jupyter": { 136 | "source_hidden": false, 137 | "outputs_hidden": false 138 | }, 139 | "nteract": { 140 | "transient": { 141 | "deleting": false 142 | } 143 | } 144 | }, 145 | "source": [ 146 | "claimDiagonisLocation = curated_location + \"Claim_diagnosis/\"\r\n", 147 | "spark.conf.set('claimDiagonisLocation',claimDiagonisLocation)\r\n", 148 | "\r\n", 149 | "claimInsuranceLocation = curated_location + \"Claim_insurance/\"\r\n", 150 | "spark.conf.set('claimInsuranceLocation',claimInsuranceLocation)\r\n", 151 | "\r\n", 152 | "claimProcedureLocation = curated_location + \"Claim_procedure/\"\r\n", 153 | "spark.conf.set('claimProcedureLocation',claimProcedureLocation)" 154 | ], 155 | "execution_count": 34 156 | }, 157 | { 158 | "cell_type": "code", 159 | "metadata": { 160 | "jupyter": { 161 | "source_hidden": false, 162 | "outputs_hidden": false 163 | }, 164 | "nteract": { 165 | "transient": { 166 | "deleting": false 167 | } 168 | }, 169 | "microsoft": { 170 | "language": "sparksql" 171 | }, 172 | "collapsed": false 173 | }, 174 | "source": [ 175 | "%%sql\r\n", 176 | "DROP TABLE IF EXISTS fhirdbexploration.claimdiagnosis;\r\n", 177 | "CREATE TABLE fhirdbexploration.claimdiagnosis USING PARQUET LOCATION '${claimDiagonisLocation}';\r\n", 178 | "\r\n", 179 | "DROP TABLE IF EXISTS fhirdbexploration.claiminsurance;\r\n", 180 | "CREATE TABLE fhirdbexploration.claiminsurance USING PARQUET LOCATION '${claimInsuranceLocation}';\r\n", 181 | "\r\n", 182 | "DROP TABLE IF EXISTS fhirdbexploration.claimprocedure;\r\n", 183 | "CREATE TABLE fhirdbexploration.claimprocedure USING PARQUET LOCATION '${claimProcedureLocation}';" 184 | ], 185 | "execution_count": 35 186 | } 187 | ] 188 | } 189 | } -------------------------------------------------------------------------------- /artifacts/notebook/ObservationParquetFlatten_Large.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "ObservationParquetFlatten_Large", 3 | "properties": { 4 | "folder": { 5 | "name": "Observation" 6 | }, 7 | "nbformat": 4, 8 | "nbformat_minor": 2, 9 | "bigDataPool": { 10 | "referenceName": "healthcare", 11 | "type": "BigDataPoolReference" 12 | }, 13 | "sessionProperties": { 14 | "driverMemory": "112g", 15 | "driverCores": 16, 16 | "executorMemory": "112g", 17 | "executorCores": 16, 18 | "numExecutors": 4, 19 | "conf": { 20 | "spark.dynamicAllocation.enabled": "false", 21 | "spark.dynamicAllocation.minExecutors": "4", 22 | "spark.dynamicAllocation.maxExecutors": "4", 23 | "spark.autotune.trackingId": "0ff2572c-9472-4bb0-980c-22f1e1f08db5" 24 | } 25 | }, 26 | "metadata": { 27 | "saveOutput": true, 28 | "enableDebugMode": false, 29 | "kernelspec": { 30 | "name": "synapse_pyspark", 31 | "display_name": "Synapse PySpark" 32 | }, 33 | "language_info": { 34 | "name": "python" 35 | }, 36 | "a365ComputeOptions": { 37 | "id": "/subscriptions/7e416de3-c506-4776-8270-83fd73c6cc37/resourceGroups/syne2e/providers/Microsoft.Synapse/workspaces/health/bigDataPools/healthcare", 38 | "name": "healthcare", 39 | "type": "Spark", 40 | "endpoint": "https://health.dev.azuresynapse.net/livyApi/versions/2019-11-01-preview/sparkPools/healthcare", 41 | "auth": { 42 | "type": "AAD", 43 | "authResource": "https://dev.azuresynapse.net" 44 | }, 45 | "sparkVersion": "3.1", 46 | "nodeCount": 3, 47 | "cores": 16, 48 | "memory": 112, 49 | "automaticScaleJobs": false 50 | }, 51 | "sessionKeepAliveTimeout": 30 52 | }, 53 | "cells": [ 54 | { 55 | "cell_type": "code", 56 | "metadata": { 57 | "jupyter": { 58 | "source_hidden": false, 59 | "outputs_hidden": false 60 | }, 61 | "nteract": { 62 | "transient": { 63 | "deleting": false 64 | } 65 | }, 66 | "tags": [ 67 | "parameters" 68 | ] 69 | }, 70 | "source": [ 71 | "StorageName = \"medicaldl\"\r\n", 72 | "DatasetSize = \"1tb\"" 73 | ], 74 | "execution_count": null 75 | }, 76 | { 77 | "cell_type": "code", 78 | "source": [ 79 | "curated_location = \"abfss://curated@\"+ StorageName +\".dfs.core.windows.net/fhir/\"+DatasetSize+\"/\"\r\n", 80 | "processed_location = \"abfss://processed@\"+StorageName+\".dfs.core.windows.net/fhir/\"+DatasetSize+\"/\"\r\n", 81 | "write_mode=\"overwrite\"" 82 | ], 83 | "execution_count": 1 84 | }, 85 | { 86 | "cell_type": "code", 87 | "metadata": { 88 | "jupyter": { 89 | "source_hidden": false, 90 | "outputs_hidden": false 91 | }, 92 | "nteract": { 93 | "transient": { 94 | "deleting": false 95 | } 96 | } 97 | }, 98 | "source": [ 99 | "Observation_df=spark.read.parquet(processed_location+\"Observation/\")" 100 | ], 101 | "execution_count": 2 102 | }, 103 | { 104 | "cell_type": "code", 105 | "metadata": { 106 | "jupyter": { 107 | "source_hidden": false, 108 | "outputs_hidden": true 109 | }, 110 | "nteract": { 111 | "transient": { 112 | "deleting": false 113 | } 114 | }, 115 | "collapsed": false 116 | }, 117 | "source": [ 118 | "#display(Observation_df.limit(10))" 119 | ], 120 | "execution_count": 3 121 | }, 122 | { 123 | "cell_type": "code", 124 | "metadata": { 125 | "jupyter": { 126 | "source_hidden": false, 127 | "outputs_hidden": false 128 | }, 129 | "nteract": { 130 | "transient": { 131 | "deleting": false 132 | } 133 | } 134 | }, 135 | "source": [ 136 | "from pyspark.sql.functions import regexp_replace\r\n", 137 | "\r\n", 138 | "Observation_df=Observation_df.withColumn(\"subject\",regexp_replace(\"subject.reference\",\"Patient/\",\"\")).withColumn(\"encounter\",regexp_replace(\"encounter.reference\",\"Encounter/\",\"\"))\r\n", 139 | "Observation_df=Observation_df.withColumnRenamed(\"id\",\"Observation_id\")\r\n", 140 | "Observation_df=Observation_df.withColumnRenamed(\"subject\",\"patient_id_reference\")\r\n", 141 | "Observation_df=Observation_df.withColumnRenamed(\"encounter\",\"encounter_id_reference\")" 142 | ], 143 | "execution_count": 3 144 | }, 145 | { 146 | "cell_type": "code", 147 | "metadata": { 148 | "jupyter": { 149 | "source_hidden": false, 150 | "outputs_hidden": true 151 | }, 152 | "nteract": { 153 | "transient": { 154 | "deleting": false 155 | } 156 | } 157 | }, 158 | "source": [ 159 | "Observation_df.printSchema()" 160 | ], 161 | "execution_count": 6 162 | }, 163 | { 164 | "cell_type": "code", 165 | "metadata": { 166 | "jupyter": { 167 | "source_hidden": false, 168 | "outputs_hidden": false 169 | }, 170 | "nteract": { 171 | "transient": { 172 | "deleting": false 173 | } 174 | } 175 | }, 176 | "source": [ 177 | "from pyspark.sql.functions import explode\r\n", 178 | "\r\n", 179 | "Observation_main_df = Observation_df.select(\r\n", 180 | " \"Observation_id\",\"resourceType\",\"issued\",\"status\", \"patient_id_reference\",\"encounter_id_reference\",\"effectiveDateTime\",\r\n", 181 | " \"valueQuantity.code\",\"valueQuantity.system\",\"valueQuantity.unit\",\"valueQuantity.value\",\"valueString\").toDF(*(\r\n", 182 | " \"Observation_id\",\"resourceType\",\"issued\",\"status\",\"patient_id_reference\",\"encounter_id_reference\",\"effectiveDateTime\",\r\n", 183 | " \"valueQuantity_code\",\"valueQuantity_system\",\r\n", 184 | " \"valueQuantity_unit\",\"valueQuantity_value\",\"valueString\"))" 185 | ], 186 | "execution_count": 8 187 | }, 188 | { 189 | "cell_type": "code", 190 | "metadata": { 191 | "jupyter": { 192 | "source_hidden": false, 193 | "outputs_hidden": false 194 | }, 195 | "nteract": { 196 | "transient": { 197 | "deleting": false 198 | } 199 | } 200 | }, 201 | "source": [ 202 | "Observation_main_df.write.mode(write_mode).parquet(curated_location+\"Observation_main/\")\r\n", 203 | "#Observation_main_df.write.format(\"delta\").saveAsTable(\"fhirlakedb.Observation_main\")\r\n", 204 | "#Patient_identifier_df.write.format(\"delta\").save(curated_location+\"Condition_clinicalStatus_delta/\")" 205 | ], 206 | "execution_count": 9 207 | }, 208 | { 209 | "cell_type": "code", 210 | "metadata": { 211 | "jupyter": { 212 | "source_hidden": false, 213 | "outputs_hidden": false 214 | }, 215 | "nteract": { 216 | "transient": { 217 | "deleting": false 218 | } 219 | } 220 | }, 221 | "source": [ 222 | "from pyspark.sql.functions import explode\r\n", 223 | "\r\n", 224 | "Observation_category_explode_df = Observation_df.select(\r\n", 225 | " \"Observation_id\",\r\n", 226 | " explode(Observation_df.category)).toDF(*(\"Observation_id\",\"category\"))\r\n", 227 | "\r\n", 228 | "Observation_category_explode2_df = Observation_category_explode_df.select(\r\n", 229 | " \"Observation_id\",\r\n", 230 | " explode(Observation_category_explode_df.category.coding)).toDF(*(\"Observation_id\",\"category\")) \r\n", 231 | "\r\n", 232 | "Observation_category_df = Observation_category_explode2_df.select(\"Observation_id\",\"category.*\").toDF(*(\"Observation_id\",\"category_code\",\"category_display\",\"category_system\"))" 233 | ], 234 | "execution_count": 10 235 | }, 236 | { 237 | "cell_type": "code", 238 | "metadata": { 239 | "jupyter": { 240 | "source_hidden": false, 241 | "outputs_hidden": false 242 | }, 243 | "nteract": { 244 | "transient": { 245 | "deleting": false 246 | } 247 | } 248 | }, 249 | "source": [ 250 | "Observation_category_df.write.mode(write_mode).parquet(curated_location+\"Observation_category/\")" 251 | ], 252 | "execution_count": 13 253 | }, 254 | { 255 | "cell_type": "code", 256 | "metadata": { 257 | "jupyter": { 258 | "source_hidden": false, 259 | "outputs_hidden": false 260 | }, 261 | "nteract": { 262 | "transient": { 263 | "deleting": false 264 | } 265 | } 266 | }, 267 | "source": [ 268 | "from pyspark.sql.functions import explode\r\n", 269 | "\r\n", 270 | "Observation_code_explode_df = Observation_df.select(\r\n", 271 | " \"Observation_id\",\"code.text\",explode(Observation_df.code.coding)).toDF(*(\"Observation_id\",\"text\",\"coding\"))\r\n", 272 | "Observation_code_df = Observation_code_explode_df.select(\"Observation_id\",\"text\",\"coding.*\").toDF(*(\"Observation_id\",\"code_text\",\"coding_code\",\"coding_display\",\"coding_system\")) " 273 | ], 274 | "execution_count": 15 275 | }, 276 | { 277 | "cell_type": "code", 278 | "metadata": { 279 | "jupyter": { 280 | "source_hidden": false, 281 | "outputs_hidden": false 282 | }, 283 | "nteract": { 284 | "transient": { 285 | "deleting": false 286 | } 287 | } 288 | }, 289 | "source": [ 290 | "Observation_code_df.write.mode(write_mode).parquet(curated_location+\"Observation_code/\")" 291 | ], 292 | "execution_count": 16 293 | }, 294 | { 295 | "cell_type": "code", 296 | "metadata": { 297 | "jupyter": { 298 | "source_hidden": false, 299 | "outputs_hidden": false 300 | }, 301 | "nteract": { 302 | "transient": { 303 | "deleting": false 304 | } 305 | } 306 | }, 307 | "source": [ 308 | "from pyspark.sql.functions import explode\r\n", 309 | "from pyspark.sql.functions import explode_outer\r\n", 310 | "\r\n", 311 | "Observation_component_explode_df = Observation_df.select(\r\n", 312 | " \"Observation_id\",explode(Observation_df.component))\r\n", 313 | "\r\n", 314 | "Observation_component_explode2_df = Observation_component_explode_df.select(\"Observation_id\",explode_outer(Observation_component_explode_df.col.code.coding),\"col.code.text\",\r\n", 315 | " \"col.valueQuantity.code\",\"col.valueQuantity.system\",\"col.valueQuantity.unit\",\"col.valueQuantity.value\").toDF(*(\"Observation_id\",\"component_code\",\"component_text\",\r\n", 316 | " \"component_valueQuantity_code\",\"component_valueQuantity_system\",\"component_valueQuantity_unit\",\"component_valueQuantity_value\")) \r\n", 317 | "\r\n", 318 | "Observation_component_df = Observation_component_explode2_df.select(\"Observation_id\",\"component_code.*\",\"component_text\",\r\n", 319 | " \"component_valueQuantity_code\",\"component_valueQuantity_system\",\"component_valueQuantity_unit\",\"component_valueQuantity_value\").toDF(*(\"Observation_id\",\"component_code\",\r\n", 320 | " \"component_display\",\"component_system\",\"component_text\",\r\n", 321 | " \"component_valueQuantity_code\",\"component_valueQuantity_system\",\"component_valueQuantity_unit\",\"component_valueQuantity_value\"))" 322 | ], 323 | "execution_count": 4 324 | }, 325 | { 326 | "cell_type": "code", 327 | "metadata": { 328 | "jupyter": { 329 | "source_hidden": false, 330 | "outputs_hidden": false 331 | }, 332 | "nteract": { 333 | "transient": { 334 | "deleting": false 335 | } 336 | } 337 | }, 338 | "source": [ 339 | "Observation_component_df.write.mode(write_mode).parquet(curated_location+\"Observation_component/\")" 340 | ], 341 | "execution_count": 7 342 | }, 343 | { 344 | "cell_type": "code", 345 | "metadata": { 346 | "jupyter": { 347 | "source_hidden": false, 348 | "outputs_hidden": false 349 | }, 350 | "nteract": { 351 | "transient": { 352 | "deleting": false 353 | } 354 | } 355 | }, 356 | "source": [ 357 | "from pyspark.sql.functions import explode\r\n", 358 | "\r\n", 359 | "Observation_valueCodeableConcept_explode_df = Observation_df.select(\r\n", 360 | " \"Observation_id\",explode(Observation_df.valueCodeableConcept.coding),\"valueCodeableConcept.text\").toDF(*(\"Observation_id\",\"coding\",\"valueCodeableConcept_text\"))\r\n", 361 | "\r\n", 362 | "Observation_valueCodeableConcept_df = Observation_valueCodeableConcept_explode_df.select(\"Observation_id\",\"coding.*\",\"valueCodeableConcept_text\").toDF(*(\"Observation_id\",\r\n", 363 | " \"valueCodeableConcept_code\", \"valueCodeableConcept_display\",\"valueCodeableConcept_system\",\"valueCodeableConcept_text\"))" 364 | ], 365 | "execution_count": 22 366 | }, 367 | { 368 | "cell_type": "code", 369 | "metadata": { 370 | "jupyter": { 371 | "source_hidden": false, 372 | "outputs_hidden": false 373 | }, 374 | "nteract": { 375 | "transient": { 376 | "deleting": false 377 | } 378 | } 379 | }, 380 | "source": [ 381 | "Observation_valueCodeableConcept_df.write.mode(write_mode).parquet(curated_location+\"Observation_valueCodeableConcept/\")" 382 | ], 383 | "execution_count": 26 384 | } 385 | ] 386 | } 387 | } -------------------------------------------------------------------------------- /artifacts/notebook/Observation_Ingestion_NDJSON2Parquet.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Observation_Ingestion_NDJSON2Parquet", 3 | "properties": { 4 | "folder": { 5 | "name": "Observation" 6 | }, 7 | "nbformat": 4, 8 | "nbformat_minor": 2, 9 | "bigDataPool": { 10 | "referenceName": "healthcare", 11 | "type": "BigDataPoolReference" 12 | }, 13 | "sessionProperties": { 14 | "driverMemory": "112g", 15 | "driverCores": 16, 16 | "executorMemory": "112g", 17 | "executorCores": 16, 18 | "numExecutors": 4, 19 | "conf": { 20 | "spark.dynamicAllocation.enabled": "false", 21 | "spark.dynamicAllocation.minExecutors": "4", 22 | "spark.dynamicAllocation.maxExecutors": "4", 23 | "spark.autotune.trackingId": "b3cc26e4-f21d-4a99-b27a-9a7b169b7dd3" 24 | } 25 | }, 26 | "metadata": { 27 | "saveOutput": true, 28 | "enableDebugMode": false, 29 | "kernelspec": { 30 | "name": "synapse_pyspark", 31 | "display_name": "Synapse PySpark" 32 | }, 33 | "language_info": { 34 | "name": "python" 35 | }, 36 | "a365ComputeOptions": { 37 | "id": "/subscriptions/7e416de3-c506-4776-8270-83fd73c6cc37/resourceGroups/syne2e/providers/Microsoft.Synapse/workspaces/health/bigDataPools/healthcare", 38 | "name": "healthcare", 39 | "type": "Spark", 40 | "endpoint": "https://health.dev.azuresynapse.net/livyApi/versions/2019-11-01-preview/sparkPools/healthcare", 41 | "auth": { 42 | "type": "AAD", 43 | "authResource": "https://dev.azuresynapse.net" 44 | }, 45 | "sparkVersion": "3.1", 46 | "nodeCount": 3, 47 | "cores": 16, 48 | "memory": 112, 49 | "automaticScaleJobs": false 50 | }, 51 | "sessionKeepAliveTimeout": 30 52 | }, 53 | "cells": [ 54 | { 55 | "cell_type": "code", 56 | "metadata": { 57 | "tags": [ 58 | "parameters" 59 | ] 60 | }, 61 | "source": [ 62 | "StorageName = \"medicaldl\"\r\n", 63 | "DatasetSize = \"1tb\"" 64 | ], 65 | "execution_count": 2 66 | }, 67 | { 68 | "cell_type": "code", 69 | "metadata": { 70 | "jupyter": { 71 | "source_hidden": false, 72 | "outputs_hidden": false 73 | }, 74 | "nteract": { 75 | "transient": { 76 | "deleting": false 77 | } 78 | } 79 | }, 80 | "source": [ 81 | "raw_location = \"abfss://raw@\" + StorageName + \".dfs.core.windows.net/fhir_ndjson/\"+ DatasetSize +\"/*/\"\r\n", 82 | "processed_location = \"abfss://processed@\" + StorageName + \".dfs.core.windows.net/fhir/\"+ DatasetSize +\"/\"\r\n", 83 | "write_mode=\"overwrite\"" 84 | ], 85 | "execution_count": null 86 | }, 87 | { 88 | "cell_type": "code", 89 | "metadata": { 90 | "jupyter": { 91 | "source_hidden": false, 92 | "outputs_hidden": false 93 | }, 94 | "nteract": { 95 | "transient": { 96 | "deleting": false 97 | } 98 | } 99 | }, 100 | "source": [ 101 | "Observation_df = spark.read.option(\"multiline\", \"false\").json(raw_location+\"Observation.ndjson\")\r\n", 102 | "Observation_df.write.mode(write_mode).parquet(processed_location+\"Observation/\")\r\n", 103 | "#display(Observation_df.limit(10))\r\n", 104 | "#Observation_df.count()" 105 | ], 106 | "execution_count": 3 107 | } 108 | ] 109 | } 110 | } -------------------------------------------------------------------------------- /artifacts/notebook/Patient_Ingestion_NDJSON2Parquet.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Patient_Ingestion_NDJSON2Parquet", 3 | "properties": { 4 | "folder": { 5 | "name": "Patient" 6 | }, 7 | "nbformat": 4, 8 | "nbformat_minor": 2, 9 | "bigDataPool": { 10 | "referenceName": "healthcare", 11 | "type": "BigDataPoolReference" 12 | }, 13 | "sessionProperties": { 14 | "driverMemory": "112g", 15 | "driverCores": 16, 16 | "executorMemory": "112g", 17 | "executorCores": 16, 18 | "numExecutors": 2, 19 | "conf": { 20 | "spark.dynamicAllocation.enabled": "false", 21 | "spark.dynamicAllocation.minExecutors": "2", 22 | "spark.dynamicAllocation.maxExecutors": "2", 23 | "spark.autotune.trackingId": "3c6ed713-0024-4eb0-ba62-6f58dc091de4" 24 | } 25 | }, 26 | "metadata": { 27 | "saveOutput": true, 28 | "enableDebugMode": false, 29 | "kernelspec": { 30 | "name": "synapse_pyspark", 31 | "display_name": "python" 32 | }, 33 | "language_info": { 34 | "name": "python" 35 | }, 36 | "a365ComputeOptions": { 37 | "id": "/subscriptions/7e416de3-c506-4776-8270-83fd73c6cc37/resourceGroups/syne2e/providers/Microsoft.Synapse/workspaces/health/bigDataPools/healthcare", 38 | "name": "healthcare", 39 | "type": "Spark", 40 | "endpoint": "https://health.dev.azuresynapse.net/livyApi/versions/2019-11-01-preview/sparkPools/healthcare", 41 | "auth": { 42 | "type": "AAD", 43 | "authResource": "https://dev.azuresynapse.net" 44 | }, 45 | "sparkVersion": "3.1", 46 | "nodeCount": 3, 47 | "cores": 16, 48 | "memory": 112, 49 | "automaticScaleJobs": false 50 | }, 51 | "sessionKeepAliveTimeout": 30 52 | }, 53 | "cells": [ 54 | { 55 | "cell_type": "code", 56 | "metadata": { 57 | "tags": [ 58 | "parameters" 59 | ] 60 | }, 61 | "source": [ 62 | "StorageName = \"medicaldl\"\r\n", 63 | "DatasetSize = \"1tb\"" 64 | ], 65 | "execution_count": null 66 | }, 67 | { 68 | "cell_type": "code", 69 | "metadata": { 70 | "jupyter": { 71 | "source_hidden": false, 72 | "outputs_hidden": false 73 | }, 74 | "nteract": { 75 | "transient": { 76 | "deleting": false 77 | } 78 | } 79 | }, 80 | "source": [ 81 | "raw_location = \"abfss://raw@\" + StorageName + \".dfs.core.windows.net/fhir_ndjson/\"+ DatasetSize +\"/*/\"\r\n", 82 | "processed_location = \"abfss://processed@\" + StorageName + \".dfs.core.windows.net/fhir/\"+ DatasetSize +\"/\"\r\n", 83 | "write_mode=\"overwrite\"" 84 | ], 85 | "execution_count": null 86 | }, 87 | { 88 | "cell_type": "code", 89 | "metadata": { 90 | "jupyter": { 91 | "source_hidden": false, 92 | "outputs_hidden": false 93 | }, 94 | "nteract": { 95 | "transient": { 96 | "deleting": false 97 | } 98 | } 99 | }, 100 | "source": [ 101 | "patient_df = spark.read.option(\"multiline\", \"false\").json(raw_location+\"Patient.ndjson\")\r\n", 102 | "patient_df.write.mode(write_mode).parquet(processed_location+\"Patient/\")\r\n", 103 | "#display(patient_df.limit(10))\r\n", 104 | "#patient_df.count()" 105 | ], 106 | "execution_count": null 107 | } 108 | ] 109 | } 110 | } -------------------------------------------------------------------------------- /artifacts/pipeline/Copy_Data_Source_To_Raw_PL.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Copy_Data_Source_To_Raw_PL", 3 | "properties": { 4 | "activities": [ 5 | { 6 | "name": "Copy Source Data To Raw Container", 7 | "type": "Copy", 8 | "dependsOn": [], 9 | "policy": { 10 | "timeout": "0.12:00:00", 11 | "retry": 0, 12 | "retryIntervalInSeconds": 30, 13 | "secureOutput": false, 14 | "secureInput": false 15 | }, 16 | "userProperties": [], 17 | "typeProperties": { 18 | "source": { 19 | "type": "BinarySource", 20 | "storeSettings": { 21 | "type": "AzureBlobStorageReadSettings", 22 | "recursive": true 23 | }, 24 | "formatSettings": { 25 | "type": "BinaryReadSettings" 26 | } 27 | }, 28 | "sink": { 29 | "type": "BinarySink", 30 | "storeSettings": { 31 | "type": "AzureBlobFSWriteSettings", 32 | "copyBehavior": "PreserveHierarchy" 33 | } 34 | }, 35 | "enableStaging": false 36 | }, 37 | "inputs": [ 38 | { 39 | "referenceName": "Source_DataPrep_DS", 40 | "type": "DatasetReference", 41 | "parameters": { 42 | "DatasetSize": { 43 | "value": "@pipeline().parameters.DatasetSize", 44 | "type": "Expression" 45 | } 46 | } 47 | } 48 | ], 49 | "outputs": [ 50 | { 51 | "referenceName": "Sink_DataPrep_DS", 52 | "type": "DatasetReference", 53 | "parameters": { 54 | "StorageName": { 55 | "value": "@pipeline().parameters.TargetStorageName", 56 | "type": "Expression" 57 | }, 58 | "DatasetSize": { 59 | "value": "@pipeline().parameters.DatasetSize", 60 | "type": "Expression" 61 | } 62 | } 63 | } 64 | ] 65 | }, 66 | { 67 | "name": "Create Curated Container", 68 | "type": "Copy", 69 | "dependsOn": [ 70 | { 71 | "activity": "Copy Source Data To Raw Container", 72 | "dependencyConditions": [ 73 | "Succeeded" 74 | ] 75 | } 76 | ], 77 | "policy": { 78 | "timeout": "0.12:00:00", 79 | "retry": 0, 80 | "retryIntervalInSeconds": 30, 81 | "secureOutput": false, 82 | "secureInput": false 83 | }, 84 | "userProperties": [], 85 | "typeProperties": { 86 | "source": { 87 | "type": "BinarySource", 88 | "storeSettings": { 89 | "type": "AzureBlobStorageReadSettings", 90 | "recursive": true 91 | }, 92 | "formatSettings": { 93 | "type": "BinaryReadSettings" 94 | } 95 | }, 96 | "sink": { 97 | "type": "BinarySink", 98 | "storeSettings": { 99 | "type": "AzureBlobFSWriteSettings", 100 | "copyBehavior": "PreserveHierarchy" 101 | } 102 | }, 103 | "enableStaging": false 104 | }, 105 | "inputs": [ 106 | { 107 | "referenceName": "Source_DataPrep_Curated_DS", 108 | "type": "DatasetReference" 109 | } 110 | ], 111 | "outputs": [ 112 | { 113 | "referenceName": "Sink_DataPrep_Curated_DS", 114 | "type": "DatasetReference", 115 | "parameters": { 116 | "StorageName": { 117 | "value": "@pipeline().parameters.TargetStorageName", 118 | "type": "Expression" 119 | } 120 | } 121 | } 122 | ] 123 | }, 124 | { 125 | "name": "Create Processed Container", 126 | "type": "Copy", 127 | "dependsOn": [ 128 | { 129 | "activity": "Copy Source Data To Raw Container", 130 | "dependencyConditions": [ 131 | "Succeeded" 132 | ] 133 | } 134 | ], 135 | "policy": { 136 | "timeout": "0.12:00:00", 137 | "retry": 0, 138 | "retryIntervalInSeconds": 30, 139 | "secureOutput": false, 140 | "secureInput": false 141 | }, 142 | "userProperties": [], 143 | "typeProperties": { 144 | "source": { 145 | "type": "BinarySource", 146 | "storeSettings": { 147 | "type": "AzureBlobStorageReadSettings", 148 | "recursive": true 149 | }, 150 | "formatSettings": { 151 | "type": "BinaryReadSettings" 152 | } 153 | }, 154 | "sink": { 155 | "type": "BinarySink", 156 | "storeSettings": { 157 | "type": "AzureBlobFSWriteSettings", 158 | "copyBehavior": "PreserveHierarchy" 159 | } 160 | }, 161 | "enableStaging": false 162 | }, 163 | "inputs": [ 164 | { 165 | "referenceName": "Source_DataPrep_Processed_DS", 166 | "type": "DatasetReference" 167 | } 168 | ], 169 | "outputs": [ 170 | { 171 | "referenceName": "Sink_DataPrep_Processed_DS", 172 | "type": "DatasetReference", 173 | "parameters": { 174 | "StorageName": { 175 | "value": "@pipeline().parameters.TargetStorageName", 176 | "type": "Expression" 177 | } 178 | } 179 | } 180 | ] 181 | } 182 | ], 183 | "parameters": { 184 | "TargetStorageName": { 185 | "type": "string" 186 | }, 187 | "DatasetSize": { 188 | "type": "string", 189 | "defaultValue": "1tb" 190 | } 191 | }, 192 | "folder": { 193 | "name": "Data Prep" 194 | }, 195 | "annotations": [] 196 | } 197 | } -------------------------------------------------------------------------------- /artifacts/pipeline/FHIR_Pipeline4Claim_Spark_OC.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "FHIR_Pipeline4Claim_Spark_OC", 3 | "properties": { 4 | "activities": [ 5 | { 6 | "name": "NDJSON_Ingestion_Claim", 7 | "type": "SynapseNotebook", 8 | "dependsOn": [], 9 | "policy": { 10 | "timeout": "7.00:00:00", 11 | "retry": 0, 12 | "retryIntervalInSeconds": 30, 13 | "secureOutput": false, 14 | "secureInput": false 15 | }, 16 | "userProperties": [], 17 | "typeProperties": { 18 | "notebook": { 19 | "referenceName": "Claim_Ingestion_NDJSON2Parquet", 20 | "type": "NotebookReference" 21 | }, 22 | "parameters": { 23 | "StorageName": { 24 | "value": { 25 | "value": "@pipeline().parameters.StorageName", 26 | "type": "Expression" 27 | }, 28 | "type": "string" 29 | }, 30 | "DatasetSize": { 31 | "value": { 32 | "value": "@pipeline().parameters.DatasetSize", 33 | "type": "Expression" 34 | }, 35 | "type": "string" 36 | } 37 | }, 38 | "snapshot": true, 39 | "sparkPool": { 40 | "referenceName": { 41 | "value": "@pipeline().parameters.SparkPoolName", 42 | "type": "Expression" 43 | }, 44 | "type": "BigDataPoolReference" 45 | }, 46 | "executorSize": null, 47 | "conf": { 48 | "spark.dynamicAllocation.enabled": null, 49 | "spark.dynamicAllocation.minExecutors": null, 50 | "spark.dynamicAllocation.maxExecutors": null 51 | }, 52 | "driverSize": null, 53 | "numExecutors": null 54 | } 55 | }, 56 | { 57 | "name": "ClaimParquetFlatten_Large", 58 | "type": "SynapseNotebook", 59 | "dependsOn": [ 60 | { 61 | "activity": "NDJSON_Ingestion_Claim", 62 | "dependencyConditions": [ 63 | "Succeeded" 64 | ] 65 | } 66 | ], 67 | "policy": { 68 | "timeout": "7.00:00:00", 69 | "retry": 0, 70 | "retryIntervalInSeconds": 30, 71 | "secureOutput": false, 72 | "secureInput": false 73 | }, 74 | "userProperties": [], 75 | "typeProperties": { 76 | "notebook": { 77 | "referenceName": "ClaimParquetFlatten_Large", 78 | "type": "NotebookReference" 79 | }, 80 | "parameters": { 81 | "StorageName": { 82 | "value": { 83 | "value": "@pipeline().parameters.StorageName", 84 | "type": "Expression" 85 | }, 86 | "type": "string" 87 | }, 88 | "DatasetSize": { 89 | "value": { 90 | "value": "@pipeline().parameters.DatasetSize", 91 | "type": "Expression" 92 | }, 93 | "type": "string" 94 | } 95 | }, 96 | "snapshot": true, 97 | "sparkPool": { 98 | "referenceName": { 99 | "value": "@pipeline().parameters.SparkPoolName", 100 | "type": "Expression" 101 | }, 102 | "type": "BigDataPoolReference" 103 | }, 104 | "conf": { 105 | "spark.dynamicAllocation.enabled": null, 106 | "spark.dynamicAllocation.minExecutors": null, 107 | "spark.dynamicAllocation.maxExecutors": null 108 | }, 109 | "numExecutors": null 110 | } 111 | }, 112 | { 113 | "name": "ClaimDiagnosis2SQL", 114 | "type": "Copy", 115 | "dependsOn": [ 116 | { 117 | "activity": "Create Tables", 118 | "dependencyConditions": [ 119 | "Succeeded" 120 | ] 121 | } 122 | ], 123 | "policy": { 124 | "timeout": "7.00:00:00", 125 | "retry": 0, 126 | "retryIntervalInSeconds": 30, 127 | "secureOutput": false, 128 | "secureInput": false 129 | }, 130 | "userProperties": [], 131 | "typeProperties": { 132 | "source": { 133 | "type": "ParquetSource", 134 | "storeSettings": { 135 | "type": "AzureBlobFSReadSettings", 136 | "recursive": true, 137 | "wildcardFolderPath": { 138 | "value": "@concat('fhir/',pipeline().parameters.DatasetSize,'/Claim_diagnosis')", 139 | "type": "Expression" 140 | }, 141 | "wildcardFileName": "*.parquet", 142 | "enablePartitionDiscovery": false 143 | } 144 | }, 145 | "sink": { 146 | "type": "SqlDWSink", 147 | "allowCopyCommand": true, 148 | "tableOption": "autoCreate", 149 | "disableMetricsCollection": false 150 | }, 151 | "enableStaging": true, 152 | "stagingSettings": { 153 | "linkedServiceName": { 154 | "referenceName": "StorageLS", 155 | "type": "LinkedServiceReference", 156 | "parameters": { 157 | "StorageName": { 158 | "value": "@pipeline().parameters.StorageName", 159 | "type": "Expression" 160 | } 161 | } 162 | }, 163 | "path": "staging" 164 | } 165 | }, 166 | "inputs": [ 167 | { 168 | "referenceName": "ClaimDiagnosisParquetLarge", 169 | "type": "DatasetReference", 170 | "parameters": { 171 | "StorageName": { 172 | "value": "@pipeline().parameters.StorageName", 173 | "type": "Expression" 174 | }, 175 | "FolderPath": { 176 | "value": "@concat('fhir/',pipeline().parameters.DatasetSize,'/Claim_diagnosis')", 177 | "type": "Expression" 178 | } 179 | } 180 | } 181 | ], 182 | "outputs": [ 183 | { 184 | "referenceName": "ClaimDiagnosisSQL", 185 | "type": "DatasetReference", 186 | "parameters": { 187 | "DatabaseName": { 188 | "value": "@pipeline().parameters.DatabaseName", 189 | "type": "Expression" 190 | }, 191 | "ServerName": { 192 | "value": "@pipeline().parameters.ServerName", 193 | "type": "Expression" 194 | } 195 | } 196 | } 197 | ] 198 | }, 199 | { 200 | "name": "ClaimInsurance2SQL", 201 | "type": "Copy", 202 | "dependsOn": [ 203 | { 204 | "activity": "Create Tables", 205 | "dependencyConditions": [ 206 | "Succeeded" 207 | ] 208 | } 209 | ], 210 | "policy": { 211 | "timeout": "7.00:00:00", 212 | "retry": 0, 213 | "retryIntervalInSeconds": 30, 214 | "secureOutput": false, 215 | "secureInput": false 216 | }, 217 | "userProperties": [], 218 | "typeProperties": { 219 | "source": { 220 | "type": "ParquetSource", 221 | "storeSettings": { 222 | "type": "AzureBlobFSReadSettings", 223 | "recursive": true, 224 | "wildcardFolderPath": { 225 | "value": "@concat('fhir/',pipeline().parameters.DatasetSize,'/Claim_insurance')", 226 | "type": "Expression" 227 | }, 228 | "wildcardFileName": "*.parquet", 229 | "enablePartitionDiscovery": false 230 | } 231 | }, 232 | "sink": { 233 | "type": "SqlDWSink", 234 | "allowCopyCommand": true, 235 | "tableOption": "autoCreate", 236 | "disableMetricsCollection": false 237 | }, 238 | "enableStaging": true, 239 | "stagingSettings": { 240 | "linkedServiceName": { 241 | "referenceName": "StorageLS", 242 | "type": "LinkedServiceReference", 243 | "parameters": { 244 | "StorageName": { 245 | "value": "@pipeline().parameters.StorageName", 246 | "type": "Expression" 247 | } 248 | } 249 | }, 250 | "path": "staging" 251 | } 252 | }, 253 | "inputs": [ 254 | { 255 | "referenceName": "ClaimInsuranceParquetLarge", 256 | "type": "DatasetReference", 257 | "parameters": { 258 | "StorageName": { 259 | "value": "@pipeline().parameters.StorageName", 260 | "type": "Expression" 261 | }, 262 | "FolderPath": { 263 | "value": "@concat('fhir/',pipeline().parameters.DatasetSize,'/Claim_insurance')", 264 | "type": "Expression" 265 | } 266 | } 267 | } 268 | ], 269 | "outputs": [ 270 | { 271 | "referenceName": "ClaimInsurance", 272 | "type": "DatasetReference", 273 | "parameters": { 274 | "DatabaseName": { 275 | "value": "@pipeline().parameters.DatabaseName", 276 | "type": "Expression" 277 | }, 278 | "ServerName": { 279 | "value": "@pipeline().parameters.ServerName", 280 | "type": "Expression" 281 | } 282 | } 283 | } 284 | ] 285 | }, 286 | { 287 | "name": "ClaimProcedure2SQL", 288 | "type": "Copy", 289 | "dependsOn": [ 290 | { 291 | "activity": "Create Tables", 292 | "dependencyConditions": [ 293 | "Succeeded" 294 | ] 295 | } 296 | ], 297 | "policy": { 298 | "timeout": "7.00:00:00", 299 | "retry": 0, 300 | "retryIntervalInSeconds": 30, 301 | "secureOutput": false, 302 | "secureInput": false 303 | }, 304 | "userProperties": [], 305 | "typeProperties": { 306 | "source": { 307 | "type": "ParquetSource", 308 | "storeSettings": { 309 | "type": "AzureBlobFSReadSettings", 310 | "recursive": true, 311 | "wildcardFolderPath": { 312 | "value": "@concat('fhir/',pipeline().parameters.DatasetSize,'/Claim_procedure')", 313 | "type": "Expression" 314 | }, 315 | "wildcardFileName": "*.parquet", 316 | "enablePartitionDiscovery": false 317 | } 318 | }, 319 | "sink": { 320 | "type": "SqlDWSink", 321 | "allowCopyCommand": true, 322 | "tableOption": "autoCreate", 323 | "disableMetricsCollection": false 324 | }, 325 | "enableStaging": true, 326 | "stagingSettings": { 327 | "linkedServiceName": { 328 | "referenceName": "StorageLS", 329 | "type": "LinkedServiceReference", 330 | "parameters": { 331 | "StorageName": { 332 | "value": "@pipeline().parameters.StorageName", 333 | "type": "Expression" 334 | } 335 | } 336 | }, 337 | "path": "staging" 338 | } 339 | }, 340 | "inputs": [ 341 | { 342 | "referenceName": "ClaimProcedureParquetLarge", 343 | "type": "DatasetReference", 344 | "parameters": { 345 | "StorageName": { 346 | "value": "@pipeline().parameters.StorageName", 347 | "type": "Expression" 348 | }, 349 | "FolderPath": { 350 | "value": "@concat('fhir/',pipeline().parameters.DatasetSize,'/Claim_procedure')", 351 | "type": "Expression" 352 | } 353 | } 354 | } 355 | ], 356 | "outputs": [ 357 | { 358 | "referenceName": "ClaimProcedureSQL", 359 | "type": "DatasetReference", 360 | "parameters": { 361 | "DatabaseName": { 362 | "value": "@pipeline().parameters.DatabaseName", 363 | "type": "Expression" 364 | }, 365 | "ServerName": { 366 | "value": "@pipeline().parameters.ServerName", 367 | "type": "Expression" 368 | } 369 | } 370 | } 371 | ] 372 | }, 373 | { 374 | "name": "Create Tables", 375 | "type": "Script", 376 | "dependsOn": [ 377 | { 378 | "activity": "ClaimParquetFlatten_Large", 379 | "dependencyConditions": [ 380 | "Succeeded" 381 | ] 382 | } 383 | ], 384 | "policy": { 385 | "timeout": "0.12:00:00", 386 | "retry": 0, 387 | "retryIntervalInSeconds": 30, 388 | "secureOutput": false, 389 | "secureInput": false 390 | }, 391 | "userProperties": [], 392 | "linkedServiceName": { 393 | "referenceName": "SynapseDedicatedPoolLS", 394 | "type": "LinkedServiceReference", 395 | "parameters": { 396 | "DatabaseName": { 397 | "value": "@pipeline().parameters.DatabaseName", 398 | "type": "Expression" 399 | }, 400 | "ServerName": { 401 | "value": "@pipeline().parameters.ServerName", 402 | "type": "Expression" 403 | } 404 | } 405 | }, 406 | "typeProperties": { 407 | "scripts": [ 408 | { 409 | "type": "Query", 410 | "text": "IF NOT EXISTS (SELECT * FROM sys.schemas WHERE name='fhir')\n\tEXEC('CREATE SCHEMA [fhir]')\n\nIF OBJECT_ID('fhir.ClaimDiagnosis') IS NOT NULL\nBEGIN\n DROP TABLE [fhir].[ClaimDiagnosis]\nEND\n\nCREATE TABLE [fhir].[ClaimDiagnosis]\n( \n\t[Claim_id] [nvarchar](64) NULL,\n\t[diagnosis_reference] [nvarchar](64) NULL,\n\t[diagnosis_sequence] [bigint] NULL\n)\nWITH\n(\n\tDISTRIBUTION = HASH ( [Claim_id] ),\n\tCLUSTERED COLUMNSTORE INDEX\n)\n\nIF OBJECT_ID('fhir.ClaimInsurance') IS NOT NULL\nBEGIN\n DROP TABLE [fhir].[ClaimInsurance]\nEND\n\nCREATE TABLE [fhir].[ClaimInsurance]\n( \n\t[Claim_id] [nvarchar](64) NULL,\n\t[insurance_coverage.display] [nvarchar](64) NULL,\n\t[insurance_focal] [bit] NULL,\n\t[insurance_sequence] [bigint] NULL\n)\nWITH\n(\n\tDISTRIBUTION = HASH ( [Claim_id] ),\n\tCLUSTERED COLUMNSTORE INDEX\n)\n\nIF OBJECT_ID('fhir.ClaimProcedure') IS NOT NULL\nBEGIN\n DROP TABLE [fhir].[ClaimProcedure]\nEND\n\nCREATE TABLE [fhir].[ClaimProcedure]\n( \n\t[Claim_id] [nvarchar](64) NULL,\n\t[procedure_reference] [nvarchar](64) NULL,\n\t[procedure_sequence] [bigint] NULL\n)\nWITH\n(\n\tDISTRIBUTION = HASH ( [Claim_id] ),\n\tCLUSTERED COLUMNSTORE INDEX\n)" 411 | } 412 | ] 413 | } 414 | }, 415 | { 416 | "name": "LakeDatabase And Table Creation", 417 | "type": "SynapseNotebook", 418 | "dependsOn": [ 419 | { 420 | "activity": "ClaimParquetFlatten_Large", 421 | "dependencyConditions": [ 422 | "Succeeded" 423 | ] 424 | } 425 | ], 426 | "policy": { 427 | "timeout": "7.00:00:00", 428 | "retry": 0, 429 | "retryIntervalInSeconds": 30, 430 | "secureOutput": false, 431 | "secureInput": false 432 | }, 433 | "userProperties": [], 434 | "typeProperties": { 435 | "notebook": { 436 | "referenceName": "Lake Database And Table Creation", 437 | "type": "NotebookReference" 438 | }, 439 | "parameters": { 440 | "StorageName": { 441 | "value": { 442 | "value": "@pipeline().parameters.StorageName", 443 | "type": "Expression" 444 | }, 445 | "type": "string" 446 | }, 447 | "DatasetSize": { 448 | "value": { 449 | "value": "@pipeline().parameters.DatasetSize", 450 | "type": "Expression" 451 | }, 452 | "type": "string" 453 | } 454 | }, 455 | "snapshot": true, 456 | "sparkPool": { 457 | "referenceName": { 458 | "value": "@pipeline().parameters.SparkPoolName", 459 | "type": "Expression" 460 | }, 461 | "type": "BigDataPoolReference" 462 | }, 463 | "conf": { 464 | "spark.dynamicAllocation.enabled": null, 465 | "spark.dynamicAllocation.minExecutors": null, 466 | "spark.dynamicAllocation.maxExecutors": null 467 | }, 468 | "numExecutors": null 469 | } 470 | } 471 | ], 472 | "parameters": { 473 | "StorageName": { 474 | "type": "string", 475 | "defaultValue": "synapsee2elake" 476 | }, 477 | "DatabaseName": { 478 | "type": "string", 479 | "defaultValue": "synapsee2edw" 480 | }, 481 | "ServerName": { 482 | "type": "string", 483 | "defaultValue": "synapsee2e" 484 | }, 485 | "SparkPoolName": { 486 | "type": "string", 487 | "defaultValue": "synapsee2espark" 488 | }, 489 | "DatasetSize": { 490 | "type": "string", 491 | "defaultValue": "1tb" 492 | } 493 | }, 494 | "folder": { 495 | "name": "Claims" 496 | }, 497 | "annotations": [] 498 | } 499 | } -------------------------------------------------------------------------------- /artifacts/pipeline/FHIR_Pipeline4Observation_Spark_OC.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "FHIR_Pipeline4Observation_Spark_OC", 3 | "properties": { 4 | "activities": [ 5 | { 6 | "name": "NDJSON_Ingestion_Observation", 7 | "type": "SynapseNotebook", 8 | "dependsOn": [], 9 | "policy": { 10 | "timeout": "0.12:00:00", 11 | "retry": 0, 12 | "retryIntervalInSeconds": 30, 13 | "secureOutput": false, 14 | "secureInput": false 15 | }, 16 | "userProperties": [], 17 | "typeProperties": { 18 | "notebook": { 19 | "referenceName": "Observation_Ingestion_NDJSON2Parquet", 20 | "type": "NotebookReference" 21 | }, 22 | "parameters": { 23 | "StorageName": { 24 | "value": { 25 | "value": "@pipeline().parameters.StorageName", 26 | "type": "Expression" 27 | }, 28 | "type": "string" 29 | } 30 | }, 31 | "snapshot": true, 32 | "sparkPool": { 33 | "referenceName": { 34 | "value": "@pipeline().parameters.SparkPoolName", 35 | "type": "Expression" 36 | }, 37 | "type": "BigDataPoolReference" 38 | }, 39 | "executorSize": null, 40 | "conf": { 41 | "spark.dynamicAllocation.enabled": null, 42 | "spark.dynamicAllocation.minExecutors": null, 43 | "spark.dynamicAllocation.maxExecutors": null 44 | }, 45 | "driverSize": null, 46 | "numExecutors": null 47 | } 48 | }, 49 | { 50 | "name": "ObservationParquetFlatten_Large", 51 | "type": "SynapseNotebook", 52 | "dependsOn": [ 53 | { 54 | "activity": "NDJSON_Ingestion_Observation", 55 | "dependencyConditions": [ 56 | "Succeeded" 57 | ] 58 | } 59 | ], 60 | "policy": { 61 | "timeout": "0.12:00:00", 62 | "retry": 0, 63 | "retryIntervalInSeconds": 30, 64 | "secureOutput": false, 65 | "secureInput": false 66 | }, 67 | "userProperties": [], 68 | "typeProperties": { 69 | "notebook": { 70 | "referenceName": "ObservationParquetFlatten_Large", 71 | "type": "NotebookReference" 72 | }, 73 | "parameters": { 74 | "StorageName": { 75 | "value": { 76 | "value": "@pipeline().parameters.StorageName", 77 | "type": "Expression" 78 | }, 79 | "type": "string" 80 | } 81 | }, 82 | "snapshot": true, 83 | "sparkPool": { 84 | "referenceName": { 85 | "value": "@pipeline().parameters.SparkPoolName", 86 | "type": "Expression" 87 | }, 88 | "type": "BigDataPoolReference" 89 | }, 90 | "executorSize": null, 91 | "conf": { 92 | "spark.dynamicAllocation.enabled": null, 93 | "spark.dynamicAllocation.minExecutors": null, 94 | "spark.dynamicAllocation.maxExecutors": null 95 | }, 96 | "driverSize": null, 97 | "numExecutors": null 98 | } 99 | }, 100 | { 101 | "name": "Observation_Parquet_large2SQL", 102 | "type": "Copy", 103 | "dependsOn": [ 104 | { 105 | "activity": "Create Tables", 106 | "dependencyConditions": [ 107 | "Succeeded" 108 | ] 109 | } 110 | ], 111 | "policy": { 112 | "timeout": "0.12:00:00", 113 | "retry": 0, 114 | "retryIntervalInSeconds": 30, 115 | "secureOutput": false, 116 | "secureInput": false 117 | }, 118 | "userProperties": [], 119 | "typeProperties": { 120 | "source": { 121 | "type": "ParquetSource", 122 | "storeSettings": { 123 | "type": "AzureBlobFSReadSettings", 124 | "recursive": true, 125 | "wildcardFolderPath": { 126 | "value": "@concat('fhir/',pipeline().parameters.DatasetSize,'/Observation_main')", 127 | "type": "Expression" 128 | }, 129 | "wildcardFileName": "*.parquet", 130 | "enablePartitionDiscovery": false 131 | } 132 | }, 133 | "sink": { 134 | "type": "SqlDWSink", 135 | "allowCopyCommand": true, 136 | "tableOption": "autoCreate", 137 | "disableMetricsCollection": false 138 | }, 139 | "enableStaging": true, 140 | "stagingSettings": { 141 | "linkedServiceName": { 142 | "referenceName": "StorageLS", 143 | "type": "LinkedServiceReference", 144 | "parameters": { 145 | "StorageName": { 146 | "value": "@pipeline().parameters.StorageName", 147 | "type": "Expression" 148 | } 149 | } 150 | }, 151 | "path": "staging" 152 | } 153 | }, 154 | "inputs": [ 155 | { 156 | "referenceName": "ObservationMain_LargeParquet", 157 | "type": "DatasetReference", 158 | "parameters": { 159 | "StorageName": { 160 | "value": "@pipeline().parameters.StorageName", 161 | "type": "Expression" 162 | }, 163 | "DatasetSize": { 164 | "value": "@pipeline().parameters.DatasetSize", 165 | "type": "Expression" 166 | } 167 | } 168 | } 169 | ], 170 | "outputs": [ 171 | { 172 | "referenceName": "Observation_SQLDS", 173 | "type": "DatasetReference", 174 | "parameters": { 175 | "DatabaseName": { 176 | "value": "@pipeline().parameters.DatabaseName", 177 | "type": "Expression" 178 | }, 179 | "ServerName": { 180 | "value": "@pipeline().parameters.ServerName", 181 | "type": "Expression" 182 | } 183 | } 184 | } 185 | ] 186 | }, 187 | { 188 | "name": "Create Tables", 189 | "type": "Script", 190 | "dependsOn": [ 191 | { 192 | "activity": "ObservationParquetFlatten_Large", 193 | "dependencyConditions": [ 194 | "Succeeded" 195 | ] 196 | } 197 | ], 198 | "policy": { 199 | "timeout": "0.12:00:00", 200 | "retry": 0, 201 | "retryIntervalInSeconds": 30, 202 | "secureOutput": false, 203 | "secureInput": false 204 | }, 205 | "userProperties": [], 206 | "linkedServiceName": { 207 | "referenceName": "SynapseDedicatedPoolLS", 208 | "type": "LinkedServiceReference", 209 | "parameters": { 210 | "DatabaseName": { 211 | "value": "@pipeline().parameters.DatabaseName", 212 | "type": "Expression" 213 | }, 214 | "ServerName": { 215 | "value": "@pipeline().parameters.ServerName", 216 | "type": "Expression" 217 | } 218 | } 219 | }, 220 | "typeProperties": { 221 | "scripts": [ 222 | { 223 | "type": "Query", 224 | "text": "IF OBJECT_ID('fhir.ObservationMain') IS NOT NULL\r\nBEGIN\r\n DROP TABLE [fhir].[ObservationMain]\r\nEND\r\n\r\nCREATE TABLE [fhir].[ObservationMain]\r\n( \r\n\t[Observation_id] [varchar](64) NULL,\r\n\t[resourceType] [varchar](100) NULL,\r\n\t[issued] VARCHAR(30) NULL,\r\n\t[status] [varchar](10) NULL,\r\n\t[patient_id_reference] [varchar](64) NULL,\r\n\t[encounter_id_reference] [varchar](64) NULL,\r\n\t[effectiveDateTime] VARCHAR(30) NULL,\r\n\t[valueQuantity_code] [varchar](50) NULL,\r\n\t[valueQuantity_system] [varchar](100) NULL,\r\n\t[valueQuantity_unit] [varchar](50) NULL,\r\n\t[valueQuantity_value] [float] NULL,\r\n\t[valueString] [nvarchar](200) NULL\r\n)\r\nWITH\r\n(\r\n\tDISTRIBUTION = ROUND_ROBIN,\r\n\tHEAP\r\n)" 225 | } 226 | ] 227 | } 228 | } 229 | ], 230 | "parameters": { 231 | "StorageName": { 232 | "type": "string", 233 | "defaultValue": "synapsee2elake" 234 | }, 235 | "DatabaseName": { 236 | "type": "string", 237 | "defaultValue": "synapsee2edw" 238 | }, 239 | "ServerName": { 240 | "type": "string", 241 | "defaultValue": "synapsee2e" 242 | }, 243 | "SparkPoolName": { 244 | "type": "string", 245 | "defaultValue": "synapsee2espark" 246 | }, 247 | "DatasetSize": { 248 | "type": "string", 249 | "defaultValue": "1tb" 250 | } 251 | }, 252 | "folder": { 253 | "name": "Observation" 254 | }, 255 | "annotations": [] 256 | } 257 | } -------------------------------------------------------------------------------- /artifacts/pipeline/FHIR_Pipeline4Patient_DataFlow_OC.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "FHIR_Pipeline4Patient_DataFlow_OC", 3 | "properties": { 4 | "activities": [ 5 | { 6 | "name": "NDJSON_Ingestion_Patient", 7 | "type": "SynapseNotebook", 8 | "dependsOn": [], 9 | "policy": { 10 | "timeout": "7.00:00:00", 11 | "retry": 0, 12 | "retryIntervalInSeconds": 30, 13 | "secureOutput": false, 14 | "secureInput": false 15 | }, 16 | "userProperties": [], 17 | "typeProperties": { 18 | "notebook": { 19 | "referenceName": "Patient_Ingestion_NDJSON2Parquet", 20 | "type": "NotebookReference" 21 | }, 22 | "parameters": { 23 | "StorageName": { 24 | "value": { 25 | "value": "@pipeline().parameters.StorageName", 26 | "type": "Expression" 27 | }, 28 | "type": "string" 29 | }, 30 | "DatasetSize": { 31 | "value": { 32 | "value": "@pipeline().parameters.DatasetSize", 33 | "type": "Expression" 34 | }, 35 | "type": "string" 36 | } 37 | }, 38 | "snapshot": true, 39 | "sparkPool": { 40 | "referenceName": { 41 | "value": "@pipeline().parameters.SparkPoolName", 42 | "type": "Expression" 43 | }, 44 | "type": "BigDataPoolReference" 45 | }, 46 | "conf": { 47 | "spark.dynamicAllocation.enabled": null, 48 | "spark.dynamicAllocation.minExecutors": null, 49 | "spark.dynamicAllocation.maxExecutors": null 50 | }, 51 | "numExecutors": null 52 | } 53 | }, 54 | { 55 | "name": "PatientParquet2Sink", 56 | "type": "ExecuteDataFlow", 57 | "dependsOn": [ 58 | { 59 | "activity": "NDJSON_Ingestion_Patient", 60 | "dependencyConditions": [ 61 | "Succeeded" 62 | ] 63 | } 64 | ], 65 | "policy": { 66 | "timeout": "1.00:00:00", 67 | "retry": 0, 68 | "retryIntervalInSeconds": 30, 69 | "secureOutput": false, 70 | "secureInput": false 71 | }, 72 | "userProperties": [], 73 | "typeProperties": { 74 | "dataflow": { 75 | "referenceName": "PatientJSON_Flatten_large", 76 | "type": "DataFlowReference", 77 | "datasetParameters": { 78 | "PatientNDJSON": { 79 | "StorageName": { 80 | "value": "@pipeline().parameters.StorageName", 81 | "type": "Expression" 82 | }, 83 | "DatasetSize": { 84 | "value": "@pipeline().parameters.DatasetSize", 85 | "type": "Expression" 86 | } 87 | }, 88 | "sinkPatientIdentifier": { 89 | "StorageName": { 90 | "value": "@pipeline().parameters.StorageName", 91 | "type": "Expression" 92 | }, 93 | "DatasetSize": { 94 | "value": "@pipeline().parameters.DatasetSize", 95 | "type": "Expression" 96 | } 97 | }, 98 | "sinkPatientExtension": { 99 | "StorageName": { 100 | "value": "@pipeline().parameters.StorageName", 101 | "type": "Expression" 102 | }, 103 | "DatasetSize": { 104 | "value": "@pipeline().parameters.DatasetSize", 105 | "type": "Expression" 106 | } 107 | }, 108 | "sinkPatientAddress": { 109 | "StorageName": { 110 | "value": "@pipeline().parameters.StorageName", 111 | "type": "Expression" 112 | }, 113 | "DatasetSize": { 114 | "value": "@pipeline().parameters.DatasetSize", 115 | "type": "Expression" 116 | } 117 | } 118 | } 119 | }, 120 | "compute": { 121 | "coreCount": 8, 122 | "computeType": "General" 123 | }, 124 | "traceLevel": "Fine", 125 | "runConcurrently": true 126 | } 127 | }, 128 | { 129 | "name": "PatientAddress_large2SQL", 130 | "type": "Copy", 131 | "dependsOn": [ 132 | { 133 | "activity": "Create Tables", 134 | "dependencyConditions": [ 135 | "Succeeded" 136 | ] 137 | } 138 | ], 139 | "policy": { 140 | "timeout": "7.00:00:00", 141 | "retry": 0, 142 | "retryIntervalInSeconds": 30, 143 | "secureOutput": false, 144 | "secureInput": false 145 | }, 146 | "userProperties": [], 147 | "typeProperties": { 148 | "source": { 149 | "type": "ParquetSource", 150 | "storeSettings": { 151 | "type": "AzureBlobFSReadSettings", 152 | "recursive": true, 153 | "wildcardFolderPath": { 154 | "value": "@concat('fhir/',pipeline().parameters.DatasetSize,'/PatientAddress')", 155 | "type": "Expression" 156 | }, 157 | "wildcardFileName": "*.parquet", 158 | "enablePartitionDiscovery": false 159 | } 160 | }, 161 | "sink": { 162 | "type": "SqlDWSink", 163 | "allowCopyCommand": true, 164 | "tableOption": "autoCreate", 165 | "disableMetricsCollection": false 166 | }, 167 | "enableStaging": true, 168 | "stagingSettings": { 169 | "linkedServiceName": { 170 | "referenceName": "StorageLS", 171 | "type": "LinkedServiceReference", 172 | "parameters": { 173 | "StorageName": { 174 | "value": "@pipeline().parameters.StorageName", 175 | "type": "Expression" 176 | } 177 | } 178 | }, 179 | "path": "staging" 180 | }, 181 | "translator": { 182 | "type": "TabularTranslator", 183 | "mappings": [ 184 | { 185 | "source": { 186 | "name": "id", 187 | "type": "String" 188 | }, 189 | "sink": { 190 | "name": "id", 191 | "type": "String" 192 | } 193 | }, 194 | { 195 | "source": { 196 | "name": "address.city", 197 | "type": "String" 198 | }, 199 | "sink": { 200 | "name": "address.city", 201 | "type": "String" 202 | } 203 | }, 204 | { 205 | "source": { 206 | "name": "address.country", 207 | "type": "String" 208 | }, 209 | "sink": { 210 | "name": "address.country", 211 | "type": "String" 212 | } 213 | }, 214 | { 215 | "source": { 216 | "name": "address.extension.extension.url", 217 | "type": "String" 218 | }, 219 | "sink": { 220 | "name": "address.extension.extension.url", 221 | "type": "String" 222 | } 223 | }, 224 | { 225 | "source": { 226 | "name": "address.extension.extension.valueDecimal", 227 | "type": "Double" 228 | }, 229 | "sink": { 230 | "name": "address.extension.extension.valueDecimal", 231 | "type": "Double" 232 | } 233 | }, 234 | { 235 | "source": { 236 | "name": "address.extension.url", 237 | "type": "String" 238 | }, 239 | "sink": { 240 | "name": "address.extension.url", 241 | "type": "String" 242 | } 243 | }, 244 | { 245 | "source": { 246 | "name": "address.postalCode", 247 | "type": "String" 248 | }, 249 | "sink": { 250 | "name": "address.postalCode", 251 | "type": "String" 252 | } 253 | }, 254 | { 255 | "source": { 256 | "name": "address.state", 257 | "type": "String" 258 | }, 259 | "sink": { 260 | "name": "address.state", 261 | "type": "String" 262 | } 263 | } 264 | ] 265 | } 266 | }, 267 | "inputs": [ 268 | { 269 | "referenceName": "PatientAddressParquetLarge", 270 | "type": "DatasetReference", 271 | "parameters": { 272 | "StorageName": { 273 | "value": "@pipeline().parameters.StorageName", 274 | "type": "Expression" 275 | }, 276 | "DatasetSize": { 277 | "value": "@pipeline().parameters.DatasetSize", 278 | "type": "Expression" 279 | } 280 | } 281 | } 282 | ], 283 | "outputs": [ 284 | { 285 | "referenceName": "PatientAddressSQL", 286 | "type": "DatasetReference", 287 | "parameters": { 288 | "DatabaseName": { 289 | "value": "@pipeline().parameters.DatabaseName", 290 | "type": "Expression" 291 | }, 292 | "ServerName": { 293 | "value": "@pipeline().parameters.ServerName", 294 | "type": "Expression" 295 | } 296 | } 297 | } 298 | ] 299 | }, 300 | { 301 | "name": "PatientIdentifier_large2SQL", 302 | "type": "Copy", 303 | "dependsOn": [ 304 | { 305 | "activity": "Create Tables", 306 | "dependencyConditions": [ 307 | "Succeeded" 308 | ] 309 | } 310 | ], 311 | "policy": { 312 | "timeout": "7.00:00:00", 313 | "retry": 0, 314 | "retryIntervalInSeconds": 30, 315 | "secureOutput": false, 316 | "secureInput": false 317 | }, 318 | "userProperties": [], 319 | "typeProperties": { 320 | "source": { 321 | "type": "ParquetSource", 322 | "storeSettings": { 323 | "type": "AzureBlobFSReadSettings", 324 | "recursive": true, 325 | "wildcardFolderPath": { 326 | "value": "@concat('fhir/',pipeline().parameters.DatasetSize,'/PatientIdentifier')", 327 | "type": "Expression" 328 | }, 329 | "wildcardFileName": "*.parquet", 330 | "enablePartitionDiscovery": false 331 | } 332 | }, 333 | "sink": { 334 | "type": "SqlDWSink", 335 | "allowCopyCommand": true, 336 | "tableOption": "autoCreate", 337 | "disableMetricsCollection": false 338 | }, 339 | "enableStaging": true, 340 | "stagingSettings": { 341 | "linkedServiceName": { 342 | "referenceName": "StorageLS", 343 | "type": "LinkedServiceReference", 344 | "parameters": { 345 | "StorageName": { 346 | "value": "@pipeline().parameters.StorageName", 347 | "type": "Expression" 348 | } 349 | } 350 | }, 351 | "path": "staging" 352 | }, 353 | "translator": { 354 | "type": "TabularTranslator", 355 | "mappings": [ 356 | { 357 | "source": { 358 | "name": "patient_id", 359 | "type": "String" 360 | }, 361 | "sink": { 362 | "name": "patient_id", 363 | "type": "String" 364 | } 365 | }, 366 | { 367 | "source": { 368 | "name": "birthDate", 369 | "type": "String" 370 | }, 371 | "sink": { 372 | "name": "birthDate", 373 | "type": "String" 374 | } 375 | }, 376 | { 377 | "source": { 378 | "name": "deceasedDateTime", 379 | "type": "String" 380 | }, 381 | "sink": { 382 | "name": "deceasedDateTime", 383 | "type": "String" 384 | } 385 | }, 386 | { 387 | "source": { 388 | "name": "gender", 389 | "type": "String" 390 | }, 391 | "sink": { 392 | "name": "gender", 393 | "type": "String" 394 | } 395 | }, 396 | { 397 | "source": { 398 | "name": "text", 399 | "type": "String" 400 | }, 401 | "sink": { 402 | "name": "text", 403 | "type": "String" 404 | } 405 | }, 406 | { 407 | "source": { 408 | "name": "multipleBirthBoolean", 409 | "type": "Boolean" 410 | }, 411 | "sink": { 412 | "name": "multipleBirthBoolean", 413 | "type": "Boolean" 414 | } 415 | }, 416 | { 417 | "source": { 418 | "name": "multipleBirthInteger", 419 | "type": "Int64" 420 | }, 421 | "sink": { 422 | "name": "multipleBirthInteger", 423 | "type": "Int64" 424 | } 425 | }, 426 | { 427 | "source": { 428 | "name": "resourceType", 429 | "type": "String" 430 | }, 431 | "sink": { 432 | "name": "resourceType", 433 | "type": "String" 434 | } 435 | }, 436 | { 437 | "source": { 438 | "name": "div", 439 | "type": "String" 440 | }, 441 | "sink": { 442 | "name": "div", 443 | "type": "String" 444 | } 445 | }, 446 | { 447 | "source": { 448 | "name": "status", 449 | "type": "String" 450 | }, 451 | "sink": { 452 | "name": "status", 453 | "type": "String" 454 | } 455 | }, 456 | { 457 | "source": { 458 | "name": "identifier.system", 459 | "type": "String" 460 | }, 461 | "sink": { 462 | "name": "identifier.system", 463 | "type": "String" 464 | } 465 | }, 466 | { 467 | "source": { 468 | "name": "identifier.type.coding.code", 469 | "type": "String" 470 | }, 471 | "sink": { 472 | "name": "identifier.type.coding.code", 473 | "type": "String" 474 | } 475 | }, 476 | { 477 | "source": { 478 | "name": "identifier.type.coding.display", 479 | "type": "String" 480 | }, 481 | "sink": { 482 | "name": "identifier.type.coding.display", 483 | "type": "String" 484 | } 485 | }, 486 | { 487 | "source": { 488 | "name": "identifier.type.coding.system", 489 | "type": "String" 490 | }, 491 | "sink": { 492 | "name": "identifier.type.coding.system", 493 | "type": "String" 494 | } 495 | }, 496 | { 497 | "source": { 498 | "name": "identifier.type.text", 499 | "type": "String" 500 | }, 501 | "sink": { 502 | "name": "identifier.type.text", 503 | "type": "String" 504 | } 505 | }, 506 | { 507 | "source": { 508 | "name": "identifier.value", 509 | "type": "String" 510 | }, 511 | "sink": { 512 | "name": "identifier.value", 513 | "type": "String" 514 | } 515 | } 516 | ] 517 | } 518 | }, 519 | "inputs": [ 520 | { 521 | "referenceName": "PatientIdentifierParquetLarge", 522 | "type": "DatasetReference", 523 | "parameters": { 524 | "StorageName": { 525 | "value": "@pipeline().parameters.StorageName", 526 | "type": "Expression" 527 | }, 528 | "DatasetSize": { 529 | "value": "@pipeline().parameters.DatasetSize", 530 | "type": "Expression" 531 | } 532 | } 533 | } 534 | ], 535 | "outputs": [ 536 | { 537 | "referenceName": "PatientIdentifierSQLLarge", 538 | "type": "DatasetReference", 539 | "parameters": { 540 | "DatabaseName": { 541 | "value": "@pipeline().parameters.DatabaseName", 542 | "type": "Expression" 543 | }, 544 | "ServerName": { 545 | "value": "@pipeline().parameters.ServerName", 546 | "type": "Expression" 547 | } 548 | } 549 | } 550 | ] 551 | }, 552 | { 553 | "name": "Create Tables", 554 | "type": "Script", 555 | "dependsOn": [ 556 | { 557 | "activity": "PatientParquet2Sink", 558 | "dependencyConditions": [ 559 | "Succeeded" 560 | ] 561 | } 562 | ], 563 | "policy": { 564 | "timeout": "0.12:00:00", 565 | "retry": 0, 566 | "retryIntervalInSeconds": 30, 567 | "secureOutput": false, 568 | "secureInput": false 569 | }, 570 | "userProperties": [], 571 | "linkedServiceName": { 572 | "referenceName": "SynapseDedicatedPoolLS", 573 | "type": "LinkedServiceReference", 574 | "parameters": { 575 | "DatabaseName": { 576 | "value": "@pipeline().parameters.DatabaseName", 577 | "type": "Expression" 578 | }, 579 | "ServerName": { 580 | "value": "@pipeline().parameters.ServerName", 581 | "type": "Expression" 582 | } 583 | } 584 | }, 585 | "typeProperties": { 586 | "scripts": [ 587 | { 588 | "type": "Query", 589 | "text": "IF OBJECT_ID('fhir.PatientAddress') IS NOT NULL\r\nBEGIN\r\n DROP TABLE [fhir].[PatientAddress]\r\nEND\r\n\r\n\r\nCREATE TABLE [fhir].[PatientAddress]\r\n( \r\n\t[id] [nvarchar](64) NULL,\r\n\t[address.city] [nvarchar](50) NULL,\r\n\t[address.country] [nvarchar](50) NULL,\r\n\t[address.extension.extension.url] [nvarchar](50) NULL,\r\n\t[address.extension.extension.valueDecimal] [float] NULL,\r\n\t[address.extension.url] [nvarchar](1000) NULL,\r\n\t[address.postalCode] [nvarchar](50) NULL,\r\n\t[address.state] [nvarchar](50) NULL\r\n)\r\nWITH\r\n(\r\n\tDISTRIBUTION = ROUND_ROBIN,\r\n\tHEAP\r\n)\r\n\r\nIF OBJECT_ID('fhir.PatientIdentifier') IS NOT NULL\r\nBEGIN\r\n DROP TABLE [fhir].[PatientIdentifier]\r\nEND\r\n\r\nCREATE TABLE [fhir].[PatientIdentifier]\r\n( \r\n\t[patient_id] [nvarchar](64) NULL,\r\n\t[birthDate] [nvarchar](200) NULL,\r\n\t[deceasedDateTime] [nvarchar](200) NULL,\r\n\t[gender] [nvarchar](50) NULL,\r\n\t[text] [nvarchar](1000) NULL,\r\n\t[multipleBirthBoolean] [bit] NULL,\r\n\t[multipleBirthInteger] [bigint] NULL,\r\n\t[resourceType] [nvarchar](100) NULL,\r\n\t[div] [nvarchar](max) NULL,\r\n\t[status] [nvarchar](500) NULL,\r\n\t[identifier.system] [nvarchar](2000) NULL,\r\n\t[identifier.type.coding.code] [nvarchar](500) NULL,\r\n\t[identifier.type.coding.display] [nvarchar](1000) NULL,\r\n\t[identifier.type.coding.system] [nvarchar](1000) NULL,\r\n\t[identifier.type.text] [nvarchar](1000) NULL,\r\n\t[identifier.value] [nvarchar](640) NULL\r\n)\r\nWITH\r\n(\r\n\tDISTRIBUTION = ROUND_ROBIN,\r\n\tHEAP\r\n)" 590 | } 591 | ] 592 | } 593 | } 594 | ], 595 | "parameters": { 596 | "StorageName": { 597 | "type": "string", 598 | "defaultValue": "synapsee2elake" 599 | }, 600 | "DatabaseName": { 601 | "type": "string", 602 | "defaultValue": "synapsee2edw" 603 | }, 604 | "ServerName": { 605 | "type": "string", 606 | "defaultValue": "synapsee2e" 607 | }, 608 | "SparkPoolName": { 609 | "type": "string", 610 | "defaultValue": "synapsee2espark" 611 | }, 612 | "DatasetSize": { 613 | "type": "string", 614 | "defaultValue": "1tb" 615 | } 616 | }, 617 | "folder": { 618 | "name": "Patient" 619 | }, 620 | "annotations": [] 621 | } 622 | } -------------------------------------------------------------------------------- /artifacts/publish_config.json: -------------------------------------------------------------------------------- 1 | {"publishBranch":"workspace_publish"} -------------------------------------------------------------------------------- /artifacts/sqlscript/JSON_exploration_w_Serverless_Demo_OC.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "JSON_exploration_w_Serverless_Demo_OC", 3 | "properties": { 4 | "description": "Json data (FHIR NDJSON) exploration with Serverless", 5 | "folder": { 6 | "name": "Exploration" 7 | }, 8 | "content": { 9 | "query": "SELECT TOP 100 *\nFROM\n OPENROWSET(\n BULK 'https://medicaldl.dfs.core.windows.net/raw/fhir_ndjson/1tb/*/Observation.ndjson',\n FORMAT = 'CSV',\n FIELDQUOTE = '0x0b',\n FIELDTERMINATOR ='0x0b'\n )\n WITH (\n line varchar(max)\n ) AS [result]\n\nSELECT TOP 100 *\nFROM\n OPENROWSET(\n BULK 'Observation.ndjson',\n DATA_SOURCE = 'JSONSource',\n FORMAT = 'CSV',\n FIELDQUOTE = '0x0b',\n FIELDTERMINATOR ='0x0b'\n )\n WITH (\n line varchar(max)\n ) AS [result]\n\n/* Flatten the NDJSO with JSON_VALUE, JSON_QUERY on CROSS APPLY OPENJSON */\n/* refreence: https://diangermishuizen.com/query-json-data-in-sql-server-and-synapse-analytics/ */\n/* Query 1, with only JSON_VALUE, JSON_QUERY */\nSELECT top 100\n JSON_VALUE(line, '$.resourceType') AS resourceType,\n JSON_VALUE(line, '$.id') AS id,\n JSON_VALUE(line, '$.status') AS status,\n JSON_query(line, '$.category') AS category_string ,\n JSON_query(line, '$.code') AS code_string\nFROM\n OPENROWSET(\n BULK 'Observation.ndjson',\n DATA_SOURCE = 'JSONSource',\n FORMAT = 'CSV',\n -- FIELDQUOTE and FIELDTERMINATOR are set to 0x0b as we do not expect to find it in the file.\n FIELDQUOTE = '0x0b',\n FIELDTERMINATOR ='0x0b'\n )\n WITH (\n line varchar(max)\n ) AS [result]\n\n/* Query 2, add CROSS APPLY OPENJSON to read array */\nSELECT top 100\n JSON_VALUE(line, '$.resourceType') AS resourceType,\n JSON_VALUE(line, '$.id') AS id,\n JSON_VALUE(line, '$.status') AS status,\n JSON_query(line, '$.valueQuantity') AS valueQuantity_string,\n valueQuantity_NestedArray_value,\n valueQuantity_NestedArray_unit ,\n JSON_query(line, '$.category') AS category_string\nFROM\n OPENROWSET(\n BULK 'Observation.ndjson',\n DATA_SOURCE = 'JSONSource',\n FORMAT = 'CSV',\n -- FIELDQUOTE and FIELDTERMINATOR are set to 0x0b as we do not expect to find it in the file.\n FIELDQUOTE = '0x0b',\n FIELDTERMINATOR ='0x0b'\n )\n WITH (\n line varchar(max)\n ) AS [result]\nCROSS APPLY OPENJSON \n (JSON_QUERY([line], '$.valueQuantity')) /*Note, if you want only the top most record from this array, replace this line with \"(JSON_QUERY([jsonContent], '$.attribute_with_nested_array[0]'))\"*/\nWITH(\n [valueQuantity_NestedArray_value] varchar(255) '$.value',\n [valueQuantity_NestedArray_unit] varchar(255) '$.unit'\n) AS [valueQuantity_NestedArray]\n\n\n/* Query 3, multiple CROSS APPLY OPENJSON to read */\nSELECT top 100\n JSON_VALUE(line, '$.resourceType') AS resourceType,\n JSON_VALUE(line, '$.id') AS id,\n JSON_VALUE(line, '$.status') AS status,\n JSON_query(line, '$.valueQuantity') AS valueQuantity_string,\n valueQuantity_NestedArray_value,\n valueQuantity_NestedArray_unit ,\n JSON_query(line, '$.category') AS category_string,\n encounter_reference\nFROM\n OPENROWSET(\n BULK 'Observation.ndjson',\n DATA_SOURCE = 'JSONSource',\n FORMAT = 'CSV',\n -- FIELDQUOTE and FIELDTERMINATOR are set to 0x0b as we do not expect to find it in the file.\n FIELDQUOTE = '0x0b',\n FIELDTERMINATOR ='0x0b'\n )\n WITH (\n line varchar(max)\n ) AS [result]\nCROSS APPLY OPENJSON \n (JSON_QUERY([line], '$.valueQuantity')) /*Note, if you want only the top most record from this array, replace this line with \"(JSON_QUERY([jsonContent], '$.attribute_with_nested_array[0]'))\"*/\nWITH(\n [valueQuantity_NestedArray_value] varchar(255) '$.value',\n [valueQuantity_NestedArray_unit] varchar(255) '$.unit'\n) AS [valueQuantity_NestedArray]\nCROSS APPLY OPENJSON \n (JSON_QUERY([line], '$.encounter')) \nWITH(\n [encounter_reference] varchar(255) '$.reference'\n) AS [encounter_reference_NestedArray]", 10 | "metadata": { 11 | "language": "sql" 12 | }, 13 | "currentConnection": { 14 | "databaseName": "FHIRRef", 15 | "poolName": "Built-in" 16 | }, 17 | "resultLimit": 5000 18 | }, 19 | "type": "SqlQuery" 20 | } 21 | } -------------------------------------------------------------------------------- /artifacts/sqlscript/Spark DB Exploration Scripts.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Spark DB Exploration Scripts", 3 | "properties": { 4 | "folder": { 5 | "name": "Exploration" 6 | }, 7 | "content": { 8 | "query": "SELECT TOP (100) [Claim_id]\n,[insurance_coverage.display]\n,[insurance_focal]\n,[insurance_sequence]\n FROM [fhirdbexploration].[dbo].[claiminsurance]\n\n SELECT TOP (100) [Claim_id]\n,[procedure_reference]\n,[procedure_sequence]\n FROM [fhirdbexploration].[dbo].[claimprocedure]\n\n SELECT TOP (100) [Claim_id]\n,[diagnosis_reference]\n,[diagnosis_sequence]\n FROM [fhirdbexploration].[dbo].[patientdianosis]", 9 | "metadata": { 10 | "language": "sql" 11 | }, 12 | "currentConnection": { 13 | "databaseName": "FHIRRef", 14 | "poolName": "Built-in" 15 | }, 16 | "resultLimit": 5000 17 | }, 18 | "type": "SqlQuery" 19 | } 20 | } -------------------------------------------------------------------------------- /artifacts/sqlscript/Table Row Count.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Table Row Count", 3 | "properties": { 4 | "content": { 5 | "query": "SELECT COUNT(*)\nFROM [fhir].[ClaimDiagnosis]\n\nSELECT COUNT(*)\nFROM [fhir].[ClaimInsurance]\n\nSELECT COUNT(*)\nFROM [fhir].[ClaimProcedure]\n\nSELECT COUNT(*)\nFROM [fhir].[PatientAddress]\n\nSELECT COUNT(*)\nFROM [fhir].[PatientIdentifier]", 6 | "metadata": { 7 | "language": "sql" 8 | }, 9 | "currentConnection": { 10 | "databaseName": "synapsee2edw", 11 | "poolName": "synapsee2edw" 12 | }, 13 | "resultLimit": 5000 14 | }, 15 | "type": "SqlQuery" 16 | } 17 | } -------------------------------------------------------------------------------- /mybigdata/credential/WorkspaceSystemIdentity.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "WorkspaceSystemIdentity", 3 | "properties": { 4 | "type": "ManagedIdentity" 5 | } 6 | } -------------------------------------------------------------------------------- /mybigdata/integrationRuntime/AutoResolveIntegrationRuntime.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "AutoResolveIntegrationRuntime", 3 | "properties": { 4 | "type": "Managed", 5 | "typeProperties": { 6 | "computeProperties": { 7 | "location": "AutoResolve", 8 | "dataFlowProperties": { 9 | "computeType": "General", 10 | "coreCount": 8, 11 | "timeToLive": 0 12 | } 13 | } 14 | } 15 | } 16 | } -------------------------------------------------------------------------------- /mybigdata/linkedService/mybigdatademows-WorkspaceDefaultSqlServer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "mybigdatademows-WorkspaceDefaultSqlServer", 3 | "type": "Microsoft.Synapse/workspaces/linkedservices", 4 | "properties": { 5 | "typeProperties": { 6 | "connectionString": "Data Source=tcp:mybigdatademows.sql.azuresynapse.net,1433;Initial Catalog=@{linkedService().DBName}" 7 | }, 8 | "parameters": { 9 | "DBName": { 10 | "type": "String" 11 | } 12 | }, 13 | "type": "AzureSqlDW", 14 | "connectVia": { 15 | "referenceName": "AutoResolveIntegrationRuntime", 16 | "type": "IntegrationRuntimeReference" 17 | }, 18 | "annotations": [] 19 | } 20 | } -------------------------------------------------------------------------------- /mybigdata/linkedService/mybigdatademows-WorkspaceDefaultStorage.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "mybigdatademows-WorkspaceDefaultStorage", 3 | "type": "Microsoft.Synapse/workspaces/linkedservices", 4 | "properties": { 5 | "typeProperties": { 6 | "url": "https://mybigdatademos.dfs.core.windows.net/" 7 | }, 8 | "type": "AzureBlobFS", 9 | "connectVia": { 10 | "referenceName": "AutoResolveIntegrationRuntime", 11 | "type": "IntegrationRuntimeReference" 12 | }, 13 | "annotations": [] 14 | } 15 | } -------------------------------------------------------------------------------- /mybigdata/publish_config.json: -------------------------------------------------------------------------------- 1 | {"publishBranch":"workspace_publish"} --------------------------------------------------------------------------------