├── .github ├── CODE_OF_CONDUCT.md ├── ISSUE_TEMPLATE.md └── PULL_REQUEST_TEMPLATE.md ├── .gitignore ├── CHANGELOG.md ├── CONTRIBUTING.md ├── LICENSE.md ├── README.md └── notebooks ├── 00-create-parquet-file.ipynb ├── 01-load-into-single-table.ipynb ├── 02-load-into-partitioned-table.ipynb ├── 03a-parallel-switch-in-load-into-partitioned-table-many.ipynb ├── 03b-parallel-switch-in-load-into-partitioned-table-single.ipynb └── read-from-azure-sql ├── fast-read.ipynb └── push-down-queries.ipynb /.github/CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 4 | > Please provide us with the following information: 5 | > --------------------------------------------------------------- 6 | 7 | ### This issue is for a: (mark with an `x`) 8 | ``` 9 | - [ ] bug report -> please search issues before submitting 10 | - [ ] feature request 11 | - [ ] documentation issue or request 12 | - [ ] regression (a behavior that used to work and stopped in a new release) 13 | ``` 14 | 15 | ### Minimal steps to reproduce 16 | > 17 | 18 | ### Any log messages given by the failure 19 | > 20 | 21 | ### Expected/desired behavior 22 | > 23 | 24 | ### OS and Version? 25 | > Windows 7, 8 or 10. Linux (which distribution). macOS (Yosemite? El Capitan? Sierra?) 26 | 27 | ### Versions 28 | > 29 | 30 | ### Mention any other details that might be useful 31 | 32 | > --------------------------------------------------------------- 33 | > Thanks! We'll be in touch soon. 34 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ## Purpose 2 | 3 | * ... 4 | 5 | ## Does this introduce a breaking change? 6 | 7 | ``` 8 | [ ] Yes 9 | [ ] No 10 | ``` 11 | 12 | ## Pull Request Type 13 | What kind of change does this Pull Request introduce? 14 | 15 | 16 | ``` 17 | [ ] Bugfix 18 | [ ] Feature 19 | [ ] Code style update (formatting, local variables) 20 | [ ] Refactoring (no functional changes, no api changes) 21 | [ ] Documentation content changes 22 | [ ] Other... Please describe: 23 | ``` 24 | 25 | ## How to Test 26 | * Get the code 27 | 28 | ``` 29 | git clone [repo-address] 30 | cd [repo-name] 31 | git checkout [branch-name] 32 | npm install 33 | ``` 34 | 35 | * Test the code 36 | 37 | ``` 38 | ``` 39 | 40 | ## What to Check 41 | Verify that the following are valid 42 | * ... 43 | 44 | ## Other Information 45 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | ## 4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore 5 | 6 | # User-specific files 7 | *.rsuser 8 | *.suo 9 | *.user 10 | *.userosscache 11 | *.sln.docstates 12 | 13 | # User-specific files (MonoDevelop/Xamarin Studio) 14 | *.userprefs 15 | 16 | # Mono auto generated files 17 | mono_crash.* 18 | 19 | # Build results 20 | [Dd]ebug/ 21 | [Dd]ebugPublic/ 22 | [Rr]elease/ 23 | [Rr]eleases/ 24 | x64/ 25 | x86/ 26 | [Aa][Rr][Mm]/ 27 | [Aa][Rr][Mm]64/ 28 | bld/ 29 | [Bb]in/ 30 | [Oo]bj/ 31 | [Ll]og/ 32 | [Ll]ogs/ 33 | 34 | # Visual Studio 2015/2017 cache/options directory 35 | .vs/ 36 | # Uncomment if you have tasks that create the project's static files in wwwroot 37 | #wwwroot/ 38 | 39 | # Visual Studio 2017 auto generated files 40 | Generated\ Files/ 41 | 42 | # MSTest test Results 43 | [Tt]est[Rr]esult*/ 44 | [Bb]uild[Ll]og.* 45 | 46 | # NUnit 47 | *.VisualState.xml 48 | TestResult.xml 49 | nunit-*.xml 50 | 51 | # Build Results of an ATL Project 52 | [Dd]ebugPS/ 53 | [Rr]eleasePS/ 54 | dlldata.c 55 | 56 | # Benchmark Results 57 | BenchmarkDotNet.Artifacts/ 58 | 59 | # .NET Core 60 | project.lock.json 61 | project.fragment.lock.json 62 | artifacts/ 63 | 64 | # StyleCop 65 | StyleCopReport.xml 66 | 67 | # Files built by Visual Studio 68 | *_i.c 69 | *_p.c 70 | *_h.h 71 | *.ilk 72 | *.meta 73 | *.obj 74 | *.iobj 75 | *.pch 76 | *.pdb 77 | *.ipdb 78 | *.pgc 79 | *.pgd 80 | *.rsp 81 | *.sbr 82 | *.tlb 83 | *.tli 84 | *.tlh 85 | *.tmp 86 | *.tmp_proj 87 | *_wpftmp.csproj 88 | *.log 89 | *.vspscc 90 | *.vssscc 91 | .builds 92 | *.pidb 93 | *.svclog 94 | *.scc 95 | 96 | # Chutzpah Test files 97 | _Chutzpah* 98 | 99 | # Visual C++ cache files 100 | ipch/ 101 | *.aps 102 | *.ncb 103 | *.opendb 104 | *.opensdf 105 | *.sdf 106 | *.cachefile 107 | *.VC.db 108 | *.VC.VC.opendb 109 | 110 | # Visual Studio profiler 111 | *.psess 112 | *.vsp 113 | *.vspx 114 | *.sap 115 | 116 | # Visual Studio Trace Files 117 | *.e2e 118 | 119 | # TFS 2012 Local Workspace 120 | $tf/ 121 | 122 | # Guidance Automation Toolkit 123 | *.gpState 124 | 125 | # ReSharper is a .NET coding add-in 126 | _ReSharper*/ 127 | *.[Rr]e[Ss]harper 128 | *.DotSettings.user 129 | 130 | # TeamCity is a build add-in 131 | _TeamCity* 132 | 133 | # DotCover is a Code Coverage Tool 134 | *.dotCover 135 | 136 | # AxoCover is a Code Coverage Tool 137 | .axoCover/* 138 | !.axoCover/settings.json 139 | 140 | # Visual Studio code coverage results 141 | *.coverage 142 | *.coveragexml 143 | 144 | # NCrunch 145 | _NCrunch_* 146 | .*crunch*.local.xml 147 | nCrunchTemp_* 148 | 149 | # MightyMoose 150 | *.mm.* 151 | AutoTest.Net/ 152 | 153 | # Web workbench (sass) 154 | .sass-cache/ 155 | 156 | # Installshield output folder 157 | [Ee]xpress/ 158 | 159 | # DocProject is a documentation generator add-in 160 | DocProject/buildhelp/ 161 | DocProject/Help/*.HxT 162 | DocProject/Help/*.HxC 163 | DocProject/Help/*.hhc 164 | DocProject/Help/*.hhk 165 | DocProject/Help/*.hhp 166 | DocProject/Help/Html2 167 | DocProject/Help/html 168 | 169 | # Click-Once directory 170 | publish/ 171 | 172 | # Publish Web Output 173 | *.[Pp]ublish.xml 174 | *.azurePubxml 175 | # Note: Comment the next line if you want to checkin your web deploy settings, 176 | # but database connection strings (with potential passwords) will be unencrypted 177 | *.pubxml 178 | *.publishproj 179 | 180 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 181 | # checkin your Azure Web App publish settings, but sensitive information contained 182 | # in these scripts will be unencrypted 183 | PublishScripts/ 184 | 185 | # NuGet Packages 186 | *.nupkg 187 | # NuGet Symbol Packages 188 | *.snupkg 189 | # The packages folder can be ignored because of Package Restore 190 | **/[Pp]ackages/* 191 | # except build/, which is used as an MSBuild target. 192 | !**/[Pp]ackages/build/ 193 | # Uncomment if necessary however generally it will be regenerated when needed 194 | #!**/[Pp]ackages/repositories.config 195 | # NuGet v3's project.json files produces more ignorable files 196 | *.nuget.props 197 | *.nuget.targets 198 | 199 | # Microsoft Azure Build Output 200 | csx/ 201 | *.build.csdef 202 | 203 | # Microsoft Azure Emulator 204 | ecf/ 205 | rcf/ 206 | 207 | # Windows Store app package directories and files 208 | AppPackages/ 209 | BundleArtifacts/ 210 | Package.StoreAssociation.xml 211 | _pkginfo.txt 212 | *.appx 213 | *.appxbundle 214 | *.appxupload 215 | 216 | # Visual Studio cache files 217 | # files ending in .cache can be ignored 218 | *.[Cc]ache 219 | # but keep track of directories ending in .cache 220 | !?*.[Cc]ache/ 221 | 222 | # Others 223 | ClientBin/ 224 | ~$* 225 | *~ 226 | *.dbmdl 227 | *.dbproj.schemaview 228 | *.jfm 229 | *.pfx 230 | *.publishsettings 231 | orleans.codegen.cs 232 | 233 | # Including strong name files can present a security risk 234 | # (https://github.com/github/gitignore/pull/2483#issue-259490424) 235 | #*.snk 236 | 237 | # Since there are multiple workflows, uncomment next line to ignore bower_components 238 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 239 | #bower_components/ 240 | 241 | # RIA/Silverlight projects 242 | Generated_Code/ 243 | 244 | # Backup & report files from converting an old project file 245 | # to a newer Visual Studio version. Backup files are not needed, 246 | # because we have git ;-) 247 | _UpgradeReport_Files/ 248 | Backup*/ 249 | UpgradeLog*.XML 250 | UpgradeLog*.htm 251 | ServiceFabricBackup/ 252 | *.rptproj.bak 253 | 254 | # SQL Server files 255 | *.mdf 256 | *.ldf 257 | *.ndf 258 | 259 | # Business Intelligence projects 260 | *.rdl.data 261 | *.bim.layout 262 | *.bim_*.settings 263 | *.rptproj.rsuser 264 | *- [Bb]ackup.rdl 265 | *- [Bb]ackup ([0-9]).rdl 266 | *- [Bb]ackup ([0-9][0-9]).rdl 267 | 268 | # Microsoft Fakes 269 | FakesAssemblies/ 270 | 271 | # GhostDoc plugin setting file 272 | *.GhostDoc.xml 273 | 274 | # Node.js Tools for Visual Studio 275 | .ntvs_analysis.dat 276 | node_modules/ 277 | 278 | # Visual Studio 6 build log 279 | *.plg 280 | 281 | # Visual Studio 6 workspace options file 282 | *.opt 283 | 284 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.) 285 | *.vbw 286 | 287 | # Visual Studio LightSwitch build output 288 | **/*.HTMLClient/GeneratedArtifacts 289 | **/*.DesktopClient/GeneratedArtifacts 290 | **/*.DesktopClient/ModelManifest.xml 291 | **/*.Server/GeneratedArtifacts 292 | **/*.Server/ModelManifest.xml 293 | _Pvt_Extensions 294 | 295 | # Paket dependency manager 296 | .paket/paket.exe 297 | paket-files/ 298 | 299 | # FAKE - F# Make 300 | .fake/ 301 | 302 | # CodeRush personal settings 303 | .cr/personal 304 | 305 | # Python Tools for Visual Studio (PTVS) 306 | __pycache__/ 307 | *.pyc 308 | 309 | # Cake - Uncomment if you are using it 310 | # tools/** 311 | # !tools/packages.config 312 | 313 | # Tabs Studio 314 | *.tss 315 | 316 | # Telerik's JustMock configuration file 317 | *.jmconfig 318 | 319 | # BizTalk build output 320 | *.btp.cs 321 | *.btm.cs 322 | *.odx.cs 323 | *.xsd.cs 324 | 325 | # OpenCover UI analysis results 326 | OpenCover/ 327 | 328 | # Azure Stream Analytics local run output 329 | ASALocalRun/ 330 | 331 | # MSBuild Binary and Structured Log 332 | *.binlog 333 | 334 | # NVidia Nsight GPU debugger configuration file 335 | *.nvuser 336 | 337 | # MFractors (Xamarin productivity tool) working folder 338 | .mfractor/ 339 | 340 | # Local History for Visual Studio 341 | .localhistory/ 342 | 343 | # BeatPulse healthcheck temp database 344 | healthchecksdb 345 | 346 | # Backup folder for Package Reference Convert tool in Visual Studio 2017 347 | MigrationBackup/ 348 | 349 | # Ionide (cross platform F# VS Code tools) working folder 350 | .ionide/ 351 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## [project-title] Changelog 2 | 3 | 4 | # x.y.z (yyyy-mm-dd) 5 | 6 | *Features* 7 | * ... 8 | 9 | *Bug Fixes* 10 | * ... 11 | 12 | *Breaking Changes* 13 | * ... 14 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to [project-title] 2 | 3 | This project welcomes contributions and suggestions. Most contributions require you to agree to a 4 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us 5 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. 6 | 7 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide 8 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions 9 | provided by the bot. You will only need to do this once across all repos using our CLA. 10 | 11 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 12 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or 13 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 14 | 15 | - [Code of Conduct](#coc) 16 | - [Issues and Bugs](#issue) 17 | - [Feature Requests](#feature) 18 | - [Submission Guidelines](#submit) 19 | 20 | ## Code of Conduct 21 | Help us keep this project open and inclusive. Please read and follow our [Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 22 | 23 | ## Found an Issue? 24 | If you find a bug in the source code or a mistake in the documentation, you can help us by 25 | [submitting an issue](#submit-issue) to the GitHub Repository. Even better, you can 26 | [submit a Pull Request](#submit-pr) with a fix. 27 | 28 | ## Want a Feature? 29 | You can *request* a new feature by [submitting an issue](#submit-issue) to the GitHub 30 | Repository. If you would like to *implement* a new feature, please submit an issue with 31 | a proposal for your work first, to be sure that we can use it. 32 | 33 | * **Small Features** can be crafted and directly [submitted as a Pull Request](#submit-pr). 34 | 35 | ## Submission Guidelines 36 | 37 | ### Submitting an Issue 38 | Before you submit an issue, search the archive, maybe your question was already answered. 39 | 40 | If your issue appears to be a bug, and hasn't been reported, open a new issue. 41 | Help us to maximize the effort we can spend fixing issues and adding new 42 | features, by not reporting duplicate issues. Providing the following information will increase the 43 | chances of your issue being dealt with quickly: 44 | 45 | * **Overview of the Issue** - if an error is being thrown a non-minified stack trace helps 46 | * **Version** - what version is affected (e.g. 0.1.2) 47 | * **Motivation for or Use Case** - explain what are you trying to do and why the current behavior is a bug for you 48 | * **Browsers and Operating System** - is this a problem with all browsers? 49 | * **Reproduce the Error** - provide a live example or a unambiguous set of steps 50 | * **Related Issues** - has a similar issue been reported before? 51 | * **Suggest a Fix** - if you can't fix the bug yourself, perhaps you can point to what might be 52 | causing the problem (line of code or commit) 53 | 54 | You can file new issues by providing the above information at the corresponding repository's issues link: https://github.com/[organization-name]/[repository-name]/issues/new]. 55 | 56 | ### Submitting a Pull Request (PR) 57 | Before you submit your Pull Request (PR) consider the following guidelines: 58 | 59 | * Search the repository (https://github.com/[organization-name]/[repository-name]/pulls) for an open or closed PR 60 | that relates to your submission. You don't want to duplicate effort. 61 | 62 | * Make your changes in a new git fork: 63 | 64 | * Commit your changes using a descriptive commit message 65 | * Push your fork to GitHub: 66 | * In GitHub, create a pull request 67 | * If we suggest changes then: 68 | * Make the required updates. 69 | * Rebase your fork and force push to your GitHub repository (this will update your Pull Request): 70 | 71 | ```shell 72 | git rebase master -i 73 | git push -f 74 | ``` 75 | 76 | That's it! Thank you for your contribution! 77 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | --- 2 | page_type: sample 3 | languages: 4 | - tsql 5 | - sql 6 | - scala 7 | products: 8 | - azure 9 | - azure-databricks 10 | - azure-blob-storage 11 | - azure-key-vault 12 | - azure-sql-database 13 | description: "Fast Data Loading in Azure SQL DB using Azure Databricks" 14 | urlFragment: "azure-sql-db-databricks" 15 | --- 16 | 17 | # Fast Data Loading in Azure SQL DB using Azure Databricks 18 | 19 | ![License](https://img.shields.io/badge/license-MIT-green.svg) 20 | 21 | 28 | 29 | Azure Databricks and Azure SQL database can be used amazingly well together. This repo will help you to use the [latest connector](https://github.com/microsoft/sql-spark-connector) to load data into Azure SQL as fast as possible, using table partitions and column-store and all the known best-practices. 30 | 31 | - [Partitioned Tables and Indexes](https://docs.microsoft.com/en-us/sql/relational-databases/partitions/partitioned-tables-and-indexes) 32 | - [Columnstore indexes: Overview](https://docs.microsoft.com/en-us/sql/relational-databases/indexes/columnstore-indexes-overview) 33 | - [Columnstore indexes - Data loading guidance](https://docs.microsoft.com/en-us/sql/relational-databases/indexes/columnstore-indexes-data-loading-guidance) 34 | 35 | ## Samples 36 | 37 | All the samples start from a partitioned Parquet file, created with data generated from the famous TPC-H benchmark. Free tools are available on TPC-H website to generate a dataset with the size you want: 38 | 39 | http://www.tpc.org/tpch/ 40 | 41 | Once the Parquet file is available, 42 | 43 | - [Create Parquet File](./notebooks/00-create-parquet-file.ipynb) 44 | 45 | the samples will guide you through the most common scenarios 46 | 47 | - [Loading a non-partitioned table](./notebooks/01-load-into-single-table.ipynb) 48 | - [Loading a partitioned table](./notebooks/02-load-into-partitioned-table.ipynb) 49 | - [Loading a partitioned table via switch-in](./notebooks/03a-parallel-switch-in-load-into-partitioned-table-many.ipynb) 50 | 51 | all samples will also show how to correctly load table if there are already indexes or if you want to use a column-store in Azure SQL. 52 | 53 | ### Bonus Samples: Reading data as fast as possible 54 | 55 | Though this repo focuses on writing data as fast as possible into Azure SQL, I also understand that you may also want to know how to do the opposite: how the read data as fast as possible from Azure SQL into Apache Spark / Azure Databricks? For this reason in the folder `notebooks/read-from-azure-sql` you will find two samples that shows how to do exactly that: 56 | 57 | - [Fast Reading from Azure SQL](./notebooks/read-from-azure-sql/fast-read.ipynb) 58 | - [Pushing queries to Azure SQL](./notebooks/read-from-azurel-sql/push-down-queries.ipynb) 59 | 60 | ## Contributing 61 | 62 | This project welcomes contributions and suggestions. Most contributions require you to agree to a Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. 63 | 64 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions provided by the bot. You will only need to do this once across all repos using our CLA. 65 | 66 | This project has adopted the Microsoft Open Source Code of Conduct. For more information see the Code of Conduct FAQ or contact opencode@microsoft.com with any additional questions or comments. 67 | -------------------------------------------------------------------------------- /notebooks/00-create-parquet-file.ipynb: -------------------------------------------------------------------------------- 1 | {"cells":[{"cell_type":"markdown","source":["# 00 - Create Parquet file used in subsequent samples\n\nThis notebook will create the parquet file used in subsequent samples"],"metadata":{}},{"cell_type":"markdown","source":["Define variables used thoughout the script. Azure Key Value has been used to securely store sensitive data. More info here: [Create an Azure Key Vault-backed secret scope](https://docs.microsoft.com/en-us/azure/databricks/security/secrets/secret-scopes#--create-an-azure-key-vault-backed-secret-scope)"],"metadata":{}},{"cell_type":"code","source":["val scope = \"key-vault-secrets\"\n\nval storageAccount = \"dmstore2\";\nval storageKey = dbutils.secrets.get(scope, \"dmstore2-2\");\n\nval parquetLocation = s\"wasbs://tpch@$storageAccount.blob.core.windows.net/10GB/parquet/lineitem\""],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
scope: String = key-vault-secrets\nstorageAccount: String = dmstore2\nstorageKey: String = [REDACTED]\nparquetLocation: String = wasbs://tpch@dmstore2.blob.core.windows.net/10GB/parquet/lineitem\n
"]}}],"execution_count":3},{"cell_type":"markdown","source":["Configure Spark to access Azure Blob Store"],"metadata":{}},{"cell_type":"code","source":["spark.conf.set(s\"fs.azure.account.key.$storageAccount.blob.core.windows.net\", storageKey);"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
"]}}],"execution_count":5},{"cell_type":"markdown","source":["Load data from generated 10GB TPC-H LINEITEM file. Tools to generate TPC-H data can be found here:\nhttp://www.tpc.org/tpc_documents_current_versions/current_specifications5.asp"],"metadata":{}},{"cell_type":"code","source":["import org.apache.spark.sql.types._\n\nval li1 = spark\n .read\n .format(\"csv\")\n .option(\"sep\", \"|\")\n .schema(\"\"\"\n L_ORDERKEY INTEGER,\n L_PARTKEY INTEGER,\n L_SUPPKEY INTEGER,\n L_LINENUMBER INTEGER,\n L_QUANTITY DECIMAL(15,2),\n L_EXTENDEDPRICE DECIMAL(15,2),\n L_DISCOUNT DECIMAL(15,2),\n L_TAX DECIMAL(15,2),\n L_RETURNFLAG CHAR(1),\n L_LINESTATUS CHAR(1),\n L_SHIPDATE DATE,\n L_COMMITDATE DATE,\n L_RECEIPTDATE DATE,\n L_SHIPINSTRUCT CHAR(25),\n L_SHIPMODE CHAR(10),\n L_COMMENT VARCHAR(44)\n \"\"\")\n .load(s\"wasbs://tpch@$storageAccount.blob.core.windows.net/10GB/lineitem.tbl\")\n;"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
import org.apache.spark.sql.types._\nli1: org.apache.spark.sql.DataFrame = [L_ORDERKEY: int, L_PARTKEY: int ... 14 more fields]\n
"]}}],"execution_count":7},{"cell_type":"markdown","source":["Create a temporary view to make it easier to manipulate schema and data"],"metadata":{}},{"cell_type":"code","source":["li1.createOrReplaceTempView(\"LINEITEM\")"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
"]}}],"execution_count":9},{"cell_type":"markdown","source":["Add a new column that will be used for partitioning"],"metadata":{}},{"cell_type":"code","source":["var li2 = spark.sql(\"SELECT *, YEAR(L_COMMITDATE) * 100 + MONTH(L_COMMITDATE) AS L_PARTITION_KEY FROM LINEITEM\")"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
li2: org.apache.spark.sql.DataFrame = [L_ORDERKEY: int, L_PARTKEY: int ... 15 more fields]\n
"]}}],"execution_count":11},{"cell_type":"markdown","source":["Repartition data using the newly created column"],"metadata":{}},{"cell_type":"code","source":["val li3 = li2.repartition($\"L_PARTITION_KEY\")"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
li3: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [L_ORDERKEY: int, L_PARTKEY: int ... 15 more fields]\n
"]}}],"execution_count":13},{"cell_type":"markdown","source":["Save dataframe into parquet format, making sure parquet will be saved using the same partitioning logic used for the dataframe"],"metadata":{}},{"cell_type":"code","source":["li3.write.mode(\"overwrite\").partitionBy(\"L_PARTITION_KEY\").parquet(parquetLocation)"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
"]}}],"execution_count":15},{"cell_type":"markdown","source":["As a test read back the parquet files"],"metadata":{}},{"cell_type":"code","source":["val li4 = spark.read.parquet(parquetLocation)\nli4.rdd.getNumPartitions"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
li4: org.apache.spark.sql.DataFrame = [L_ORDERKEY: int, L_PARTKEY: int ... 15 more fields]\nres6: Int = 20\n
"]}}],"execution_count":17},{"cell_type":"markdown","source":["Peek at first 10 partitions"],"metadata":{}},{"cell_type":"code","source":["display(li4.groupBy($\"L_PARTITION_KEY\").count().orderBy($\"L_PARTITION_KEY\").limit(10))"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["
L_PARTITION_KEYcount
199201412
199202190252
199203582150
199204748645
199205770266
199206746006
199207772006
199208770213
199209748997
199210771256
"]}}],"execution_count":19}],"metadata":{"name":"00-create-parquet-file","notebookId":1331848450253195},"nbformat":4,"nbformat_minor":0} -------------------------------------------------------------------------------- /notebooks/01-load-into-single-table.ipynb: -------------------------------------------------------------------------------- 1 | {"cells":[{"cell_type":"markdown","source":["# 01 - Load data into an Azure SQL non-partitioned table\n\nThe sample is using the new sql-spark-connector (https://github.com/microsoft/sql-spark-connector). Make sure you have installed it before running this notebook. Maven Coordinates: `com.microsoft.azure:spark-mssql-connector:1.0.0`\n\n## Notes on terminology\n\nThe term \"row-store\" is used to identify and index that is not using the [column-store layout](https://docs.microsoft.com/en-us/sql/relational-databases/indexes/columnstore-indexes-overview) to store its data.\n\n## Samples\n\nIn this notebook there are three samples\n\n- Load data into a table without indexes\n- Load data into a table with row-store indexes\n- Load data into a table with columns-store indexes\n\n## Supported Azure Databricks Versions\n\nDatabricks supported versions: Spark 2.4.5 and Scala 2.11"],"metadata":{}},{"cell_type":"markdown","source":["## Setup"],"metadata":{}},{"cell_type":"markdown","source":["Define variables used thoughout the script. Azure Key Value has been used to securely store sensitive data. More info here: [Create an Azure Key Vault-backed secret scope](https://docs.microsoft.com/en-us/azure/databricks/security/secrets/secret-scopes#--create-an-azure-key-vault-backed-secret-scope)"],"metadata":{}},{"cell_type":"code","source":["val scope = \"key-vault-secrets\"\n\nval storageAccount = \"dmstore2\";\nval storageKey = dbutils.secrets.get(scope, \"dmstore2-2\");\n\nval server = dbutils.secrets.get(scope, \"srv001\").concat(\".database.windows.net\");\nval database = \"ApacheSpark\";\nval user = dbutils.secrets.get(scope, \"dbuser001\");\nval password = dbutils.secrets.get(scope, \"dbpwd001\");\nval table = \"dbo.LINEITEM_LOADTEST\"\n\nval url = s\"jdbc:sqlserver://$server;databaseName=$database;\""],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
scope: String = key-vault-secrets\nstorageAccount: String = dmstore2\nstorageKey: String = [REDACTED]\nserver: String = [REDACTED].database.windows.net\ndatabase: String = ApacheSpark\nuser: String = [REDACTED]\npassword: String = [REDACTED]\ntable: String = dbo.LINEITEM_LOADTEST\nurl: String = jdbc:sqlserver://[REDACTED].database.windows.net;databaseName=ApacheSpark;\n
"]}}],"execution_count":4},{"cell_type":"markdown","source":["Configure Spark to access Azure Blob Store"],"metadata":{}},{"cell_type":"code","source":["spark.conf.set(s\"fs.azure.account.key.$storageAccount.blob.core.windows.net\", storageKey);"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
"]}}],"execution_count":6},{"cell_type":"markdown","source":["Load the Parquet file generated in `00-create-parquet-file` notebook that contains LINEITEM data partitioned by Year and Month"],"metadata":{}},{"cell_type":"code","source":["val li = spark.read.parquet(s\"wasbs://tpch@$storageAccount.blob.core.windows.net/10GB/parquet/lineitem\")"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
li: org.apache.spark.sql.DataFrame = [L_ORDERKEY: int, L_PARTKEY: int ... 15 more fields]\n
"]}}],"execution_count":8},{"cell_type":"markdown","source":["Loaded data is split in 20 dataframe partitions"],"metadata":{}},{"cell_type":"code","source":["li.rdd.getNumPartitions"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
res2: Int = 20\n
"]}}],"execution_count":10},{"cell_type":"markdown","source":["Show schema of loaded data"],"metadata":{}},{"cell_type":"code","source":["li.printSchema"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
root\n-- L_ORDERKEY: integer (nullable = true)\n-- L_PARTKEY: integer (nullable = true)\n-- L_SUPPKEY: integer (nullable = true)\n-- L_LINENUMBER: integer (nullable = true)\n-- L_QUANTITY: decimal(15,2) (nullable = true)\n-- L_EXTENDEDPRICE: decimal(15,2) (nullable = true)\n-- L_DISCOUNT: decimal(15,2) (nullable = true)\n-- L_TAX: decimal(15,2) (nullable = true)\n-- L_RETURNFLAG: string (nullable = true)\n-- L_LINESTATUS: string (nullable = true)\n-- L_SHIPDATE: date (nullable = true)\n-- L_COMMITDATE: date (nullable = true)\n-- L_RECEIPTDATE: date (nullable = true)\n-- L_SHIPINSTRUCT: string (nullable = true)\n-- L_SHIPMODE: string (nullable = true)\n-- L_COMMENT: string (nullable = true)\n-- L_PARTITION_KEY: integer (nullable = true)\n\n
"]}}],"execution_count":12},{"cell_type":"markdown","source":["Now, make sure you create on your Azure SQL the following LINEITEM table:\n```sql\ncreate table [dbo].[LINEITEM_LOADTEST]\n(\n\t[L_ORDERKEY] [int] not null,\n\t[L_PARTKEY] [int] not null,\n\t[L_SUPPKEY] [int] not null,\n\t[L_LINENUMBER] [int] not null,\n\t[L_QUANTITY] [decimal](15, 2) not null,\n\t[L_EXTENDEDPRICE] [decimal](15, 2) not null,\n\t[L_DISCOUNT] [decimal](15, 2) not null,\n\t[L_TAX] [decimal](15, 2) not null,\n\t[L_RETURNFLAG] [char](1) not null,\n\t[L_LINESTATUS] [char](1) not null,\n\t[L_SHIPDATE] [date] not null,\n\t[L_COMMITDATE] [date] not null,\n\t[L_RECEIPTDATE] [date] not null,\n\t[L_SHIPINSTRUCT] [char](25) not null,\n\t[L_SHIPMODE] [char](10) not null,\n\t[L_COMMENT] [varchar](44) not null,\n\t[L_PARTITION_KEY] [int] not null\n) \n```"],"metadata":{}},{"cell_type":"markdown","source":["## Load data into a table with no indexes\n\nIn Azure SQL terminology an Heap is a table with no clustered index. In this sample we'll load data into a table that as no index (clustered or non-clustered) as is not partitioned. This is the simplest scenario possibile and allows parallel load of data.\n\n### Note:\nParallel load *cannot* happen if you have row-store indexes on the table. If you want to bulk load data in parallel into a table that has row-store indexes, you must use partitioning. If you are planning to add indexes to your table, and data to be loaded in the table is in the terabyte range, you want to use partitioing and have indexes created before bulk loading data into Azure SQL, as otherwise creating index once the table is already loaded will use a significat amout of resources."],"metadata":{}},{"cell_type":"markdown","source":["To enable parallel load the option `tableLock` must be set to `true`. This will prevent any other access to the table, other then the one done for performing the bulk load operations."],"metadata":{}},{"cell_type":"code","source":["li.write\n .format(\"com.microsoft.sqlserver.jdbc.spark\") \n .mode(\"overwrite\") \n .option(\"truncate\", \"true\") \n .option(\"url\", url) \n .option(\"dbtable\", \"dbo.LINEITEM_LOADTEST\") \n .option(\"user\", user) \n .option(\"password\", password) \n .option(\"reliabilityLevel\", \"BEST_EFFORT\") \n .option(\"tableLock\", \"true\") \n .option(\"batchsize\", \"100000\")\n .option(\"schemaCheckEnabled\", \"false\") // needed to avoid clash of NULLable columns vs NON-NULLable colums\n .save()"],"metadata":{},"outputs":[],"execution_count":16},{"cell_type":"markdown","source":["## Load data into a table with row-store indexes\n\nIf table is not partitioned, there are no options to bulk load data in parallel into the desired table. The only way to avoid locking and deadlocks is to load everything by serializing the bulk load operations. As you can expect, performance won't be the optimal."],"metadata":{}},{"cell_type":"markdown","source":["Create the following index on the table\n```sql\ncreate clustered index IXC on dbo.[LINEITEM_LOADTEST] ([L_COMMITDATE]);\n\ncreate unique nonclustered index IX1 on dbo.[LINEITEM_LOADTEST] ([L_ORDERKEY], [L_LINENUMBER]);\n\ncreate nonclustered index IX2 on dbo.[LINEITEM_LOADTEST] ([L_PARTKEY]); \n```"],"metadata":{}},{"cell_type":"markdown","source":["Load data by coalescing all dataframe partitions into just one"],"metadata":{}},{"cell_type":"code","source":["val url = s\"jdbc:sqlserver://$server;databaseName=$database;\"\n\nli.coalesce(1)\n .write\n .format(\"com.microsoft.sqlserver.jdbc.spark\") \n .mode(\"overwrite\") \n .option(\"truncate\", \"true\") \n .option(\"url\", url) \n .option(\"dbtable\", \"dbo.LINEITEM_LOADTEST\") \n .option(\"user\", user) \n .option(\"password\", password) \n .option(\"reliabilityLevel\", \"BEST_EFFORT\") \n .option(\"tableLock\", \"false\") \n .option(\"batchsize\", \"100000\")\n .option(\"schemaCheckEnabled\", \"false\")\n .save()"],"metadata":{},"outputs":[],"execution_count":20},{"cell_type":"markdown","source":["## Load data into a table with (only) column-store indexes\n\nIf a table has only column-store indexes, data load can happen in parallel, as there is no sorting needed."],"metadata":{}},{"cell_type":"markdown","source":["Empty table if needed, to speed up index deletion\n\n```sql\ntruncate table dbo.[LINEITEM_LOADTEST];\n```\n\nDrop the previously create indexes if needed:\n```sql\ndrop index IXC on dbo.[LINEITEM_LOADTEST];\ndrop index IX1 on dbo.[LINEITEM_LOADTEST];\ndrop index IX2 on dbo.[LINEITEM_LOADTEST];\n```\n\nAnd the create a clustered columnstore index:\n\n```sql\ncreate clustered columnstore index IXCCS on dbo.[LINEITEM_LOADTEST]\n```"],"metadata":{}},{"cell_type":"markdown","source":["Load data using [columnstore data loading best pratices](https://docs.microsoft.com/en-us/sql/relational-databases/indexes/columnstore-indexes-data-loading-guidance), by loading 1048576 rows at time, to land directly into a compressed segment. `tableLock` options must be set to `false` to avoid table lock that will prevent parallel load. Data with be loaded in parallel, using as many as Apache Spark workers are available."],"metadata":{}},{"cell_type":"code","source":["li.write \n .format(\"com.microsoft.sqlserver.jdbc.spark\") \n .mode(\"overwrite\") \n .option(\"truncate\", \"true\") \n .option(\"url\", url) \n .option(\"dbtable\", \"dbo.LINEITEM_LOADTEST\") \n .option(\"user\", user) \n .option(\"password\", password) \n .option(\"reliabilityLevel\", \"BEST_EFFORT\") \n .option(\"tableLock\", \"false\") \n .option(\"batchsize\", \"1048576\") \n .option(\"schemaCheckEnabled\", \"false\")\n .save()"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
"]}}],"execution_count":24},{"cell_type":"code","source":[""],"metadata":{},"outputs":[],"execution_count":25}],"metadata":{"name":"01-load-into-single-table","notebookId":1331848450253174},"nbformat":4,"nbformat_minor":0} -------------------------------------------------------------------------------- /notebooks/02-load-into-partitioned-table.ipynb: -------------------------------------------------------------------------------- 1 | {"cells":[{"cell_type":"markdown","source":["# 02 - Load data into an Azure SQL partitioned table\n\nAzure SQL supports [table and index partitioning](https://docs.microsoft.com/en-us/sql/relational-databases/partitions/partitioned-tables-and-indexes). If a table is partitioned, data can be loaded in parallel without the need to put a lock on the entire table. In order to allow parallel partitions to be loaded, the source RDD/DataFrame/Dataset and the target Azure SQL table *MUST* have compatible partitions, which means that one RDD partition ends up exactly in one or more than one Azure SQL partitions, and those are not used by other RDD partitions.\n\nWhen table is partitioned, data *can* be bulk loaded in parallel also if there are indexes on the table. Especially on very large databases _this is the recommended approach_. The bulk load process will be a bit slower, but you'll not need to create indexes after having loaded the data. Creation of indexes on huge, already loaded, tables is a very expensive operation that you would like to avoid if possibile.\n\nThe sample is using the new sql-spark-connector (https://github.com/microsoft/sql-spark-connector). Make sure to have it installed in the cluster before running the notebook.\n\n## Dataframe and Azure SQL partitions\n\nBoth Azure SQL and Azure Databricks (more specifically, Spark, and even more specifically a Spark Dataframe) are able to use take advantage of partitioning to more easily deal with large amounts of data. Partitions allow to work on subset of data, and usually you can do such work in parallel, spreading the workload on several CPU and/or nodes.\n\n## Notes on terminology\n\nThe term \"row-store\" is used to identify and index that is not using the [column-store layout](https://docs.microsoft.com/en-us/sql/relational-databases/indexes/columnstore-indexes-overview) to store its data.\n\n## Samples\n\nIn this notebook there are two samples\n\n- Load data into a partitioned table with row-store indexes\n- Load data into a partitioned table with columns-store indexes\n\n## Supported Azure Databricks Versions\n\nDatabricks supported versions: Spark 2.4.5 and Scala 2.11"],"metadata":{}},{"cell_type":"markdown","source":["## Setup"],"metadata":{}},{"cell_type":"markdown","source":["Define variables used thoughout the script. Azure Key Value has been used to securely store sensitive data. More info here: [Create an Azure Key Vault-backed secret scope](https://docs.microsoft.com/en-us/azure/databricks/security/secrets/secret-scopes#--create-an-azure-key-vault-backed-secret-scope)"],"metadata":{}},{"cell_type":"code","source":["val scope = \"key-vault-secrets\"\n\nval storageAccount = \"dmstore2\";\nval storageKey = dbutils.secrets.get(scope, \"dmstore2-2\");\n\nval server = dbutils.secrets.get(scope, \"srv001\").concat(\".database.windows.net\");\nval database = \"ApacheSpark\";\nval user = dbutils.secrets.get(scope, \"dbuser001\");\nval password = dbutils.secrets.get(scope, \"dbpwd001\");\n\nval url = s\"jdbc:sqlserver://$server;databaseName=$database;\""],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
scope: String = key-vault-secrets\nstorageAccount: String = dmstore2\nstorageKey: String = [REDACTED]\nserver: String = [REDACTED].database.windows.net\ndatabase: String = ApacheSpark\nuser: String = [REDACTED]\npassword: String = [REDACTED]\nurl: String = jdbc:sqlserver://[REDACTED].database.windows.net;databaseName=ApacheSpark;\n
"]}}],"execution_count":4},{"cell_type":"markdown","source":["Configure Spark to access Azure Blob Store"],"metadata":{}},{"cell_type":"code","source":["spark.conf.set(s\"fs.azure.account.key.$storageAccount.blob.core.windows.net\", storageKey);"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
"]}}],"execution_count":6},{"cell_type":"markdown","source":["Load the Parquet file generated in `00-create-parquet-file` notebook that contains LINEITEM data partitioned by Year and Month"],"metadata":{}},{"cell_type":"code","source":["val li = spark.read.parquet(s\"wasbs://tpch@$storageAccount.blob.core.windows.net/10GB/parquet/lineitem\")"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
li: org.apache.spark.sql.DataFrame = [L_ORDERKEY: int, L_PARTKEY: int ... 15 more fields]\n
"]}}],"execution_count":8},{"cell_type":"markdown","source":["Loaded data is split in 20 dataframe partitions"],"metadata":{}},{"cell_type":"code","source":["li.rdd.getNumPartitions"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
res2: Int = 20\n
"]}}],"execution_count":10},{"cell_type":"markdown","source":["Show schema of loaded data"],"metadata":{}},{"cell_type":"code","source":["li.printSchema"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
root\n-- L_ORDERKEY: integer (nullable = true)\n-- L_PARTKEY: integer (nullable = true)\n-- L_SUPPKEY: integer (nullable = true)\n-- L_LINENUMBER: integer (nullable = true)\n-- L_QUANTITY: decimal(15,2) (nullable = true)\n-- L_EXTENDEDPRICE: decimal(15,2) (nullable = true)\n-- L_DISCOUNT: decimal(15,2) (nullable = true)\n-- L_TAX: decimal(15,2) (nullable = true)\n-- L_RETURNFLAG: string (nullable = true)\n-- L_LINESTATUS: string (nullable = true)\n-- L_SHIPDATE: date (nullable = true)\n-- L_COMMITDATE: date (nullable = true)\n-- L_RECEIPTDATE: date (nullable = true)\n-- L_SHIPINSTRUCT: string (nullable = true)\n-- L_SHIPMODE: string (nullable = true)\n-- L_COMMENT: string (nullable = true)\n-- L_PARTITION_KEY: integer (nullable = true)\n\n
"]}}],"execution_count":12},{"cell_type":"markdown","source":["Make sure you create on your Azure SQL the following `LINEITEM` table, partitioned by `L_PARTITION_KEY`:\n\n```sql\ncreate partition function pf_LINEITEM(int)\nas range left for values \n(\n\t199201,199202,199203,199204,199205,199206,199207,199208,199209,199210,199211,199212,\n\t199301,199302,199303,199304,199305,199306,199307,199308,199309,199310,199311,199312,\n\t199401,199402,199403,199404,199405,199406,199407,199408,199409,199410,199411,199412,\n\t199501,199502,199503,199504,199505,199506,199507,199508,199509,199510,199511,199512,\n\t199601,199602,199603,199604,199605,199606,199607,199608,199609,199610,199611,199612,\n\t199701,199702,199703,199704,199705,199706,199707,199708,199709,199710,199711,199712,\n\t199801,199802,199803,199804,199805,199806,199807,199808,199809,199810\n);\n\ncreate partition scheme ps_LINEITEM\nas partition pf_LINEITEM\nall to ([Primary])\n;\n\ndrop table if exists [dbo].[LINEITEM_LOADTEST];\ncreate table [dbo].[LINEITEM_LOADTEST]\n(\n\t[L_ORDERKEY] [int] not null,\n\t[L_PARTKEY] [int] not null,\n\t[L_SUPPKEY] [int] not null,\n\t[L_LINENUMBER] [int] not null,\n\t[L_QUANTITY] [decimal](15, 2) not null,\n\t[L_EXTENDEDPRICE] [decimal](15, 2) not null,\n\t[L_DISCOUNT] [decimal](15, 2) not null,\n\t[L_TAX] [decimal](15, 2) not null,\n\t[L_RETURNFLAG] [char](1) not null,\n\t[L_LINESTATUS] [char](1) not null,\n\t[L_SHIPDATE] [date] not null,\n\t[L_COMMITDATE] [date] not null,\n\t[L_RECEIPTDATE] [date] not null,\n\t[L_SHIPINSTRUCT] [char](25) not null,\n\t[L_SHIPMODE] [char](10) not null,\n\t[L_COMMENT] [varchar](44) not null,\n\t[L_PARTITION_KEY] [int] not null\n) on ps_LINEITEM([L_PARTITION_KEY])\n```"],"metadata":{}},{"cell_type":"markdown","source":["You can check that Azure SQL table is partitioned by running the following T-SQL command:\n\n```sql\nSELECT\n schema_name(t.schema_id) as [schema_name],\n t.[name] as table_name,\n i.[name] as index_name,\n ps.[partition_id],\n ps.partition_number,\n p.data_compression_desc,\n i.[type_desc], \n ps.row_count,\n (ps.used_page_count * 8.) / 1024. / 1024. as size_in_gb\nfrom\n sys.dm_db_partition_stats as ps \ninner join \n sys.partitions as p on ps.partition_id = p.partition_id\ninner join\n sys.tables as t on t.object_id = ps.object_id\ninner join\n sys.indexes as i on ps.object_id = i.object_id and ps.index_id = i.index_id\nwhere\n t.[name] = 'LINEITEM_LOADTEST' and t.[schema_id] = schema_id('dbo')\norder by\n [schema_name], table_name, index_name, partition_number\n```"],"metadata":{}},{"cell_type":"markdown","source":["## Load data into a partitioned table with row-store indexes"],"metadata":{}},{"cell_type":"markdown","source":["On the target table create the Clustered Index and a couple of Non-Clustered Index. In order to allow parallel partitioned load, also indexes must use the same partitioning function used by the table\n\n```sql\ncreate clustered index IXC on dbo.[LINEITEM_LOADTEST] ([L_COMMITDATE]) \non ps_LINEITEM([L_PARTITION_KEY]);\n\ncreate unique nonclustered index IX1 on dbo.[LINEITEM_LOADTEST] ([L_ORDERKEY], [L_LINENUMBER], [L_PARTITION_KEY]) \non ps_LINEITEM([L_PARTITION_KEY]);\n\ncreate nonclustered index IX2 on dbo.[LINEITEM_LOADTEST] ([L_PARTKEY], [L_PARTITION_KEY]) \non ps_LINEITEM([L_PARTITION_KEY]);\n```"],"metadata":{}},{"cell_type":"markdown","source":["As DataFrame and Azure SQL Table are both partitioned by `L_PARTITION_KEY`, there isn't much left to do and the connector will take care of everything for us. `tableLock` must be set to `false` to avoid table lock that will prevent parallel partitioned load. Thanks to partitions, acquired locks will not interfere with each other."],"metadata":{}},{"cell_type":"code","source":["li.write \n .format(\"com.microsoft.sqlserver.jdbc.spark\") \n .mode(\"overwrite\") \n .option(\"truncate\", \"true\") \n .option(\"url\", url) \n .option(\"dbtable\", \"dbo.LINEITEM_LOADTEST\") \n .option(\"user\", user) \n .option(\"password\", password) \n .option(\"reliabilityLevel\", \"BEST_EFFORT\") \n .option(\"tableLock\", \"false\") \n .option(\"batchsize\", \"100000\") \n .option(\"schemaCheckEnabled\", \"false\") // needed to avoid clash of NULLable columns vs NON-NULLable colums\n .save()"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
"]}}],"execution_count":18},{"cell_type":"markdown","source":["## Load data into a partitioned table with column-store index"],"metadata":{}},{"cell_type":"markdown","source":["Empty the test table if needed, to speed up index deletion\n\n```sql\ntruncate table dbo.[LINEITEM_LOADTEST];\n```\n\nDrop the previously create indexes if needed:\n```sql\ndrop index IXC on dbo.[LINEITEM_LOADTEST];\ndrop index IX1 on dbo.[LINEITEM_LOADTEST];\ndrop index IX2 on dbo.[LINEITEM_LOADTEST];\n```\n\nAnd then create a clustered columnstore index:\n\n```sql\ncreate clustered columnstore index IXCCS on dbo.[LINEITEM_LOADTEST]\non ps_LINEITEM([L_PARTITION_KEY]);\n```"],"metadata":{}},{"cell_type":"markdown","source":["Load data using [columnstore data loading best pratices](https://docs.microsoft.com/en-us/sql/relational-databases/indexes/columnstore-indexes-data-loading-guidance), by loading 1048576 rows at time, to land directly into a compressed segment. Locking the table must be set to `false` to avoid locking. Data with be loaded in parallel, using as many as Apache Spark workers are available."],"metadata":{}},{"cell_type":"code","source":["li.write \n .format(\"com.microsoft.sqlserver.jdbc.spark\") \n .mode(\"overwrite\") \n .option(\"truncate\", \"true\") \n .option(\"url\", url) \n .option(\"dbtable\", \"dbo.LINEITEM_LOADTEST\") \n .option(\"user\", user) \n .option(\"password\", password) \n .option(\"reliabilityLevel\", \"BEST_EFFORT\") \n .option(\"tableLock\", \"false\") \n .option(\"batchsize\", \"1048576\") \n .option(\"schemaCheckEnabled\", \"false\")\n .save()"],"metadata":{},"outputs":[],"execution_count":22}],"metadata":{"name":"02-load-into-partitioned-table","notebookId":1536696850337469},"nbformat":4,"nbformat_minor":0} -------------------------------------------------------------------------------- /notebooks/03a-parallel-switch-in-load-into-partitioned-table-many.ipynb: -------------------------------------------------------------------------------- 1 | {"cells":[{"cell_type":"markdown","source":["# 03a - Parallel Switch-In Load Into Partitioned Table\n\nIf you have to load data into a table that is also actively used by users, you cannot just run a bulk copy operation on such table. If you plan to use `tableLock` option, users will not be able to access data for the whole duration of the bulk load; even if you don't plan to use `tableLock` option, a bulk load operation will still impact and interfere with conccurrent operations running on the table partition.\n\nTo get more details on partitioning, take a look at the `02-load-into-partitioned-table` notebook.\n\nThe solution to be able to bulk load data and at the same time have the table usable by applications and users is simple: load another table instead, and then \"switch-in\" that table into the target one. More details on this pattern can be found in [this post](https://www.cathrinewilhelmsen.net/2015/04/19/table-partitioning-in-sql-server-partition-switching/) written by the Data Platform MVP Cathrine Wilhelmsen. \n\nBeside improving concurrency during bulk load operation, you also have another benefit that can be very useful. When not using the switch-in ability just discussed, it's usually better to load the table with indexes already created, as for very big tables, creating an index can completely drain all the resources avaiable to your Azure SQL database. By using this tecnique you are actually using a \"divide-et-impera\" approach, so that you can load data into a staging table with no indexes, where you'll have the best load performance possible, and then create the needed index later, with much lower impact on resources. The lower resource impact is due to the fact that you are only load data that will go into a single partition, not the whole table, and thus should be smaller and much more manageable. By repeating this process for all partitions you need to load, you can load data without impacting to much on Azure SQL resources and thus query performances.\n\nDue to the fact that Apache Spark RDD partitions and Azure SQL partitions are in a 1:N relationship, is not possibile for the Azure SQL Connector to easily determine which staging table should be used and how to do the switch-in. Luckily we can do this operation manually, using a [well documented technique](https://docs.databricks.com/notebooks/notebook-workflows.html), helping Apache Spark to maximize parallelism to load Azure SQL partitions.\n\nThe sample is using the new sql-spark-connector (https://github.com/microsoft/sql-spark-connector). Make sure you have installed it before running this notebook.\n\n## Notes on terminology\n\nThe term \"row-store\" is used to identify and index that is not using the [column-store layout](https://docs.microsoft.com/en-us/sql/relational-databases/indexes/columnstore-indexes-overview) to store its data.\n\n## Sample\n\nThis notebook is used to parallelize the work done by another notebook (`03b-parallel-switch-in-load-into-partitioned-table-single.ipynb`), that is actually the one loading the data into a staging table via bulk copy and than doing the switch-in operation. \n\n## Supported Azure Databricks Versions\n\nDatabricks supported versions: Spark 2.4.5 and Scala 2.11"],"metadata":{}},{"cell_type":"markdown","source":["# Create Target Table\nCreate table and its indexes"],"metadata":{}},{"cell_type":"markdown","source":["Make sure you create on your Azure SQL the following `LINEITEM` table, partitioned by `L_PARTITION_KEY`:\n\n```sql\ncreate partition function pf_LINEITEM(int)\nas range left for values \n(\n\t199201,199202,199203,199204,199205,199206,199207,199208,199209,199210,199211,199212,\n\t199301,199302,199303,199304,199305,199306,199307,199308,199309,199310,199311,199312,\n\t199401,199402,199403,199404,199405,199406,199407,199408,199409,199410,199411,199412,\n\t199501,199502,199503,199504,199505,199506,199507,199508,199509,199510,199511,199512,\n\t199601,199602,199603,199604,199605,199606,199607,199608,199609,199610,199611,199612,\n\t199701,199702,199703,199704,199705,199706,199707,199708,199709,199710,199711,199712,\n\t199801,199802,199803,199804,199805,199806,199807,199808,199809,199810\n);\n\ncreate partition scheme ps_LINEITEM\nas partition pf_LINEITEM\nall to ([Primary])\n;\n\ncreate table [dbo].[LINEITEM_LOADTEST]\n(\n\t[L_ORDERKEY] [int] not null,\n\t[L_PARTKEY] [int] not null,\n\t[L_SUPPKEY] [int] not null,\n\t[L_LINENUMBER] [int] not null,\n\t[L_QUANTITY] [decimal](15, 2) not null,\n\t[L_EXTENDEDPRICE] [decimal](15, 2) not null,\n\t[L_DISCOUNT] [decimal](15, 2) not null,\n\t[L_TAX] [decimal](15, 2) not null,\n\t[L_RETURNFLAG] [char](1) not null,\n\t[L_LINESTATUS] [char](1) not null,\n\t[L_SHIPDATE] [date] not null,\n\t[L_COMMITDATE] [date] not null,\n\t[L_RECEIPTDATE] [date] not null,\n\t[L_SHIPINSTRUCT] [char](25) not null,\n\t[L_SHIPMODE] [char](10) not null,\n\t[L_COMMENT] [varchar](44) not null,\n\t[L_PARTITION_KEY] [int] not null\n) on ps_LINEITEM([L_PARTITION_KEY])\n;\n\ncreate clustered index IXC on dbo.[LINEITEM_LOADTEST] ([L_COMMITDATE]) \non ps_LINEITEM([L_PARTITION_KEY]);\n\ncreate unique nonclustered index IX1 on dbo.[LINEITEM_LOADTEST] ([L_ORDERKEY], [L_LINENUMBER], [L_PARTITION_KEY]) \non ps_LINEITEM([L_PARTITION_KEY]);\n\ncreate nonclustered index IX2 on dbo.[LINEITEM_LOADTEST] ([L_PARTKEY], [L_PARTITION_KEY]) \non ps_LINEITEM([L_PARTITION_KEY]);\n```"],"metadata":{}},{"cell_type":"markdown","source":["## Create support function\nTo be able to execute a switch-in load, parallel load must be managed manually, as T-SQL code must be execute before and after each Azure SQL partition (not Dataframe partition! Remember that a Dataframe partition can target multiple Azure SQL partitions) has been loaded bia bulk load operation. By using the [tecnique explained in the official Databricks documentation](https://docs.databricks.com/notebooks/notebook-workflows.html#api) it is possibile to execute a notebook in parallel, by implementing the following function."],"metadata":{}},{"cell_type":"code","source":["import scala.concurrent.{Future, Await}\nimport scala.concurrent.duration._\nimport scala.util.control.NonFatal\n\ncase class NotebookData(path: String, timeout: Int, parameters: Map[String, String] = Map.empty[String, String])\n\ndef parallelNotebooks(notebooks: Seq[NotebookData]): Future[Seq[String]] = {\n import scala.concurrent.{Future, blocking, Await}\n import java.util.concurrent.Executors\n import scala.concurrent.ExecutionContext\n import com.databricks.WorkflowException\n\n val numNotebooksInParallel = 4 \n // If you create too many notebooks in parallel the driver may crash when you submit all of the jobs at once. \n // This code limits the number of parallel notebooks.\n implicit val ec = ExecutionContext.fromExecutor(Executors.newFixedThreadPool(numNotebooksInParallel))\n val ctx = dbutils.notebook.getContext()\n \n Future.sequence(\n notebooks.map { notebook => \n Future {\n dbutils.notebook.setContext(ctx)\n if (notebook.parameters.nonEmpty)\n dbutils.notebook.run(notebook.path, notebook.timeout, notebook.parameters)\n else\n dbutils.notebook.run(notebook.path, notebook.timeout)\n }\n .recover {\n case NonFatal(e) => s\"ERROR: ${e.getMessage}\"\n }\n }\n )\n}"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
import scala.concurrent.{Future, Await}\nimport scala.concurrent.duration._\nimport scala.util.control.NonFatal\ndefined class NotebookData\nparallelNotebooks: (notebooks: Seq[NotebookData])scala.concurrent.Future[Seq[String]]\n
"]}}],"execution_count":5},{"cell_type":"markdown","source":["## Run Parallel Load"],"metadata":{}},{"cell_type":"markdown","source":["Create a Sequence with Azure SQL partitions to be loaded is stored"],"metadata":{}},{"cell_type":"code","source":["import spark.implicits._\nimport org.apache.spark.sql._\n\ncase class partitionToProcess(partitionKey:Int)\n\nval ptp = Seq(\n partitionToProcess(199702),\n partitionToProcess(199703),\n partitionToProcess(199704),\n partitionToProcess(199706),\n partitionToProcess(199707),\n partitionToProcess(199708)\n)"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
import spark.implicits._\nimport org.apache.spark.sql._\ndefined class partitionToProcess\nptp: Seq[partitionToProcess] = List(partitionToProcess(199702), partitionToProcess(199703), partitionToProcess(199704), partitionToProcess(199706), partitionToProcess(199707), partitionToProcess(199708))\n
"]}}],"execution_count":8},{"cell_type":"markdown","source":["Execute in parallel several instances of the notebook that load a specific partition, using a different partition key for each instance"],"metadata":{}},{"cell_type":"code","source":["import scala.concurrent.Await\nimport scala.concurrent.duration._\nimport scala.language.postfixOps\n\nval timeOut = 600 // seconds\nval ipynb = \"./03b-parallel-switch-in-load-into-partitioned-table-single\"\n\nval notebooks = ptp.map(p => NotebookData(ipynb, timeOut, Map(\"partitionKey\" -> p.partitionKey.toString)))"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
import scala.concurrent.Await\nimport scala.concurrent.duration._\nimport scala.language.postfixOps\ntimeOut: Int = 600\nipynb: String = ./03b-parallel-switch-in-load-into-partitioned-table-single\nnotebooks: Seq[NotebookData] = List(NotebookData(./03b-parallel-switch-in-load-into-partitioned-table-single,600,Map(partitionKey -> 199702)), NotebookData(./03b-parallel-switch-in-load-into-partitioned-table-single,600,Map(partitionKey -> 199703)), NotebookData(./03b-parallel-switch-in-load-into-partitioned-table-single,600,Map(partitionKey -> 199704)), NotebookData(./03b-parallel-switch-in-load-into-partitioned-table-single,600,Map(partitionKey -> 199706)), NotebookData(./03b-parallel-switch-in-load-into-partitioned-table-single,600,Map(partitionKey -> 199707)), NotebookData(./03b-parallel-switch-in-load-into-partitioned-table-single,600,Map(partitionKey -> 199708)))\n
"]}}],"execution_count":10},{"cell_type":"code","source":["val res = parallelNotebooks(notebooks)\n\nAwait.result(res, (timeOut * ptp.size seconds)) // this is a blocking call.\n\nres.value"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
res: scala.concurrent.Future[Seq[String]] = Future(Success(List(199702, 199703, 199704, 199706, 199707, 199708)))\nres3: Option[scala.util.Try[Seq[String]]] = Some(Success(List(199702, 199703, 199704, 199706, 199707, 199708)))\n
"]}}],"execution_count":11},{"cell_type":"code","source":[""],"metadata":{},"outputs":[],"execution_count":12}],"metadata":{"name":"03a-parallel-switch-in-load-into-partitioned-table-many","notebookId":964636935775876},"nbformat":4,"nbformat_minor":0} -------------------------------------------------------------------------------- /notebooks/03b-parallel-switch-in-load-into-partitioned-table-single.ipynb: -------------------------------------------------------------------------------- 1 | {"cells":[{"cell_type":"markdown","source":["# 03b - Parallel Switch-In Load Into Partitioned Table - Sigle Partition Load\n\nThis notebook will bulk load data into exactly one Azure SQL partition. It accepts a Partition Key as a parameter, and that value will be used to load all data that belongs to that partition. In this sample column used to partition data is the `L_PARTITION_KEY` column, which is an integer, so the provided partition key *must be* an integer too.\n\nData is not loaded directly into the selected partition, but a staging table is created, loaded and then switched into the target table, becoming the desired partition.\n\nMore info on this switch-in technique can be found in the related notebook: `03a-parallel-switch-in-load-into-partitioned-table-many`\n\nThe sample is using the new sql-spark-connector (https://github.com/microsoft/sql-spark-connector). Make sure you have installed it before running this notebook.\n\n## Notes on terminology\n\nThe term \"row-store\" is used to identify and index that is not using the [column-store layout](https://docs.microsoft.com/en-us/sql/relational-databases/indexes/columnstore-indexes-overview) to store its data.\n\n## Sample\n\nThis notebook is used to load exactly on partition of a partitioned table by loading a staging table and then switching it in into the target table. The process is the following:\n\n- Create a staging table\n- Load staging table\n- Create indexes\n- Create check constraints\n- Execute switch-in operation\n\nMore details on this pattern can be found in [this post](https://www.cathrinewilhelmsen.net/2015/04/19/table-partitioning-in-sql-server-partition-switching/) written by the Data Platform MVP Cathrine Wilhelmsen. \n\n## Supported Azure Databricks Versions\n\nDatabricks supported versions: Spark 2.4.5 and Scala 2.11"],"metadata":{}},{"cell_type":"markdown","source":["## Setup"],"metadata":{}},{"cell_type":"markdown","source":["Define notebook parameter:"],"metadata":{}},{"cell_type":"code","source":["dbutils.widgets.text(\"partitionKey\", \"0\", \"Partition Key\")"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
"]}}],"execution_count":4},{"cell_type":"markdown","source":["Define variables used thoughout the script. Azure Key Value has been used to securely store sensitive data. More info here: [Create an Azure Key Vault-backed secret scope](https://docs.microsoft.com/en-us/azure/databricks/security/secrets/secret-scopes#--create-an-azure-key-vault-backed-secret-scope)"],"metadata":{}},{"cell_type":"code","source":["val partitionKey = dbutils.widgets.get(\"partitionKey\").toInt\nval prevPartitionKey = partitionKey\n\nval scope = \"key-vault-secrets\"\n\nval storageAccount = \"dmstore2\";\nval storageKey = dbutils.secrets.get(scope, \"dmstore2-2\");\n\nval server = dbutils.secrets.get(scope, \"srv001\").concat(\".database.windows.net\");\nval database = \"ApacheSpark\";\nval user = dbutils.secrets.get(scope, \"dbuser001\");\nval password = dbutils.secrets.get(scope, \"dbpwd001\");\nval table = \"dbo.LINEITEM_LOADTEST\"\n\nval url = s\"jdbc:sqlserver://$server;databaseName=$database;\""],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
partitionKey: Int = 199810\nprevPartitionKey: Int = 199810\nscope: String = key-vault-secrets\nstorageAccount: String = dmstore2\nstorageKey: String = [REDACTED]\nserver: String = [REDACTED].database.windows.net\ndatabase: String = ApacheSpark\nuser: String = [REDACTED]\npassword: String = [REDACTED]\ntable: String = dbo.LINEITEM_LOADTEST\nurl: String = jdbc:sqlserver://[REDACTED].database.windows.net;databaseName=ApacheSpark;\n
"]}}],"execution_count":6},{"cell_type":"markdown","source":["Configure Spark to access Azure Blob Store"],"metadata":{}},{"cell_type":"code","source":["spark.conf.set(s\"fs.azure.account.key.$storageAccount.blob.core.windows.net\", storageKey);"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
"]}}],"execution_count":8},{"cell_type":"markdown","source":["Load the Parquet file generated in `00-create-parquet-file` notebook that contains LINEITEM data partitioned by Year and Month. Make sure only the specified partion is loaded"],"metadata":{}},{"cell_type":"code","source":["val li = spark\n .read\n .parquet(s\"wasbs://tpch@$storageAccount.blob.core.windows.net/10GB/parquet/lineitem\")\n .filter($\"L_PARTITION_KEY\" === partitionKey)"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
li: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [L_ORDERKEY: int, L_PARTKEY: int ... 15 more fields]\n
"]}}],"execution_count":10},{"cell_type":"markdown","source":["Create the T-SQL script need to extract information on the partition that will be loaded into Azure SQL"],"metadata":{}},{"cell_type":"code","source":["val sqlPartitionValueInfo = \ns\"\"\"\nSELECT\n\t*\nFROM\n(\n\tSELECT\n\t\tprv.[boundary_id] AS partitionId,\n\t\tCAST(prv.[value] AS INT) AS [value],\n\t\tCAST(LAG(prv.[value]) OVER (ORDER BY prv.[boundary_id]) AS INT) AS [prevValue],\n\t\tCAST(LEAD(prv.[value]) OVER (ORDER BY prv.[boundary_id]) AS INT) AS [nextValue]\n\tFROM\n\t\tsys.[indexes] i\n\tINNER JOIN\n\t\tsys.[data_spaces] dp ON i.[data_space_id] = dp.[data_space_id]\n\tINNER JOIN\n\t\tsys.[partition_schemes] ps ON dp.[data_space_id] = ps.[data_space_id]\n\tINNER JOIN\n\t\tsys.[partition_range_values] prv ON [prv].[function_id] = [ps].[function_id]\n\tWHERE\n\t\ti.[object_id] = OBJECT_ID('${table}')\n\tAND\n\t\ti.[index_id] IN (0,1)\n) AS [pi]\nWHERE\n\t[value] = ${partitionKey}\n\"\"\""],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
sqlPartitionValueInfo: String =\n"\nSELECT\n\t*\nFROM\n(\n\tSELECT\n\t\tprv.[boundary_id] AS partitionId,\n\t\tCAST(prv.[value] AS INT) AS [value],\n\t\tCAST(LAG(prv.[value]) OVER (ORDER BY prv.[boundary_id]) AS INT) AS [prevValue],\n\t\tCAST(LEAD(prv.[value]) OVER (ORDER BY prv.[boundary_id]) AS INT) AS [nextValue]\n\tFROM\n\t\tsys.[indexes] i\n\tINNER JOIN\n\t\tsys.[data_spaces] dp ON i.[data_space_id] = dp.[data_space_id]\n\tINNER JOIN\n\t\tsys.[partition_schemes] ps ON dp.[data_space_id] = ps.[data_space_id]\n\tINNER JOIN\n\t\tsys.[partition_range_values] prv ON [prv].[function_id] = [ps].[function_id]\n\tWHERE\n\t\ti.[object_id] = OBJECT_ID('dbo.LINEITEM_LOADTEST')\n\tAND\n\t\ti.[index_id] IN (0,1)\n) AS [pi]\nWHERE\n\t[value] = 199810\n"\n
"]}}],"execution_count":12},{"cell_type":"markdown","source":["Setup JDBC connection, needed to execute ad-hoc T-SQL statement on Azure SQL"],"metadata":{}},{"cell_type":"code","source":["val connectionProperties = new java.util.Properties()\nconnectionProperties.put(\"user\", user)\nconnectionProperties.put(\"password\", password)\nconnectionProperties.setProperty(\"Driver\", \"com.microsoft.sqlserver.jdbc.SQLServerDriver\")\nval conn = java.sql.DriverManager.getConnection(url, connectionProperties)\nval st = conn.createStatement()"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
connectionProperties: java.util.Properties = {user=[REDACTED], password=[REDACTED], Driver=com.microsoft.sqlserver.jdbc.SQLServerDriver}\nconn: java.sql.Connection = ConnectionID:18 ClientConnectionId: cba0a4b8-ec8c-419a-9158-f970f6cf3bb4\nst: java.sql.Statement = SQLServerStatement:35\n
"]}}],"execution_count":14},{"cell_type":"markdown","source":["Load Azure SQL partition metadata"],"metadata":{}},{"cell_type":"code","source":["case class PartitionInfo(partitionId: Int, value: Int, prevValue: Option[Int], nextValue: Option[Int]);\nval piDF = spark.read.jdbc(url, s\"($sqlPartitionValueInfo) AS t\", connectionProperties)\ndisplay(piDF)"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["
partitionIdvalueprevValuenextValue
82199810199809null
"]}}],"execution_count":16},{"cell_type":"code","source":["val pi = piDF.as[PartitionInfo].collect()(0)\nprint(pi)"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
PartitionInfo(82,199810,Some(199809),None)pi: PartitionInfo = PartitionInfo(82,199810,Some(199809),None)\n
"]}}],"execution_count":17},{"cell_type":"markdown","source":["Create on Azure SQL a staging table where data will be bulk loaded"],"metadata":{}},{"cell_type":"code","source":["st.execute(s\"DROP TABLE IF EXISTS ${table}_STG_${partitionKey}\")\nst.execute(s\"SELECT TOP (0) * INTO ${table}_STG_${partitionKey} FROM ${table}\")"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
res24: Boolean = false\n
"]}}],"execution_count":19},{"cell_type":"markdown","source":["Load the staging table"],"metadata":{}},{"cell_type":"code","source":["li.write \n .format(\"com.microsoft.sqlserver.jdbc.spark\") \n .mode(\"overwrite\") \n .option(\"truncate\", \"true\") \n .option(\"url\", url) \n .option(\"dbtable\", s\"${table}_STG_${partitionKey}\") \n .option(\"user\", user) \n .option(\"password\", password) \n .option(\"reliabilityLevel\", \"BEST_EFFORT\") \n .option(\"tableLock\", \"false\") \n .option(\"batchsize\", \"100000\") \n .option(\"schemaCheckEnabled\", \"false\")\n .save()"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
"]}}],"execution_count":21},{"cell_type":"markdown","source":["Create the same indexes that the target table has, in order to allow switch-in"],"metadata":{}},{"cell_type":"code","source":["st.execute(s\"CREATE CLUSTERED INDEX IXC ON ${table}_STG_${partitionKey} ([L_COMMITDATE], [L_PARTITION_KEY])\")\nst.execute(s\"CREATE UNIQUE NONCLUSTERED INDEX IX1 ON ${table}_STG_${partitionKey} ([L_ORDERKEY], [L_LINENUMBER], [L_PARTITION_KEY])\")\nst.execute(s\"CREATE NONCLUSTERED INDEX IX2 ON ${table}_STG_${partitionKey} ([L_PARTKEY], [L_PARTITION_KEY])\")"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
res26: Boolean = false\n
"]}}],"execution_count":23},{"cell_type":"markdown","source":["Add a check constraint on the table to allow switch-in"],"metadata":{}},{"cell_type":"code","source":["if (pi.prevValue == None) {\n st.execute(s\"ALTER TABLE ${table}_STG_${partitionKey} ADD CONSTRAINT ck_partition_${partitionKey} CHECK (L_PARTITION_KEY <= ${pi.value})\")\n} else {\n st.execute(s\"ALTER TABLE ${table}_STG_${partitionKey} ADD CONSTRAINT ck_partition_${partitionKey} CHECK (L_PARTITION_KEY > ${pi.prevValue.get} AND L_PARTITION_KEY <= ${pi.value})\")\n}"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
res27: Boolean = false\n
"]}}],"execution_count":25},{"cell_type":"markdown","source":["Delete data in existing partition of target table, execute the switch-in and drop the staging table"],"metadata":{}},{"cell_type":"code","source":["st.execute(s\"TRUNCATE TABLE ${table} WITH (PARTITIONS (${pi.partitionId}))\")\nst.execute(s\"ALTER TABLE ${table}_STG_${partitionKey} SWITCH TO ${table} PARTITION ${pi.partitionId}\")\nst.execute(s\"DROP TABLE ${table}_STG_${partitionKey}\")"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
res122: Boolean = false\n
"]}}],"execution_count":27},{"cell_type":"markdown","source":["Done!"],"metadata":{}},{"cell_type":"code","source":["dbutils.notebook.exit(partitionKey.toString)"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/plain":["199810"]}}],"execution_count":29}],"metadata":{"name":"03b-parallel-switch-in-load-into-partitioned-table-single","notebookId":964636935775860},"nbformat":4,"nbformat_minor":0} -------------------------------------------------------------------------------- /notebooks/read-from-azure-sql/push-down-queries.ipynb: -------------------------------------------------------------------------------- 1 | {"cells":[{"cell_type":"markdown","source":["# Push Down Queries\n\nSample that shows how to push queries to Azure SQL"],"metadata":{}},{"cell_type":"markdown","source":["Define variables used thoughout the script. Azure Key Value has been used to securely store sensitive data. More info here: [Create an Azure Key Vault-backed secret scope](https://docs.microsoft.com/en-us/azure/databricks/security/secrets/secret-scopes#--create-an-azure-key-vault-backed-secret-scope)"],"metadata":{}},{"cell_type":"code","source":["val scope = \"key-vault-secrets\"\n\nval server = dbutils.secrets.get(scope, \"srv001\")\nval database = \"ApacheSpark\"\n\nval jdbcUrl = s\"jdbc:sqlserver://$server.database.windows.net;database=$database;\"\n\nval connectionProperties = new java.util.Properties()\nconnectionProperties.put(\"user\", dbutils.secrets.get(scope, \"dbuser001\"))\nconnectionProperties.put(\"password\", dbutils.secrets.get(scope, \"dbpwd001\"))\nconnectionProperties.setProperty(\"Driver\", \"com.microsoft.sqlserver.jdbc.SQLServerDriver\")"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
scope: String = key-vault-secrets\nserver: String = [REDACTED]\ndatabase: String = ApacheSpark\njdbcUrl: String = jdbc:sqlserver://[REDACTED].database.windows.net;database=ApacheSpark;\nconnectionProperties: java.util.Properties = {user=[REDACTED], password=[REDACTED], Driver=com.microsoft.sqlserver.jdbc.SQLServerDriver}\nres0: Object = null\n
"]}}],"execution_count":3},{"cell_type":"markdown","source":["A pushdown query is executed as a subquery by Azure SQL, so you *MUST* alias the subquery and put it in parenthesis: `(<...>) AS subquery`"],"metadata":{}},{"cell_type":"code","source":["val pushDown = \"\"\"(\nSELECT\n\tL_COMMITDATE,\n\tCOUNT(*) AS TotalOrders\nFROM\n\tdbo.LINEITEM\nGROUP BY\n\tL_COMMITDATE\n) AS SubQuery\"\"\"\n\nval li = spark.read.jdbc(jdbcUrl, pushDown, connectionProperties)"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
pushDown: String =\n(\nSELECT\n\tL_COMMITDATE,\n\tCOUNT(*) AS TotalOrders\nFROM\n\tdbo.LINEITEM\nGROUP BY\n\tL_COMMITDATE\n) AS SubQuery\nli: org.apache.spark.sql.DataFrame = [L_COMMITDATE: date, TotalOrders: int]\n
"]}}],"execution_count":5},{"cell_type":"code","source":["li.printSchema"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
root\n-- L_COMMITDATE: date (nullable = true)\n-- TotalOrders: integer (nullable = true)\n\n
"]}}],"execution_count":6},{"cell_type":"code","source":["display(li.where(\"L_COMMITDATE='1995-01-01'\"))"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["
L_COMMITDATETotalOrders
1995-01-0124781
"]}}],"execution_count":7}],"metadata":{"name":"push-down-queries","notebookId":2236457168077307},"nbformat":4,"nbformat_minor":0} --------------------------------------------------------------------------------