├── .github
    ├── CODE_OF_CONDUCT.md
    ├── ISSUE_TEMPLATE.md
    └── PULL_REQUEST_TEMPLATE.md
├── .gitignore
├── CHANGELOG.md
├── CONTRIBUTING.md
├── LICENSE.md
├── README.md
└── notebooks
    ├── 00-create-parquet-file.ipynb
    ├── 01-load-into-single-table.ipynb
    ├── 02-load-into-partitioned-table.ipynb
    ├── 03a-parallel-switch-in-load-into-partitioned-table-many.ipynb
    ├── 03b-parallel-switch-in-load-into-partitioned-table-single.ipynb
    └── read-from-azure-sql
        ├── fast-read.ipynb
        └── push-down-queries.ipynb


/.github/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Microsoft Open Source Code of Conduct
 2 | 
 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 4 | 
 5 | Resources:
 6 | 
 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | <!--
 2 | IF SUFFICIENT INFORMATION IS NOT PROVIDED VIA THE FOLLOWING TEMPLATE THE ISSUE MIGHT BE CLOSED WITHOUT FURTHER CONSIDERATION OR INVESTIGATION
 3 | -->
 4 | > Please provide us with the following information:
 5 | > ---------------------------------------------------------------
 6 | 
 7 | ### This issue is for a: (mark with an `x`)
 8 | ```
 9 | - [ ] bug report -> please search issues before submitting
10 | - [ ] feature request
11 | - [ ] documentation issue or request
12 | - [ ] regression (a behavior that used to work and stopped in a new release)
13 | ```
14 | 
15 | ### Minimal steps to reproduce
16 | >
17 | 
18 | ### Any log messages given by the failure
19 | >
20 | 
21 | ### Expected/desired behavior
22 | >
23 | 
24 | ### OS and Version?
25 | > Windows 7, 8 or 10. Linux (which distribution). macOS (Yosemite? El Capitan? Sierra?)
26 | 
27 | ### Versions
28 | >
29 | 
30 | ### Mention any other details that might be useful
31 | 
32 | > ---------------------------------------------------------------
33 | > Thanks! We'll be in touch soon.
34 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | ## Purpose
 2 | <!-- Describe the intention of the changes being proposed. What problem does it solve or functionality does it add? -->
 3 | * ...
 4 | 
 5 | ## Does this introduce a breaking change?
 6 | <!-- Mark one with an "x". -->
 7 | ```
 8 | [ ] Yes
 9 | [ ] No
10 | ```
11 | 
12 | ## Pull Request Type
13 | What kind of change does this Pull Request introduce?
14 | 
15 | <!-- Please check the one that applies to this PR using "x". -->
16 | ```
17 | [ ] Bugfix
18 | [ ] Feature
19 | [ ] Code style update (formatting, local variables)
20 | [ ] Refactoring (no functional changes, no api changes)
21 | [ ] Documentation content changes
22 | [ ] Other... Please describe:
23 | ```
24 | 
25 | ## How to Test
26 | *  Get the code
27 | 
28 | ```
29 | git clone [repo-address]
30 | cd [repo-name]
31 | git checkout [branch-name]
32 | npm install
33 | ```
34 | 
35 | * Test the code
36 | <!-- Add steps to run the tests suite and/or manually test -->
37 | ```
38 | ```
39 | 
40 | ## What to Check
41 | Verify that the following are valid
42 | * ...
43 | 
44 | ## Other Information
45 | <!-- Add any other helpful information that may be needed here. -->


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | ## Ignore Visual Studio temporary files, build results, and
  2 | ## files generated by popular Visual Studio add-ons.
  3 | ##
  4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
  5 | 
  6 | # User-specific files
  7 | *.rsuser
  8 | *.suo
  9 | *.user
 10 | *.userosscache
 11 | *.sln.docstates
 12 | 
 13 | # User-specific files (MonoDevelop/Xamarin Studio)
 14 | *.userprefs
 15 | 
 16 | # Mono auto generated files
 17 | mono_crash.*
 18 | 
 19 | # Build results
 20 | [Dd]ebug/
 21 | [Dd]ebugPublic/
 22 | [Rr]elease/
 23 | [Rr]eleases/
 24 | x64/
 25 | x86/
 26 | [Aa][Rr][Mm]/
 27 | [Aa][Rr][Mm]64/
 28 | bld/
 29 | [Bb]in/
 30 | [Oo]bj/
 31 | [Ll]og/
 32 | [Ll]ogs/
 33 | 
 34 | # Visual Studio 2015/2017 cache/options directory
 35 | .vs/
 36 | # Uncomment if you have tasks that create the project's static files in wwwroot
 37 | #wwwroot/
 38 | 
 39 | # Visual Studio 2017 auto generated files
 40 | Generated\ Files/
 41 | 
 42 | # MSTest test Results
 43 | [Tt]est[Rr]esult*/
 44 | [Bb]uild[Ll]og.*
 45 | 
 46 | # NUnit
 47 | *.VisualState.xml
 48 | TestResult.xml
 49 | nunit-*.xml
 50 | 
 51 | # Build Results of an ATL Project
 52 | [Dd]ebugPS/
 53 | [Rr]eleasePS/
 54 | dlldata.c
 55 | 
 56 | # Benchmark Results
 57 | BenchmarkDotNet.Artifacts/
 58 | 
 59 | # .NET Core
 60 | project.lock.json
 61 | project.fragment.lock.json
 62 | artifacts/
 63 | 
 64 | # StyleCop
 65 | StyleCopReport.xml
 66 | 
 67 | # Files built by Visual Studio
 68 | *_i.c
 69 | *_p.c
 70 | *_h.h
 71 | *.ilk
 72 | *.meta
 73 | *.obj
 74 | *.iobj
 75 | *.pch
 76 | *.pdb
 77 | *.ipdb
 78 | *.pgc
 79 | *.pgd
 80 | *.rsp
 81 | *.sbr
 82 | *.tlb
 83 | *.tli
 84 | *.tlh
 85 | *.tmp
 86 | *.tmp_proj
 87 | *_wpftmp.csproj
 88 | *.log
 89 | *.vspscc
 90 | *.vssscc
 91 | .builds
 92 | *.pidb
 93 | *.svclog
 94 | *.scc
 95 | 
 96 | # Chutzpah Test files
 97 | _Chutzpah*
 98 | 
 99 | # Visual C++ cache files
100 | ipch/
101 | *.aps
102 | *.ncb
103 | *.opendb
104 | *.opensdf
105 | *.sdf
106 | *.cachefile
107 | *.VC.db
108 | *.VC.VC.opendb
109 | 
110 | # Visual Studio profiler
111 | *.psess
112 | *.vsp
113 | *.vspx
114 | *.sap
115 | 
116 | # Visual Studio Trace Files
117 | *.e2e
118 | 
119 | # TFS 2012 Local Workspace
120 | $tf/
121 | 
122 | # Guidance Automation Toolkit
123 | *.gpState
124 | 
125 | # ReSharper is a .NET coding add-in
126 | _ReSharper*/
127 | *.[Rr]e[Ss]harper
128 | *.DotSettings.user
129 | 
130 | # TeamCity is a build add-in
131 | _TeamCity*
132 | 
133 | # DotCover is a Code Coverage Tool
134 | *.dotCover
135 | 
136 | # AxoCover is a Code Coverage Tool
137 | .axoCover/*
138 | !.axoCover/settings.json
139 | 
140 | # Visual Studio code coverage results
141 | *.coverage
142 | *.coveragexml
143 | 
144 | # NCrunch
145 | _NCrunch_*
146 | .*crunch*.local.xml
147 | nCrunchTemp_*
148 | 
149 | # MightyMoose
150 | *.mm.*
151 | AutoTest.Net/
152 | 
153 | # Web workbench (sass)
154 | .sass-cache/
155 | 
156 | # Installshield output folder
157 | [Ee]xpress/
158 | 
159 | # DocProject is a documentation generator add-in
160 | DocProject/buildhelp/
161 | DocProject/Help/*.HxT
162 | DocProject/Help/*.HxC
163 | DocProject/Help/*.hhc
164 | DocProject/Help/*.hhk
165 | DocProject/Help/*.hhp
166 | DocProject/Help/Html2
167 | DocProject/Help/html
168 | 
169 | # Click-Once directory
170 | publish/
171 | 
172 | # Publish Web Output
173 | *.[Pp]ublish.xml
174 | *.azurePubxml
175 | # Note: Comment the next line if you want to checkin your web deploy settings,
176 | # but database connection strings (with potential passwords) will be unencrypted
177 | *.pubxml
178 | *.publishproj
179 | 
180 | # Microsoft Azure Web App publish settings. Comment the next line if you want to
181 | # checkin your Azure Web App publish settings, but sensitive information contained
182 | # in these scripts will be unencrypted
183 | PublishScripts/
184 | 
185 | # NuGet Packages
186 | *.nupkg
187 | # NuGet Symbol Packages
188 | *.snupkg
189 | # The packages folder can be ignored because of Package Restore
190 | **/[Pp]ackages/*
191 | # except build/, which is used as an MSBuild target.
192 | !**/[Pp]ackages/build/
193 | # Uncomment if necessary however generally it will be regenerated when needed
194 | #!**/[Pp]ackages/repositories.config
195 | # NuGet v3's project.json files produces more ignorable files
196 | *.nuget.props
197 | *.nuget.targets
198 | 
199 | # Microsoft Azure Build Output
200 | csx/
201 | *.build.csdef
202 | 
203 | # Microsoft Azure Emulator
204 | ecf/
205 | rcf/
206 | 
207 | # Windows Store app package directories and files
208 | AppPackages/
209 | BundleArtifacts/
210 | Package.StoreAssociation.xml
211 | _pkginfo.txt
212 | *.appx
213 | *.appxbundle
214 | *.appxupload
215 | 
216 | # Visual Studio cache files
217 | # files ending in .cache can be ignored
218 | *.[Cc]ache
219 | # but keep track of directories ending in .cache
220 | !?*.[Cc]ache/
221 | 
222 | # Others
223 | ClientBin/
224 | ~$*
225 | *~
226 | *.dbmdl
227 | *.dbproj.schemaview
228 | *.jfm
229 | *.pfx
230 | *.publishsettings
231 | orleans.codegen.cs
232 | 
233 | # Including strong name files can present a security risk
234 | # (https://github.com/github/gitignore/pull/2483#issue-259490424)
235 | #*.snk
236 | 
237 | # Since there are multiple workflows, uncomment next line to ignore bower_components
238 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
239 | #bower_components/
240 | 
241 | # RIA/Silverlight projects
242 | Generated_Code/
243 | 
244 | # Backup & report files from converting an old project file
245 | # to a newer Visual Studio version. Backup files are not needed,
246 | # because we have git ;-)
247 | _UpgradeReport_Files/
248 | Backup*/
249 | UpgradeLog*.XML
250 | UpgradeLog*.htm
251 | ServiceFabricBackup/
252 | *.rptproj.bak
253 | 
254 | # SQL Server files
255 | *.mdf
256 | *.ldf
257 | *.ndf
258 | 
259 | # Business Intelligence projects
260 | *.rdl.data
261 | *.bim.layout
262 | *.bim_*.settings
263 | *.rptproj.rsuser
264 | *- [Bb]ackup.rdl
265 | *- [Bb]ackup ([0-9]).rdl
266 | *- [Bb]ackup ([0-9][0-9]).rdl
267 | 
268 | # Microsoft Fakes
269 | FakesAssemblies/
270 | 
271 | # GhostDoc plugin setting file
272 | *.GhostDoc.xml
273 | 
274 | # Node.js Tools for Visual Studio
275 | .ntvs_analysis.dat
276 | node_modules/
277 | 
278 | # Visual Studio 6 build log
279 | *.plg
280 | 
281 | # Visual Studio 6 workspace options file
282 | *.opt
283 | 
284 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
285 | *.vbw
286 | 
287 | # Visual Studio LightSwitch build output
288 | **/*.HTMLClient/GeneratedArtifacts
289 | **/*.DesktopClient/GeneratedArtifacts
290 | **/*.DesktopClient/ModelManifest.xml
291 | **/*.Server/GeneratedArtifacts
292 | **/*.Server/ModelManifest.xml
293 | _Pvt_Extensions
294 | 
295 | # Paket dependency manager
296 | .paket/paket.exe
297 | paket-files/
298 | 
299 | # FAKE - F# Make
300 | .fake/
301 | 
302 | # CodeRush personal settings
303 | .cr/personal
304 | 
305 | # Python Tools for Visual Studio (PTVS)
306 | __pycache__/
307 | *.pyc
308 | 
309 | # Cake - Uncomment if you are using it
310 | # tools/**
311 | # !tools/packages.config
312 | 
313 | # Tabs Studio
314 | *.tss
315 | 
316 | # Telerik's JustMock configuration file
317 | *.jmconfig
318 | 
319 | # BizTalk build output
320 | *.btp.cs
321 | *.btm.cs
322 | *.odx.cs
323 | *.xsd.cs
324 | 
325 | # OpenCover UI analysis results
326 | OpenCover/
327 | 
328 | # Azure Stream Analytics local run output
329 | ASALocalRun/
330 | 
331 | # MSBuild Binary and Structured Log
332 | *.binlog
333 | 
334 | # NVidia Nsight GPU debugger configuration file
335 | *.nvuser
336 | 
337 | # MFractors (Xamarin productivity tool) working folder
338 | .mfractor/
339 | 
340 | # Local History for Visual Studio
341 | .localhistory/
342 | 
343 | # BeatPulse healthcheck temp database
344 | healthchecksdb
345 | 
346 | # Backup folder for Package Reference Convert tool in Visual Studio 2017
347 | MigrationBackup/
348 | 
349 | # Ionide (cross platform F# VS Code tools) working folder
350 | .ionide/
351 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | ## [project-title] Changelog
 2 | 
 3 | <a name="x.y.z"></a>
 4 | # x.y.z (yyyy-mm-dd)
 5 | 
 6 | *Features*
 7 | * ...
 8 | 
 9 | *Bug Fixes*
10 | * ...
11 | 
12 | *Breaking Changes*
13 | * ...
14 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to [project-title]
 2 | 
 3 | This project welcomes contributions and suggestions.  Most contributions require you to agree to a
 4 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
 5 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
 6 | 
 7 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide
 8 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
 9 | provided by the bot. You will only need to do this once across all repos using our CLA.
10 | 
11 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
12 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
13 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
14 | 
15 |  - [Code of Conduct](#coc)
16 |  - [Issues and Bugs](#issue)
17 |  - [Feature Requests](#feature)
18 |  - [Submission Guidelines](#submit)
19 | 
20 | ## <a name="coc"></a> Code of Conduct
21 | Help us keep this project open and inclusive. Please read and follow our [Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
22 | 
23 | ## <a name="issue"></a> Found an Issue?
24 | If you find a bug in the source code or a mistake in the documentation, you can help us by
25 | [submitting an issue](#submit-issue) to the GitHub Repository. Even better, you can
26 | [submit a Pull Request](#submit-pr) with a fix.
27 | 
28 | ## <a name="feature"></a> Want a Feature?
29 | You can *request* a new feature by [submitting an issue](#submit-issue) to the GitHub
30 | Repository. If you would like to *implement* a new feature, please submit an issue with
31 | a proposal for your work first, to be sure that we can use it.
32 | 
33 | * **Small Features** can be crafted and directly [submitted as a Pull Request](#submit-pr).
34 | 
35 | ## <a name="submit"></a> Submission Guidelines
36 | 
37 | ### <a name="submit-issue"></a> Submitting an Issue
38 | Before you submit an issue, search the archive, maybe your question was already answered.
39 | 
40 | If your issue appears to be a bug, and hasn't been reported, open a new issue.
41 | Help us to maximize the effort we can spend fixing issues and adding new
42 | features, by not reporting duplicate issues.  Providing the following information will increase the
43 | chances of your issue being dealt with quickly:
44 | 
45 | * **Overview of the Issue** - if an error is being thrown a non-minified stack trace helps
46 | * **Version** - what version is affected (e.g. 0.1.2)
47 | * **Motivation for or Use Case** - explain what are you trying to do and why the current behavior is a bug for you
48 | * **Browsers and Operating System** - is this a problem with all browsers?
49 | * **Reproduce the Error** - provide a live example or a unambiguous set of steps
50 | * **Related Issues** - has a similar issue been reported before?
51 | * **Suggest a Fix** - if you can't fix the bug yourself, perhaps you can point to what might be
52 |   causing the problem (line of code or commit)
53 | 
54 | You can file new issues by providing the above information at the corresponding repository's issues link: https://github.com/[organization-name]/[repository-name]/issues/new].
55 | 
56 | ### <a name="submit-pr"></a> Submitting a Pull Request (PR)
57 | Before you submit your Pull Request (PR) consider the following guidelines:
58 | 
59 | * Search the repository (https://github.com/[organization-name]/[repository-name]/pulls) for an open or closed PR
60 |   that relates to your submission. You don't want to duplicate effort.
61 | 
62 | * Make your changes in a new git fork:
63 | 
64 | * Commit your changes using a descriptive commit message
65 | * Push your fork to GitHub:
66 | * In GitHub, create a pull request
67 | * If we suggest changes then:
68 |   * Make the required updates.
69 |   * Rebase your fork and force push to your GitHub repository (this will update your Pull Request):
70 | 
71 |     ```shell
72 |     git rebase master -i
73 |     git push -f
74 |     ```
75 | 
76 | That's it! Thank you for your contribution!
77 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | page_type: sample
 3 | languages:
 4 | - tsql
 5 | - sql
 6 | - scala
 7 | products:
 8 | - azure
 9 | - azure-databricks
10 | - azure-blob-storage
11 | - azure-key-vault
12 | - azure-sql-database
13 | description: "Fast Data Loading in Azure SQL DB using Azure Databricks"
14 | urlFragment: "azure-sql-db-databricks"
15 | ---
16 | 
17 | # Fast Data Loading in Azure SQL DB using Azure Databricks
18 | 
19 | ![License](https://img.shields.io/badge/license-MIT-green.svg)
20 | 
21 | <!-- 
22 | Guidelines on README format: https://review.docs.microsoft.com/help/onboard/admin/samples/concepts/readme-template?branch=master
23 | 
24 | Guidance on onboarding samples to docs.microsoft.com/samples: https://review.docs.microsoft.com/help/onboard/admin/samples/process/onboarding?branch=master
25 | 
26 | Taxonomies for products and languages: https://review.docs.microsoft.com/new-hope/information-architecture/metadata/taxonomies?branch=master
27 | -->
28 | 
29 | Azure Databricks and Azure SQL database can be used amazingly well together. This repo will help you to use the [latest connector](https://github.com/microsoft/sql-spark-connector) to load data into Azure SQL as fast as possible, using table partitions and column-store and all the known best-practices.
30 | 
31 | - [Partitioned Tables and Indexes](https://docs.microsoft.com/en-us/sql/relational-databases/partitions/partitioned-tables-and-indexes)
32 | - [Columnstore indexes: Overview](https://docs.microsoft.com/en-us/sql/relational-databases/indexes/columnstore-indexes-overview)
33 | - [Columnstore indexes - Data loading guidance](https://docs.microsoft.com/en-us/sql/relational-databases/indexes/columnstore-indexes-data-loading-guidance)
34 | 
35 | ## Samples
36 | 
37 | All the samples start from a partitioned Parquet file, created with data generated from the famous TPC-H benchmark. Free tools are available on TPC-H website to generate a dataset with the size you want:
38 | 
39 | http://www.tpc.org/tpch/
40 | 
41 | Once the Parquet file is available, 
42 | 
43 | - [Create Parquet File](./notebooks/00-create-parquet-file.ipynb)
44 | 
45 | the samples will guide you through the most common scenarios
46 | 
47 | - [Loading a non-partitioned table](./notebooks/01-load-into-single-table.ipynb)
48 | - [Loading a partitioned table](./notebooks/02-load-into-partitioned-table.ipynb)
49 | - [Loading a partitioned table via switch-in](./notebooks/03a-parallel-switch-in-load-into-partitioned-table-many.ipynb)
50 | 
51 | all samples will also show how to correctly load table if there are already indexes or if you want to use a column-store in Azure SQL.
52 | 
53 | ### Bonus Samples: Reading data as fast as possible
54 | 
55 | Though this repo focuses on writing data as fast as possible into Azure SQL, I also understand that you may also want to know how to do the opposite: how the read data as fast as possible from Azure SQL into Apache Spark / Azure Databricks? For this reason in the folder `notebooks/read-from-azure-sql` you will find two samples that shows how to do exactly that:
56 | 
57 | - [Fast Reading from Azure SQL](./notebooks/read-from-azure-sql/fast-read.ipynb)
58 | - [Pushing queries to Azure SQL](./notebooks/read-from-azurel-sql/push-down-queries.ipynb)
59 | 
60 | ## Contributing
61 | 
62 | This project welcomes contributions and suggestions. Most contributions require you to agree to a Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
63 | 
64 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions provided by the bot. You will only need to do this once across all repos using our CLA.
65 | 
66 | This project has adopted the Microsoft Open Source Code of Conduct. For more information see the Code of Conduct FAQ or contact opencode@microsoft.com with any additional questions or comments.
67 | 


--------------------------------------------------------------------------------
/notebooks/00-create-parquet-file.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"markdown","source":["# 00 - Create Parquet file used in subsequent samples\n\nThis notebook will create the parquet file used in subsequent samples"],"metadata":{}},{"cell_type":"markdown","source":["Define variables used thoughout the script. Azure Key Value has been used to securely store sensitive data. More info here: [Create an Azure Key Vault-backed secret scope](https://docs.microsoft.com/en-us/azure/databricks/security/secrets/secret-scopes#--create-an-azure-key-vault-backed-secret-scope)"],"metadata":{}},{"cell_type":"code","source":["val scope = \"key-vault-secrets\"\n\nval storageAccount = \"dmstore2\";\nval storageKey = dbutils.secrets.get(scope, \"dmstore2-2\");\n\nval parquetLocation = s\"wasbs://tpch@$storageAccount.blob.core.windows.net/10GB/parquet/lineitem\""],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">scope: String = key-vault-secrets\nstorageAccount: String = dmstore2\nstorageKey: String = [REDACTED]\nparquetLocation: String = wasbs://tpch@dmstore2.blob.core.windows.net/10GB/parquet/lineitem\n</div>"]}}],"execution_count":3},{"cell_type":"markdown","source":["Configure Spark to access Azure Blob Store"],"metadata":{}},{"cell_type":"code","source":["spark.conf.set(s\"fs.azure.account.key.$storageAccount.blob.core.windows.net\", storageKey);"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"></div>"]}}],"execution_count":5},{"cell_type":"markdown","source":["Load data from generated 10GB TPC-H LINEITEM file. Tools to generate TPC-H data can be found here:\nhttp://www.tpc.org/tpc_documents_current_versions/current_specifications5.asp"],"metadata":{}},{"cell_type":"code","source":["import org.apache.spark.sql.types._\n\nval li1 = spark\n  .read\n  .format(\"csv\")\n  .option(\"sep\", \"|\")\n  .schema(\"\"\"\n    L_ORDERKEY INTEGER,\n    L_PARTKEY INTEGER,\n    L_SUPPKEY INTEGER,\n    L_LINENUMBER INTEGER,\n    L_QUANTITY DECIMAL(15,2),\n    L_EXTENDEDPRICE DECIMAL(15,2),\n    L_DISCOUNT DECIMAL(15,2),\n    L_TAX DECIMAL(15,2),\n    L_RETURNFLAG CHAR(1),\n    L_LINESTATUS CHAR(1),\n    L_SHIPDATE DATE,\n    L_COMMITDATE DATE,\n    L_RECEIPTDATE DATE,\n    L_SHIPINSTRUCT CHAR(25),\n    L_SHIPMODE CHAR(10),\n    L_COMMENT VARCHAR(44)\n   \"\"\")\n  .load(s\"wasbs://tpch@$storageAccount.blob.core.windows.net/10GB/lineitem.tbl\")\n;"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">import org.apache.spark.sql.types._\nli1: org.apache.spark.sql.DataFrame = [L_ORDERKEY: int, L_PARTKEY: int ... 14 more fields]\n</div>"]}}],"execution_count":7},{"cell_type":"markdown","source":["Create a temporary view to make it easier to manipulate schema and data"],"metadata":{}},{"cell_type":"code","source":["li1.createOrReplaceTempView(\"LINEITEM\")"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"></div>"]}}],"execution_count":9},{"cell_type":"markdown","source":["Add a new column that will be used for partitioning"],"metadata":{}},{"cell_type":"code","source":["var li2 = spark.sql(\"SELECT *, YEAR(L_COMMITDATE) * 100 + MONTH(L_COMMITDATE) AS L_PARTITION_KEY FROM LINEITEM\")"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">li2: org.apache.spark.sql.DataFrame = [L_ORDERKEY: int, L_PARTKEY: int ... 15 more fields]\n</div>"]}}],"execution_count":11},{"cell_type":"markdown","source":["Repartition data using the newly created column"],"metadata":{}},{"cell_type":"code","source":["val li3 = li2.repartition($\"L_PARTITION_KEY\")"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">li3: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [L_ORDERKEY: int, L_PARTKEY: int ... 15 more fields]\n</div>"]}}],"execution_count":13},{"cell_type":"markdown","source":["Save dataframe into parquet format, making sure parquet will be saved using the same partitioning logic used for the dataframe"],"metadata":{}},{"cell_type":"code","source":["li3.write.mode(\"overwrite\").partitionBy(\"L_PARTITION_KEY\").parquet(parquetLocation)"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"></div>"]}}],"execution_count":15},{"cell_type":"markdown","source":["As a test read back the parquet files"],"metadata":{}},{"cell_type":"code","source":["val li4 = spark.read.parquet(parquetLocation)\nli4.rdd.getNumPartitions"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">li4: org.apache.spark.sql.DataFrame = [L_ORDERKEY: int, L_PARTKEY: int ... 15 more fields]\nres6: Int = 20\n</div>"]}}],"execution_count":17},{"cell_type":"markdown","source":["Peek at first 10 partitions"],"metadata":{}},{"cell_type":"code","source":["display(li4.groupBy($\"L_PARTITION_KEY\").count().orderBy($\"L_PARTITION_KEY\").limit(10))"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .table-result-container {\n    max-height: 300px;\n    overflow: auto;\n  }\n  table, th, td {\n    border: 1px solid black;\n    border-collapse: collapse;\n  }\n  th, td {\n    padding: 5px;\n  }\n  th {\n    text-align: left;\n  }\n</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>L_PARTITION_KEY</th><th>count</th></tr></thead><tbody><tr><td>199201</td><td>412</td></tr><tr><td>199202</td><td>190252</td></tr><tr><td>199203</td><td>582150</td></tr><tr><td>199204</td><td>748645</td></tr><tr><td>199205</td><td>770266</td></tr><tr><td>199206</td><td>746006</td></tr><tr><td>199207</td><td>772006</td></tr><tr><td>199208</td><td>770213</td></tr><tr><td>199209</td><td>748997</td></tr><tr><td>199210</td><td>771256</td></tr></tbody></table></div>"]}}],"execution_count":19}],"metadata":{"name":"00-create-parquet-file","notebookId":1331848450253195},"nbformat":4,"nbformat_minor":0}


--------------------------------------------------------------------------------
/notebooks/01-load-into-single-table.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"markdown","source":["# 01 - Load data into an Azure SQL non-partitioned table\n\nThe sample is using the new sql-spark-connector (https://github.com/microsoft/sql-spark-connector). Make sure you have installed it before running this notebook. Maven Coordinates: `com.microsoft.azure:spark-mssql-connector:1.0.0`\n\n## Notes on terminology\n\nThe term \"row-store\" is used to identify and index that is not using the [column-store layout](https://docs.microsoft.com/en-us/sql/relational-databases/indexes/columnstore-indexes-overview) to store its data.\n\n## Samples\n\nIn this notebook there are three samples\n\n- Load data into a table without indexes\n- Load data into a table with row-store indexes\n- Load data into a table with columns-store indexes\n\n## Supported Azure Databricks Versions\n\nDatabricks supported versions: Spark 2.4.5 and Scala 2.11"],"metadata":{}},{"cell_type":"markdown","source":["## Setup"],"metadata":{}},{"cell_type":"markdown","source":["Define variables used thoughout the script. Azure Key Value has been used to securely store sensitive data. More info here: [Create an Azure Key Vault-backed secret scope](https://docs.microsoft.com/en-us/azure/databricks/security/secrets/secret-scopes#--create-an-azure-key-vault-backed-secret-scope)"],"metadata":{}},{"cell_type":"code","source":["val scope = \"key-vault-secrets\"\n\nval storageAccount = \"dmstore2\";\nval storageKey = dbutils.secrets.get(scope, \"dmstore2-2\");\n\nval server = dbutils.secrets.get(scope, \"srv001\").concat(\".database.windows.net\");\nval database = \"ApacheSpark\";\nval user = dbutils.secrets.get(scope, \"dbuser001\");\nval password = dbutils.secrets.get(scope, \"dbpwd001\");\nval table = \"dbo.LINEITEM_LOADTEST\"\n\nval url = s\"jdbc:sqlserver://$server;databaseName=$database;\""],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">scope: String = key-vault-secrets\nstorageAccount: String = dmstore2\nstorageKey: String = [REDACTED]\nserver: String = [REDACTED].database.windows.net\ndatabase: String = ApacheSpark\nuser: String = [REDACTED]\npassword: String = [REDACTED]\ntable: String = dbo.LINEITEM_LOADTEST\nurl: String = jdbc:sqlserver://[REDACTED].database.windows.net;databaseName=ApacheSpark;\n</div>"]}}],"execution_count":4},{"cell_type":"markdown","source":["Configure Spark to access Azure Blob Store"],"metadata":{}},{"cell_type":"code","source":["spark.conf.set(s\"fs.azure.account.key.$storageAccount.blob.core.windows.net\", storageKey);"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"></div>"]}}],"execution_count":6},{"cell_type":"markdown","source":["Load the Parquet file generated in `00-create-parquet-file` notebook that contains LINEITEM data partitioned by Year and Month"],"metadata":{}},{"cell_type":"code","source":["val li = spark.read.parquet(s\"wasbs://tpch@$storageAccount.blob.core.windows.net/10GB/parquet/lineitem\")"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">li: org.apache.spark.sql.DataFrame = [L_ORDERKEY: int, L_PARTKEY: int ... 15 more fields]\n</div>"]}}],"execution_count":8},{"cell_type":"markdown","source":["Loaded data is split in 20 dataframe partitions"],"metadata":{}},{"cell_type":"code","source":["li.rdd.getNumPartitions"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">res2: Int = 20\n</div>"]}}],"execution_count":10},{"cell_type":"markdown","source":["Show schema of loaded data"],"metadata":{}},{"cell_type":"code","source":["li.printSchema"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">root\n-- L_ORDERKEY: integer (nullable = true)\n-- L_PARTKEY: integer (nullable = true)\n-- L_SUPPKEY: integer (nullable = true)\n-- L_LINENUMBER: integer (nullable = true)\n-- L_QUANTITY: decimal(15,2) (nullable = true)\n-- L_EXTENDEDPRICE: decimal(15,2) (nullable = true)\n-- L_DISCOUNT: decimal(15,2) (nullable = true)\n-- L_TAX: decimal(15,2) (nullable = true)\n-- L_RETURNFLAG: string (nullable = true)\n-- L_LINESTATUS: string (nullable = true)\n-- L_SHIPDATE: date (nullable = true)\n-- L_COMMITDATE: date (nullable = true)\n-- L_RECEIPTDATE: date (nullable = true)\n-- L_SHIPINSTRUCT: string (nullable = true)\n-- L_SHIPMODE: string (nullable = true)\n-- L_COMMENT: string (nullable = true)\n-- L_PARTITION_KEY: integer (nullable = true)\n\n</div>"]}}],"execution_count":12},{"cell_type":"markdown","source":["Now, make sure you create on your Azure SQL the following LINEITEM table:\n```sql\ncreate table [dbo].[LINEITEM_LOADTEST]\n(\n\t[L_ORDERKEY] [int] not null,\n\t[L_PARTKEY] [int] not null,\n\t[L_SUPPKEY] [int] not null,\n\t[L_LINENUMBER] [int] not null,\n\t[L_QUANTITY] [decimal](15, 2) not null,\n\t[L_EXTENDEDPRICE] [decimal](15, 2) not null,\n\t[L_DISCOUNT] [decimal](15, 2) not null,\n\t[L_TAX] [decimal](15, 2) not null,\n\t[L_RETURNFLAG] [char](1) not null,\n\t[L_LINESTATUS] [char](1) not null,\n\t[L_SHIPDATE] [date] not null,\n\t[L_COMMITDATE] [date] not null,\n\t[L_RECEIPTDATE] [date] not null,\n\t[L_SHIPINSTRUCT] [char](25) not null,\n\t[L_SHIPMODE] [char](10) not null,\n\t[L_COMMENT] [varchar](44) not null,\n\t[L_PARTITION_KEY] [int] not null\n) \n```"],"metadata":{}},{"cell_type":"markdown","source":["## Load data into a table with no indexes\n\nIn Azure SQL terminology an Heap is a table with no clustered index. In this sample we'll load data into a table that as no index (clustered or non-clustered) as is not partitioned. This is the simplest scenario possibile and allows parallel load of data.\n\n### Note:\nParallel load *cannot* happen if you have row-store indexes on the table. If you want to bulk load data in parallel into a table that has row-store indexes, you must use partitioning. If you are planning to add indexes to your table, and data to be loaded in the table is in the terabyte range, you want to use partitioing and have indexes created before bulk loading data into Azure SQL, as otherwise creating index once the table is already loaded will use a significat amout of resources."],"metadata":{}},{"cell_type":"markdown","source":["To enable parallel load the option `tableLock` must be set to `true`. This will prevent any other access to the table, other then the one done for performing the bulk load operations."],"metadata":{}},{"cell_type":"code","source":["li.write\n  .format(\"com.microsoft.sqlserver.jdbc.spark\") \n  .mode(\"overwrite\")   \n  .option(\"truncate\", \"true\") \n  .option(\"url\", url) \n  .option(\"dbtable\", \"dbo.LINEITEM_LOADTEST\") \n  .option(\"user\", user) \n  .option(\"password\", password) \n  .option(\"reliabilityLevel\", \"BEST_EFFORT\") \n  .option(\"tableLock\", \"true\") \n  .option(\"batchsize\", \"100000\")\n  .option(\"schemaCheckEnabled\", \"false\") // needed to avoid clash of NULLable columns vs NON-NULLable colums\n  .save()"],"metadata":{},"outputs":[],"execution_count":16},{"cell_type":"markdown","source":["## Load data into a table with row-store indexes\n\nIf table is not partitioned, there are no options to bulk load data in parallel into the desired table. The only way to avoid locking and deadlocks is to load everything by serializing the bulk load operations. As you can expect, performance won't be the optimal."],"metadata":{}},{"cell_type":"markdown","source":["Create the following index on the table\n```sql\ncreate clustered index IXC on dbo.[LINEITEM_LOADTEST] ([L_COMMITDATE]);\n\ncreate unique nonclustered index IX1 on dbo.[LINEITEM_LOADTEST] ([L_ORDERKEY], [L_LINENUMBER]);\n\ncreate nonclustered index IX2 on dbo.[LINEITEM_LOADTEST] ([L_PARTKEY]); \n```"],"metadata":{}},{"cell_type":"markdown","source":["Load data by coalescing all dataframe partitions into just one"],"metadata":{}},{"cell_type":"code","source":["val url = s\"jdbc:sqlserver://$server;databaseName=$database;\"\n\nli.coalesce(1)\n  .write\n  .format(\"com.microsoft.sqlserver.jdbc.spark\") \n  .mode(\"overwrite\")   \n  .option(\"truncate\", \"true\") \n  .option(\"url\", url) \n  .option(\"dbtable\", \"dbo.LINEITEM_LOADTEST\") \n  .option(\"user\", user) \n  .option(\"password\", password) \n  .option(\"reliabilityLevel\", \"BEST_EFFORT\") \n  .option(\"tableLock\", \"false\") \n  .option(\"batchsize\", \"100000\")\n  .option(\"schemaCheckEnabled\", \"false\")\n  .save()"],"metadata":{},"outputs":[],"execution_count":20},{"cell_type":"markdown","source":["## Load data into a table with (only) column-store indexes\n\nIf a table has only column-store indexes, data load can happen in parallel, as there is no sorting needed."],"metadata":{}},{"cell_type":"markdown","source":["Empty table if needed, to speed up index deletion\n\n```sql\ntruncate table dbo.[LINEITEM_LOADTEST];\n```\n\nDrop the previously create indexes if needed:\n```sql\ndrop index IXC on dbo.[LINEITEM_LOADTEST];\ndrop index IX1 on dbo.[LINEITEM_LOADTEST];\ndrop index IX2 on dbo.[LINEITEM_LOADTEST];\n```\n\nAnd the create a clustered columnstore index:\n\n```sql\ncreate clustered columnstore index IXCCS on dbo.[LINEITEM_LOADTEST]\n```"],"metadata":{}},{"cell_type":"markdown","source":["Load data using [columnstore data loading best pratices](https://docs.microsoft.com/en-us/sql/relational-databases/indexes/columnstore-indexes-data-loading-guidance), by loading 1048576 rows at time, to land directly into a compressed segment. `tableLock` options must be set to `false` to avoid table lock that will prevent parallel load. Data with be loaded in parallel, using as many as Apache Spark workers are available."],"metadata":{}},{"cell_type":"code","source":["li.write \n  .format(\"com.microsoft.sqlserver.jdbc.spark\") \n  .mode(\"overwrite\")   \n  .option(\"truncate\", \"true\") \n  .option(\"url\", url) \n  .option(\"dbtable\", \"dbo.LINEITEM_LOADTEST\") \n  .option(\"user\", user) \n  .option(\"password\", password) \n  .option(\"reliabilityLevel\", \"BEST_EFFORT\") \n  .option(\"tableLock\", \"false\") \n  .option(\"batchsize\", \"1048576\")   \n  .option(\"schemaCheckEnabled\", \"false\")\n  .save()"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"></div>"]}}],"execution_count":24},{"cell_type":"code","source":[""],"metadata":{},"outputs":[],"execution_count":25}],"metadata":{"name":"01-load-into-single-table","notebookId":1331848450253174},"nbformat":4,"nbformat_minor":0}


--------------------------------------------------------------------------------
/notebooks/02-load-into-partitioned-table.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"markdown","source":["# 02 - Load data into an Azure SQL partitioned table\n\nAzure SQL supports [table and index partitioning](https://docs.microsoft.com/en-us/sql/relational-databases/partitions/partitioned-tables-and-indexes). If a table is partitioned, data can be loaded in parallel without the need to put a lock on the entire table. In order to allow parallel partitions to be loaded, the source RDD/DataFrame/Dataset and the target Azure SQL table *MUST* have compatible partitions, which means that one RDD partition ends up exactly in one or more than one Azure SQL partitions, and those are not used by other RDD partitions.\n\nWhen table is partitioned, data *can* be bulk loaded in parallel also if there are indexes on the table. Especially on very large databases _this is the recommended approach_. The bulk load process will be a bit slower, but you'll not need to create indexes after having loaded the data. Creation of indexes on huge, already loaded, tables is a very expensive operation that you would like to avoid if possibile.\n\nThe sample is using the new sql-spark-connector (https://github.com/microsoft/sql-spark-connector). Make sure to have it installed in the cluster before running the notebook.\n\n## Dataframe and Azure SQL partitions\n\nBoth Azure SQL and Azure Databricks (more specifically, Spark, and even more specifically a Spark Dataframe) are able to use take advantage of partitioning to more easily deal with large amounts of data. Partitions allow to work on subset of data, and usually you can do such work in parallel, spreading the workload on several CPU and/or nodes.\n\n## Notes on terminology\n\nThe term \"row-store\" is used to identify and index that is not using the [column-store layout](https://docs.microsoft.com/en-us/sql/relational-databases/indexes/columnstore-indexes-overview) to store its data.\n\n## Samples\n\nIn this notebook there are two samples\n\n- Load data into a partitioned table with row-store indexes\n- Load data into a partitioned table with columns-store indexes\n\n## Supported Azure Databricks Versions\n\nDatabricks supported versions: Spark 2.4.5 and Scala 2.11"],"metadata":{}},{"cell_type":"markdown","source":["## Setup"],"metadata":{}},{"cell_type":"markdown","source":["Define variables used thoughout the script. Azure Key Value has been used to securely store sensitive data. More info here: [Create an Azure Key Vault-backed secret scope](https://docs.microsoft.com/en-us/azure/databricks/security/secrets/secret-scopes#--create-an-azure-key-vault-backed-secret-scope)"],"metadata":{}},{"cell_type":"code","source":["val scope = \"key-vault-secrets\"\n\nval storageAccount = \"dmstore2\";\nval storageKey = dbutils.secrets.get(scope, \"dmstore2-2\");\n\nval server = dbutils.secrets.get(scope, \"srv001\").concat(\".database.windows.net\");\nval database = \"ApacheSpark\";\nval user = dbutils.secrets.get(scope, \"dbuser001\");\nval password = dbutils.secrets.get(scope, \"dbpwd001\");\n\nval url = s\"jdbc:sqlserver://$server;databaseName=$database;\""],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">scope: String = key-vault-secrets\nstorageAccount: String = dmstore2\nstorageKey: String = [REDACTED]\nserver: String = [REDACTED].database.windows.net\ndatabase: String = ApacheSpark\nuser: String = [REDACTED]\npassword: String = [REDACTED]\nurl: String = jdbc:sqlserver://[REDACTED].database.windows.net;databaseName=ApacheSpark;\n</div>"]}}],"execution_count":4},{"cell_type":"markdown","source":["Configure Spark to access Azure Blob Store"],"metadata":{}},{"cell_type":"code","source":["spark.conf.set(s\"fs.azure.account.key.$storageAccount.blob.core.windows.net\", storageKey);"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"></div>"]}}],"execution_count":6},{"cell_type":"markdown","source":["Load the Parquet file generated in `00-create-parquet-file` notebook that contains LINEITEM data partitioned by Year and Month"],"metadata":{}},{"cell_type":"code","source":["val li = spark.read.parquet(s\"wasbs://tpch@$storageAccount.blob.core.windows.net/10GB/parquet/lineitem\")"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">li: org.apache.spark.sql.DataFrame = [L_ORDERKEY: int, L_PARTKEY: int ... 15 more fields]\n</div>"]}}],"execution_count":8},{"cell_type":"markdown","source":["Loaded data is split in 20 dataframe partitions"],"metadata":{}},{"cell_type":"code","source":["li.rdd.getNumPartitions"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">res2: Int = 20\n</div>"]}}],"execution_count":10},{"cell_type":"markdown","source":["Show schema of loaded data"],"metadata":{}},{"cell_type":"code","source":["li.printSchema"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">root\n-- L_ORDERKEY: integer (nullable = true)\n-- L_PARTKEY: integer (nullable = true)\n-- L_SUPPKEY: integer (nullable = true)\n-- L_LINENUMBER: integer (nullable = true)\n-- L_QUANTITY: decimal(15,2) (nullable = true)\n-- L_EXTENDEDPRICE: decimal(15,2) (nullable = true)\n-- L_DISCOUNT: decimal(15,2) (nullable = true)\n-- L_TAX: decimal(15,2) (nullable = true)\n-- L_RETURNFLAG: string (nullable = true)\n-- L_LINESTATUS: string (nullable = true)\n-- L_SHIPDATE: date (nullable = true)\n-- L_COMMITDATE: date (nullable = true)\n-- L_RECEIPTDATE: date (nullable = true)\n-- L_SHIPINSTRUCT: string (nullable = true)\n-- L_SHIPMODE: string (nullable = true)\n-- L_COMMENT: string (nullable = true)\n-- L_PARTITION_KEY: integer (nullable = true)\n\n</div>"]}}],"execution_count":12},{"cell_type":"markdown","source":["Make sure you create on your Azure SQL the following `LINEITEM` table, partitioned by `L_PARTITION_KEY`:\n\n```sql\ncreate partition function pf_LINEITEM(int)\nas range left for values \n(\n\t199201,199202,199203,199204,199205,199206,199207,199208,199209,199210,199211,199212,\n\t199301,199302,199303,199304,199305,199306,199307,199308,199309,199310,199311,199312,\n\t199401,199402,199403,199404,199405,199406,199407,199408,199409,199410,199411,199412,\n\t199501,199502,199503,199504,199505,199506,199507,199508,199509,199510,199511,199512,\n\t199601,199602,199603,199604,199605,199606,199607,199608,199609,199610,199611,199612,\n\t199701,199702,199703,199704,199705,199706,199707,199708,199709,199710,199711,199712,\n\t199801,199802,199803,199804,199805,199806,199807,199808,199809,199810\n);\n\ncreate partition scheme ps_LINEITEM\nas partition pf_LINEITEM\nall to ([Primary])\n;\n\ndrop table if exists [dbo].[LINEITEM_LOADTEST];\ncreate table [dbo].[LINEITEM_LOADTEST]\n(\n\t[L_ORDERKEY] [int] not null,\n\t[L_PARTKEY] [int] not null,\n\t[L_SUPPKEY] [int] not null,\n\t[L_LINENUMBER] [int] not null,\n\t[L_QUANTITY] [decimal](15, 2) not null,\n\t[L_EXTENDEDPRICE] [decimal](15, 2) not null,\n\t[L_DISCOUNT] [decimal](15, 2) not null,\n\t[L_TAX] [decimal](15, 2) not null,\n\t[L_RETURNFLAG] [char](1) not null,\n\t[L_LINESTATUS] [char](1) not null,\n\t[L_SHIPDATE] [date] not null,\n\t[L_COMMITDATE] [date] not null,\n\t[L_RECEIPTDATE] [date] not null,\n\t[L_SHIPINSTRUCT] [char](25) not null,\n\t[L_SHIPMODE] [char](10) not null,\n\t[L_COMMENT] [varchar](44) not null,\n\t[L_PARTITION_KEY] [int] not null\n) on ps_LINEITEM([L_PARTITION_KEY])\n```"],"metadata":{}},{"cell_type":"markdown","source":["You can check that Azure SQL table is partitioned by running the following T-SQL command:\n\n```sql\nSELECT\n    schema_name(t.schema_id) as [schema_name],\n    t.[name] as table_name,\n    i.[name] as index_name,\n    ps.[partition_id],\n    ps.partition_number,\n    p.data_compression_desc,\n    i.[type_desc],    \n    ps.row_count,\n    (ps.used_page_count * 8.) / 1024. / 1024. as size_in_gb\nfrom\n    sys.dm_db_partition_stats as ps \ninner join  \n    sys.partitions as p on ps.partition_id = p.partition_id\ninner join\n    sys.tables as t on t.object_id = ps.object_id\ninner join\n    sys.indexes as i on ps.object_id = i.object_id and ps.index_id = i.index_id\nwhere\n    t.[name] = 'LINEITEM_LOADTEST' and t.[schema_id] = schema_id('dbo')\norder by\n    [schema_name], table_name, index_name, partition_number\n```"],"metadata":{}},{"cell_type":"markdown","source":["## Load data into a partitioned table with row-store indexes"],"metadata":{}},{"cell_type":"markdown","source":["On the target table create the Clustered Index and a couple of Non-Clustered Index. In order to allow parallel partitioned load, also indexes must use the same partitioning function used by the table\n\n```sql\ncreate clustered index IXC on dbo.[LINEITEM_LOADTEST] ([L_COMMITDATE]) \non ps_LINEITEM([L_PARTITION_KEY]);\n\ncreate unique nonclustered index IX1 on dbo.[LINEITEM_LOADTEST] ([L_ORDERKEY], [L_LINENUMBER], [L_PARTITION_KEY]) \non ps_LINEITEM([L_PARTITION_KEY]);\n\ncreate nonclustered index IX2 on dbo.[LINEITEM_LOADTEST] ([L_PARTKEY], [L_PARTITION_KEY]) \non ps_LINEITEM([L_PARTITION_KEY]);\n```"],"metadata":{}},{"cell_type":"markdown","source":["As DataFrame and Azure SQL Table are both partitioned by `L_PARTITION_KEY`, there isn't much left to do and the connector will take care of everything for us. `tableLock` must be set to `false` to avoid table lock that will prevent parallel partitioned load. Thanks to partitions, acquired locks will not interfere with each other."],"metadata":{}},{"cell_type":"code","source":["li.write \n  .format(\"com.microsoft.sqlserver.jdbc.spark\") \n  .mode(\"overwrite\")   \n  .option(\"truncate\", \"true\") \n  .option(\"url\", url) \n  .option(\"dbtable\", \"dbo.LINEITEM_LOADTEST\") \n  .option(\"user\", user) \n  .option(\"password\", password) \n  .option(\"reliabilityLevel\", \"BEST_EFFORT\") \n  .option(\"tableLock\", \"false\") \n  .option(\"batchsize\", \"100000\") \n  .option(\"schemaCheckEnabled\", \"false\") // needed to avoid clash of NULLable columns vs NON-NULLable colums\n  .save()"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"></div>"]}}],"execution_count":18},{"cell_type":"markdown","source":["## Load data into a partitioned table with column-store index"],"metadata":{}},{"cell_type":"markdown","source":["Empty the test table if needed, to speed up index deletion\n\n```sql\ntruncate table dbo.[LINEITEM_LOADTEST];\n```\n\nDrop the previously create indexes if needed:\n```sql\ndrop index IXC on dbo.[LINEITEM_LOADTEST];\ndrop index IX1 on dbo.[LINEITEM_LOADTEST];\ndrop index IX2 on dbo.[LINEITEM_LOADTEST];\n```\n\nAnd then create a clustered columnstore index:\n\n```sql\ncreate clustered columnstore index IXCCS on dbo.[LINEITEM_LOADTEST]\non ps_LINEITEM([L_PARTITION_KEY]);\n```"],"metadata":{}},{"cell_type":"markdown","source":["Load data using [columnstore data loading best pratices](https://docs.microsoft.com/en-us/sql/relational-databases/indexes/columnstore-indexes-data-loading-guidance), by loading 1048576 rows at time, to land directly into a compressed segment. Locking the table must be set to `false` to avoid locking. Data with be loaded in parallel, using as many as Apache Spark workers are available."],"metadata":{}},{"cell_type":"code","source":["li.write \n  .format(\"com.microsoft.sqlserver.jdbc.spark\") \n  .mode(\"overwrite\")   \n  .option(\"truncate\", \"true\") \n  .option(\"url\", url) \n  .option(\"dbtable\", \"dbo.LINEITEM_LOADTEST\") \n  .option(\"user\", user) \n  .option(\"password\", password) \n  .option(\"reliabilityLevel\", \"BEST_EFFORT\") \n  .option(\"tableLock\", \"false\") \n  .option(\"batchsize\", \"1048576\") \n  .option(\"schemaCheckEnabled\", \"false\")\n  .save()"],"metadata":{},"outputs":[],"execution_count":22}],"metadata":{"name":"02-load-into-partitioned-table","notebookId":1536696850337469},"nbformat":4,"nbformat_minor":0}


--------------------------------------------------------------------------------
/notebooks/03a-parallel-switch-in-load-into-partitioned-table-many.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"markdown","source":["# 03a - Parallel Switch-In Load Into Partitioned Table\n\nIf you have to load data into a table that is also actively used by users, you cannot just run a bulk copy operation on such table. If you plan to use `tableLock` option, users will not be able to access data for the whole duration of the bulk load; even if you don't plan to use `tableLock` option, a bulk load operation will still impact and interfere with conccurrent operations running on the table partition.\n\nTo get more details on partitioning, take a look at the `02-load-into-partitioned-table` notebook.\n\nThe solution to be able to bulk load data and at the same time have the table usable by applications and users is simple: load another table instead, and then \"switch-in\" that table into the target one. More details on this pattern can be found in [this post](https://www.cathrinewilhelmsen.net/2015/04/19/table-partitioning-in-sql-server-partition-switching/) written by the Data Platform MVP Cathrine Wilhelmsen. \n\nBeside improving concurrency during bulk load operation, you also have another benefit that can be very useful. When not using the switch-in ability just discussed, it's usually better to load the table with indexes already created, as for very big tables, creating an index can completely drain all the resources avaiable to your Azure SQL database. By using this tecnique you are actually using a \"divide-et-impera\" approach, so that you can load data into a staging table with no indexes, where you'll have the best load performance possible, and then create the needed index later, with much lower impact on resources. The lower resource impact is due to the fact that you are only load data that will go into a single partition, not the whole table, and thus should be smaller and much more manageable. By repeating this process for all partitions you need to load, you can load data without impacting to much on Azure SQL resources and thus query performances.\n\nDue to the fact that Apache Spark RDD partitions and Azure SQL partitions are in a 1:N relationship, is not possibile for the Azure SQL Connector to easily determine which staging table should be used and how to do the switch-in. Luckily we can do this operation manually, using a [well documented technique](https://docs.databricks.com/notebooks/notebook-workflows.html), helping Apache Spark to maximize parallelism to load Azure SQL partitions.\n\nThe sample is using the new sql-spark-connector (https://github.com/microsoft/sql-spark-connector). Make sure you have installed it before running this notebook.\n\n## Notes on terminology\n\nThe term \"row-store\" is used to identify and index that is not using the [column-store layout](https://docs.microsoft.com/en-us/sql/relational-databases/indexes/columnstore-indexes-overview) to store its data.\n\n## Sample\n\nThis notebook is used to parallelize the work done by another notebook (`03b-parallel-switch-in-load-into-partitioned-table-single.ipynb`), that is actually the one loading the data into a staging table via bulk copy and than doing the switch-in operation.  \n\n## Supported Azure Databricks Versions\n\nDatabricks supported versions: Spark 2.4.5 and Scala 2.11"],"metadata":{}},{"cell_type":"markdown","source":["# Create Target Table\nCreate table and its indexes"],"metadata":{}},{"cell_type":"markdown","source":["Make sure you create on your Azure SQL the following `LINEITEM` table, partitioned by `L_PARTITION_KEY`:\n\n```sql\ncreate partition function pf_LINEITEM(int)\nas range left for values \n(\n\t199201,199202,199203,199204,199205,199206,199207,199208,199209,199210,199211,199212,\n\t199301,199302,199303,199304,199305,199306,199307,199308,199309,199310,199311,199312,\n\t199401,199402,199403,199404,199405,199406,199407,199408,199409,199410,199411,199412,\n\t199501,199502,199503,199504,199505,199506,199507,199508,199509,199510,199511,199512,\n\t199601,199602,199603,199604,199605,199606,199607,199608,199609,199610,199611,199612,\n\t199701,199702,199703,199704,199705,199706,199707,199708,199709,199710,199711,199712,\n\t199801,199802,199803,199804,199805,199806,199807,199808,199809,199810\n);\n\ncreate partition scheme ps_LINEITEM\nas partition pf_LINEITEM\nall to ([Primary])\n;\n\ncreate table [dbo].[LINEITEM_LOADTEST]\n(\n\t[L_ORDERKEY] [int] not null,\n\t[L_PARTKEY] [int] not null,\n\t[L_SUPPKEY] [int] not null,\n\t[L_LINENUMBER] [int] not null,\n\t[L_QUANTITY] [decimal](15, 2) not null,\n\t[L_EXTENDEDPRICE] [decimal](15, 2) not null,\n\t[L_DISCOUNT] [decimal](15, 2) not null,\n\t[L_TAX] [decimal](15, 2) not null,\n\t[L_RETURNFLAG] [char](1) not null,\n\t[L_LINESTATUS] [char](1) not null,\n\t[L_SHIPDATE] [date] not null,\n\t[L_COMMITDATE] [date] not null,\n\t[L_RECEIPTDATE] [date] not null,\n\t[L_SHIPINSTRUCT] [char](25) not null,\n\t[L_SHIPMODE] [char](10) not null,\n\t[L_COMMENT] [varchar](44) not null,\n\t[L_PARTITION_KEY] [int] not null\n) on ps_LINEITEM([L_PARTITION_KEY])\n;\n\ncreate clustered index IXC on dbo.[LINEITEM_LOADTEST] ([L_COMMITDATE]) \non ps_LINEITEM([L_PARTITION_KEY]);\n\ncreate unique nonclustered index IX1 on dbo.[LINEITEM_LOADTEST] ([L_ORDERKEY], [L_LINENUMBER], [L_PARTITION_KEY]) \non ps_LINEITEM([L_PARTITION_KEY]);\n\ncreate nonclustered index IX2 on dbo.[LINEITEM_LOADTEST] ([L_PARTKEY], [L_PARTITION_KEY]) \non ps_LINEITEM([L_PARTITION_KEY]);\n```"],"metadata":{}},{"cell_type":"markdown","source":["## Create support function\nTo be able to execute a switch-in load, parallel load must be managed manually, as T-SQL code must be execute before and after each Azure SQL partition (not Dataframe partition! Remember that a Dataframe partition can target multiple Azure SQL partitions) has been loaded bia bulk load operation. By using the [tecnique explained in the official Databricks documentation](https://docs.databricks.com/notebooks/notebook-workflows.html#api) it is possibile to execute a notebook in parallel, by implementing the following function."],"metadata":{}},{"cell_type":"code","source":["import scala.concurrent.{Future, Await}\nimport scala.concurrent.duration._\nimport scala.util.control.NonFatal\n\ncase class NotebookData(path: String, timeout: Int, parameters: Map[String, String] = Map.empty[String, String])\n\ndef parallelNotebooks(notebooks: Seq[NotebookData]): Future[Seq[String]] = {\n  import scala.concurrent.{Future, blocking, Await}\n  import java.util.concurrent.Executors\n  import scala.concurrent.ExecutionContext\n  import com.databricks.WorkflowException\n\n  val numNotebooksInParallel = 4 \n  // If you create too many notebooks in parallel the driver may crash when you submit all of the jobs at once. \n  // This code limits the number of parallel notebooks.\n  implicit val ec = ExecutionContext.fromExecutor(Executors.newFixedThreadPool(numNotebooksInParallel))\n  val ctx = dbutils.notebook.getContext()\n  \n  Future.sequence(\n    notebooks.map { notebook => \n      Future {\n        dbutils.notebook.setContext(ctx)\n        if (notebook.parameters.nonEmpty)\n          dbutils.notebook.run(notebook.path, notebook.timeout, notebook.parameters)\n        else\n          dbutils.notebook.run(notebook.path, notebook.timeout)\n      }\n      .recover {\n        case NonFatal(e) => s\"ERROR: ${e.getMessage}\"\n      }\n    }\n  )\n}"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">import scala.concurrent.{Future, Await}\nimport scala.concurrent.duration._\nimport scala.util.control.NonFatal\ndefined class NotebookData\nparallelNotebooks: (notebooks: Seq[NotebookData])scala.concurrent.Future[Seq[String]]\n</div>"]}}],"execution_count":5},{"cell_type":"markdown","source":["## Run Parallel Load"],"metadata":{}},{"cell_type":"markdown","source":["Create a Sequence with Azure SQL partitions to be loaded is stored"],"metadata":{}},{"cell_type":"code","source":["import spark.implicits._\nimport org.apache.spark.sql._\n\ncase class partitionToProcess(partitionKey:Int)\n\nval ptp = Seq(\n    partitionToProcess(199702),\n    partitionToProcess(199703),\n    partitionToProcess(199704),\n    partitionToProcess(199706),\n    partitionToProcess(199707),\n    partitionToProcess(199708)\n)"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">import spark.implicits._\nimport org.apache.spark.sql._\ndefined class partitionToProcess\nptp: Seq[partitionToProcess] = List(partitionToProcess(199702), partitionToProcess(199703), partitionToProcess(199704), partitionToProcess(199706), partitionToProcess(199707), partitionToProcess(199708))\n</div>"]}}],"execution_count":8},{"cell_type":"markdown","source":["Execute in parallel several instances of the notebook that load a specific partition, using a different partition key for each instance"],"metadata":{}},{"cell_type":"code","source":["import scala.concurrent.Await\nimport scala.concurrent.duration._\nimport scala.language.postfixOps\n\nval timeOut = 600 // seconds\nval ipynb = \"./03b-parallel-switch-in-load-into-partitioned-table-single\"\n\nval notebooks = ptp.map(p => NotebookData(ipynb, timeOut, Map(\"partitionKey\" -> p.partitionKey.toString)))"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">import scala.concurrent.Await\nimport scala.concurrent.duration._\nimport scala.language.postfixOps\ntimeOut: Int = 600\nipynb: String = ./03b-parallel-switch-in-load-into-partitioned-table-single\nnotebooks: Seq[NotebookData] = List(NotebookData(./03b-parallel-switch-in-load-into-partitioned-table-single,600,Map(partitionKey -&gt; 199702)), NotebookData(./03b-parallel-switch-in-load-into-partitioned-table-single,600,Map(partitionKey -&gt; 199703)), NotebookData(./03b-parallel-switch-in-load-into-partitioned-table-single,600,Map(partitionKey -&gt; 199704)), NotebookData(./03b-parallel-switch-in-load-into-partitioned-table-single,600,Map(partitionKey -&gt; 199706)), NotebookData(./03b-parallel-switch-in-load-into-partitioned-table-single,600,Map(partitionKey -&gt; 199707)), NotebookData(./03b-parallel-switch-in-load-into-partitioned-table-single,600,Map(partitionKey -&gt; 199708)))\n</div>"]}}],"execution_count":10},{"cell_type":"code","source":["val res = parallelNotebooks(notebooks)\n\nAwait.result(res, (timeOut * ptp.size seconds)) // this is a blocking call.\n\nres.value"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">res: scala.concurrent.Future[Seq[String]] = Future(Success(List(199702, 199703, 199704, 199706, 199707, 199708)))\nres3: Option[scala.util.Try[Seq[String]]] = Some(Success(List(199702, 199703, 199704, 199706, 199707, 199708)))\n</div>"]}}],"execution_count":11},{"cell_type":"code","source":[""],"metadata":{},"outputs":[],"execution_count":12}],"metadata":{"name":"03a-parallel-switch-in-load-into-partitioned-table-many","notebookId":964636935775876},"nbformat":4,"nbformat_minor":0}


--------------------------------------------------------------------------------
/notebooks/03b-parallel-switch-in-load-into-partitioned-table-single.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"markdown","source":["# 03b - Parallel Switch-In Load Into Partitioned Table - Sigle Partition Load\n\nThis notebook will bulk load data into exactly one Azure SQL partition. It accepts a Partition Key as a parameter, and that value will be used to load all data that belongs to that partition. In this sample column used to partition data is the `L_PARTITION_KEY` column, which is an integer, so the provided partition key *must be* an integer too.\n\nData is not loaded directly into the selected partition, but a staging table is created, loaded and then switched into the target table, becoming the desired partition.\n\nMore info on this switch-in technique can be found in the related notebook: `03a-parallel-switch-in-load-into-partitioned-table-many`\n\nThe sample is using the new sql-spark-connector (https://github.com/microsoft/sql-spark-connector). Make sure you have installed it before running this notebook.\n\n## Notes on terminology\n\nThe term \"row-store\" is used to identify and index that is not using the [column-store layout](https://docs.microsoft.com/en-us/sql/relational-databases/indexes/columnstore-indexes-overview) to store its data.\n\n## Sample\n\nThis notebook is used to load exactly on partition of a partitioned table by loading a staging table and then switching it in into the target table. The process is the following:\n\n- Create a staging table\n- Load staging table\n- Create indexes\n- Create check constraints\n- Execute switch-in operation\n\nMore details on this pattern can be found in [this post](https://www.cathrinewilhelmsen.net/2015/04/19/table-partitioning-in-sql-server-partition-switching/) written by the Data Platform MVP Cathrine Wilhelmsen. \n\n## Supported Azure Databricks Versions\n\nDatabricks supported versions: Spark 2.4.5 and Scala 2.11"],"metadata":{}},{"cell_type":"markdown","source":["## Setup"],"metadata":{}},{"cell_type":"markdown","source":["Define notebook parameter:"],"metadata":{}},{"cell_type":"code","source":["dbutils.widgets.text(\"partitionKey\", \"0\", \"Partition Key\")"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"></div>"]}}],"execution_count":4},{"cell_type":"markdown","source":["Define variables used thoughout the script. Azure Key Value has been used to securely store sensitive data. More info here: [Create an Azure Key Vault-backed secret scope](https://docs.microsoft.com/en-us/azure/databricks/security/secrets/secret-scopes#--create-an-azure-key-vault-backed-secret-scope)"],"metadata":{}},{"cell_type":"code","source":["val partitionKey = dbutils.widgets.get(\"partitionKey\").toInt\nval prevPartitionKey = partitionKey\n\nval scope = \"key-vault-secrets\"\n\nval storageAccount = \"dmstore2\";\nval storageKey = dbutils.secrets.get(scope, \"dmstore2-2\");\n\nval server = dbutils.secrets.get(scope, \"srv001\").concat(\".database.windows.net\");\nval database = \"ApacheSpark\";\nval user = dbutils.secrets.get(scope, \"dbuser001\");\nval password = dbutils.secrets.get(scope, \"dbpwd001\");\nval table = \"dbo.LINEITEM_LOADTEST\"\n\nval url = s\"jdbc:sqlserver://$server;databaseName=$database;\""],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">partitionKey: Int = 199810\nprevPartitionKey: Int = 199810\nscope: String = key-vault-secrets\nstorageAccount: String = dmstore2\nstorageKey: String = [REDACTED]\nserver: String = [REDACTED].database.windows.net\ndatabase: String = ApacheSpark\nuser: String = [REDACTED]\npassword: String = [REDACTED]\ntable: String = dbo.LINEITEM_LOADTEST\nurl: String = jdbc:sqlserver://[REDACTED].database.windows.net;databaseName=ApacheSpark;\n</div>"]}}],"execution_count":6},{"cell_type":"markdown","source":["Configure Spark to access Azure Blob Store"],"metadata":{}},{"cell_type":"code","source":["spark.conf.set(s\"fs.azure.account.key.$storageAccount.blob.core.windows.net\", storageKey);"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"></div>"]}}],"execution_count":8},{"cell_type":"markdown","source":["Load the Parquet file generated in `00-create-parquet-file` notebook that contains LINEITEM data partitioned by Year and Month. Make sure only the specified partion is loaded"],"metadata":{}},{"cell_type":"code","source":["val li = spark\n  .read\n  .parquet(s\"wasbs://tpch@$storageAccount.blob.core.windows.net/10GB/parquet/lineitem\")\n  .filter($\"L_PARTITION_KEY\" === partitionKey)"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">li: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [L_ORDERKEY: int, L_PARTKEY: int ... 15 more fields]\n</div>"]}}],"execution_count":10},{"cell_type":"markdown","source":["Create the T-SQL script need to extract information on the partition that will be loaded into Azure SQL"],"metadata":{}},{"cell_type":"code","source":["val sqlPartitionValueInfo = \ns\"\"\"\nSELECT\n\t*\nFROM\n(\n\tSELECT\n\t\tprv.[boundary_id] AS partitionId,\n\t\tCAST(prv.[value] AS INT) AS [value],\n\t\tCAST(LAG(prv.[value]) OVER (ORDER BY prv.[boundary_id]) AS INT) AS [prevValue],\n\t\tCAST(LEAD(prv.[value]) OVER (ORDER BY prv.[boundary_id]) AS INT) AS [nextValue]\n\tFROM\n\t\tsys.[indexes] i\n\tINNER JOIN\n\t\tsys.[data_spaces] dp ON i.[data_space_id] = dp.[data_space_id]\n\tINNER JOIN\n\t\tsys.[partition_schemes] ps ON dp.[data_space_id] = ps.[data_space_id]\n\tINNER JOIN\n\t\tsys.[partition_range_values] prv ON [prv].[function_id] = [ps].[function_id]\n\tWHERE\n\t\ti.[object_id] = OBJECT_ID('${table}')\n\tAND\n\t\ti.[index_id] IN (0,1)\n) AS [pi]\nWHERE\n\t[value] = ${partitionKey}\n\"\"\""],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">sqlPartitionValueInfo: String =\n&quot;\nSELECT\n\t*\nFROM\n(\n\tSELECT\n\t\tprv.[boundary_id] AS partitionId,\n\t\tCAST(prv.[value] AS INT) AS [value],\n\t\tCAST(LAG(prv.[value]) OVER (ORDER BY prv.[boundary_id]) AS INT) AS [prevValue],\n\t\tCAST(LEAD(prv.[value]) OVER (ORDER BY prv.[boundary_id]) AS INT) AS [nextValue]\n\tFROM\n\t\tsys.[indexes] i\n\tINNER JOIN\n\t\tsys.[data_spaces] dp ON i.[data_space_id] = dp.[data_space_id]\n\tINNER JOIN\n\t\tsys.[partition_schemes] ps ON dp.[data_space_id] = ps.[data_space_id]\n\tINNER JOIN\n\t\tsys.[partition_range_values] prv ON [prv].[function_id] = [ps].[function_id]\n\tWHERE\n\t\ti.[object_id] = OBJECT_ID('dbo.LINEITEM_LOADTEST')\n\tAND\n\t\ti.[index_id] IN (0,1)\n) AS [pi]\nWHERE\n\t[value] = 199810\n&quot;\n</div>"]}}],"execution_count":12},{"cell_type":"markdown","source":["Setup JDBC connection, needed to execute ad-hoc T-SQL statement on Azure SQL"],"metadata":{}},{"cell_type":"code","source":["val connectionProperties = new java.util.Properties()\nconnectionProperties.put(\"user\", user)\nconnectionProperties.put(\"password\", password)\nconnectionProperties.setProperty(\"Driver\", \"com.microsoft.sqlserver.jdbc.SQLServerDriver\")\nval conn = java.sql.DriverManager.getConnection(url, connectionProperties)\nval st = conn.createStatement()"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">connectionProperties: java.util.Properties = {user=[REDACTED], password=[REDACTED], Driver=com.microsoft.sqlserver.jdbc.SQLServerDriver}\nconn: java.sql.Connection = ConnectionID:18 ClientConnectionId: cba0a4b8-ec8c-419a-9158-f970f6cf3bb4\nst: java.sql.Statement = SQLServerStatement:35\n</div>"]}}],"execution_count":14},{"cell_type":"markdown","source":["Load Azure SQL partition metadata"],"metadata":{}},{"cell_type":"code","source":["case class PartitionInfo(partitionId: Int, value: Int, prevValue: Option[Int], nextValue: Option[Int]);\nval piDF = spark.read.jdbc(url, s\"($sqlPartitionValueInfo) AS t\", connectionProperties)\ndisplay(piDF)"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .table-result-container {\n    max-height: 300px;\n    overflow: auto;\n  }\n  table, th, td {\n    border: 1px solid black;\n    border-collapse: collapse;\n  }\n  th, td {\n    padding: 5px;\n  }\n  th {\n    text-align: left;\n  }\n</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>partitionId</th><th>value</th><th>prevValue</th><th>nextValue</th></tr></thead><tbody><tr><td>82</td><td>199810</td><td>199809</td><td>null</td></tr></tbody></table></div>"]}}],"execution_count":16},{"cell_type":"code","source":["val pi = piDF.as[PartitionInfo].collect()(0)\nprint(pi)"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">PartitionInfo(82,199810,Some(199809),None)pi: PartitionInfo = PartitionInfo(82,199810,Some(199809),None)\n</div>"]}}],"execution_count":17},{"cell_type":"markdown","source":["Create on Azure SQL a staging table where data will be bulk loaded"],"metadata":{}},{"cell_type":"code","source":["st.execute(s\"DROP TABLE IF EXISTS ${table}_STG_${partitionKey}\")\nst.execute(s\"SELECT TOP (0) * INTO ${table}_STG_${partitionKey} FROM ${table}\")"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">res24: Boolean = false\n</div>"]}}],"execution_count":19},{"cell_type":"markdown","source":["Load the staging table"],"metadata":{}},{"cell_type":"code","source":["li.write \n  .format(\"com.microsoft.sqlserver.jdbc.spark\") \n  .mode(\"overwrite\")   \n  .option(\"truncate\", \"true\") \n  .option(\"url\", url) \n  .option(\"dbtable\", s\"${table}_STG_${partitionKey}\") \n  .option(\"user\", user) \n  .option(\"password\", password) \n  .option(\"reliabilityLevel\", \"BEST_EFFORT\") \n  .option(\"tableLock\", \"false\") \n  .option(\"batchsize\", \"100000\")   \n  .option(\"schemaCheckEnabled\", \"false\")\n  .save()"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"></div>"]}}],"execution_count":21},{"cell_type":"markdown","source":["Create the same indexes that the target table has, in order to allow switch-in"],"metadata":{}},{"cell_type":"code","source":["st.execute(s\"CREATE CLUSTERED INDEX IXC ON ${table}_STG_${partitionKey} ([L_COMMITDATE], [L_PARTITION_KEY])\")\nst.execute(s\"CREATE UNIQUE NONCLUSTERED INDEX IX1 ON ${table}_STG_${partitionKey} ([L_ORDERKEY], [L_LINENUMBER], [L_PARTITION_KEY])\")\nst.execute(s\"CREATE NONCLUSTERED INDEX IX2 ON ${table}_STG_${partitionKey} ([L_PARTKEY], [L_PARTITION_KEY])\")"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">res26: Boolean = false\n</div>"]}}],"execution_count":23},{"cell_type":"markdown","source":["Add a check constraint on the table to allow switch-in"],"metadata":{}},{"cell_type":"code","source":["if (pi.prevValue == None) {\n  st.execute(s\"ALTER TABLE ${table}_STG_${partitionKey} ADD CONSTRAINT ck_partition_${partitionKey} CHECK (L_PARTITION_KEY <= ${pi.value})\")\n} else {\n  st.execute(s\"ALTER TABLE ${table}_STG_${partitionKey} ADD CONSTRAINT ck_partition_${partitionKey} CHECK (L_PARTITION_KEY > ${pi.prevValue.get} AND L_PARTITION_KEY <= ${pi.value})\")\n}"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">res27: Boolean = false\n</div>"]}}],"execution_count":25},{"cell_type":"markdown","source":["Delete data in existing partition of target table, execute the switch-in and drop the staging table"],"metadata":{}},{"cell_type":"code","source":["st.execute(s\"TRUNCATE TABLE ${table} WITH (PARTITIONS (${pi.partitionId}))\")\nst.execute(s\"ALTER TABLE ${table}_STG_${partitionKey} SWITCH TO ${table} PARTITION ${pi.partitionId}\")\nst.execute(s\"DROP TABLE ${table}_STG_${partitionKey}\")"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">res122: Boolean = false\n</div>"]}}],"execution_count":27},{"cell_type":"markdown","source":["Done!"],"metadata":{}},{"cell_type":"code","source":["dbutils.notebook.exit(partitionKey.toString)"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/plain":["199810"]}}],"execution_count":29}],"metadata":{"name":"03b-parallel-switch-in-load-into-partitioned-table-single","notebookId":964636935775860},"nbformat":4,"nbformat_minor":0}


--------------------------------------------------------------------------------
/notebooks/read-from-azure-sql/push-down-queries.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"markdown","source":["# Push Down Queries\n\nSample that shows how to push queries to Azure SQL"],"metadata":{}},{"cell_type":"markdown","source":["Define variables used thoughout the script. Azure Key Value has been used to securely store sensitive data. More info here: [Create an Azure Key Vault-backed secret scope](https://docs.microsoft.com/en-us/azure/databricks/security/secrets/secret-scopes#--create-an-azure-key-vault-backed-secret-scope)"],"metadata":{}},{"cell_type":"code","source":["val scope = \"key-vault-secrets\"\n\nval server = dbutils.secrets.get(scope, \"srv001\")\nval database = \"ApacheSpark\"\n\nval jdbcUrl = s\"jdbc:sqlserver://$server.database.windows.net;database=$database;\"\n\nval connectionProperties = new java.util.Properties()\nconnectionProperties.put(\"user\", dbutils.secrets.get(scope, \"dbuser001\"))\nconnectionProperties.put(\"password\", dbutils.secrets.get(scope, \"dbpwd001\"))\nconnectionProperties.setProperty(\"Driver\", \"com.microsoft.sqlserver.jdbc.SQLServerDriver\")"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">scope: String = key-vault-secrets\nserver: String = [REDACTED]\ndatabase: String = ApacheSpark\njdbcUrl: String = jdbc:sqlserver://[REDACTED].database.windows.net;database=ApacheSpark;\nconnectionProperties: java.util.Properties = {user=[REDACTED], password=[REDACTED], Driver=com.microsoft.sqlserver.jdbc.SQLServerDriver}\nres0: Object = null\n</div>"]}}],"execution_count":3},{"cell_type":"markdown","source":["A pushdown query is executed as a subquery by Azure SQL, so you *MUST* alias the subquery and put it in parenthesis: `(<...>) AS subquery`"],"metadata":{}},{"cell_type":"code","source":["val pushDown = \"\"\"(\nSELECT\n\tL_COMMITDATE,\n\tCOUNT(*) AS TotalOrders\nFROM\n\tdbo.LINEITEM\nGROUP BY\n\tL_COMMITDATE\n) AS SubQuery\"\"\"\n\nval li = spark.read.jdbc(jdbcUrl, pushDown, connectionProperties)"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">pushDown: String =\n(\nSELECT\n\tL_COMMITDATE,\n\tCOUNT(*) AS TotalOrders\nFROM\n\tdbo.LINEITEM\nGROUP BY\n\tL_COMMITDATE\n) AS SubQuery\nli: org.apache.spark.sql.DataFrame = [L_COMMITDATE: date, TotalOrders: int]\n</div>"]}}],"execution_count":5},{"cell_type":"code","source":["li.printSchema"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">root\n-- L_COMMITDATE: date (nullable = true)\n-- TotalOrders: integer (nullable = true)\n\n</div>"]}}],"execution_count":6},{"cell_type":"code","source":["display(li.where(\"L_COMMITDATE='1995-01-01'\"))"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .table-result-container {\n    max-height: 300px;\n    overflow: auto;\n  }\n  table, th, td {\n    border: 1px solid black;\n    border-collapse: collapse;\n  }\n  th, td {\n    padding: 5px;\n  }\n  th {\n    text-align: left;\n  }\n</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>L_COMMITDATE</th><th>TotalOrders</th></tr></thead><tbody><tr><td>1995-01-01</td><td>24781</td></tr></tbody></table></div>"]}}],"execution_count":7}],"metadata":{"name":"push-down-queries","notebookId":2236457168077307},"nbformat":4,"nbformat_minor":0}


--------------------------------------------------------------------------------