├── .github
└── ISSUE_TEMPLATE.md
├── .gitignore
├── LICENSE
├── README.md
├── adf
├── _scripts
│ └── deploymentadf.ps1
├── arm-template-parameters-definition.json
├── dataset
│ ├── Ds_AdlsGen2_MelbParkingData.json
│ └── Ds_REST_MelbParkingData.json
├── linkedService
│ ├── Ls_AdlsGen2_01.json
│ ├── Ls_AzureSQLDW_01.json
│ ├── Ls_KeyVault.json
│ ├── Ls_Rest_MelParkSensors_01.json
│ └── Ls_adb_01.json
├── pipeline
│ └── P_Ingest_MelbParkingData.json
└── trigger
│ └── T_Sched.json
├── clean_up.sh
├── data
├── raw_data
│ ├── On-street_Parking_Bay_Sensors
│ │ ├── On-street_Parking_Bay_Sensors.csv
│ │ ├── On-street_Parking_Bay_Sensors.json
│ │ └── On-street_Parking_Bay_Sensors_baylist.csv
│ └── README.md
└── seed
│ ├── DimDate.csv
│ └── DimTime.csv
├── databricks
├── config
│ ├── cluster.config.json
│ ├── cluster.config.template.json
│ └── run.setup.config.json
├── configure_databricks.sh
├── create_secrets.sh
├── deploy_app.sh
├── libs
│ └── azure-cosmosdb-spark_2.3.0_2.11-1.2.2-uber.jar
└── notebooks
│ ├── 00_setup.py
│ ├── 01_explore.py
│ ├── 02_standardize.py
│ └── 03_transform.py
├── deploy.sh
├── docs
├── CI_CD.md
└── NDCSydney2019-DataDevOps.pdf
├── images
├── CI_CD_process.PNG
├── Release_1_Agent_DeployToDatabricks.PNG
└── architecture.PNG
├── infrastructure
├── README.md
├── azuredeploy.json
├── azuredeploy.parameters.dev.json
├── azuredeploy.parameters.prod.json
├── azuredeploy.parameters.stg.json
├── configure_adlagen2.sh
└── deploy_infrastructure.sh
├── samples
├── azuresql
│ ├── README.md
│ ├── azure-pipelines-ci.yml
│ └── ddo_samples_azuresql
│ │ ├── ddo_samples_azuresql.sln
│ │ └── ddo_samples_azuresql
│ │ ├── SalesLT
│ │ ├── Sequences
│ │ │ └── SalesOrderNumber.sql
│ │ ├── Tables
│ │ │ ├── Address.sql
│ │ │ ├── Customer.sql
│ │ │ ├── CustomerAddress.sql
│ │ │ ├── Product.sql
│ │ │ ├── ProductCategory.sql
│ │ │ ├── ProductDescription.sql
│ │ │ ├── ProductModel.sql
│ │ │ ├── ProductModelProductDescription.sql
│ │ │ ├── SalesOrderDetail.sql
│ │ │ └── SalesOrderHeader.sql
│ │ └── Views
│ │ │ ├── vGetAllCategories.sql
│ │ │ ├── vProductAndDescription.sql
│ │ │ └── vProductModelCatalogDescription.sql
│ │ ├── Security
│ │ └── SalesLT.sql
│ │ ├── dbo
│ │ ├── Functions
│ │ │ ├── ufnGetAllCategories.sql
│ │ │ ├── ufnGetCustomerInformation.sql
│ │ │ └── ufnGetSalesOrderStatusText.sql
│ │ ├── Stored Procedures
│ │ │ ├── uspLogError.sql
│ │ │ └── uspPrintError.sql
│ │ ├── Tables
│ │ │ ├── BuildVersion.sql
│ │ │ └── ErrorLog.sql
│ │ └── User Defined Types
│ │ │ ├── AccountNumber.sql
│ │ │ ├── Flag.sql
│ │ │ ├── Name.sql
│ │ │ ├── NameStyle.sql
│ │ │ ├── OrderNumber.sql
│ │ │ └── Phone.sql
│ │ └── ddo_samples_azuresql.sqlproj
└── databricks
│ └── README.md
├── sql
└── ddo_azuresqldw_dw
│ ├── ddo_azuresqldw_dw.sln
│ └── ddo_azuresqldw_dw
│ ├── External Resources
│ ├── AzureDataLakeStorage.sql
│ └── ParquetFileFormat.sql
│ ├── Script.PostDeployment1.sql
│ ├── Security
│ ├── ADLSCredentialKey.sql
│ ├── MasterKeys.sql
│ └── ext.sql
│ ├── dbo
│ ├── Stored Procedures
│ │ └── load_dw.sql
│ └── Tables
│ │ ├── dim_location.sql
│ │ ├── dim_parking_bay.sql
│ │ ├── dim_st_marker.sql
│ │ └── fact_parking.sql
│ ├── ddo_azuresqldw_dw.sqlproj
│ └── ext
│ └── External Tables
│ ├── dim_location.sql
│ ├── dim_parking_bay.sql
│ ├── dim_st_marker.sql
│ └── fact_parking.sql
└── src
└── ddo_transform
├── .editorconfig
├── AUTHORS.rst
├── CONTRIBUTING.rst
├── Dockerfile
├── HISTORY.rst
├── MANIFEST.in
├── Makefile
├── README.rst
├── azure-pipelines-ci-artifacts.yml
├── azure-pipelines-ci-qa.yml
├── data
├── MelbParkingBayData.json
├── MelbParkingSensorData.json
├── dim_location.json
├── dim_parking_bay.json
├── dim_st_marker.json
├── interim_parking_bay.json
└── interim_sensor.json
├── ddo_transform
├── __init__.py
├── standardize.py
├── transform.py
└── util.py
├── docs
├── Makefile
├── authors.rst
├── conf.py
├── contributing.rst
├── history.rst
├── index.rst
├── installation.rst
├── make.bat
├── readme.rst
└── usage.rst
├── requirements.txt
├── requirements_dev.txt
├── setup.cfg
├── setup.py
├── tests
├── test_standardize.py
└── test_transform.py
└── tox.ini
/.github/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | * ddo_transform version:
2 | * Python version:
3 | * Operating System:
4 |
5 | ### Description
6 |
7 | Describe what you were trying to get done.
8 | Tell us what happened, what went wrong, and what you expected to happen.
9 |
10 | ### What I Did
11 |
12 | ```
13 | Paste the command(s) you ran and the output.
14 | If there was a crash, please include the traceback here.
15 | ```
16 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 | .pytest_cache/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 |
58 | # Flask stuff:
59 | instance/
60 | .webassets-cache
61 |
62 | # Scrapy stuff:
63 | .scrapy
64 |
65 | # Sphinx documentation
66 | docs/_build/
67 |
68 | # PyBuilder
69 | target/
70 |
71 | # Jupyter Notebook
72 | .ipynb_checkpoints
73 |
74 | # pyenv
75 | .python-version
76 |
77 | # celery beat schedule file
78 | celerybeat-schedule
79 |
80 | # SageMath parsed files
81 | *.sage.py
82 |
83 | # dotenv
84 | .env
85 | .env.*
86 |
87 | # virtualenv
88 | .venv
89 | venv/
90 | ENV/
91 |
92 | # Spyder project settings
93 | .spyderproject
94 | .spyproject
95 |
96 | # Rope project settings
97 | .ropeproject
98 |
99 | # mkdocs documentation
100 | /site
101 |
102 | # mypy
103 | .mypy_cache/
104 |
105 | ## Ignore Visual Studio temporary files, build results, and
106 | ## files generated by popular Visual Studio add-ons.
107 | ##
108 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
109 |
110 | # User-specific files
111 | *.rsuser
112 | *.suo
113 | *.user
114 | *.userosscache
115 | *.sln.docstates
116 |
117 | # User-specific files (MonoDevelop/Xamarin Studio)
118 | *.userprefs
119 |
120 | # Mono auto generated files
121 | mono_crash.*
122 |
123 | # Build results
124 | [Dd]ebug/
125 | [Dd]ebugPublic/
126 | [Rr]elease/
127 | [Rr]eleases/
128 | x64/
129 | x86/
130 | [Aa][Rr][Mm]/
131 | [Aa][Rr][Mm]64/
132 | bld/
133 | [Bb]in/
134 | [Oo]bj/
135 | [Ll]og/
136 |
137 | # Visual Studio 2015/2017 cache/options directory
138 | .vs/
139 | # Uncomment if you have tasks that create the project's static files in wwwroot
140 | #wwwroot/
141 |
142 | # Visual Studio 2017 auto generated files
143 | Generated\ Files/
144 |
145 | # MSTest test Results
146 | [Tt]est[Rr]esult*/
147 | [Bb]uild[Ll]og.*
148 |
149 | # NUnit
150 | *.VisualState.xml
151 | TestResult.xml
152 | nunit-*.xml
153 |
154 | # Build Results of an ATL Project
155 | [Dd]ebugPS/
156 | [Rr]eleasePS/
157 | dlldata.c
158 |
159 | # Benchmark Results
160 | BenchmarkDotNet.Artifacts/
161 |
162 | # .NET Core
163 | project.lock.json
164 | project.fragment.lock.json
165 | artifacts/
166 |
167 | # StyleCop
168 | StyleCopReport.xml
169 |
170 | # Files built by Visual Studio
171 | *_i.c
172 | *_p.c
173 | *_h.h
174 | *.ilk
175 | *.meta
176 | *.obj
177 | *.iobj
178 | *.pch
179 | *.pdb
180 | *.ipdb
181 | *.pgc
182 | *.pgd
183 | *.rsp
184 | *.sbr
185 | *.tlb
186 | *.tli
187 | *.tlh
188 | *.tmp
189 | *.tmp_proj
190 | *_wpftmp.csproj
191 | *.log
192 | *.vspscc
193 | *.vssscc
194 | .builds
195 | *.pidb
196 | *.svclog
197 | *.scc
198 |
199 | # Chutzpah Test files
200 | _Chutzpah*
201 |
202 | # Visual C++ cache files
203 | ipch/
204 | *.aps
205 | *.ncb
206 | *.opendb
207 | *.opensdf
208 | *.sdf
209 | *.cachefile
210 | *.VC.db
211 | *.VC.VC.opendb
212 |
213 | # Visual Studio profiler
214 | *.psess
215 | *.vsp
216 | *.vspx
217 | *.sap
218 |
219 | # Visual Studio Trace Files
220 | *.e2e
221 |
222 | # TFS 2012 Local Workspace
223 | $tf/
224 |
225 | # Guidance Automation Toolkit
226 | *.gpState
227 |
228 | # ReSharper is a .NET coding add-in
229 | _ReSharper*/
230 | *.[Rr]e[Ss]harper
231 | *.DotSettings.user
232 |
233 | # JustCode is a .NET coding add-in
234 | .JustCode
235 |
236 | # TeamCity is a build add-in
237 | _TeamCity*
238 |
239 | # DotCover is a Code Coverage Tool
240 | *.dotCover
241 |
242 | # AxoCover is a Code Coverage Tool
243 | .axoCover/*
244 | !.axoCover/settings.json
245 |
246 | # Visual Studio code coverage results
247 | *.coverage
248 | *.coveragexml
249 |
250 | # NCrunch
251 | _NCrunch_*
252 | .*crunch*.local.xml
253 | nCrunchTemp_*
254 |
255 | # MightyMoose
256 | *.mm.*
257 | AutoTest.Net/
258 |
259 | # Web workbench (sass)
260 | .sass-cache/
261 |
262 | # Installshield output folder
263 | [Ee]xpress/
264 |
265 | # DocProject is a documentation generator add-in
266 | DocProject/buildhelp/
267 | DocProject/Help/*.HxT
268 | DocProject/Help/*.HxC
269 | DocProject/Help/*.hhc
270 | DocProject/Help/*.hhk
271 | DocProject/Help/*.hhp
272 | DocProject/Help/Html2
273 | DocProject/Help/html
274 |
275 | # Click-Once directory
276 | publish/
277 |
278 | # Publish Web Output
279 | *.[Pp]ublish.xml
280 | *.azurePubxml
281 | # Note: Comment the next line if you want to checkin your web deploy settings,
282 | # but database connection strings (with potential passwords) will be unencrypted
283 | *.pubxml
284 | *.publishproj
285 |
286 | # Microsoft Azure Web App publish settings. Comment the next line if you want to
287 | # checkin your Azure Web App publish settings, but sensitive information contained
288 | # in these scripts will be unencrypted
289 | PublishScripts/
290 |
291 | # NuGet Packages
292 | *.nupkg
293 | # NuGet Symbol Packages
294 | *.snupkg
295 | # The packages folder can be ignored because of Package Restore
296 | **/[Pp]ackages/*
297 | # except build/, which is used as an MSBuild target.
298 | !**/[Pp]ackages/build/
299 | # Uncomment if necessary however generally it will be regenerated when needed
300 | #!**/[Pp]ackages/repositories.config
301 | # NuGet v3's project.json files produces more ignorable files
302 | *.nuget.props
303 | *.nuget.targets
304 |
305 | # Microsoft Azure Build Output
306 | csx/
307 | *.build.csdef
308 |
309 | # Microsoft Azure Emulator
310 | ecf/
311 | rcf/
312 |
313 | # Windows Store app package directories and files
314 | AppPackages/
315 | BundleArtifacts/
316 | Package.StoreAssociation.xml
317 | _pkginfo.txt
318 | *.appx
319 | *.appxbundle
320 | *.appxupload
321 |
322 | # Visual Studio cache files
323 | # files ending in .cache can be ignored
324 | *.[Cc]ache
325 | # but keep track of directories ending in .cache
326 | !?*.[Cc]ache/
327 |
328 | # Others
329 | ClientBin/
330 | ~$*
331 | *~
332 | *.dbmdl
333 | *.dbproj.schemaview
334 | *.jfm
335 | *.pfx
336 | *.publishsettings
337 | orleans.codegen.cs
338 |
339 | # Including strong name files can present a security risk
340 | # (https://github.com/github/gitignore/pull/2483#issue-259490424)
341 | #*.snk
342 |
343 | # Since there are multiple workflows, uncomment next line to ignore bower_components
344 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
345 | #bower_components/
346 |
347 | # RIA/Silverlight projects
348 | Generated_Code/
349 |
350 | # Backup & report files from converting an old project file
351 | # to a newer Visual Studio version. Backup files are not needed,
352 | # because we have git ;-)
353 | _UpgradeReport_Files/
354 | Backup*/
355 | UpgradeLog*.XML
356 | UpgradeLog*.htm
357 | ServiceFabricBackup/
358 | *.rptproj.bak
359 |
360 | # SQL Server files
361 | *.mdf
362 | *.ldf
363 | *.ndf
364 |
365 | # Business Intelligence projects
366 | *.rdl.data
367 | *.bim.layout
368 | *.bim_*.settings
369 | *.rptproj.rsuser
370 | *- [Bb]ackup.rdl
371 | *- [Bb]ackup ([0-9]).rdl
372 | *- [Bb]ackup ([0-9][0-9]).rdl
373 |
374 | # Microsoft Fakes
375 | FakesAssemblies/
376 |
377 | # GhostDoc plugin setting file
378 | *.GhostDoc.xml
379 |
380 | # Node.js Tools for Visual Studio
381 | .ntvs_analysis.dat
382 | node_modules/
383 |
384 | # Visual Studio 6 build log
385 | *.plg
386 |
387 | # Visual Studio 6 workspace options file
388 | *.opt
389 |
390 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
391 | *.vbw
392 |
393 | # Visual Studio LightSwitch build output
394 | **/*.HTMLClient/GeneratedArtifacts
395 | **/*.DesktopClient/GeneratedArtifacts
396 | **/*.DesktopClient/ModelManifest.xml
397 | **/*.Server/GeneratedArtifacts
398 | **/*.Server/ModelManifest.xml
399 | _Pvt_Extensions
400 |
401 | # Paket dependency manager
402 | .paket/paket.exe
403 | paket-files/
404 |
405 | # FAKE - F# Make
406 | .fake/
407 |
408 | # CodeRush personal settings
409 | .cr/personal
410 |
411 | # Python Tools for Visual Studio (PTVS)
412 | __pycache__/
413 | *.pyc
414 |
415 | # Cake - Uncomment if you are using it
416 | # tools/**
417 | # !tools/packages.config
418 |
419 | # Tabs Studio
420 | *.tss
421 |
422 | # Telerik's JustMock configuration file
423 | *.jmconfig
424 |
425 | # BizTalk build output
426 | *.btp.cs
427 | *.btm.cs
428 | *.odx.cs
429 | *.xsd.cs
430 |
431 | # OpenCover UI analysis results
432 | OpenCover/
433 |
434 | # Azure Stream Analytics local run output
435 | ASALocalRun/
436 |
437 | # MSBuild Binary and Structured Log
438 | *.binlog
439 |
440 | # NVidia Nsight GPU debugger configuration file
441 | *.nvuser
442 |
443 | # MFractors (Xamarin productivity tool) working folder
444 | .mfractor/
445 |
446 | # Local History for Visual Studio
447 | .localhistory/
448 |
449 | # BeatPulse healthcheck temp database
450 | healthchecksdb
451 |
452 | # Backup folder for Package Reference Convert tool in Visual Studio 2017
453 | MigrationBackup/
454 |
455 | # Devcontainer
456 | .devcontainer/
457 |
458 | .vscode/
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019, Lace Lofranco
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
23 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # We've moved!
2 |
3 | This repository has moved under the official Azure Samples Github organization: .
4 |
5 | **https://github.com/Azure-Samples/modern-data-warehouse-dataops**
6 |
7 |
8 | -------------------
9 |
10 | # DataDevOps
11 |
12 | The purpose of this repository is to demonstrate how DevOps principles can be applied Data Pipeline Solution.
13 |
14 | [](https://www.youtube.com/watch?v=Xs1-OU5cmsw")
15 |
16 | ## Architecture
17 |
18 | The following shows the overall architecture of the solution.
19 |
20 | 
21 |
22 | ### Design Considerations
23 |
24 | - **Data Transformation logic belongs in packages, not Notebooks**
25 | - All main data transformation code should be packaged up within a Python package/JAR/etc. These packages are then uploaded to DBFS and installed on a specifically configured cluster, along with all other third-party dependencies (ei. azure-cosmosdb-spark jar). Notebooks then simply import the package(s) and calls any relevant functions. Effectively, Notebooks become a lightweight wrapper around the packages. This ensures seperation of concerns and promotes code reuse, testability, and code quality.
26 | - **Data should be tested**
27 | - Two different tests should be performed:
28 | - **Structure** (Is the data in the expected shape / schema?)
29 | - **Content** (Are there unexpected nulls? Are the summary statistics in expected ranges?)
30 | - **Data should have lineage**
31 | - Just as application deployments should have lineage in order to track which code commit produced which artifacts and deployments, each final loaded data record should be tagged with the appropriate ETL pipeline run id. Not only does this ensure traceability, it also helps with recovery from any potential failed / half-run data loads.
32 |
33 | ## Build and Release Pipeline
34 |
35 | The following shows the overall CI/CD process end to end.
36 |
37 | 
38 |
39 | Both Build and Release Pipelines are built using [AzureDevOps](https://dev.azure.com/) (Public instance) and can be view using the following links:
40 | - [Build Pipelines](https://dev.azure.com/devlacepub/DataDevOps/_build)
41 | - [Release Pipeline](https://dev.azure.com/devlacepub/DataDevOps/_release)
42 |
43 | More information [here](docs/CI_CD_process.md).
44 | ### Environments
45 |
46 | - **Dev** - Development collaboration branch
47 | - **QA** - Environment where all integration tests are run (*not yet implmented*)
48 | - **Staging/UAT** - A mirror of the production job, along with state and data. Deploying to staging first give the ability to "mock" a realistic release into production.
49 | - **Production**
50 |
51 | In addition to these environment, each developer may choose to have their own Development(s) environment for their individual use.
52 |
53 | ## Testing
54 |
55 | - Unit Testing - Standard unit tests which tests small pieces of functionality within your code. Data transformation code should have unit tests.
56 |
57 | - Integration Testing - This includes end-to-end testing of the ETL pipeline.
58 |
59 | - Data Testing
60 | 1. Structure - Test for correct schema, expected structure.
61 | 2. Content - Can be tested through quantitative summary statistics and qualitative data quality graphs within the notebook.
62 |
63 | ## Monitoring
64 |
65 | ### Databricks
66 | - [Monitoring Azure Databricks with Azure Monitor](https://docs.microsoft.com/en-us/azure/architecture/databricks-monitoring/)
67 | - [Monitoring Azure Databricks Jobs with Application Insights](https://msdn.microsoft.com/en-us/magazine/mt846727.aspx)
68 |
69 | ### Data Factory
70 | - [Monitor Azure Data Factory with Azure Monitor](https://docs.microsoft.com/en-us/azure/data-factory/monitor-using-azure-monitor)
71 | - [Alerting in Azure Data Factory](https://azure.microsoft.com/en-in/blog/create-alerts-to-proactively-monitor-your-data-factory-pipelines/)
72 |
73 | ## Deploy the solution
74 |
75 | ### Pre-requisites:
76 | 1. Github Account
77 | 2. Azure DevOps Account + Project
78 | 3. Azure Account
79 |
80 | ### Software pre-requisites:
81 | 1. For Windows users, [Windows Subsystem For Linux](https://docs.microsoft.com/en-us/windows/wsl/install-win10)
82 | 2. [az cli 2.x](https://docs.microsoft.com/en-us/cli/azure/install-azure-cli?view=azure-cli-latest)
83 | 3. [Python 3+](https://www.python.org/)
84 | 4. [databricks-cli](https://docs.azuredatabricks.net/dev-tools/databricks-cli.html)
85 | 5. [jq](https://stedolan.github.io/jq/)
86 |
87 | NOTE: This deployment was tested using WSL (Ubuntu 16.04) and Debian GNU/Linux 9.9 (stretch)
88 |
89 | ### Deployment Instructions
90 |
91 | 1. Fork this repository. Forking is necessary if you want to setup git integration with Azure Data Factory.
92 | 2. **Deploy Azure resources.**
93 | 1. Clone the forked repository and `cd` into the root of the repo
94 | 2. Run `./deploy.sh`.
95 | - This will deploy three Resource Groups (one per environment) each with the following Azure resources.
96 | - Data Factory (empty) - *next steps will deploy actual data pipelines*.
97 | - Data Lake Store Gen2 and Service Principal with Storage Contributor rights assigned.
98 | - Databricks workspace - notebooks uploaded, SparkSQL tables created, and ADLS Gen2 mounted using SP.
99 | - KeyVault with all secrets stored.
100 | - This will create a local `.env.{environment_name}` files containing essential configuration information.
101 | - All Azure resources are tagged with correct Environment.
102 | - IMPORTANT: Due to a limitation of the inability to generate Databricks PAT tokens automatically, you will be prompted generate and enter this per environment. See [here](https://docs.azuredatabricks.net/dev-tools/databricks-cli.html#set-up-authentication) for more information.
103 | - The solution is designed such that **all** starting environment deployment configuration should be specified in the arm.parameters files. This is to centralize configuration.
104 |
105 | 3. **Setup ADF git integration in DEV Data Factory**
106 | 1. In the Azure Portal, navigate to the Data Factory in the **DEV** environment.
107 | 2. Click "Author & Monitor" to launch the Data Factory portal.
108 | 3. On the landing page, select "Set up code repository". For more information, see [here](https://docs.microsoft.com/en-us/azure/data-factory/source-control).
109 | 4. Fill in the repository settings with the following:
110 | - Repository type: **Github**
111 | - Github Account: ***your_Github_account***
112 | - Git repository name: **forked Github repository**
113 | - Collaboration branch: **master**
114 | - Root folder: **/adf**
115 | - Import Existing Data Factory resource to respository: **Unselected**
116 | 5. Navigating to "Author" tab, you should see all the pipelines deployed.
117 | 6. Select `Connections` > `Ls_KeyVault`. Update the Base Url to the KeyVault Url of your DEV environment.
118 | 7. Select `Connections` > `Ls_AdlsGen2_01`. Update URL to the ADLS Gen2 Url of your DEV environment.
119 | 8. Click `Publish` to publish changes.
120 |
121 | 4. **Setup Build Pipelines.** You will be creating two build pipelines, one which will trigger for every pull request which will run Unit Testing + Linting, and the second one which will trigger on every commit to master and will create the actual build artifacts for release.
122 | 1. In Azure DevOps, navigate to `Pipelines`. Select "Create Pipeline".
123 | 2. Under "Where is your code?", select Github (YAML).
124 | - If you have not yet already, you maybe prompted to connect your Github account. See [here](https://docs.microsoft.com/en-us/azure/devops/pipelines/repos/github?view=azure-devops&tabs=yaml#grant-access-to-your-github-repositories) for more information.
125 | 3. Under "Select a repository", select your forked repo.
126 | 3. Under "Configure your pipeline", select "Existing Azure Pipelines YAML file".
127 | - Branch: master
128 | - Path: `/src/ddo_transform/azure-pipelines-ci-qa.yaml`
129 | 4. Select `Run`.
130 | 5. Repeat steps 1-4, but select as the path `/src/ddo_transform/azure-pipelines-ci-artifacts`.
131 |
132 | 5. **Setup Release Pipelines**
133 |
134 | **WORK IN PROGRESS**
135 | 1. In Azure DevOps, navigate to `Release`. Select "New pipeline".
136 | 2. Under "Select a template", select "Empty job".
137 | 3. Under "Stage", set Stage name to "Deploy to STG".
138 | 4. Under Agent job, fill in information as shown:
139 |
140 | 
141 |
142 | 5. Add a step to the Agent job by select the "+" icon.
143 |
144 |
145 |
146 | ## Known Issues, Limitations and Workarounds
147 | - Currently, ADLS Gen2 cannot be managed via the az cli 2.0.
148 | - **Workaround**: Use the REST API to automate creation of the File System.
149 | - Databricks KeyVault-backed secrets scopes can only be create via the UI, and thus cannot be created programmatically and was not incorporated in the automated deployment of the solution.
150 | - **Workaround**: Use normal Databricks secrets with the downside of duplicated information.
151 | - Databricks Personal Access Tokens can only be created via the UI.
152 | - **Workaround**: User is asked to supply the tokens during deployment, which is unfortunately cumbersome.
153 | - Data Factory Databricks Linked Service does not support dynamic configuration, thus needing a manual step to point to new cluster during deployment of pipeline to a new environment.
154 | - **Workaround**: Alternative is to create an on-demand cluster however this may introduce latency issues with cluster spin up time. Optionally, user can manually update Linked Service to point to correct cluster.
155 |
156 | ## Data
157 |
158 | ### Physical layout
159 |
160 | ADLS Gen2 is structured as the following:
161 | ------------
162 |
163 | datalake <- filesystem
164 | /libs <- contains all libs, jars, wheels needed for processing
165 | /data
166 | /lnd <- landing folder where all data files are ingested into.
167 | /interim <- interim (cleanesed) tables
168 | /dw <- final tables
169 |
170 |
171 | ------------
172 |
173 |
174 | All data procured here: https://www.melbourne.vic.gov.au/about-council/governance-transparency/open-data/Pages/on-street-parking-data.aspx
175 |
--------------------------------------------------------------------------------
/adf/_scripts/deploymentadf.ps1:
--------------------------------------------------------------------------------
1 | param
2 | (
3 | [parameter(Mandatory = $false)] [String] $rootFolder,
4 | [parameter(Mandatory = $false)] [String] $armTemplate,
5 | [parameter(Mandatory = $false)] [String] $ResourceGroupName,
6 | [parameter(Mandatory = $false)] [String] $DataFactoryName,
7 | [parameter(Mandatory = $false)] [Bool] $predeployment=$true,
8 | [parameter(Mandatory = $false)] [Bool] $deleteDeployment=$false
9 | )
10 |
11 | $templateJson = Get-Content $armTemplate | ConvertFrom-Json
12 | $resources = $templateJson.resources
13 |
14 | #Triggers
15 | Write-Host "Getting triggers"
16 | $triggersADF = Get-AzDataFactoryV2Trigger -DataFactoryName $DataFactoryName -ResourceGroupName $ResourceGroupName
17 | $triggersTemplate = $resources | Where-Object { $_.type -eq "Microsoft.DataFactory/factories/triggers" }
18 | $triggerNames = $triggersTemplate | ForEach-Object {$_.name.Substring(37, $_.name.Length-40)}
19 | $activeTriggerNames = $triggersTemplate | Where-Object { $_.properties.runtimeState -eq "Started" -and ($_.properties.pipelines.Count -gt 0 -or $_.properties.pipeline.pipelineReference -ne $null)} | ForEach-Object {$_.name.Substring(37, $_.name.Length-40)}
20 | $deletedtriggers = $triggersADF | Where-Object { $triggerNames -notcontains $_.Name }
21 | $triggerstostop = $triggerNames | where { ($triggersADF | Select-Object name).name -contains $_ }
22 |
23 | if ($predeployment -eq $true) {
24 | #Stop all triggers
25 | Write-Host "Stopping deployed triggers"
26 | $triggerstostop | ForEach-Object {
27 | Write-host "Disabling trigger " $_
28 | Stop-AzDataFactoryV2Trigger -ResourceGroupName $ResourceGroupName -DataFactoryName $DataFactoryName -Name $_ -Force
29 | }
30 | }
31 | else {
32 | #Deleted resources
33 | #pipelines
34 | Write-Host "Getting pipelines"
35 | $pipelinesADF = Get-AzDataFactoryV2Pipeline -DataFactoryName $DataFactoryName -ResourceGroupName $ResourceGroupName
36 | $pipelinesTemplate = $resources | Where-Object { $_.type -eq "Microsoft.DataFactory/factories/pipelines" }
37 | $pipelinesNames = $pipelinesTemplate | ForEach-Object {$_.name.Substring(37, $_.name.Length-40)}
38 | $deletedpipelines = $pipelinesADF | Where-Object { $pipelinesNames -notcontains $_.Name }
39 | #datasets
40 | Write-Host "Getting datasets"
41 | $datasetsADF = Get-AzDataFactoryV2Dataset -DataFactoryName $DataFactoryName -ResourceGroupName $ResourceGroupName
42 | $datasetsTemplate = $resources | Where-Object { $_.type -eq "Microsoft.DataFactory/factories/datasets" }
43 | $datasetsNames = $datasetsTemplate | ForEach-Object {$_.name.Substring(37, $_.name.Length-40) }
44 | $deleteddataset = $datasetsADF | Where-Object { $datasetsNames -notcontains $_.Name }
45 | #linkedservices
46 | Write-Host "Getting linked services"
47 | $linkedservicesADF = Get-AzDataFactoryV2LinkedService -DataFactoryName $DataFactoryName -ResourceGroupName $ResourceGroupName
48 | $linkedservicesTemplate = $resources | Where-Object { $_.type -eq "Microsoft.DataFactory/factories/linkedservices" }
49 | $linkedservicesNames = $linkedservicesTemplate | ForEach-Object {$_.name.Substring(37, $_.name.Length-40)}
50 | $deletedlinkedservices = $linkedservicesADF | Where-Object { $linkedservicesNames -notcontains $_.Name }
51 | #Integrationruntimes
52 | Write-Host "Getting integration runtimes"
53 | $integrationruntimesADF = Get-AzDataFactoryV2IntegrationRuntime -DataFactoryName $DataFactoryName -ResourceGroupName $ResourceGroupName
54 | $integrationruntimesTemplate = $resources | Where-Object { $_.type -eq "Microsoft.DataFactory/factories/integrationruntimes" }
55 | $integrationruntimesNames = $integrationruntimesTemplate | ForEach-Object {$_.name.Substring(37, $_.name.Length-40)}
56 | $deletedintegrationruntimes = $integrationruntimesADF | Where-Object { $integrationruntimesNames -notcontains $_.Name }
57 |
58 | #Delete resources
59 | Write-Host "Deleting triggers"
60 | $deletedtriggers | ForEach-Object {
61 | Write-Host "Deleting trigger " $_.Name
62 | $trig = Get-AzDataFactoryV2Trigger -name $_.Name -ResourceGroupName $ResourceGroupName -DataFactoryName $DataFactoryName
63 | if ($trig.RuntimeState -eq "Started") {
64 | Stop-AzDataFactoryV2Trigger -ResourceGroupName $ResourceGroupName -DataFactoryName $DataFactoryName -Name $_.Name -Force
65 | }
66 | Remove-AzDataFactoryV2Trigger -Name $_.Name -ResourceGroupName $ResourceGroupName -DataFactoryName $DataFactoryName -Force
67 | }
68 | Write-Host "Deleting pipelines"
69 | $deletedpipelines | ForEach-Object {
70 | Write-Host "Deleting pipeline " $_.Name
71 | Remove-AzDataFactoryV2Pipeline -Name $_.Name -ResourceGroupName $ResourceGroupName -DataFactoryName $DataFactoryName -Force
72 | }
73 | Write-Host "Deleting datasets"
74 | $deleteddataset | ForEach-Object {
75 | Write-Host "Deleting dataset " $_.Name
76 | Remove-AzDataFactoryV2Dataset -Name $_.Name -ResourceGroupName $ResourceGroupName -DataFactoryName $DataFactoryName -Force
77 | }
78 | Write-Host "Deleting linked services"
79 | $deletedlinkedservices | ForEach-Object {
80 | Write-Host "Deleting Linked Service " $_.Name
81 | Remove-AzDataFactoryV2LinkedService -Name $_.Name -ResourceGroupName $ResourceGroupName -DataFactoryName $DataFactoryName -Force
82 | }
83 | Write-Host "Deleting integration runtimes"
84 | $deletedintegrationruntimes | ForEach-Object {
85 | Write-Host "Deleting integration runtime " $_.Name
86 | Remove-AzDataFactoryV2IntegrationRuntime -Name $_.Name -ResourceGroupName $ResourceGroupName -DataFactoryName $DataFactoryName -Force
87 | }
88 |
89 | if ($deleteDeployment -eq $true) {
90 | Write-Host "Deleting ARM deployment ... under resource group: " $ResourceGroupName
91 | $deployments = Get-AzResourceGroupDeployment -ResourceGroupName $ResourceGroupName
92 | $deploymentsToConsider = $deployments | Where { $_.DeploymentName -like "ArmTemplate_master*" -or $_.DeploymentName -like "ArmTemplateForFactory*" } | Sort-Object -Property Timestamp -Descending
93 | $deploymentName = $deploymentsToConsider[0].DeploymentName
94 |
95 | Write-Host "Deployment to be deleted: " $deploymentName
96 | $deploymentOperations = Get-AzResourceGroupDeploymentOperation -DeploymentName $deploymentName -ResourceGroupName $ResourceGroupName
97 | $deploymentsToDelete = $deploymentOperations | Where { $_.properties.targetResource.id -like "*Microsoft.Resources/deployments*" }
98 |
99 | $deploymentsToDelete | ForEach-Object {
100 | Write-host "Deleting inner deployment: " $_.properties.targetResource.id
101 | Remove-AzResourceGroupDeployment -Id $_.properties.targetResource.id
102 | }
103 | Write-Host "Deleting deployment: " $deploymentName
104 | Remove-AzResourceGroupDeployment -ResourceGroupName $ResourceGroupName -Name $deploymentName
105 | }
106 |
107 | #Start Active triggers - After cleanup efforts
108 | Write-Host "Starting active triggers"
109 | $activeTriggerNames | ForEach-Object {
110 | Write-host "Enabling trigger " $_
111 | Start-AzDataFactoryV2Trigger -ResourceGroupName $ResourceGroupName -DataFactoryName $DataFactoryName -Name $_ -Force
112 | }
113 | }
--------------------------------------------------------------------------------
/adf/arm-template-parameters-definition.json:
--------------------------------------------------------------------------------
1 | {
2 | "Microsoft.DataFactory/factories/pipelines": {
3 | "properties": {
4 | "activities": [{
5 | "typeProperties": {
6 | "notebookPath": "=",
7 | "libraries": [{
8 | "egg": "="
9 | }]
10 | }
11 | }]
12 | }
13 | },
14 | "Microsoft.DataFactory/factories/integrationRuntimes":{
15 | "properties": {
16 | "typeProperties": {
17 | "ssisProperties": {
18 | "catalogInfo": {
19 | "catalogServerEndpoint": "=",
20 | "catalogAdminUserName": "=",
21 | "catalogAdminPassword": {
22 | "value": "-::secureString"
23 | }
24 | },
25 | "customSetupScriptProperties": {
26 | "sasToken": {
27 | "value": "-::secureString"
28 | }
29 | }
30 | },
31 | "linkedInfo": {
32 | "key": {
33 | "value": "-::secureString"
34 | },
35 | "resourceId": "="
36 | }
37 | }
38 | }
39 | },
40 | "Microsoft.DataFactory/factories/triggers": {
41 | "properties": {
42 | "pipelines": [{
43 | "parameters": {
44 | "*": "="
45 | }
46 | },
47 | "pipelineReference.referenceName"
48 | ],
49 | "pipeline": {
50 | "parameters": {
51 | "*": "="
52 | }
53 | },
54 | "typeProperties": {
55 | "scope": "="
56 | }
57 |
58 | }
59 | },
60 | "Microsoft.DataFactory/factories/linkedServices": {
61 | "*": {
62 | "properties": {
63 | "typeProperties": {
64 | "accountName": "=",
65 | "username": "=",
66 | "userName": "=",
67 | "accessKeyId": "=",
68 | "servicePrincipalId": "=",
69 | "userId": "=",
70 | "clientId": "=",
71 | "clusterUserName": "=",
72 | "clusterSshUserName": "=",
73 | "hostSubscriptionId": "=",
74 | "clusterResourceGroup": "=",
75 | "subscriptionId": "=",
76 | "resourceGroupName": "=",
77 | "tenant": "=",
78 | "dataLakeStoreUri": "=",
79 | "baseUrl": "=",
80 | "database": "=",
81 | "serviceEndpoint": "=",
82 | "batchUri": "=",
83 | "databaseName": "=",
84 | "systemNumber": "=",
85 | "server": "=",
86 | "url":"=",
87 | "aadResourceId": "=",
88 | "connectionString": "|:-connectionString:secureString"
89 | }
90 | }
91 | },
92 | "Odbc": {
93 | "properties": {
94 | "typeProperties": {
95 | "userName": "=",
96 | "connectionString": {
97 | "secretName": "="
98 | }
99 | }
100 | }
101 | }
102 | },
103 | "Microsoft.DataFactory/factories/datasets": {
104 | "*": {
105 | "properties": {
106 | "typeProperties": {
107 | "folderPath": "=",
108 | "fileName": "="
109 | }
110 | }
111 | }}
112 | }
--------------------------------------------------------------------------------
/adf/dataset/Ds_AdlsGen2_MelbParkingData.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "Ds_AdlsGen2_MelbParkingData",
3 | "properties": {
4 | "linkedServiceName": {
5 | "referenceName": "Ls_AdlsGen2_01",
6 | "type": "LinkedServiceReference"
7 | },
8 | "parameters": {
9 | "infilefolder": {
10 | "type": "String"
11 | },
12 | "infilename": {
13 | "type": "String"
14 | },
15 | "container": {
16 | "type": "String",
17 | "defaultValue": "datalake/data/lnd"
18 | }
19 | },
20 | "type": "AzureBlobFSFile",
21 | "typeProperties": {
22 | "format": {
23 | "type": "JsonFormat",
24 | "filePattern": "arrayOfObjects"
25 | },
26 | "fileName": {
27 | "value": "@dataset().infilename",
28 | "type": "Expression"
29 | },
30 | "folderPath": {
31 | "value": "@concat(dataset().container, '/', dataset().infilefolder)",
32 | "type": "Expression"
33 | }
34 | }
35 | }
36 | }
--------------------------------------------------------------------------------
/adf/dataset/Ds_REST_MelbParkingData.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "Ds_REST_MelbParkingData",
3 | "properties": {
4 | "linkedServiceName": {
5 | "referenceName": "Ls_Rest_MelParkSensors_01",
6 | "type": "LinkedServiceReference"
7 | },
8 | "parameters": {
9 | "relativeurl": {
10 | "type": "String"
11 | }
12 | },
13 | "annotations": [],
14 | "type": "RestResource",
15 | "typeProperties": {
16 | "relativeUrl": {
17 | "value": "@dataset().relativeurl",
18 | "type": "Expression"
19 | },
20 | "requestMethod": "GET"
21 | }
22 | }
23 | }
--------------------------------------------------------------------------------
/adf/linkedService/Ls_AdlsGen2_01.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "Ls_AdlsGen2_01",
3 | "properties": {
4 | "annotations": [],
5 | "type": "AzureBlobFS",
6 | "typeProperties": {
7 | "url": "https://ddostordevvnhf6tvx.dfs.core.windows.net/",
8 | "accountKey": {
9 | "type": "AzureKeyVaultSecret",
10 | "store": {
11 | "referenceName": "Ls_KeyVault",
12 | "type": "LinkedServiceReference"
13 | },
14 | "secretName": "storageKey"
15 | }
16 | }
17 | }
18 | }
--------------------------------------------------------------------------------
/adf/linkedService/Ls_AzureSQLDW_01.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "Ls_AzureSQLDW_01",
3 | "properties": {
4 | "annotations": [],
5 | "type": "AzureSqlDW",
6 | "typeProperties": {
7 | "connectionString": {
8 | "type": "AzureKeyVaultSecret",
9 | "store": {
10 | "referenceName": "Ls_KeyVault",
11 | "type": "LinkedServiceReference"
12 | },
13 | "secretName": "sqldwConnectionString"
14 | }
15 | }
16 | }
17 | }
--------------------------------------------------------------------------------
/adf/linkedService/Ls_KeyVault.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "Ls_KeyVault",
3 | "properties": {
4 | "annotations": [],
5 | "type": "AzureKeyVault",
6 | "typeProperties": {
7 | "baseUrl": "https://ddokvdevvnhf6tvx.vault.azure.net/"
8 | }
9 | }
10 | }
--------------------------------------------------------------------------------
/adf/linkedService/Ls_Rest_MelParkSensors_01.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "Ls_Rest_MelParkSensors_01",
3 | "type": "Microsoft.DataFactory/factories/linkedservices",
4 | "properties": {
5 | "annotations": [],
6 | "type": "RestService",
7 | "typeProperties": {
8 | "url": "https://data.melbourne.vic.gov.au/resource/",
9 | "enableServerCertificateValidation": true,
10 | "authenticationType": "Anonymous"
11 | }
12 | }
13 | }
--------------------------------------------------------------------------------
/adf/linkedService/Ls_adb_01.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "Ls_adb_01",
3 | "properties": {
4 | "type": "AzureDatabricks",
5 | "typeProperties": {
6 | "domain": "https://australiaeast.azuredatabricks.net",
7 | "accessToken": {
8 | "type": "AzureKeyVaultSecret",
9 | "store": {
10 | "referenceName": "Ls_KeyVault",
11 | "type": "LinkedServiceReference"
12 | },
13 | "secretName": "dbricksToken"
14 | },
15 | "newClusterNodeType": "Standard_DS3_v2",
16 | "newClusterNumOfWorker": "1",
17 | "newClusterSparkEnvVars": {
18 | "PYSPARK_PYTHON": "/databricks/python3/bin/python3"
19 | },
20 | "newClusterVersion": "5.4.x-scala2.11"
21 | }
22 | },
23 | "type": "Microsoft.DataFactory/factories/linkedservices"
24 | }
--------------------------------------------------------------------------------
/adf/pipeline/P_Ingest_MelbParkingData.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "P_Ingest_MelbParkingData",
3 | "properties": {
4 | "description": "Hello from NDC Sydney!!!",
5 | "activities": [
6 | {
7 | "name": "StandardizeData",
8 | "description": "",
9 | "type": "DatabricksNotebook",
10 | "dependsOn": [
11 | {
12 | "activity": "DownloadSensorData",
13 | "dependencyConditions": [
14 | "Succeeded"
15 | ]
16 | },
17 | {
18 | "activity": "DownloadBayData",
19 | "dependencyConditions": [
20 | "Succeeded"
21 | ]
22 | }
23 | ],
24 | "policy": {
25 | "timeout": "7.00:00:00",
26 | "retry": 0,
27 | "retryIntervalInSeconds": 30,
28 | "secureOutput": false,
29 | "secureInput": false
30 | },
31 | "userProperties": [],
32 | "typeProperties": {
33 | "notebookPath": "/notebooks/02_standardize",
34 | "baseParameters": {
35 | "infilefolder": {
36 | "value": "@variables('infilefolder')",
37 | "type": "Expression"
38 | },
39 | "loadid": {
40 | "value": "@pipeline().RunId",
41 | "type": "Expression"
42 | }
43 | },
44 | "libraries": [
45 | {
46 | "egg": "dbfs:/mnt/datalake/libs/ddo_transform-1.0.0-py2.py3-none-any.whl"
47 | },
48 | {
49 | "pypi": {
50 | "package": "applicationinsights"
51 | }
52 | }
53 | ]
54 | },
55 | "linkedServiceName": {
56 | "referenceName": "Ls_adb_01",
57 | "type": "LinkedServiceReference"
58 | }
59 | },
60 | {
61 | "name": "Set infilefolder",
62 | "type": "SetVariable",
63 | "dependsOn": [],
64 | "userProperties": [],
65 | "typeProperties": {
66 | "variableName": "infilefolder",
67 | "value": {
68 | "value": "@utcnow('yyyy_MM_dd_hh_mm_ss')",
69 | "type": "Expression"
70 | }
71 | }
72 | },
73 | {
74 | "name": "DownloadSensorData",
75 | "type": "Copy",
76 | "dependsOn": [
77 | {
78 | "activity": "Set infilefolder",
79 | "dependencyConditions": [
80 | "Succeeded"
81 | ]
82 | }
83 | ],
84 | "policy": {
85 | "timeout": "7.00:00:00",
86 | "retry": 0,
87 | "retryIntervalInSeconds": 30,
88 | "secureOutput": false,
89 | "secureInput": false
90 | },
91 | "userProperties": [],
92 | "typeProperties": {
93 | "source": {
94 | "type": "RestSource",
95 | "httpRequestTimeout": "00:01:40",
96 | "requestInterval": "00.00:00:00.010"
97 | },
98 | "sink": {
99 | "type": "AzureBlobFSSink"
100 | },
101 | "enableStaging": false
102 | },
103 | "inputs": [
104 | {
105 | "referenceName": "Ds_REST_MelbParkingData",
106 | "type": "DatasetReference",
107 | "parameters": {
108 | "relativeurl": "dtpv-d4pf.json"
109 | }
110 | }
111 | ],
112 | "outputs": [
113 | {
114 | "referenceName": "Ds_AdlsGen2_MelbParkingData",
115 | "type": "DatasetReference",
116 | "parameters": {
117 | "infilefolder": {
118 | "value": "@variables('infilefolder')",
119 | "type": "Expression"
120 | },
121 | "infilename": "MelbParkingSensorData.json",
122 | "container": "datalake/data/lnd"
123 | }
124 | }
125 | ]
126 | },
127 | {
128 | "name": "DownloadBayData",
129 | "type": "Copy",
130 | "dependsOn": [
131 | {
132 | "activity": "Set infilefolder",
133 | "dependencyConditions": [
134 | "Succeeded"
135 | ]
136 | }
137 | ],
138 | "policy": {
139 | "timeout": "7.00:00:00",
140 | "retry": 0,
141 | "retryIntervalInSeconds": 30,
142 | "secureOutput": false,
143 | "secureInput": false
144 | },
145 | "userProperties": [],
146 | "typeProperties": {
147 | "source": {
148 | "type": "RestSource",
149 | "httpRequestTimeout": "00:01:40",
150 | "requestInterval": "00.00:00:00.010"
151 | },
152 | "sink": {
153 | "type": "AzureBlobFSSink"
154 | },
155 | "enableStaging": false
156 | },
157 | "inputs": [
158 | {
159 | "referenceName": "Ds_REST_MelbParkingData",
160 | "type": "DatasetReference",
161 | "parameters": {
162 | "relativeurl": "wuf8-susg.json"
163 | }
164 | }
165 | ],
166 | "outputs": [
167 | {
168 | "referenceName": "Ds_AdlsGen2_MelbParkingData",
169 | "type": "DatasetReference",
170 | "parameters": {
171 | "infilefolder": {
172 | "value": "@variables('infilefolder')",
173 | "type": "Expression"
174 | },
175 | "infilename": "MelbParkingBayData.json",
176 | "container": "datalake/data/lnd"
177 | }
178 | }
179 | ]
180 | },
181 | {
182 | "name": "TransformData",
183 | "type": "DatabricksNotebook",
184 | "dependsOn": [
185 | {
186 | "activity": "StandardizeData",
187 | "dependencyConditions": [
188 | "Succeeded"
189 | ]
190 | }
191 | ],
192 | "policy": {
193 | "timeout": "7.00:00:00",
194 | "retry": 0,
195 | "retryIntervalInSeconds": 30,
196 | "secureOutput": false,
197 | "secureInput": false
198 | },
199 | "userProperties": [],
200 | "typeProperties": {
201 | "notebookPath": "/notebooks/03_transform",
202 | "baseParameters": {
203 | "loadid": {
204 | "value": "@pipeline().RunId",
205 | "type": "Expression"
206 | }
207 | },
208 | "libraries": [
209 | {
210 | "egg": "dbfs:/mnt/datalake/libs/ddo_transform-1.0.0-py2.py3-none-any.whl"
211 | },
212 | {
213 | "pypi": {
214 | "package": "applicationinsights"
215 | }
216 | }
217 | ]
218 | },
219 | "linkedServiceName": {
220 | "referenceName": "Ls_adb_01",
221 | "type": "LinkedServiceReference"
222 | }
223 | },
224 | {
225 | "name": "Load SQLDW",
226 | "type": "SqlServerStoredProcedure",
227 | "dependsOn": [
228 | {
229 | "activity": "TransformData",
230 | "dependencyConditions": [
231 | "Succeeded"
232 | ]
233 | }
234 | ],
235 | "policy": {
236 | "timeout": "7.00:00:00",
237 | "retry": 0,
238 | "retryIntervalInSeconds": 30,
239 | "secureOutput": false,
240 | "secureInput": false
241 | },
242 | "userProperties": [],
243 | "typeProperties": {
244 | "storedProcedureName": "[dbo].[load_dw]",
245 | "storedProcedureParameters": {
246 | "load_id": {
247 | "value": {
248 | "value": "@pipeline().RunId",
249 | "type": "Expression"
250 | },
251 | "type": "String"
252 | }
253 | }
254 | },
255 | "linkedServiceName": {
256 | "referenceName": "Ls_AzureSQLDW_01",
257 | "type": "LinkedServiceReference"
258 | }
259 | }
260 | ],
261 | "variables": {
262 | "infilefolder": {
263 | "type": "String",
264 | "defaultValue": "lnd"
265 | }
266 | },
267 | "annotations": []
268 | }
269 | }
--------------------------------------------------------------------------------
/adf/trigger/T_Sched.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "T_Sched",
3 | "properties": {
4 | "annotations": [],
5 | "runtimeState": "Stopped",
6 | "pipelines": [
7 | {
8 | "pipelineReference": {
9 | "referenceName": "P_Ingest_MelbParkingData",
10 | "type": "PipelineReference"
11 | }
12 | }
13 | ],
14 | "type": "ScheduleTrigger",
15 | "typeProperties": {
16 | "recurrence": {
17 | "frequency": "Hour",
18 | "interval": 1,
19 | "startTime": "2019-02-01T05:28:00.000Z",
20 | "timeZone": "UTC"
21 | }
22 | }
23 | }
24 | }
--------------------------------------------------------------------------------
/clean_up.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Access granted under MIT Open Source License: https://en.wikipedia.org/wiki/MIT_License
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
6 | # documentation files (the "Software"), to deal in the Software without restriction, including without limitation
7 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, # and/or sell copies of the Software,
8 | # and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
9 | #
10 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions
11 | # of the Software.
12 | #
13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
14 | # TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
15 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
16 | # CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
17 | # DEALINGS IN THE SOFTWARE.
18 |
19 | set -o errexit
20 | set -o pipefail
21 | set -o nounset
22 | # set -o xtrace # For debugging
23 |
24 | ###################
25 | # PARAMETERS
26 |
27 | env_name="${1-}"
28 |
29 | # Import correct .env file
30 | set -o allexport
31 | env_file=".env.$env_name"
32 | if [[ -e $env_file ]]
33 | then
34 | source $env_file
35 | fi
36 | set +o allexport
37 |
38 | az group delete -g $RESOURCE_GROUP -y --no-wait
39 | az ad sp delete --id $SP_STOR_ID
--------------------------------------------------------------------------------
/data/raw_data/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devlace/datadevops/e6c564a674a8264eed94fa6a8a8056e3b450525c/data/raw_data/README.md
--------------------------------------------------------------------------------
/databricks/config/cluster.config.json:
--------------------------------------------------------------------------------
1 | {
2 | "cluster_name": "ddo_cluster",
3 | "autoscale": { "min_workers": 1, "max_workers": 2 },
4 | "spark_version": "5.5.x-scala2.11",
5 | "autotermination_minutes": 30,
6 | "node_type_id": "Standard_DS3_v2",
7 | "driver_node_type_id": "Standard_DS3_v2",
8 | "spark_env_vars": {
9 | "PYSPARK_PYTHON": "/databricks/python3/bin/python3",
10 | "MOUNT_DATA_PATH": "/mnt/datalake",
11 | "MOUNT_DATA_CONTAINER": "datalake",
12 | "DATABASE": "datalake"
13 | }
14 | }
--------------------------------------------------------------------------------
/databricks/config/cluster.config.template.json:
--------------------------------------------------------------------------------
1 | {
2 | "cluster_name": "__REPLACE_CLUSTER_NAME__",
3 | "autoscale": { "min_workers": 1, "max_workers": 2 },
4 | "spark_version": "5.5.x-scala2.11",
5 | "autotermination_minutes": 120,
6 | "node_type_id": "Standard_DS3_v2",
7 | "driver_node_type_id": "Standard_DS3_v2",
8 | "spark_env_vars": {
9 | "PYSPARK_PYTHON": "/databricks/python3/bin/python3",
10 | "MOUNT_DATA_PATH": "__REPLACE_MOUNT_DATA_PATH__",
11 | "MOUNT_DATA_CONTAINER": "__REPLACE_MOUNT_DATA_CONTAINER__",
12 | "DATABASE": "__REPLACE_DATABASE__"
13 | }
14 | }
--------------------------------------------------------------------------------
/databricks/config/run.setup.config.json:
--------------------------------------------------------------------------------
1 | {
2 | "run_name": "Setup workspace",
3 | "new_cluster": {
4 | "spark_version": "5.5.x-scala2.11",
5 | "node_type_id": "Standard_DS3_v2",
6 | "num_workers": 1,
7 | "spark_env_vars": {
8 | "PYSPARK_PYTHON": "/databricks/python3/bin/python3",
9 | "MOUNT_DATA_PATH": "/mnt/datalake",
10 | "MOUNT_DATA_CONTAINER": "datalake",
11 | "DATABASE": "datalake"
12 | }
13 | },
14 | "libraries": [],
15 | "timeout_seconds": 3600,
16 | "notebook_task": {
17 | "notebook_path": "/notebooks/00_setup"
18 | }
19 | }
--------------------------------------------------------------------------------
/databricks/configure_databricks.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Access granted under MIT Open Source License: https://en.wikipedia.org/wiki/MIT_License
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
6 | # documentation files (the "Software"), to deal in the Software without restriction, including without limitation
7 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, # and/or sell copies of the Software,
8 | # and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
9 | #
10 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions
11 | # of the Software.
12 | #
13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
14 | # TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
15 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
16 | # CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
17 | # DEALINGS IN THE SOFTWARE.
18 |
19 |
20 | set -o errexit
21 | set -o pipefail
22 | set -o nounset
23 | # set -o xtrace # For debugging
24 |
25 | # Set path
26 | parent_dir=$(pwd -P)
27 | dir_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P ); cd "$dir_path"
28 |
29 | # Constants
30 | RED='\033[0;31m'
31 | ORANGE='\033[0;33m'
32 | NC='\033[0m'
33 |
34 | CLUSTER_CONFIG="./config/cluster.config.json"
35 | MOUNT_DATA_PATH="/mnt/datalake"
36 |
37 | ###################
38 | # USER PARAMETERS
39 | env_name="${1-}"
40 |
41 | # Import correct .env file
42 | set -o allexport
43 | env_file="../.env.$env_name"
44 | if [[ -e $env_file ]]
45 | then
46 | source $env_file
47 | fi
48 | set +o allexport
49 |
50 |
51 | wait_for_run () {
52 | # See here: https://docs.azuredatabricks.net/api/latest/jobs.html#jobsrunresultstate
53 | declare mount_run_id=$1
54 | while : ; do
55 | life_cycle_status=$(databricks runs get --run-id $mount_run_id | jq -r ".state.life_cycle_state")
56 | result_state=$(databricks runs get --run-id $mount_run_id | jq -r ".state.result_state")
57 | if [[ $result_state == "SUCCESS" || $result_state == "SKIPPED" ]]; then
58 | break;
59 | elif [[ $life_cycle_status == "INTERNAL_ERROR" || $result_state == "FAILED" ]]; then
60 | state_message=$(databricks runs get --run-id $mount_run_id | jq -r ".state.state_message")
61 | echo -e "${RED}Error while running ${mount_run_id}: ${state_message} ${NC}"
62 | exit 1
63 | else
64 | echo "Waiting for run ${mount_run_id} to finish..."
65 | sleep 2m
66 | fi
67 | done
68 | }
69 |
70 |
71 | cluster_exists () {
72 | declare cluster_name="$1"
73 | declare cluster=$(databricks clusters list | tr -s " " | cut -d" " -f2 | grep ^${cluster_name}$)
74 | if [[ -n $cluster ]]; then
75 | return 0; # cluster exists
76 | else
77 | return 1; # cluster does not exists
78 | fi
79 | }
80 |
81 |
82 | _main() {
83 | # Upload notebooks
84 | echo "Uploading notebooks..."
85 | databricks workspace import_dir "notebooks" "/notebooks" --overwrite
86 |
87 | # Setup workspace
88 | echo "Setting up workspace and tables. This may take a while as cluster spins up..."
89 | wait_for_run $(databricks runs submit --json-file "./config/run.setup.config.json" | jq -r ".run_id" )
90 |
91 | # Create initial cluster, if not yet exists
92 | echo "Creating an interactive cluster..."
93 | cluster_name=$(cat $CLUSTER_CONFIG | jq -r ".cluster_name")
94 | if cluster_exists $cluster_name; then
95 | echo "Cluster ${cluster_name} already exists!"
96 | else
97 | echo "Creating cluster ${cluster_name}..."
98 | databricks clusters create --json-file $CLUSTER_CONFIG
99 | fi
100 |
101 | # Upload dependencies
102 | echo "Uploading libraries dependencies..."
103 | databricks fs cp ./libs/ "dbfs:${MOUNT_DATA_PATH}/libs/" --recursive --overwrite
104 |
105 | # Install Library dependencies
106 | echo "Installing library depedencies..."
107 | cluster_id=$(databricks clusters list | awk '/'$cluster_name'/ {print $1}')
108 | databricks libraries install \
109 | --jar "dbfs:${MOUNT_DATA_PATH}/libs/azure-cosmosdb-spark_2.3.0_2.11-1.2.2-uber.jar" \
110 | --cluster-id $cluster_id
111 |
112 | }
113 |
114 | _main
115 |
116 |
117 | echo "Return to parent script dir: $parent_dir"
118 | cd "$parent_dir"
--------------------------------------------------------------------------------
/databricks/create_secrets.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Access granted under MIT Open Source License: https://en.wikipedia.org/wiki/MIT_License
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
6 | # documentation files (the "Software"), to deal in the Software without restriction, including without limitation
7 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, # and/or sell copies of the Software,
8 | # and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
9 | #
10 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions
11 | # of the Software.
12 | #
13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
14 | # TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
15 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
16 | # CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
17 | # DEALINGS IN THE SOFTWARE.
18 |
19 |
20 | set -o errexit
21 | set -o pipefail
22 | set -o nounset
23 | # set -o xtrace # For debugging
24 |
25 | # Set path
26 | parent_dir=$(pwd -P)
27 | dir_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P ); cd "$dir_path"
28 |
29 | # Set constants
30 | scope_name="storage_scope" # fixed # TODO pass via arm template
31 |
32 | ###################
33 | # Requires the following to be set:
34 | #
35 | # BLOB_STORAGE_ACCOUNT=
36 | # SP_STOR_ID=
37 | # SP_STOR_PASS=
38 | # SP_STOR_TENANT=
39 |
40 |
41 | ###################
42 | # USER PARAMETERS
43 | env_name="${1-}"
44 |
45 | # Import correct .env file
46 | set -o allexport
47 | env_file="../.env.$env_name"
48 | if [[ -e $env_file ]]
49 | then
50 | source $env_file
51 | fi
52 | set +o allexport
53 |
54 | # Create scope, if not exists
55 | if [[ -z $(databricks secrets list-scopes | grep "$scope_name") ]]; then
56 | echo "Creating secrets scope: $scope_name"
57 | databricks secrets create-scope --scope "$scope_name"
58 | fi
59 |
60 | # Create secrets
61 | echo "Creating secrets within scope $scope_name..."
62 |
63 | databricks secrets write --scope "$scope_name" --key "storage_account" --string-value "$BLOB_STORAGE_ACCOUNT"
64 | databricks secrets write --scope "$scope_name" --key "storage_sp_id" --string-value "$SP_STOR_ID"
65 | databricks secrets write --scope "$scope_name" --key "storage_sp_key" --string-value "$SP_STOR_PASS"
66 | databricks secrets write --scope "$scope_name" --key "storage_sp_tenant" --string-value "$SP_STOR_TENANT"
67 |
68 |
69 | echo "Return to parent script dir: $parent_dir"
70 | cd "$parent_dir"
--------------------------------------------------------------------------------
/databricks/deploy_app.sh:
--------------------------------------------------------------------------------
1 |
2 | #!/bin/bash
3 |
4 | # Access granted under MIT Open Source License: https://en.wikipedia.org/wiki/MIT_License
5 | #
6 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
7 | # documentation files (the "Software"), to deal in the Software without restriction, including without limitation
8 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, # and/or sell copies of the Software,
9 | # and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
10 | #
11 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions
12 | # of the Software.
13 | #
14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
15 | # TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
16 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
17 | # CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
18 | # DEALINGS IN THE SOFTWARE.
19 |
20 |
21 | set -o errexit
22 | set -o pipefail
23 | set -o nounset
24 | set -o xtrace # For debugging
25 |
26 | # Set path
27 | dir_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )
28 | cd "$dir_path"
29 |
30 |
31 | ###################
32 | # Requires the following to be set:
33 | #
34 | # RELEASE_ID=
35 | # MOUNT_DATA_PATH=
36 | # WHEEL_FILE_PATH=
37 |
38 | # Set DBFS libs path
39 | dbfs_libs_path=dbfs:${MOUNT_DATA_PATH}/libs/release_${RELEASE_ID}
40 |
41 | # Upload dependencies
42 | echo "Uploading libraries dependencies to DBFS..."
43 | databricks fs cp ./libs/ "${dbfs_libs_path}" --recursive
44 |
45 | echo "Uploading app libraries to DBFS..."
46 | databricks fs cp $WHEEL_FILE_PATH "${dbfs_libs_path}"
47 |
48 | # Upload notebooks to workspace
49 | echo "Uploading notebooks to workspace..."
50 | databricks workspace import_dir "notebooks" "/releases/release_${RELEASE_ID}/"
51 |
--------------------------------------------------------------------------------
/databricks/libs/azure-cosmosdb-spark_2.3.0_2.11-1.2.2-uber.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devlace/datadevops/e6c564a674a8264eed94fa6a8a8056e3b450525c/databricks/libs/azure-cosmosdb-spark_2.3.0_2.11-1.2.2-uber.jar
--------------------------------------------------------------------------------
/databricks/notebooks/00_setup.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | # MAGIC %md
3 | # MAGIC ## Mount ADLS Gen2
4 |
5 | # COMMAND ----------
6 |
7 | import os
8 |
9 | # Set mount path
10 | storage_mount_data_path = os.environ['MOUNT_DATA_PATH']
11 | storage_mount_container = os.environ['MOUNT_DATA_CONTAINER']
12 |
13 | # Unmount if existing
14 | for mp in dbutils.fs.mounts():
15 | if mp.mountPoint == storage_mount_data_path:
16 | dbutils.fs.unmount(storage_mount_data_path)
17 |
18 | # Refresh mounts
19 | dbutils.fs.refreshMounts()
20 |
21 | # COMMAND ----------
22 |
23 | # Retrieve storage credentials
24 | storage_account = dbutils.secrets.get(scope = "storage_scope", key = "storage_account")
25 | storage_sp_id = dbutils.secrets.get(scope = "storage_scope", key = "storage_sp_id")
26 | storage_sp_key = dbutils.secrets.get(scope = "storage_scope", key = "storage_sp_key")
27 | storage_sp_tenant = dbutils.secrets.get(scope = "storage_scope", key = "storage_sp_tenant")
28 |
29 | # Mount
30 | configs = {"fs.azure.account.auth.type": "OAuth",
31 | "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
32 | "fs.azure.account.oauth2.client.id": storage_sp_id,
33 | "fs.azure.account.oauth2.client.secret": storage_sp_key,
34 | "fs.azure.account.oauth2.client.endpoint": "https://login.microsoftonline.com/" + storage_sp_tenant + "/oauth2/token"}
35 |
36 | # Optionally, you can add to the source URI of your mount point.
37 | dbutils.fs.mount(
38 | source = "abfss://" + storage_mount_container + "@" + storage_account + ".dfs.core.windows.net/",
39 | mount_point = storage_mount_data_path,
40 | extra_configs = configs)
41 |
42 | # Refresh mounts
43 | dbutils.fs.refreshMounts()
44 |
45 | # COMMAND ----------
46 |
47 | # MAGIC %md
48 | # MAGIC ## Create Tables
49 |
50 | # COMMAND ----------
51 |
52 | # MAGIC %sql
53 | # MAGIC CREATE SCHEMA IF NOT EXISTS dw;
54 | # MAGIC CREATE SCHEMA IF NOT EXISTS lnd;
55 | # MAGIC CREATE SCHEMA IF NOT EXISTS interim;
56 | # MAGIC CREATE SCHEMA IF NOT EXISTS malformed;
57 |
58 | # COMMAND ----------
59 |
60 | # MAGIC %sql
61 | # MAGIC -- FACT tables
62 | # MAGIC DROP TABLE IF EXISTS dw.fact_parking;
63 | # MAGIC CREATE TABLE dw.fact_parking (
64 | # MAGIC dim_date_id STRING,
65 | # MAGIC dim_time_id STRING,
66 | # MAGIC dim_parking_bay_id STRING,
67 | # MAGIC dim_location_id STRING,
68 | # MAGIC dim_st_marker_id STRING,
69 | # MAGIC status STRING,
70 | # MAGIC load_id STRING,
71 | # MAGIC loaded_on TIMESTAMP
72 | # MAGIC )
73 | # MAGIC USING parquet
74 | # MAGIC LOCATION '/mnt/datalake/data/dw/fact_parking/';
75 | # MAGIC
76 | # MAGIC REFRESH TABLE dw.fact_parking;
77 |
78 | # COMMAND ----------
79 |
80 | # MAGIC %sql
81 | # MAGIC -- DIMENSION tables
82 | # MAGIC DROP TABLE IF EXISTS dw.dim_st_marker;
83 | # MAGIC CREATE TABLE dw.dim_st_marker (
84 | # MAGIC dim_st_marker_id STRING,
85 | # MAGIC st_marker_id STRING,
86 | # MAGIC load_id STRING,
87 | # MAGIC loaded_on TIMESTAMP
88 | # MAGIC )
89 | # MAGIC USING parquet
90 | # MAGIC LOCATION '/mnt/datalake/data/dw/dim_st_marker/';
91 | # MAGIC
92 | # MAGIC REFRESH TABLE dw.dim_st_marker;
93 | # MAGIC
94 | # MAGIC --
95 | # MAGIC DROP TABLE IF EXISTS dw.dim_location;
96 | # MAGIC CREATE TABLE dw.dim_location (
97 | # MAGIC dim_location_id STRING,
98 | # MAGIC lat FLOAT,
99 | # MAGIC lon FLOAT,
100 | # MAGIC load_id STRING,
101 | # MAGIC loaded_on TIMESTAMP
102 | # MAGIC )
103 | # MAGIC USING parquet
104 | # MAGIC LOCATION '/mnt/datalake/data/dw/dim_location/';
105 | # MAGIC
106 | # MAGIC REFRESH TABLE dw.dim_location;
107 | # MAGIC
108 | # MAGIC --
109 | # MAGIC DROP TABLE IF EXISTS dw.dim_parking_bay;
110 | # MAGIC CREATE TABLE dw.dim_parking_bay (
111 | # MAGIC dim_parking_bay_id STRING,
112 | # MAGIC bay_id INT,
113 | # MAGIC `marker_id` STRING,
114 | # MAGIC `meter_id` STRING,
115 | # MAGIC `rd_seg_dsc` STRING,
116 | # MAGIC `rd_seg_id` STRING,
117 | # MAGIC load_id STRING,
118 | # MAGIC loaded_on TIMESTAMP
119 | # MAGIC )
120 | # MAGIC USING parquet
121 | # MAGIC LOCATION '/mnt/datalake/data/dw/dim_parking_bay/';
122 | # MAGIC
123 | # MAGIC REFRESH TABLE dw.dim_parking_bay;
124 |
125 | # COMMAND ----------
126 |
127 | # MAGIC %sql
128 | # MAGIC DROP TABLE IF EXISTS dw.dim_date;
129 | # MAGIC DROP TABLE IF EXISTS dw.dim_time;
130 |
131 | # COMMAND ----------
132 |
133 | from pyspark.sql.functions import col
134 | import os
135 | from urllib.request import urlretrieve
136 |
137 | def download_url(url, filename):
138 | # Create dir if not exist
139 | dir_path = os.path.dirname(filename)
140 | if not os.path.exists(dir_path):
141 | os.makedirs(dir_path)
142 | urlretrieve(url, filename)
143 |
144 | # Download data
145 | download_url("https://lacedemodata.blob.core.windows.net/data/DimDate.csv", "/dbfs/mnt/datalake/data/seed/DimDate.csv")
146 | download_url("https://lacedemodata.blob.core.windows.net/data/DimTime.csv", "/dbfs/mnt/datalake/data/seed/DimTime.csv")
147 |
148 | # DimDate
149 | dimdate = spark.read.csv("dbfs:/mnt/datalake/data/seed/DimDate.csv", header=True)
150 | dimdate.write.saveAsTable("dw.dim_date")
151 |
152 | # DimTime
153 | dimtime = spark.read.csv("dbfs:/mnt/datalake/data/seed/DimTime.csv", header=True)
154 | dimtime = dimtime.select(dimtime["second_of_day"].alias("dim_time_id"), col("*"))
155 | dimtime.write.saveAsTable("dw.dim_time")
156 |
157 | # COMMAND ----------
158 |
159 | # MAGIC %sql
160 | # MAGIC -- INTERIM tables
161 | # MAGIC DROP TABLE IF EXISTS interim.parking_bay;
162 | # MAGIC CREATE TABLE interim.parking_bay (
163 | # MAGIC bay_id INT,
164 | # MAGIC `last_edit` TIMESTAMP,
165 | # MAGIC `marker_id` STRING,
166 | # MAGIC `meter_id` STRING,
167 | # MAGIC `rd_seg_dsc` STRING,
168 | # MAGIC `rd_seg_id` STRING,
169 | # MAGIC `the_geom` STRUCT<`coordinates`: ARRAY>>>, `type`: STRING>,
170 | # MAGIC load_id STRING,
171 | # MAGIC loaded_on TIMESTAMP
172 | # MAGIC )
173 | # MAGIC USING parquet
174 | # MAGIC LOCATION '/mnt/datalake/data/interim/parking_bay/';
175 | # MAGIC
176 | # MAGIC REFRESH TABLE interim.parking_bay;
177 | # MAGIC
178 | # MAGIC --
179 | # MAGIC DROP TABLE IF EXISTS interim.sensor;
180 | # MAGIC CREATE TABLE interim.sensor (
181 | # MAGIC bay_id INT,
182 | # MAGIC `st_marker_id` STRING,
183 | # MAGIC `lat` FLOAT,
184 | # MAGIC `lon` FLOAT,
185 | # MAGIC `location` STRUCT<`coordinates`: ARRAY, `type`: STRING>,
186 | # MAGIC `status` STRING,
187 | # MAGIC load_id STRING,
188 | # MAGIC loaded_on TIMESTAMP
189 | # MAGIC )
190 | # MAGIC USING parquet
191 | # MAGIC LOCATION '/mnt/datalake/data/interim/sensors/';
192 | # MAGIC
193 | # MAGIC REFRESH TABLE interim.sensor;
194 |
195 | # COMMAND ----------
196 |
197 | # MAGIC %sql
198 | # MAGIC -- ERROR tables
199 | # MAGIC DROP TABLE IF EXISTS malformed.parking_bay;
200 | # MAGIC CREATE TABLE malformed.parking_bay (
201 | # MAGIC bay_id INT,
202 | # MAGIC `last_edit` TIMESTAMP,
203 | # MAGIC `marker_id` STRING,
204 | # MAGIC `meter_id` STRING,
205 | # MAGIC `rd_seg_dsc` STRING,
206 | # MAGIC `rd_seg_id` STRING,
207 | # MAGIC `the_geom` STRUCT<`coordinates`: ARRAY>>>, `type`: STRING>,
208 | # MAGIC load_id STRING,
209 | # MAGIC loaded_on TIMESTAMP
210 | # MAGIC )
211 | # MAGIC USING parquet
212 | # MAGIC LOCATION '/mnt/datalake/data/interim/parking_bay/';
213 | # MAGIC
214 | # MAGIC REFRESH TABLE interim.parking_bay;
215 | # MAGIC
216 | # MAGIC --
217 | # MAGIC DROP TABLE IF EXISTS malformed.sensor;
218 | # MAGIC CREATE TABLE malformed.sensor (
219 | # MAGIC bay_id INT,
220 | # MAGIC `st_marker_id` STRING,
221 | # MAGIC `lat` FLOAT,
222 | # MAGIC `lon` FLOAT,
223 | # MAGIC `location` STRUCT<`coordinates`: ARRAY, `type`: STRING>,
224 | # MAGIC `status` STRING,
225 | # MAGIC load_id STRING,
226 | # MAGIC loaded_on TIMESTAMP
227 | # MAGIC )
228 | # MAGIC USING parquet
229 | # MAGIC LOCATION '/mnt/datalake/data/interim/sensors/';
230 | # MAGIC
231 | # MAGIC REFRESH TABLE interim.sensor;
232 |
233 | # COMMAND ----------
234 |
235 |
236 |
--------------------------------------------------------------------------------
/databricks/notebooks/01_explore.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | import os
3 | import datetime
4 |
5 | # For testing
6 | base_path = 'dbfs:/mnt/datalake/data/lnd/2019_10_06_05_54_25'
7 | parkingbay_filepath = os.path.join(base_path, "MelbParkingBayData.json")
8 | sensors_filepath = os.path.join(base_path, "MelbParkingSensorData.json")
9 |
10 | # COMMAND ----------
11 |
12 | parkingbay_sdf = spark.read\
13 | .option("multiLine", True)\
14 | .json(parkingbay_filepath)
15 | sensordata_sdf = spark.read\
16 | .option("multiLine", True)\
17 | .json(sensors_filepath)
18 |
19 | # COMMAND ----------
20 |
21 | display(parkingbay_sdf)
22 |
23 | # COMMAND ----------
24 |
25 | display(sensordata_sdf)
26 |
27 | # COMMAND ----------
28 |
29 | display(sensordata_sdf)
30 |
--------------------------------------------------------------------------------
/databricks/notebooks/02_standardize.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | dbutils.widgets.text("infilefolder", "", "In - Folder Path")
3 | infilefolder = dbutils.widgets.get("infilefolder")
4 |
5 | dbutils.widgets.text("loadid", "", "Load Id")
6 | loadid = dbutils.widgets.get("loadid")
7 |
8 | # COMMAND ----------
9 |
10 | from applicationinsights import TelemetryClient
11 | tc = TelemetryClient(dbutils.secrets.get(scope = "storage_scope", key = "appinsights_key"))
12 |
13 | # COMMAND ----------
14 |
15 | import os
16 | import datetime
17 |
18 | # For testing
19 | # infilefolder = 'datalake/data/lnd/2019_03_11_01_38_00/'
20 | load_id = loadid
21 | loaded_on = datetime.datetime.now()
22 | base_path = os.path.join('dbfs:/mnt/datalake/data/lnd/', infilefolder)
23 | parkingbay_filepath = os.path.join(base_path, "MelbParkingBayData.json")
24 | sensors_filepath = os.path.join(base_path, "MelbParkingSensorData.json")
25 |
26 |
27 | # COMMAND ----------
28 |
29 | import ddo_transform.standardize as s
30 |
31 | # Retrieve schema
32 | parkingbay_schema = s.get_schema("in_parkingbay_schema")
33 | sensordata_schema = s.get_schema("in_sensordata_schema")
34 |
35 | # Read data
36 | parkingbay_sdf = spark.read\
37 | .schema(parkingbay_schema)\
38 | .option("badRecordsPath", os.path.join(base_path, "__corrupt", "MelbParkingBayData"))\
39 | .option("multiLine", True)\
40 | .json(parkingbay_filepath)
41 | sensordata_sdf = spark.read\
42 | .schema(sensordata_schema)\
43 | .option("badRecordsPath", os.path.join(base_path, "__corrupt", "MelbParkingSensorData"))\
44 | .option("multiLine", True)\
45 | .json(sensors_filepath)
46 |
47 |
48 | # Standardize
49 | t_parkingbay_sdf, t_parkingbay_malformed_sdf = s.standardize_parking_bay(parkingbay_sdf, load_id, loaded_on)
50 | t_sensordata_sdf, t_sensordata_malformed_sdf = s.standardize_sensordata(sensordata_sdf, load_id, loaded_on)
51 |
52 | # Insert new rows
53 | t_parkingbay_sdf.write.mode("append").insertInto("interim.parking_bay")
54 | t_sensordata_sdf.write.mode("append").insertInto("interim.sensor")
55 |
56 | # Insert bad rows
57 | t_parkingbay_malformed_sdf.write.mode("append").insertInto("malformed.parking_bay")
58 | t_sensordata_malformed_sdf.write.mode("append").insertInto("malformed.sensor")
59 |
60 |
61 | # COMMAND ----------
62 |
63 | # MAGIC %md
64 | # MAGIC ### Metrics
65 |
66 | # COMMAND ----------
67 |
68 | parkingbay_count = t_parkingbay_sdf.count()
69 | sensordata_count = t_sensordata_sdf.count()
70 | parkingbay_malformed_count = t_parkingbay_malformed_sdf.count()
71 | sensordata_malformed_count = t_sensordata_malformed_sdf.count()
72 |
73 | tc.track_event('Standardize : Completed load',
74 | properties={'parkingbay_filepath': parkingbay_filepath,
75 | 'sensors_filepath': sensors_filepath,
76 | 'load_id': load_id
77 | },
78 | measurements={'parkingbay_count': parkingbay_count,
79 | 'sensordata_count': sensordata_count,
80 | 'parkingbay_malformed_count': parkingbay_malformed_count,
81 | 'sensordata_malformed_count': sensordata_malformed_count
82 | })
83 | tc.flush()
84 |
85 | # COMMAND ----------
86 |
87 | dbutils.notebook.exit("success")
88 |
--------------------------------------------------------------------------------
/databricks/notebooks/03_transform.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | dbutils.widgets.text("loadid", "", "Load Id")
3 | loadid = dbutils.widgets.get("loadid")
4 |
5 | # COMMAND ----------
6 |
7 | from applicationinsights import TelemetryClient
8 | tc = TelemetryClient(dbutils.secrets.get(scope = "storage_scope", key = "appinsights_key"))
9 |
10 | # COMMAND ----------
11 |
12 | import datetime
13 | import os
14 | from pyspark.sql.functions import col, lit
15 | import ddo_transform.transform as t
16 | import ddo_transform.util as util
17 |
18 | load_id = loadid
19 | loaded_on = datetime.datetime.now()
20 | base_path = 'dbfs:/mnt/datalake/data/dw/'
21 |
22 | # Read interim cleansed data
23 | parkingbay_sdf = spark.read.table("interim.parking_bay").filter(col('load_id') == lit(load_id))
24 | sensordata_sdf = spark.read.table("interim.sensor").filter(col('load_id') == lit(load_id))
25 |
26 | # COMMAND ----------
27 |
28 | # MAGIC %md
29 | # MAGIC ### Transform and load Dimension tables
30 |
31 | # COMMAND ----------
32 |
33 | # Read existing Dimensions
34 | dim_parkingbay_sdf = spark.read.table("dw.dim_parking_bay")
35 | dim_location_sdf = spark.read.table("dw.dim_location")
36 | dim_st_marker = spark.read.table("dw.dim_st_marker")
37 |
38 | # Transform
39 | new_dim_parkingbay_sdf = t.process_dim_parking_bay(parkingbay_sdf, dim_parkingbay_sdf, load_id, loaded_on).cache()
40 | new_dim_location_sdf = t.process_dim_location(sensordata_sdf, dim_location_sdf, load_id, loaded_on).cache()
41 | new_dim_st_marker_sdf = t.process_dim_st_marker(sensordata_sdf, dim_st_marker, load_id, loaded_on).cache()
42 |
43 | # Load
44 | util.save_overwrite_unmanaged_table(spark, new_dim_parkingbay_sdf, table_name="dw.dim_parking_bay", path=os.path.join(base_path, "dim_parking_bay"))
45 | util.save_overwrite_unmanaged_table(spark, new_dim_location_sdf, table_name="dw.dim_location", path=os.path.join(base_path, "dim_location"))
46 | util.save_overwrite_unmanaged_table(spark, new_dim_st_marker_sdf, table_name="dw.dim_st_marker", path=os.path.join(base_path, "dim_st_marker"))
47 |
48 | # COMMAND ----------
49 |
50 | # MAGIC %md
51 | # MAGIC ### Transform and load Fact tables
52 |
53 | # COMMAND ----------
54 |
55 | # Read existing Dimensions
56 | dim_parkingbay_sdf = spark.read.table("dw.dim_parking_bay")
57 | dim_location_sdf = spark.read.table("dw.dim_location")
58 | dim_st_marker = spark.read.table("dw.dim_st_marker")
59 |
60 | # Process
61 | nr_fact_parking = t.process_fact_parking(sensordata_sdf, dim_parkingbay_sdf, dim_location_sdf, dim_st_marker, load_id, loaded_on)
62 |
63 | # Insert new rows
64 | nr_fact_parking.write.mode("append").insertInto("dw.fact_parking")
65 |
66 | # COMMAND ----------
67 |
68 | # MAGIC %md
69 | # MAGIC ### Metrics
70 |
71 | # COMMAND ----------
72 |
73 | new_dim_parkingbay_count = spark.read.table("dw.dim_parking_bay").count()
74 | new_dim_location_count = spark.read.table("dw.dim_location").count()
75 | new_dim_st_marker_count = spark.read.table("dw.dim_st_marker").count()
76 | nr_fact_parking_count = nr_fact_parking.count()
77 |
78 |
79 | tc.track_event('Transform : Completed load',
80 | properties={'load_id': load_id},
81 | measurements={'new_dim_parkingbay_count': new_dim_parkingbay_count,
82 | 'new_dim_location_count': new_dim_location_count,
83 | 'new_dim_st_marker_count': new_dim_st_marker_count,
84 | 'newrecords_fact_parking_count': nr_fact_parking_count
85 | })
86 | tc.flush()
87 |
88 | # COMMAND ----------
89 |
90 | dbutils.notebook.exit("success")
91 |
--------------------------------------------------------------------------------
/deploy.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Access granted under MIT Open Source License: https://en.wikipedia.org/wiki/MIT_License
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
6 | # documentation files (the "Software"), to deal in the Software without restriction, including without limitation
7 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, # and/or sell copies of the Software,
8 | # and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
9 | #
10 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions
11 | # of the Software.
12 | #
13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
14 | # TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
15 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
16 | # CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
17 | # DEALINGS IN THE SOFTWARE.
18 |
19 | set -o errexit
20 | set -o pipefail
21 | set -o nounset
22 | # set -o xtrace # For debugging
23 |
24 | # Check if required utilities are installed
25 | command -v jq >/dev/null 2>&1 || { echo >&2 "I require jq but it's not installed. See https://stedolan.github.io/jq/. Aborting."; exit 1; }
26 | command -v az >/dev/null 2>&1 || { echo >&2 "I require azure cli but it's not installed. See https://bit.ly/2Gc8IsS. Aborting."; exit 1; }
27 |
28 | # Check if user is logged in
29 | [[ -n $(az account show 2> /dev/null) ]] || { echo "Please login via the Azure CLI: "; az login; }
30 |
31 | # Globals and constants
32 | TIMESTAMP=$(date +%s)
33 | RED='\033[0;31m'
34 | ORANGE='\033[0;33m'
35 | NC='\033[0m'
36 |
37 | # Set path
38 | dir_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P ); cd "$dir_path"
39 |
40 |
41 | ###################
42 | # USER PARAMETERS
43 |
44 | rg_name_pre="${1-}"
45 | rg_location="${2-}"
46 | sub_id="${3-}"
47 |
48 | # while [[ -z $env_name ]]; do
49 | # read -rp "$(echo -e ${ORANGE}"Enter environment (dev, stg or prod): "${NC})" env_name
50 | # # TODO validate if dev, stg, prod
51 | # done
52 |
53 | while [[ -z $rg_name_pre ]]; do
54 | read -rp "$(echo -e ${ORANGE}"Enter Resource Group name: "${NC})" rg_name_pre
55 | done
56 |
57 | while [[ -z $rg_location ]]; do
58 | read -rp "$(echo -e ${ORANGE}"Enter Azure Location (ei. EAST US 2): "${NC})" rg_location
59 | done
60 |
61 | while [[ -z $sub_id ]]; do
62 | # Check if user only has one sub
63 | sub_count=$(az account list --output json | jq '. | length')
64 | if (( $sub_count != 1 )); then
65 | az account list --output table
66 | read -rp "$(echo -e ${ORANGE}"Enter Azure Subscription Id you wish to deploy to (enter to use Default): "${NC})" sub_id
67 | fi
68 | # If still empty then user selected IsDefault
69 | if [[ -z $sub_id ]]; then
70 | sub_id=$(az account show --output json | jq -r '.id')
71 | fi
72 | done
73 |
74 | # By default, set all KeyVault permission to deployer
75 | # Retrieve KeyVault User Id
76 | userId=$(az account show --output json | jq -r '.user.name')
77 | kvOwnerObjectId=$(az ad user show --id $userId \
78 | --output json | jq -r '.objectId')
79 |
80 |
81 | ###################
82 | # DEPLOY ALL
83 |
84 | for env_name in dev stg prod; do
85 | # Azure infrastructure
86 | . ./infrastructure/deploy_infrastructure.sh "$env_name" "$rg_name_pre-$env_name" $rg_location $sub_id $kvOwnerObjectId
87 |
88 | # Databricks
89 | . ./databricks/create_secrets.sh "$env_name"
90 | . ./databricks/configure_databricks.sh "$env_name"
91 | done
92 |
--------------------------------------------------------------------------------
/docs/CI_CD.md:
--------------------------------------------------------------------------------
1 | ### Build Pipelines
2 |
3 | 1. **Build - Quality Assurance**
4 | - Purpose: Ensure code quality and integrity
5 | - Trigger: Pull Request to Master
6 | - Steps:
7 | 1. Build Python packages
8 | 2. Run units tests
9 | 3. Code Coverage
10 | 4. Linting
11 | 2. **Build - Artifacts**
12 | - Purpose: To produce necessary artifacts for Release
13 | - Trigger: Commit to Master
14 | - Steps:
15 | 1. Build and create Python Wheel
16 | 2. Publish artifacts:
17 | - Python Wheel
18 | - Databricks Notebooks and cluster configuration
19 | - Data Factory pipeline definitions
20 | - IaC - ARM templates, Bash scripts
21 | - 3rd party library dependencies (JARs, etc)
22 |
23 | ### Release Pipelines
24 |
25 | Currently, there is one multi-stage release pipeline with the following stages. Each stage deploys to a different environment.
26 |
27 | 1. **On-demand Integration Testing (QA) environment** - **TODO**
28 | 1. Deploy Azure resources with ARM templates + Bash scripts
29 | 2. Store sensitive configuration information in shared QA KeyVault
30 | 3. Download integration test data from shared Storage to newly deployed ADAL Gen2.
31 | 4. Configure Databricks workspace
32 | - Setup Data mount
33 | - Create Databricks secrets
34 | 5. Deploy Data Application to Databricks
35 | - Deploy cluster given configuration
36 | - Upload Jars, Python wheels to DBFS
37 | - Install libraries on cluster
38 | 6. Deploy ADF pipeline
39 | 7. Run integration tests
40 | - Trigger ADF Pipeline
41 | - Databricks job to run integration test notebook
42 |
43 | 2. **Deploy to Staging**
44 | - NOTE: *Staging environment should be a mirror of Production and thus already have a configured Databricks workspace (secrets, data mount, etc), ADAL Gen2, ADF Pipeline, KeyVault, etc.*
45 | 1. Hydrate data with latest production data
46 | 2. Deploy Data Application to Databricks
47 | - Deploy cluster given configuration
48 | - Upload Jars, Python wheels to DBFS
49 | - Install libraries on cluster
50 | 3. Deploy ADF Pipeline and activate triggers
51 | 4. Run integration tests
52 |
53 | 3. **Deploy to Production**
54 | 1. Deploy Data Application to Databricks
55 | - Deploy cluster given configuration
56 | - Upload Jars, Python wheels to DBFS
57 | - Install libraries on cluster
58 | 2. Deploy ADF Pipeline
59 | 3. Swap between existing deployment and newly released deployment
--------------------------------------------------------------------------------
/docs/NDCSydney2019-DataDevOps.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devlace/datadevops/e6c564a674a8264eed94fa6a8a8056e3b450525c/docs/NDCSydney2019-DataDevOps.pdf
--------------------------------------------------------------------------------
/images/CI_CD_process.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devlace/datadevops/e6c564a674a8264eed94fa6a8a8056e3b450525c/images/CI_CD_process.PNG
--------------------------------------------------------------------------------
/images/Release_1_Agent_DeployToDatabricks.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devlace/datadevops/e6c564a674a8264eed94fa6a8a8056e3b450525c/images/Release_1_Agent_DeployToDatabricks.PNG
--------------------------------------------------------------------------------
/images/architecture.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devlace/datadevops/e6c564a674a8264eed94fa6a8a8056e3b450525c/images/architecture.PNG
--------------------------------------------------------------------------------
/infrastructure/README.md:
--------------------------------------------------------------------------------
1 | All parameters are set in azure.deploy.parameters..json files
2 |
3 | .env. files are produced after every deployment
4 |
5 | ## Scripts
6 |
7 | deploy_all.sh
8 | └── deploy_infrastructure.sh <- deploys resources to a specific Environment
9 | └── configure_adlagen2.sh <- configures the newly deployed ADLA Gen2
--------------------------------------------------------------------------------
/infrastructure/azuredeploy.json:
--------------------------------------------------------------------------------
1 | {
2 | "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#",
3 | "contentVersion": "1.0.0.0",
4 | "parameters": {
5 | "deployNs": {
6 | "defaultValue": "ddo",
7 | "type": "string"
8 | },
9 | "env": {
10 | "defaultValue": "dev",
11 | "type": "string",
12 | "allowedValues": [
13 | "dev",
14 | "stg",
15 | "prod"
16 | ]
17 | },
18 | "dbricksName": {
19 | "defaultValue": "[concat(parameters('deployNs'), 'dbricks', parameters('env'))]",
20 | "type": "String"
21 | },
22 | "kvName": {
23 | "defaultValue": "[concat(parameters('deployNs'), 'kv', parameters('env'), substring(uniqueString(resourceGroup().id), 0, 8))]",
24 | "type": "String",
25 | "metadata": {
26 | "description": "Key Vault Name"
27 | }
28 | },
29 | "kvOwnerObjectId": {
30 | "type": "String",
31 | "metadata": {
32 | "description": "Active Directory ObjectId to be granted full rights to KV"
33 | }
34 | },
35 | "storName": {
36 | "defaultValue": "[concat(parameters('deployNs'), 'stor', parameters('env'), substring(uniqueString(resourceGroup().id), 0, 8))]",
37 | "type": "String",
38 | "metadata": {
39 | "description": "Storage account - ADLA Gen2"
40 | }
41 | },
42 | "spStorName": {
43 | "defaultValue": "[concat(parameters('deployNs'), 'sp', parameters('env'), substring(uniqueString(resourceGroup().id), 0, 8))]",
44 | "type": "String",
45 | "metadata": {
46 | "description": "Service Principal to be granted access to Storage Account - ADLA Gen2"
47 | }
48 | },
49 | "adfName": {
50 | "defaultValue": "[concat(parameters('deployNs'), 'adf', parameters('env'), substring(uniqueString(resourceGroup().id), 0, 8))]",
51 | "type": "string",
52 | "metadata": {
53 | "description": "Data Factory Name"
54 | }
55 | }
56 | },
57 | "variables": {
58 | "managedResourceGroupId": "[concat(subscription().id, '/resourceGroups/', variables('managedResourceGroupName'))]",
59 | "managedResourceGroupName": "[concat('databricks-rg-', parameters('dbricksName'), '-', uniqueString(parameters('dbricksName'), resourceGroup().id))]"
60 | },
61 | "resources": [
62 | {
63 | "apiVersion": "2018-04-01",
64 | "location": "[resourceGroup().location]",
65 | "name": "[parameters('dbricksName')]",
66 | "tags": {
67 | "displayName": "Databricks Workspace",
68 | "Environment": "[parameters('env')]"
69 | },
70 | "sku": {
71 | "name": "premium"
72 | },
73 | "properties": {
74 | "ManagedResourceGroupId": "[variables('managedResourceGroupId')]"
75 | },
76 | "type": "Microsoft.Databricks/workspaces"
77 | },
78 | {
79 | "type": "Microsoft.KeyVault/vaults",
80 | "name": "[parameters('kvName')]",
81 | "apiVersion": "2015-06-01",
82 | "location": "[resourceGroup().location]",
83 | "tags": {
84 | "displayName": "Key Vault",
85 | "Environment": "[parameters('env')]"
86 | },
87 | "properties": {
88 | "enabledForDeployment": false,
89 | "enabledForTemplateDeployment": true,
90 | "enabledForVolumeEncryption": false,
91 | "tenantId": "[subscription().tenantId]",
92 | "accessPolicies": [{
93 | "tenantId": "[subscription().tenantId]",
94 | "objectId": "[parameters('kvOwnerObjectId')]",
95 | "permissions": {
96 | "keys": [
97 | "All"
98 | ],
99 | "secrets": [
100 | "All"
101 | ]
102 | }
103 | }, {
104 | "tenantId": "[subscription().tenantId]",
105 | "objectId": "[reference(parameters('adfName'), '2018-06-01', 'Full').identity.principalId]",
106 | "permissions": {
107 | "secrets": [
108 | "get", "list"
109 | ]
110 | }
111 | }],
112 | "sku": {
113 | "family": "A",
114 | "name": "Standard"
115 | }
116 | }
117 | },
118 | {
119 | "type": "Microsoft.Storage/storageAccounts",
120 | "sku": {
121 | "name": "Standard_LRS",
122 | "tier": "Standard"
123 | },
124 | "kind": "StorageV2",
125 | "name": "[parameters('storName')]",
126 | "apiVersion": "2018-07-01",
127 | "location": "[resourceGroup().location]",
128 | "tags": {
129 | "displayName": "Data Lake",
130 | "Environment": "[parameters('env')]"
131 | },
132 | "scale": null,
133 | "properties": {
134 | "isHnsEnabled": true,
135 | "networkAcls": {
136 | "bypass": "AzureServices",
137 | "virtualNetworkRules": [],
138 | "ipRules": [],
139 | "defaultAction": "Allow"
140 | },
141 | "supportsHttpsTrafficOnly": true,
142 | "encryption": {
143 | "services": {
144 | "file": {
145 | "enabled": true
146 | },
147 | "blob": {
148 | "enabled": true
149 | }
150 | },
151 | "keySource": "Microsoft.Storage"
152 | },
153 | "accessTier": "Hot"
154 | },
155 | "dependsOn": []
156 | },
157 | {
158 | "apiVersion": "2018-06-01",
159 | "name": "[parameters('adfName')]",
160 | "location": "[resourceGroup().location]",
161 | "tags": {
162 | "displayName": "DataFactory",
163 | "Environment": "[parameters('env')]"
164 | },
165 | "type": "Microsoft.DataFactory/factories",
166 | "identity": {
167 | "type": "SystemAssigned"
168 | },
169 | "properties": {}
170 | }
171 | ],
172 | "outputs": {
173 | "dbricksName": {
174 | "value": "[parameters('dbricksName')]",
175 | "type": "string"
176 | },
177 | "dbricksLocation": {
178 | "value": "[resourceGroup().location]",
179 | "type": "string"
180 | },
181 | "kvName": {
182 | "value": "[parameters('kvName')]",
183 | "type": "string"
184 | },
185 | "storName": {
186 | "value": "[parameters('storName')]",
187 | "type": "string"
188 | },
189 | "spStorName": {
190 | "value": "[parameters('spStorName')]",
191 | "type": "string"
192 | }
193 | }
194 | }
--------------------------------------------------------------------------------
/infrastructure/azuredeploy.parameters.dev.json:
--------------------------------------------------------------------------------
1 | {
2 | "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentParameters.json#",
3 | "contentVersion": "1.0.0.0",
4 | "parameters": {
5 | "env": {
6 | "value": "dev"
7 | }
8 | }
9 | }
--------------------------------------------------------------------------------
/infrastructure/azuredeploy.parameters.prod.json:
--------------------------------------------------------------------------------
1 | {
2 | "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentParameters.json#",
3 | "contentVersion": "1.0.0.0",
4 | "parameters": {
5 | "env": {
6 | "value": "prod"
7 | }
8 | }
9 | }
--------------------------------------------------------------------------------
/infrastructure/azuredeploy.parameters.stg.json:
--------------------------------------------------------------------------------
1 | {
2 | "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentParameters.json#",
3 | "contentVersion": "1.0.0.0",
4 | "parameters": {
5 | "env": {
6 | "value": "stg"
7 | }
8 | }
9 | }
--------------------------------------------------------------------------------
/infrastructure/configure_adlagen2.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Access granted under MIT Open Source License: https://en.wikipedia.org/wiki/MIT_License
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
6 | # documentation files (the "Software"), to deal in the Software without restriction, including without limitation
7 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, # and/or sell copies of the Software,
8 | # and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
9 | #
10 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions
11 | # of the Software.
12 | #
13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
14 | # TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
15 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
16 | # CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
17 | # DEALINGS IN THE SOFTWARE.
18 |
19 | #######################################################
20 | # Configure ADLA Gen2 Service Principal permissions
21 | #
22 | # This script performs the following:
23 | # 1. Create Service Principle for ADLS Gen2
24 | # 2. Grant correct RBAC role to the SP
25 | # 3. Create File System using REST API
26 | #
27 | # Prerequisites:
28 | # - User is logged in to the azure cli
29 | # - Correct Azure subscription is selected
30 | #######################################################
31 |
32 | set -o errexit
33 | set -o pipefail
34 | set -o nounset
35 | # set -o xtrace # For debugging
36 |
37 | ################
38 | # PARAMETERS
39 | ################
40 | rg_name="${1-}"
41 | storage_account="${2-}"
42 | sp_stor_id="${3-}"
43 | sp_stor_pass="${4-}"
44 | sp_stor_tenantid="${5-}"
45 |
46 | storage_fs=datalake # Constant
47 |
48 | # Retrieve full storage account azure id
49 | storage_account_id=$(az storage account show \
50 | --name "$storage_account" \
51 | --resource-group "$rg_name" \
52 | --output json |
53 | jq -r '.id')
54 |
55 | # See this issue: https://github.com/Azure/azure-powershell/issues/2286
56 | # TODO: make more robust
57 | sleep 1m
58 |
59 | # Grant "Storage Blob Data Owner (Preview)
60 | echo "Granting 'Storage Blob Data Contributor' for '$storage_account' to SP"
61 | az role assignment create --assignee "$sp_stor_id" \
62 | --role "Storage Blob Data Contributor" \
63 | --scope "$storage_account_id"
64 |
65 | # Because ADLA Gen2 is not yet supported by the az cli 2.0 as of 2019/02/04
66 | # we resort to calling the REST API directly:
67 | # https://docs.microsoft.com/en-us/rest/api/storageservices/datalakestoragegen2/filesystem
68 | #
69 | # For information on calling Azure REST API, see here:
70 | # https://docs.microsoft.com/en-us/rest/api/azure/
71 |
72 | # It takes time for AD permissions to propogate
73 | # TODO: make more robust
74 | sleep 2m
75 |
76 | # Use service principle to generate bearer token
77 | bearer_token=$(curl -X POST -d "grant_type=client_credentials&client_id=${sp_stor_id}&client_secret=${sp_stor_pass}&resource=https%3A%2F%2Fstorage.azure.com%2F" \
78 | https://login.microsoftonline.com/${sp_stor_tenantid}/oauth2/token |
79 | jq -r '.access_token')
80 |
81 | # Use bearer token to create file system
82 | echo "Creating ADLA Gen2 File System '$storage_fs' in storage account: '$storage_account'"
83 | curl -X PUT -d -H 'Content-Type:application/json' -H "Authorization: Bearer ${bearer_token}" \
84 | https://${storage_account}.dfs.core.windows.net/${storage_fs}?resource=filesystem
85 |
86 | echo "Completed configuring ADLA Gen2."
--------------------------------------------------------------------------------
/infrastructure/deploy_infrastructure.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Access granted under MIT Open Source License: https://en.wikipedia.org/wiki/MIT_License
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
6 | # documentation files (the "Software"), to deal in the Software without restriction, including without limitation
7 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, # and/or sell copies of the Software,
8 | # and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
9 | #
10 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions
11 | # of the Software.
12 | #
13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
14 | # TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
15 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
16 | # CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
17 | # DEALINGS IN THE SOFTWARE.
18 |
19 | #######################################################
20 | # Deploys all necessary azure resources and stores
21 | # configuration information in an .ENV file
22 | #
23 | # Prerequisites:
24 | # - User is logged in to the azure cli
25 | # - Correct Azure subscription is selected
26 | #######################################################
27 |
28 | set -o errexit
29 | set -o pipefail
30 | set -o nounset
31 | # set -o xtrace # For debugging
32 |
33 | ###################
34 | # PARAMETERS
35 |
36 | env_name="${1-}"
37 | rg_name="${2-}"
38 | rg_location="${3-}"
39 | sub_id="${4-}"
40 | kvOwnerObjectId="${5-}"
41 |
42 | env_file="../.env.${env_name}"
43 |
44 | # Set path
45 | parent_dir=$(pwd -P)
46 | dir_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P ); cd "$dir_path"
47 |
48 |
49 | #####################
50 | # DEPLOY ARM TEMPLATE
51 |
52 | # Set account to where ARM template will be deployed to
53 | echo "Deploying to Subscription: $sub_id"
54 | az account set --subscription $sub_id
55 |
56 | # Create resource group
57 | echo "Creating resource group: $rg_name"
58 | az group create --name "$rg_name" --location "$rg_location"
59 |
60 | # Deploy arm template
61 | echo "Deploying resources into $rg_name"
62 | arm_output=$(az group deployment create \
63 | --resource-group "$rg_name" \
64 | --template-file "./azuredeploy.json" \
65 | --parameters @"./azuredeploy.parameters.${env_name}.json" \
66 | --parameters "kvOwnerObjectId=${kvOwnerObjectId}" \
67 | --output json)
68 |
69 | if [[ -z $arm_output ]]; then
70 | echo >&2 "ARM deployment failed."
71 | exit 1
72 | fi
73 |
74 | ###########################
75 | # RETRIEVE DATABRICKS INFORMATION
76 |
77 | # Ask user to configure databricks cli
78 | # TODO: see if this can be automated
79 | dbricks_name=$(echo $arm_output | jq -r '.properties.outputs.dbricksName.value')
80 | echo -e "${ORANGE}"
81 | echo "Configure your databricks cli to connect to the newly created Databricks workspace: ${dbricks_name}. See here for more info: https://bit.ly/2GUwHcw."
82 | databricks configure --token
83 | echo -e "${NC}"
84 |
85 | # Databricks token and details
86 | dbricks_location=$(echo $arm_output | jq -r '.properties.outputs.dbricksLocation.value')
87 | dbi_token=$(awk '/token/ && NR==3 {print $0;exit;}' ~/.databrickscfg | cut -d' ' -f3)
88 | [[ -n $dbi_token ]] || { echo >&2 "Databricks cli not configured correctly. Please run databricks configure --token. Aborting."; exit 1; }
89 |
90 |
91 | #########################
92 | # RETRIEVE CONFIG INFORMATION
93 |
94 | # Retrieve KeyVault details
95 | kv_name=$(echo $arm_output | jq -r '.properties.outputs.kvName.value')
96 |
97 | # Retrieve storage account (ADLS Gen2) details
98 | storage_account=$(echo $arm_output | jq -r '.properties.outputs.storName.value')
99 | storage_account_key=$(az storage account keys list \
100 | --account-name $storage_account \
101 | --resource-group $rg_name \
102 | --output json |
103 | jq -r '.[0].value')
104 |
105 | # Retrieve SP name for ADLA Gen2 from arm output
106 | sp_stor_name=$(echo $arm_output | jq -r '.properties.outputs.spStorName.value')
107 |
108 |
109 | #########################
110 | # CREATE AND CONFIGURE SERVICE PRINCIPAL FOR ADLA GEN2
111 |
112 | echo "Creating Service Principal (SP) for access to ADLA Gen2: '$sp_stor_name'"
113 | sp_stor_out=$(az ad sp create-for-rbac --name $sp_stor_name \
114 | --skip-assignment \
115 | --output json)
116 | sp_stor_id=$(echo $sp_stor_out | jq -r '.appId')
117 | sp_stor_pass=$(echo $sp_stor_out | jq -r '.password')
118 | sp_stor_tenantid=$(echo $sp_stor_out | jq -r '.tenant')
119 |
120 | . ./configure_adlagen2.sh "$rg_name" "$storage_account" "$sp_stor_id" "$sp_stor_pass" "$sp_stor_tenantid"
121 |
122 |
123 | ####################
124 | # SAVE RELEVANT SECRETS IN KEYVAULT
125 |
126 | az keyvault secret set --vault-name $kv_name --name "storageAccount" --value $storage_account
127 | az keyvault secret set --vault-name $kv_name --name "storageKey" --value $storage_account_key
128 | az keyvault secret set --vault-name $kv_name --name "spStorName" --value $sp_stor_name
129 | az keyvault secret set --vault-name $kv_name --name "spStorId" --value $sp_stor_id
130 | az keyvault secret set --vault-name $kv_name --name "spStorPass" --value $sp_stor_pass
131 | az keyvault secret set --vault-name $kv_name --name "spStorTenantId" --value $sp_stor_tenantid
132 | az keyvault secret set --vault-name $kv_name --name "dbricksDomain" --value https://${dbricks_location}.azuredatabricks.net
133 | az keyvault secret set --vault-name $kv_name --name "dbricksToken" --value $dbi_token
134 |
135 |
136 | ####################
137 | # BUILD ENV FILE FROM CONFIG INFORMATION
138 |
139 | echo "Appending configuration to .env file."
140 | cat << EOF >> $env_file
141 |
142 | # ------ Configuration from deployment on ${TIMESTAMP} -----------
143 | RESOURCE_GROUP=${rg_name}
144 | BLOB_STORAGE_ACCOUNT=${storage_account}
145 | BLOB_STORAGE_KEY=${storage_account_key}
146 | SP_STOR_NAME=${sp_stor_name}
147 | SP_STOR_ID=${sp_stor_id}
148 | SP_STOR_PASS=${sp_stor_pass}
149 | SP_STOR_TENANT=${sp_stor_tenantid}
150 | KV_NAME=${kv_name}
151 | DATABRICKS_HOST=https://${dbricks_location}.azuredatabricks.net
152 | DATABRICKS_TOKEN=${dbi_token}
153 |
154 | EOF
155 | echo "Completed deploying Azure resources $rg_name ($env_name)"
156 |
157 |
158 | echo "Return to parent script dir: $parent_dir"
159 | cd "$parent_dir"
--------------------------------------------------------------------------------
/samples/azuresql/README.md:
--------------------------------------------------------------------------------
1 | ## Deploying SQL Azure Database
2 |
3 | ### Build Pipeline
4 |
5 | ### Release Pipeline
--------------------------------------------------------------------------------
/samples/azuresql/azure-pipelines-ci.yml:
--------------------------------------------------------------------------------
1 | # Starter pipeline
2 | # Start with a minimal pipeline that you can customize to build and deploy your code.
3 | # Add steps that build, run tests, deploy, and more:
4 | # https://aka.ms/yaml
5 |
6 | trigger:
7 | - master
8 |
9 | pool:
10 | vmImage: 'vs2017-win2016'
11 |
12 | variables:
13 | SLN_DIR: 'samples\azuresql\ddo_samples_azuresql'
14 | BUILD_PLATFORM: 'any cpu'
15 | BUILD_CONFIGURATION: 'release'
16 |
17 | steps:
18 | - task: VSBuild@1
19 | displayName: 'Build solution sln'
20 | inputs:
21 | solution: '$(SLN_DIR)\ddo_samples_azuresql.sln'
22 | platform: '$(BUILD_PLATFORM)'
23 | configuration: '$(BUILD_CONFIGURATION)'
24 |
25 | - task: VSTest@2
26 | displayName: 'VsTest - testAssemblies'
27 | inputs:
28 | testAssemblyVer2: |
29 | $(SLN_DIR)\**\$(BUILD_CONFIGURATION)\*test*.dll
30 | !**\obj\**
31 | platform: '$(BUILD_PLATFORM)'
32 | configuration: '$(BUILD_CONFIGURATION)'
33 |
34 | - task: PublishSymbols@2
35 | displayName: 'Publish symbols path'
36 | inputs:
37 | SearchPattern: '$(SLN_DIR)\**\bin\**\*.pdb'
38 | PublishSymbols: false
39 | continueOnError: true
40 |
41 | - task: CopyFiles@2
42 | displayName: 'Copy Files to: $(build.artifactstagingdirectory)'
43 | inputs:
44 | SourceFolder: '$(system.defaultworkingdirectory)'
45 | Contents: '$(SLN_DIR)\**\bin\$(BUILD_CONFIGURATION)\**'
46 | TargetFolder: '$(build.artifactstagingdirectory)'
47 | condition: succeededOrFailed()
48 |
--------------------------------------------------------------------------------
/samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql.sln:
--------------------------------------------------------------------------------
1 |
2 | Microsoft Visual Studio Solution File, Format Version 12.00
3 | # Visual Studio 15
4 | VisualStudioVersion = 15.0.28307.645
5 | MinimumVisualStudioVersion = 10.0.40219.1
6 | Project("{00D1A9C2-B5F0-4AF3-8072-F6C62B433612}") = "ddo_samples_azuresql", "ddo_samples_azuresql\ddo_samples_azuresql.sqlproj", "{387EBFC5-ACB1-4445-A25F-D70D18D34C30}"
7 | EndProject
8 | Global
9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution
10 | Debug|Any CPU = Debug|Any CPU
11 | Release|Any CPU = Release|Any CPU
12 | EndGlobalSection
13 | GlobalSection(ProjectConfigurationPlatforms) = postSolution
14 | {387EBFC5-ACB1-4445-A25F-D70D18D34C30}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
15 | {387EBFC5-ACB1-4445-A25F-D70D18D34C30}.Debug|Any CPU.Build.0 = Debug|Any CPU
16 | {387EBFC5-ACB1-4445-A25F-D70D18D34C30}.Debug|Any CPU.Deploy.0 = Debug|Any CPU
17 | {387EBFC5-ACB1-4445-A25F-D70D18D34C30}.Release|Any CPU.ActiveCfg = Release|Any CPU
18 | {387EBFC5-ACB1-4445-A25F-D70D18D34C30}.Release|Any CPU.Build.0 = Release|Any CPU
19 | {387EBFC5-ACB1-4445-A25F-D70D18D34C30}.Release|Any CPU.Deploy.0 = Release|Any CPU
20 | EndGlobalSection
21 | GlobalSection(SolutionProperties) = preSolution
22 | HideSolutionNode = FALSE
23 | EndGlobalSection
24 | GlobalSection(ExtensibilityGlobals) = postSolution
25 | SolutionGuid = {D547B017-8D99-4D13-8CA6-E7BC0430B4E5}
26 | EndGlobalSection
27 | EndGlobal
28 |
--------------------------------------------------------------------------------
/samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/SalesLT/Sequences/SalesOrderNumber.sql:
--------------------------------------------------------------------------------
1 | CREATE SEQUENCE [SalesLT].[SalesOrderNumber]
2 | AS INT
3 | START WITH 1
4 | INCREMENT BY 1;
5 |
6 |
--------------------------------------------------------------------------------
/samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/SalesLT/Tables/Address.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE [SalesLT].[Address] (
2 | [AddressID] INT IDENTITY (1, 1) NOT NULL,
3 | [AddressLine1] NVARCHAR (60) NOT NULL,
4 | [AddressLine2] NVARCHAR (60) NULL,
5 | [City] NVARCHAR (30) NOT NULL,
6 | [StateProvince] [dbo].[Name] NOT NULL,
7 | [CountryRegion] [dbo].[Name] NOT NULL,
8 | [PostalCode] NVARCHAR (15) NOT NULL,
9 | [rowguid] UNIQUEIDENTIFIER CONSTRAINT [DF_Address_rowguid] DEFAULT (newid()) NOT NULL,
10 | [ModifiedDate] DATETIME CONSTRAINT [DF_Address_ModifiedDate] DEFAULT (getdate()) NOT NULL,
11 | CONSTRAINT [PK_Address_AddressID] PRIMARY KEY CLUSTERED ([AddressID] ASC),
12 | CONSTRAINT [AK_Address_rowguid] UNIQUE NONCLUSTERED ([rowguid] ASC)
13 | );
14 |
15 |
16 | GO
17 | CREATE NONCLUSTERED INDEX [IX_Address_StateProvince]
18 | ON [SalesLT].[Address]([StateProvince] ASC);
19 |
20 |
21 | GO
22 | CREATE NONCLUSTERED INDEX [IX_Address_AddressLine1_AddressLine2_City_StateProvince_PostalCode_CountryRegion]
23 | ON [SalesLT].[Address]([AddressLine1] ASC, [AddressLine2] ASC, [City] ASC, [StateProvince] ASC, [PostalCode] ASC, [CountryRegion] ASC);
24 |
25 |
--------------------------------------------------------------------------------
/samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/SalesLT/Tables/Customer.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE [SalesLT].[Customer] (
2 | [CustomerID] INT IDENTITY (1, 1) NOT NULL,
3 | [NameStyle] [dbo].[NameStyle] CONSTRAINT [DF_Customer_NameStyle] DEFAULT ((0)) NOT NULL,
4 | [Title] NVARCHAR (8) NULL,
5 | [FirstName] [dbo].[Name] NOT NULL,
6 | [MiddleName] [dbo].[Name] NULL,
7 | [LastName] [dbo].[Name] NOT NULL,
8 | [Suffix] NVARCHAR (10) NULL,
9 | [CompanyName] NVARCHAR (128) NULL,
10 | [SalesPerson] NVARCHAR (256) NULL,
11 | [EmailAddress] NVARCHAR (50) NULL,
12 | [Phone] [dbo].[Phone] NULL,
13 | [PasswordHash] VARCHAR (128) NOT NULL,
14 | [PasswordSalt] VARCHAR (10) NOT NULL,
15 | [rowguid] UNIQUEIDENTIFIER CONSTRAINT [DF_Customer_rowguid] DEFAULT (newid()) NOT NULL,
16 | [ModifiedDate] DATETIME CONSTRAINT [DF_Customer_ModifiedDate] DEFAULT (getdate()) NOT NULL,
17 | CONSTRAINT [PK_Customer_CustomerID] PRIMARY KEY CLUSTERED ([CustomerID] ASC),
18 | CONSTRAINT [AK_Customer_rowguid] UNIQUE NONCLUSTERED ([rowguid] ASC)
19 | );
20 |
21 |
22 | GO
23 | CREATE NONCLUSTERED INDEX [IX_Customer_EmailAddress]
24 | ON [SalesLT].[Customer]([EmailAddress] ASC);
25 |
26 |
--------------------------------------------------------------------------------
/samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/SalesLT/Tables/CustomerAddress.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE [SalesLT].[CustomerAddress] (
2 | [CustomerID] INT NOT NULL,
3 | [AddressID] INT NOT NULL,
4 | [AddressType] [dbo].[Name] NOT NULL,
5 | [rowguid] UNIQUEIDENTIFIER CONSTRAINT [DF_CustomerAddress_rowguid] DEFAULT (newid()) NOT NULL,
6 | [ModifiedDate] DATETIME CONSTRAINT [DF_CustomerAddress_ModifiedDate] DEFAULT (getdate()) NOT NULL,
7 | CONSTRAINT [PK_CustomerAddress_CustomerID_AddressID] PRIMARY KEY CLUSTERED ([CustomerID] ASC, [AddressID] ASC),
8 | CONSTRAINT [FK_CustomerAddress_Address_AddressID] FOREIGN KEY ([AddressID]) REFERENCES [SalesLT].[Address] ([AddressID]),
9 | CONSTRAINT [FK_CustomerAddress_Customer_CustomerID] FOREIGN KEY ([CustomerID]) REFERENCES [SalesLT].[Customer] ([CustomerID]),
10 | CONSTRAINT [AK_CustomerAddress_rowguid] UNIQUE NONCLUSTERED ([rowguid] ASC)
11 | );
12 |
13 |
--------------------------------------------------------------------------------
/samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/SalesLT/Tables/Product.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE [SalesLT].[Product] (
2 | [ProductID] INT IDENTITY (1, 1) NOT NULL,
3 | [Name] [dbo].[Name] NOT NULL,
4 | [ProductNumber] NVARCHAR (25) NOT NULL,
5 | [Color] NVARCHAR (15) NULL,
6 | [StandardCost] MONEY NOT NULL,
7 | [ListPrice] MONEY NOT NULL,
8 | [Size] NVARCHAR (5) NULL,
9 | [Weight] DECIMAL (8, 2) NULL,
10 | [ProductCategoryID] INT NULL,
11 | [ProductModelID] INT NULL,
12 | [SellStartDate] DATETIME NOT NULL,
13 | [SellEndDate] DATETIME NULL,
14 | [DiscontinuedDate] DATETIME NULL,
15 | [ThumbNailPhoto] VARBINARY (MAX) NULL,
16 | [ThumbnailPhotoFileName] NVARCHAR (50) NULL,
17 | [rowguid] UNIQUEIDENTIFIER CONSTRAINT [DF_Product_rowguid] DEFAULT (newid()) NOT NULL,
18 | [ModifiedDate] DATETIME CONSTRAINT [DF_Product_ModifiedDate] DEFAULT (getdate()) NOT NULL,
19 | CONSTRAINT [PK_Product_ProductID] PRIMARY KEY CLUSTERED ([ProductID] ASC),
20 | CONSTRAINT [CK_Product_ListPrice] CHECK ([ListPrice]>=(0.00)),
21 | CONSTRAINT [CK_Product_SellEndDate] CHECK ([SellEndDate]>=[SellStartDate] OR [SellEndDate] IS NULL),
22 | CONSTRAINT [CK_Product_StandardCost] CHECK ([StandardCost]>=(0.00)),
23 | CONSTRAINT [CK_Product_Weight] CHECK ([Weight]>(0.00)),
24 | CONSTRAINT [FK_Product_ProductCategory_ProductCategoryID] FOREIGN KEY ([ProductCategoryID]) REFERENCES [SalesLT].[ProductCategory] ([ProductCategoryID]),
25 | CONSTRAINT [FK_Product_ProductModel_ProductModelID] FOREIGN KEY ([ProductModelID]) REFERENCES [SalesLT].[ProductModel] ([ProductModelID]),
26 | CONSTRAINT [AK_Product_Name] UNIQUE NONCLUSTERED ([Name] ASC),
27 | CONSTRAINT [AK_Product_ProductNumber] UNIQUE NONCLUSTERED ([ProductNumber] ASC),
28 | CONSTRAINT [AK_Product_rowguid] UNIQUE NONCLUSTERED ([rowguid] ASC)
29 | );
30 |
31 |
--------------------------------------------------------------------------------
/samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/SalesLT/Tables/ProductCategory.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE [SalesLT].[ProductCategory] (
2 | [ProductCategoryID] INT IDENTITY (1, 1) NOT NULL,
3 | [ParentProductCategoryID] INT NULL,
4 | [Name] [dbo].[Name] NOT NULL,
5 | [rowguid] UNIQUEIDENTIFIER CONSTRAINT [DF_ProductCategory_rowguid] DEFAULT (newid()) NOT NULL,
6 | [ModifiedDate] DATETIME CONSTRAINT [DF_ProductCategory_ModifiedDate] DEFAULT (getdate()) NOT NULL,
7 | CONSTRAINT [PK_ProductCategory_ProductCategoryID] PRIMARY KEY CLUSTERED ([ProductCategoryID] ASC),
8 | CONSTRAINT [FK_ProductCategory_ProductCategory_ParentProductCategoryID_ProductCategoryID] FOREIGN KEY ([ParentProductCategoryID]) REFERENCES [SalesLT].[ProductCategory] ([ProductCategoryID]),
9 | CONSTRAINT [AK_ProductCategory_Name] UNIQUE NONCLUSTERED ([Name] ASC),
10 | CONSTRAINT [AK_ProductCategory_rowguid] UNIQUE NONCLUSTERED ([rowguid] ASC)
11 | );
12 |
13 |
--------------------------------------------------------------------------------
/samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/SalesLT/Tables/ProductDescription.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE [SalesLT].[ProductDescription] (
2 | [ProductDescriptionID] INT IDENTITY (1, 1) NOT NULL,
3 | [Description] NVARCHAR (400) NOT NULL,
4 | [rowguid] UNIQUEIDENTIFIER CONSTRAINT [DF_ProductDescription_rowguid] DEFAULT (newid()) NOT NULL,
5 | [ModifiedDate] DATETIME CONSTRAINT [DF_ProductDescription_ModifiedDate] DEFAULT (getdate()) NOT NULL,
6 | CONSTRAINT [PK_ProductDescription_ProductDescriptionID] PRIMARY KEY CLUSTERED ([ProductDescriptionID] ASC),
7 | CONSTRAINT [AK_ProductDescription_rowguid] UNIQUE NONCLUSTERED ([rowguid] ASC)
8 | );
9 |
10 |
--------------------------------------------------------------------------------
/samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/SalesLT/Tables/ProductModel.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE [SalesLT].[ProductModel] (
2 | [ProductModelID] INT IDENTITY (1, 1) NOT NULL,
3 | [Name] [dbo].[Name] NOT NULL,
4 | [CatalogDescription] XML NULL,
5 | [rowguid] UNIQUEIDENTIFIER CONSTRAINT [DF_ProductModel_rowguid] DEFAULT (newid()) NOT NULL,
6 | [ModifiedDate] DATETIME CONSTRAINT [DF_ProductModel_ModifiedDate] DEFAULT (getdate()) NOT NULL,
7 | CONSTRAINT [PK_ProductModel_ProductModelID] PRIMARY KEY CLUSTERED ([ProductModelID] ASC),
8 | CONSTRAINT [AK_ProductModel_Name] UNIQUE NONCLUSTERED ([Name] ASC),
9 | CONSTRAINT [AK_ProductModel_rowguid] UNIQUE NONCLUSTERED ([rowguid] ASC)
10 | );
11 |
12 |
--------------------------------------------------------------------------------
/samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/SalesLT/Tables/ProductModelProductDescription.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE [SalesLT].[ProductModelProductDescription] (
2 | [ProductModelID] INT NOT NULL,
3 | [ProductDescriptionID] INT NOT NULL,
4 | [Culture] NCHAR (6) NOT NULL,
5 | [rowguid] UNIQUEIDENTIFIER CONSTRAINT [DF_ProductModelProductDescription_rowguid] DEFAULT (newid()) NOT NULL,
6 | [ModifiedDate] DATETIME CONSTRAINT [DF_ProductModelProductDescription_ModifiedDate] DEFAULT (getdate()) NOT NULL,
7 | CONSTRAINT [PK_ProductModelProductDescription_ProductModelID_ProductDescriptionID_Culture] PRIMARY KEY CLUSTERED ([ProductModelID] ASC, [ProductDescriptionID] ASC, [Culture] ASC),
8 | CONSTRAINT [FK_ProductModelProductDescription_ProductDescription_ProductDescriptionID] FOREIGN KEY ([ProductDescriptionID]) REFERENCES [SalesLT].[ProductDescription] ([ProductDescriptionID]),
9 | CONSTRAINT [FK_ProductModelProductDescription_ProductModel_ProductModelID] FOREIGN KEY ([ProductModelID]) REFERENCES [SalesLT].[ProductModel] ([ProductModelID]),
10 | CONSTRAINT [AK_ProductModelProductDescription_rowguid] UNIQUE NONCLUSTERED ([rowguid] ASC)
11 | );
12 |
13 |
--------------------------------------------------------------------------------
/samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/SalesLT/Tables/SalesOrderDetail.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE [SalesLT].[SalesOrderDetail] (
2 | [SalesOrderID] INT NOT NULL,
3 | [SalesOrderDetailID] INT IDENTITY (1, 1) NOT NULL,
4 | [OrderQty] SMALLINT NOT NULL,
5 | [ProductID] INT NOT NULL,
6 | [UnitPrice] MONEY NOT NULL,
7 | [UnitPriceDiscount] MONEY CONSTRAINT [DF_SalesOrderDetail_UnitPriceDiscount] DEFAULT ((0.0)) NOT NULL,
8 | [LineTotal] AS (isnull(([UnitPrice]*((1.0)-[UnitPriceDiscount]))*[OrderQty],(0.0))),
9 | [rowguid] UNIQUEIDENTIFIER CONSTRAINT [DF_SalesOrderDetail_rowguid] DEFAULT (newid()) NOT NULL,
10 | [ModifiedDate] DATETIME CONSTRAINT [DF_SalesOrderDetail_ModifiedDate] DEFAULT (getdate()) NOT NULL,
11 | CONSTRAINT [PK_SalesOrderDetail_SalesOrderID_SalesOrderDetailID] PRIMARY KEY CLUSTERED ([SalesOrderID] ASC, [SalesOrderDetailID] ASC),
12 | CONSTRAINT [CK_SalesOrderDetail_OrderQty] CHECK ([OrderQty]>(0)),
13 | CONSTRAINT [CK_SalesOrderDetail_UnitPrice] CHECK ([UnitPrice]>=(0.00)),
14 | CONSTRAINT [CK_SalesOrderDetail_UnitPriceDiscount] CHECK ([UnitPriceDiscount]>=(0.00)),
15 | CONSTRAINT [FK_SalesOrderDetail_Product_ProductID] FOREIGN KEY ([ProductID]) REFERENCES [SalesLT].[Product] ([ProductID]),
16 | CONSTRAINT [FK_SalesOrderDetail_SalesOrderHeader_SalesOrderID] FOREIGN KEY ([SalesOrderID]) REFERENCES [SalesLT].[SalesOrderHeader] ([SalesOrderID]) ON DELETE CASCADE,
17 | CONSTRAINT [AK_SalesOrderDetail_rowguid] UNIQUE NONCLUSTERED ([rowguid] ASC)
18 | );
19 |
20 |
21 | GO
22 | CREATE NONCLUSTERED INDEX [IX_SalesOrderDetail_ProductID]
23 | ON [SalesLT].[SalesOrderDetail]([ProductID] ASC);
24 |
25 |
--------------------------------------------------------------------------------
/samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/SalesLT/Tables/SalesOrderHeader.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE [SalesLT].[SalesOrderHeader] (
2 | [SalesOrderID] INT CONSTRAINT [DF_SalesOrderHeader_OrderID] DEFAULT (NEXT VALUE FOR [SalesLT].[SalesOrderNumber]) NOT NULL,
3 | [RevisionNumber] TINYINT CONSTRAINT [DF_SalesOrderHeader_RevisionNumber] DEFAULT ((0)) NOT NULL,
4 | [OrderDate] DATETIME CONSTRAINT [DF_SalesOrderHeader_OrderDate] DEFAULT (getdate()) NOT NULL,
5 | [DueDate] DATETIME NOT NULL,
6 | [ShipDate] DATETIME NULL,
7 | [Status] TINYINT CONSTRAINT [DF_SalesOrderHeader_Status] DEFAULT ((1)) NOT NULL,
8 | [OnlineOrderFlag] [dbo].[Flag] CONSTRAINT [DF_SalesOrderHeader_OnlineOrderFlag] DEFAULT ((1)) NOT NULL,
9 | [SalesOrderNumber] AS (isnull(N'SO'+CONVERT([nvarchar](23),[SalesOrderID],(0)),N'*** ERROR ***')),
10 | [PurchaseOrderNumber] [dbo].[OrderNumber] NULL,
11 | [AccountNumber] [dbo].[AccountNumber] NULL,
12 | [CustomerID] INT NOT NULL,
13 | [ShipToAddressID] INT NULL,
14 | [BillToAddressID] INT NULL,
15 | [ShipMethod] NVARCHAR (50) NOT NULL,
16 | [CreditCardApprovalCode] VARCHAR (15) NULL,
17 | [SubTotal] MONEY CONSTRAINT [DF_SalesOrderHeader_SubTotal] DEFAULT ((0.00)) NOT NULL,
18 | [TaxAmt] MONEY CONSTRAINT [DF_SalesOrderHeader_TaxAmt] DEFAULT ((0.00)) NOT NULL,
19 | [Freight] MONEY CONSTRAINT [DF_SalesOrderHeader_Freight] DEFAULT ((0.00)) NOT NULL,
20 | [TotalDue] AS (isnull(([SubTotal]+[TaxAmt])+[Freight],(0))),
21 | [Comment] NVARCHAR (MAX) NULL,
22 | [rowguid] UNIQUEIDENTIFIER CONSTRAINT [DF_SalesOrderHeader_rowguid] DEFAULT (newid()) NOT NULL,
23 | [ModifiedDate] DATETIME CONSTRAINT [DF_SalesOrderHeader_ModifiedDate] DEFAULT (getdate()) NOT NULL,
24 | CONSTRAINT [PK_SalesOrderHeader_SalesOrderID] PRIMARY KEY CLUSTERED ([SalesOrderID] ASC),
25 | CONSTRAINT [CK_SalesOrderHeader_DueDate] CHECK ([DueDate]>=[OrderDate]),
26 | CONSTRAINT [CK_SalesOrderHeader_Freight] CHECK ([Freight]>=(0.00)),
27 | CONSTRAINT [CK_SalesOrderHeader_ShipDate] CHECK ([ShipDate]>=[OrderDate] OR [ShipDate] IS NULL),
28 | CONSTRAINT [CK_SalesOrderHeader_Status] CHECK ([Status]>=(0) AND [Status]<=(8)),
29 | CONSTRAINT [CK_SalesOrderHeader_SubTotal] CHECK ([SubTotal]>=(0.00)),
30 | CONSTRAINT [CK_SalesOrderHeader_TaxAmt] CHECK ([TaxAmt]>=(0.00)),
31 | CONSTRAINT [FK_SalesOrderHeader_Address_BillTo_AddressID] FOREIGN KEY ([BillToAddressID]) REFERENCES [SalesLT].[Address] ([AddressID]),
32 | CONSTRAINT [FK_SalesOrderHeader_Address_ShipTo_AddressID] FOREIGN KEY ([ShipToAddressID]) REFERENCES [SalesLT].[Address] ([AddressID]),
33 | CONSTRAINT [FK_SalesOrderHeader_Customer_CustomerID] FOREIGN KEY ([CustomerID]) REFERENCES [SalesLT].[Customer] ([CustomerID]),
34 | CONSTRAINT [AK_SalesOrderHeader_rowguid] UNIQUE NONCLUSTERED ([rowguid] ASC),
35 | CONSTRAINT [AK_SalesOrderHeader_SalesOrderNumber] UNIQUE NONCLUSTERED ([SalesOrderNumber] ASC)
36 | );
37 |
38 |
39 | GO
40 | CREATE NONCLUSTERED INDEX [IX_SalesOrderHeader_CustomerID]
41 | ON [SalesLT].[SalesOrderHeader]([CustomerID] ASC);
42 |
43 |
--------------------------------------------------------------------------------
/samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/SalesLT/Views/vGetAllCategories.sql:
--------------------------------------------------------------------------------
1 | CREATE VIEW [SalesLT].[vGetAllCategories]
2 | WITH SCHEMABINDING
3 | AS
4 | -- Returns the CustomerID, first name, and last name for the specified customer.
5 | WITH CategoryCTE([ParentProductCategoryID], [ProductCategoryID], [Name]) AS
6 | (
7 | SELECT [ParentProductCategoryID], [ProductCategoryID], [Name]
8 | FROM SalesLT.ProductCategory
9 | WHERE ParentProductCategoryID IS NULL
10 |
11 | UNION ALL
12 |
13 | SELECT C.[ParentProductCategoryID], C.[ProductCategoryID], C.[Name]
14 | FROM SalesLT.ProductCategory AS C
15 | INNER JOIN CategoryCTE AS BC ON BC.ProductCategoryID = C.ParentProductCategoryID
16 | )
17 |
18 | SELECT PC.[Name] AS [ParentProductCategoryName], CCTE.[Name] as [ProductCategoryName], CCTE.[ProductCategoryID]
19 | FROM CategoryCTE AS CCTE
20 | JOIN SalesLT.ProductCategory AS PC
21 | ON PC.[ProductCategoryID] = CCTE.[ParentProductCategoryID]
--------------------------------------------------------------------------------
/samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/SalesLT/Views/vProductAndDescription.sql:
--------------------------------------------------------------------------------
1 | CREATE VIEW [SalesLT].[vProductAndDescription]
2 | WITH SCHEMABINDING
3 | AS
4 | -- View (indexed or standard) to display products and product descriptions by language.
5 | SELECT
6 | p.[ProductID]
7 | ,p.[Name]
8 | ,pm.[Name] AS [ProductModel]
9 | ,pmx.[Culture]
10 | ,pd.[Description]
11 | FROM [SalesLT].[Product] p
12 | INNER JOIN [SalesLT].[ProductModel] pm
13 | ON p.[ProductModelID] = pm.[ProductModelID]
14 | INNER JOIN [SalesLT].[ProductModelProductDescription] pmx
15 | ON pm.[ProductModelID] = pmx.[ProductModelID]
16 | INNER JOIN [SalesLT].[ProductDescription] pd
17 | ON pmx.[ProductDescriptionID] = pd.[ProductDescriptionID];
18 | GO
19 | CREATE UNIQUE CLUSTERED INDEX [IX_vProductAndDescription]
20 | ON [SalesLT].[vProductAndDescription]([Culture] ASC, [ProductID] ASC);
21 |
22 |
--------------------------------------------------------------------------------
/samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/SalesLT/Views/vProductModelCatalogDescription.sql:
--------------------------------------------------------------------------------
1 | CREATE VIEW [SalesLT].[vProductModelCatalogDescription]
2 | AS
3 | SELECT
4 | [ProductModelID]
5 | ,[Name]
6 | ,[CatalogDescription].value(N'declare namespace p1="http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/ProductModelDescription";
7 | declare namespace html="http://www.w3.org/1999/xhtml";
8 | (/p1:ProductDescription/p1:Summary/html:p)[1]', 'nvarchar(max)') AS [Summary]
9 | ,[CatalogDescription].value(N'declare namespace p1="http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/ProductModelDescription";
10 | (/p1:ProductDescription/p1:Manufacturer/p1:Name)[1]', 'nvarchar(max)') AS [Manufacturer]
11 | ,[CatalogDescription].value(N'declare namespace p1="http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/ProductModelDescription";
12 | (/p1:ProductDescription/p1:Manufacturer/p1:Copyright)[1]', 'nvarchar(30)') AS [Copyright]
13 | ,[CatalogDescription].value(N'declare namespace p1="http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/ProductModelDescription";
14 | (/p1:ProductDescription/p1:Manufacturer/p1:ProductURL)[1]', 'nvarchar(256)') AS [ProductURL]
15 | ,[CatalogDescription].value(N'declare namespace p1="http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/ProductModelDescription";
16 | declare namespace wm="http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/ProductModelWarrAndMain";
17 | (/p1:ProductDescription/p1:Features/wm:Warranty/wm:WarrantyPeriod)[1]', 'nvarchar(256)') AS [WarrantyPeriod]
18 | ,[CatalogDescription].value(N'declare namespace p1="http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/ProductModelDescription";
19 | declare namespace wm="http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/ProductModelWarrAndMain";
20 | (/p1:ProductDescription/p1:Features/wm:Warranty/wm:Description)[1]', 'nvarchar(256)') AS [WarrantyDescription]
21 | ,[CatalogDescription].value(N'declare namespace p1="http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/ProductModelDescription";
22 | declare namespace wm="http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/ProductModelWarrAndMain";
23 | (/p1:ProductDescription/p1:Features/wm:Maintenance/wm:NoOfYears)[1]', 'nvarchar(256)') AS [NoOfYears]
24 | ,[CatalogDescription].value(N'declare namespace p1="http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/ProductModelDescription";
25 | declare namespace wm="http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/ProductModelWarrAndMain";
26 | (/p1:ProductDescription/p1:Features/wm:Maintenance/wm:Description)[1]', 'nvarchar(256)') AS [MaintenanceDescription]
27 | ,[CatalogDescription].value(N'declare namespace p1="http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/ProductModelDescription";
28 | declare namespace wf="http://www.adventure-works.com/schemas/OtherFeatures";
29 | (/p1:ProductDescription/p1:Features/wf:wheel)[1]', 'nvarchar(256)') AS [Wheel]
30 | ,[CatalogDescription].value(N'declare namespace p1="http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/ProductModelDescription";
31 | declare namespace wf="http://www.adventure-works.com/schemas/OtherFeatures";
32 | (/p1:ProductDescription/p1:Features/wf:saddle)[1]', 'nvarchar(256)') AS [Saddle]
33 | ,[CatalogDescription].value(N'declare namespace p1="http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/ProductModelDescription";
34 | declare namespace wf="http://www.adventure-works.com/schemas/OtherFeatures";
35 | (/p1:ProductDescription/p1:Features/wf:pedal)[1]', 'nvarchar(256)') AS [Pedal]
36 | ,[CatalogDescription].value(N'declare namespace p1="http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/ProductModelDescription";
37 | declare namespace wf="http://www.adventure-works.com/schemas/OtherFeatures";
38 | (/p1:ProductDescription/p1:Features/wf:BikeFrame)[1]', 'nvarchar(max)') AS [BikeFrame]
39 | ,[CatalogDescription].value(N'declare namespace p1="http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/ProductModelDescription";
40 | declare namespace wf="http://www.adventure-works.com/schemas/OtherFeatures";
41 | (/p1:ProductDescription/p1:Features/wf:crankset)[1]', 'nvarchar(256)') AS [Crankset]
42 | ,[CatalogDescription].value(N'declare namespace p1="http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/ProductModelDescription";
43 | (/p1:ProductDescription/p1:Picture/p1:Angle)[1]', 'nvarchar(256)') AS [PictureAngle]
44 | ,[CatalogDescription].value(N'declare namespace p1="http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/ProductModelDescription";
45 | (/p1:ProductDescription/p1:Picture/p1:Size)[1]', 'nvarchar(256)') AS [PictureSize]
46 | ,[CatalogDescription].value(N'declare namespace p1="http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/ProductModelDescription";
47 | (/p1:ProductDescription/p1:Picture/p1:ProductPhotoID)[1]', 'nvarchar(256)') AS [ProductPhotoID]
48 | ,[CatalogDescription].value(N'declare namespace p1="http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/ProductModelDescription";
49 | (/p1:ProductDescription/p1:Specifications/Material)[1]', 'nvarchar(256)') AS [Material]
50 | ,[CatalogDescription].value(N'declare namespace p1="http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/ProductModelDescription";
51 | (/p1:ProductDescription/p1:Specifications/Color)[1]', 'nvarchar(256)') AS [Color]
52 | ,[CatalogDescription].value(N'declare namespace p1="http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/ProductModelDescription";
53 | (/p1:ProductDescription/p1:Specifications/ProductLine)[1]', 'nvarchar(256)') AS [ProductLine]
54 | ,[CatalogDescription].value(N'declare namespace p1="http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/ProductModelDescription";
55 | (/p1:ProductDescription/p1:Specifications/Style)[1]', 'nvarchar(256)') AS [Style]
56 | ,[CatalogDescription].value(N'declare namespace p1="http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/ProductModelDescription";
57 | (/p1:ProductDescription/p1:Specifications/RiderExperience)[1]', 'nvarchar(1024)') AS [RiderExperience]
58 | ,[rowguid]
59 | ,[ModifiedDate]
60 | FROM [SalesLT].[ProductModel]
61 | WHERE [CatalogDescription] IS NOT NULL;
--------------------------------------------------------------------------------
/samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/Security/SalesLT.sql:
--------------------------------------------------------------------------------
1 | CREATE SCHEMA [SalesLT]
2 | AUTHORIZATION [dbo];
3 |
4 |
--------------------------------------------------------------------------------
/samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/dbo/Functions/ufnGetAllCategories.sql:
--------------------------------------------------------------------------------
1 | CREATE FUNCTION [dbo].[ufnGetAllCategories]()
2 | RETURNS @retCategoryInformation TABLE
3 | (
4 | -- Columns returned by the function
5 | [ParentProductCategoryName] nvarchar(50) NULL,
6 | [ProductCategoryName] nvarchar(50) NOT NULL,
7 | [ProductCategoryID] int NOT NULL
8 | )
9 | AS
10 | -- Returns the CustomerID, first name, and last name for the specified customer.
11 | BEGIN
12 | WITH CategoryCTE([ParentProductCategoryID], [ProductCategoryID], [Name]) AS
13 | (
14 | SELECT [ParentProductCategoryID], [ProductCategoryID], [Name]
15 | FROM SalesLT.ProductCategory
16 | WHERE ParentProductCategoryID IS NULL
17 |
18 | UNION ALL
19 |
20 | SELECT C.[ParentProductCategoryID], C.[ProductCategoryID], C.[Name]
21 | FROM SalesLT.ProductCategory AS C
22 | INNER JOIN CategoryCTE AS BC ON BC.ProductCategoryID = C.ParentProductCategoryID
23 | )
24 |
25 | INSERT INTO @retCategoryInformation
26 | SELECT PC.[Name] AS [ParentProductCategoryName], CCTE.[Name] as [ProductCategoryName], CCTE.[ProductCategoryID]
27 | FROM CategoryCTE AS CCTE
28 | JOIN SalesLT.ProductCategory AS PC
29 | ON PC.[ProductCategoryID] = CCTE.[ParentProductCategoryID];
30 | RETURN;
31 | END;
--------------------------------------------------------------------------------
/samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/dbo/Functions/ufnGetCustomerInformation.sql:
--------------------------------------------------------------------------------
1 | CREATE FUNCTION [dbo].[ufnGetCustomerInformation](@CustomerID int)
2 | RETURNS TABLE
3 | AS
4 | -- Returns the CustomerID, first name, and last name for the specified customer.
5 | RETURN (
6 | SELECT
7 | CustomerID,
8 | FirstName,
9 | LastName
10 | FROM [SalesLT].[Customer]
11 | WHERE [CustomerID] = @CustomerID
12 | );
--------------------------------------------------------------------------------
/samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/dbo/Functions/ufnGetSalesOrderStatusText.sql:
--------------------------------------------------------------------------------
1 | CREATE FUNCTION [dbo].[ufnGetSalesOrderStatusText](@Status tinyint)
2 | RETURNS nvarchar(15)
3 | AS
4 | -- Returns the sales order status text representation for the status value.
5 | BEGIN
6 | DECLARE @ret nvarchar(15);
7 |
8 | SET @ret =
9 | CASE @Status
10 | WHEN 1 THEN 'In process'
11 | WHEN 2 THEN 'Approved'
12 | WHEN 3 THEN 'Backordered'
13 | WHEN 4 THEN 'Rejected'
14 | WHEN 5 THEN 'Shipped'
15 | WHEN 6 THEN 'Cancelled'
16 | ELSE '** Invalid **'
17 | END;
18 |
19 | RETURN @ret
20 | END;
--------------------------------------------------------------------------------
/samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/dbo/Stored Procedures/uspLogError.sql:
--------------------------------------------------------------------------------
1 |
2 | -- uspLogError logs error information in the ErrorLog table about the
3 | -- error that caused execution to jump to the CATCH block of a
4 | -- TRY...CATCH construct. This should be executed from within the scope
5 | -- of a CATCH block otherwise it will return without inserting error
6 | -- information.
7 | CREATE PROCEDURE [dbo].[uspLogError]
8 | @ErrorLogID int = 0 OUTPUT -- contains the ErrorLogID of the row inserted
9 | AS -- by uspLogError in the ErrorLog table
10 | BEGIN
11 | SET NOCOUNT ON;
12 |
13 | -- Output parameter value of 0 indicates that error
14 | -- information was not logged
15 | SET @ErrorLogID = 0;
16 |
17 | BEGIN TRY
18 | -- Return if there is no error information to log
19 | IF ERROR_NUMBER() IS NULL
20 | RETURN;
21 |
22 | -- Return if inside an uncommittable transaction.
23 | -- Data insertion/modification is not allowed when
24 | -- a transaction is in an uncommittable state.
25 | IF XACT_STATE() = -1
26 | BEGIN
27 | PRINT 'Cannot log error since the current transaction is in an uncommittable state. '
28 | + 'Rollback the transaction before executing uspLogError in order to successfully log error information.';
29 | RETURN;
30 | END
31 |
32 | INSERT [dbo].[ErrorLog]
33 | (
34 | [UserName],
35 | [ErrorNumber],
36 | [ErrorSeverity],
37 | [ErrorState],
38 | [ErrorProcedure],
39 | [ErrorLine],
40 | [ErrorMessage]
41 | )
42 | VALUES
43 | (
44 | CONVERT(sysname, CURRENT_USER),
45 | ERROR_NUMBER(),
46 | ERROR_SEVERITY(),
47 | ERROR_STATE(),
48 | ERROR_PROCEDURE(),
49 | ERROR_LINE(),
50 | ERROR_MESSAGE()
51 | );
52 |
53 | -- Pass back the ErrorLogID of the row inserted
54 | SET @ErrorLogID = @@IDENTITY;
55 | END TRY
56 | BEGIN CATCH
57 | PRINT 'An error occurred in stored procedure uspLogError: ';
58 | EXECUTE [dbo].[uspPrintError];
59 | RETURN -1;
60 | END CATCH
61 | END;
--------------------------------------------------------------------------------
/samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/dbo/Stored Procedures/uspPrintError.sql:
--------------------------------------------------------------------------------
1 |
2 | -- uspPrintError prints error information about the error that caused
3 | -- execution to jump to the CATCH block of a TRY...CATCH construct.
4 | -- Should be executed from within the scope of a CATCH block otherwise
5 | -- it will return without printing any error information.
6 | CREATE PROCEDURE [dbo].[uspPrintError]
7 | AS
8 | BEGIN
9 | SET NOCOUNT ON;
10 |
11 | -- Print error information.
12 | PRINT 'Error ' + CONVERT(varchar(50), ERROR_NUMBER()) +
13 | ', Severity ' + CONVERT(varchar(5), ERROR_SEVERITY()) +
14 | ', State ' + CONVERT(varchar(5), ERROR_STATE()) +
15 | ', Procedure ' + ISNULL(ERROR_PROCEDURE(), '-') +
16 | ', Line ' + CONVERT(varchar(5), ERROR_LINE());
17 | PRINT ERROR_MESSAGE();
18 | END;
--------------------------------------------------------------------------------
/samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/dbo/Tables/BuildVersion.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE [dbo].[BuildVersion] (
2 | [SystemInformationID] TINYINT IDENTITY (1, 1) NOT NULL,
3 | [Database Version] NVARCHAR (25) NOT NULL,
4 | [VersionDate] DATETIME NOT NULL,
5 | [ModifiedDate] DATETIME CONSTRAINT [DF_BuildVersion_ModifiedDate] DEFAULT (getdate()) NOT NULL,
6 | PRIMARY KEY CLUSTERED ([SystemInformationID] ASC)
7 | );
8 |
9 |
--------------------------------------------------------------------------------
/samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/dbo/Tables/ErrorLog.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE [dbo].[ErrorLog] (
2 | [ErrorLogID] INT IDENTITY (1, 1) NOT NULL,
3 | [ErrorTime] DATETIME CONSTRAINT [DF_ErrorLog_ErrorTime] DEFAULT (getdate()) NOT NULL,
4 | [UserName] [sysname] NOT NULL,
5 | [ErrorNumber] INT NOT NULL,
6 | [ErrorSeverity] INT NULL,
7 | [ErrorState] INT NULL,
8 | [ErrorProcedure] NVARCHAR (126) NULL,
9 | [ErrorLine] INT NULL,
10 | [ErrorMessage] NVARCHAR (4000) NOT NULL,
11 | CONSTRAINT [PK_ErrorLog_ErrorLogID] PRIMARY KEY CLUSTERED ([ErrorLogID] ASC)
12 | );
13 |
14 |
--------------------------------------------------------------------------------
/samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/dbo/User Defined Types/AccountNumber.sql:
--------------------------------------------------------------------------------
1 | CREATE TYPE [dbo].[AccountNumber]
2 | FROM NVARCHAR (15) NULL;
3 |
4 |
--------------------------------------------------------------------------------
/samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/dbo/User Defined Types/Flag.sql:
--------------------------------------------------------------------------------
1 | CREATE TYPE [dbo].[Flag]
2 | FROM BIT NOT NULL;
3 |
4 |
--------------------------------------------------------------------------------
/samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/dbo/User Defined Types/Name.sql:
--------------------------------------------------------------------------------
1 | CREATE TYPE [dbo].[Name]
2 | FROM NVARCHAR (50) NULL;
3 |
4 |
--------------------------------------------------------------------------------
/samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/dbo/User Defined Types/NameStyle.sql:
--------------------------------------------------------------------------------
1 | CREATE TYPE [dbo].[NameStyle]
2 | FROM BIT NOT NULL;
3 |
4 |
--------------------------------------------------------------------------------
/samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/dbo/User Defined Types/OrderNumber.sql:
--------------------------------------------------------------------------------
1 | CREATE TYPE [dbo].[OrderNumber]
2 | FROM NVARCHAR (25) NULL;
3 |
4 |
--------------------------------------------------------------------------------
/samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/dbo/User Defined Types/Phone.sql:
--------------------------------------------------------------------------------
1 | CREATE TYPE [dbo].[Phone]
2 | FROM NVARCHAR (25) NULL;
3 |
4 |
--------------------------------------------------------------------------------
/samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/ddo_samples_azuresql.sqlproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Debug
5 | AnyCPU
6 | ddo_samples_azuresql
7 | 2.0
8 | 4.1
9 | {387ebfc5-acb1-4445-a25f-d70d18d34c30}
10 | Microsoft.Data.Tools.Schema.Sql.SqlAzureV12DatabaseSchemaProvider
11 | Database
12 |
13 |
14 | ddo_samples_azuresql
15 | ddo_samples_azuresql
16 | 1033, CI
17 | BySchemaAndSchemaType
18 | True
19 | v4.5
20 | CS
21 | Properties
22 | False
23 | True
24 | True
25 |
26 |
27 | bin\Release\
28 | $(MSBuildProjectName).sql
29 | False
30 | pdbonly
31 | true
32 | false
33 | true
34 | prompt
35 | 4
36 |
37 |
38 | bin\Debug\
39 | $(MSBuildProjectName).sql
40 | false
41 | true
42 | full
43 | false
44 | true
45 | true
46 | prompt
47 | 4
48 |
49 |
50 | 11.0
51 |
52 | True
53 | 11.0
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
--------------------------------------------------------------------------------
/samples/databricks/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devlace/datadevops/e6c564a674a8264eed94fa6a8a8056e3b450525c/samples/databricks/README.md
--------------------------------------------------------------------------------
/sql/ddo_azuresqldw_dw/ddo_azuresqldw_dw.sln:
--------------------------------------------------------------------------------
1 |
2 | Microsoft Visual Studio Solution File, Format Version 12.00
3 | # Visual Studio Version 16
4 | VisualStudioVersion = 16.0.29326.143
5 | MinimumVisualStudioVersion = 10.0.40219.1
6 | Project("{00D1A9C2-B5F0-4AF3-8072-F6C62B433612}") = "ddo_azuresqldw_dw", "ddo_azuresqldw_dw\ddo_azuresqldw_dw.sqlproj", "{AA416CF5-F184-4573-B591-7ED42A294421}"
7 | EndProject
8 | Global
9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution
10 | Debug|Any CPU = Debug|Any CPU
11 | Release|Any CPU = Release|Any CPU
12 | EndGlobalSection
13 | GlobalSection(ProjectConfigurationPlatforms) = postSolution
14 | {AA416CF5-F184-4573-B591-7ED42A294421}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
15 | {AA416CF5-F184-4573-B591-7ED42A294421}.Debug|Any CPU.Build.0 = Debug|Any CPU
16 | {AA416CF5-F184-4573-B591-7ED42A294421}.Debug|Any CPU.Deploy.0 = Debug|Any CPU
17 | {AA416CF5-F184-4573-B591-7ED42A294421}.Release|Any CPU.ActiveCfg = Release|Any CPU
18 | {AA416CF5-F184-4573-B591-7ED42A294421}.Release|Any CPU.Build.0 = Release|Any CPU
19 | {AA416CF5-F184-4573-B591-7ED42A294421}.Release|Any CPU.Deploy.0 = Release|Any CPU
20 | EndGlobalSection
21 | GlobalSection(SolutionProperties) = preSolution
22 | HideSolutionNode = FALSE
23 | EndGlobalSection
24 | GlobalSection(ExtensibilityGlobals) = postSolution
25 | SolutionGuid = {1B991B8B-9C61-481C-8B3C-CC2958974BA2}
26 | EndGlobalSection
27 | EndGlobal
28 |
--------------------------------------------------------------------------------
/sql/ddo_azuresqldw_dw/ddo_azuresqldw_dw/External Resources/AzureDataLakeStorage.sql:
--------------------------------------------------------------------------------
1 | CREATE EXTERNAL DATA SOURCE [AzureDataLakeStorage]
2 | WITH (
3 | TYPE = HADOOP,
4 | LOCATION = N'$(ADLSLocation)',
5 | CREDENTIAL = [ADLSCredentialKey]
6 | );
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/sql/ddo_azuresqldw_dw/ddo_azuresqldw_dw/External Resources/ParquetFileFormat.sql:
--------------------------------------------------------------------------------
1 | CREATE EXTERNAL FILE FORMAT [ParquetFileFormat]
2 | WITH (
3 | FORMAT_TYPE = PARQUET
4 | );
5 |
6 |
--------------------------------------------------------------------------------
/sql/ddo_azuresqldw_dw/ddo_azuresqldw_dw/Script.PostDeployment1.sql:
--------------------------------------------------------------------------------
1 | /*
2 | Post-Deployment Script Template
3 | --------------------------------------------------------------------------------------
4 | This file contains SQL statements that will be appended to the build script.
5 | Use SQLCMD syntax to include a file in the post-deployment script.
6 | Example: :r .\myfile.sql
7 | Use SQLCMD syntax to reference a variable in the post-deployment script.
8 | Example: :setvar TableName MyTable
9 | SELECT * FROM [$(TableName)]
10 | --------------------------------------------------------------------------------------
11 | */
12 |
13 | :setvar ADLSLocation ADLSLocation
14 | :setvar ADLSCredentialKey ADLSCredentialKey
15 |
16 | ALTER EXTERNAL DATA SOURCE [AzureDataLakeStorage] SET LOCATION = '$(ADLSLocation)';
17 | GO
18 |
19 | ALTER DATABASE SCOPED CREDENTIAL [ADLSCredentialKey] WITH IDENTITY = N'user', SECRET = '$(ADLSCredentialKey)';
20 | GO
21 |
--------------------------------------------------------------------------------
/sql/ddo_azuresqldw_dw/ddo_azuresqldw_dw/Security/ADLSCredentialKey.sql:
--------------------------------------------------------------------------------
1 | CREATE DATABASE SCOPED CREDENTIAL [ADLSCredentialKey] WITH IDENTITY = N'user', SECRET = '$(ADLSCredentialKey)';
2 |
3 |
--------------------------------------------------------------------------------
/sql/ddo_azuresqldw_dw/ddo_azuresqldw_dw/Security/MasterKeys.sql:
--------------------------------------------------------------------------------
1 | CREATE MASTER KEY;
2 |
3 |
--------------------------------------------------------------------------------
/sql/ddo_azuresqldw_dw/ddo_azuresqldw_dw/Security/ext.sql:
--------------------------------------------------------------------------------
1 | CREATE SCHEMA [ext]
2 | AUTHORIZATION [dbo];
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
--------------------------------------------------------------------------------
/sql/ddo_azuresqldw_dw/ddo_azuresqldw_dw/dbo/Stored Procedures/load_dw.sql:
--------------------------------------------------------------------------------
1 | CREATE PROC [dbo].[load_dw] @load_id [VARCHAR](50) AS
2 | BEGIN
3 | -- SET NOCOUNT ON added to prevent extra result sets from
4 | -- interfering with SELECT statements.
5 | SET NOCOUNT ON
6 |
7 | -- DIM TABLES
8 |
9 | TRUNCATE TABLE dbo.[dim_parking_bay];
10 | INSERT INTO dbo.[dim_parking_bay]
11 | SELECT
12 | CAST([dim_parking_bay_id] AS UNIQUEIDENTIFIER),
13 | [bay_id],
14 | [marker_id],
15 | [meter_id],
16 | [rd_seg_id],
17 | [rd_seg_dsc],
18 | [load_id],
19 | [loaded_on]
20 | FROM ext.[dim_parking_bay];
21 |
22 | --
23 | TRUNCATE TABLE dbo.[dim_location];
24 | INSERT INTO dbo.[dim_location]
25 | SELECT
26 | CAST([dim_location_id] AS UNIQUEIDENTIFIER),
27 | [lat],
28 | [lon],
29 | [load_id],
30 | [loaded_on]
31 | FROM ext.[dim_location];
32 |
33 | --
34 | TRUNCATE TABLE dbo.[dim_st_marker];
35 | INSERT INTO dbo.[dim_st_marker]
36 | SELECT
37 | CAST([dim_st_marker_id] AS UNIQUEIDENTIFIER),
38 | [st_marker_id],
39 | [load_id],
40 | [loaded_on]
41 | FROM ext.[dim_st_marker];
42 |
43 |
44 | -- FACT TABLES
45 | DELETE FROM dbo.[fact_parking] WHERE load_id=@load_id;
46 | INSERT INTO dbo.[fact_parking]
47 | SELECT
48 | [dim_date_id],
49 | [dim_time_id],
50 | CAST([dim_parking_bay_id] AS UNIQUEIDENTIFIER),
51 | CAST([dim_location_id] AS UNIQUEIDENTIFIER),
52 | CAST([dim_st_marker_id] AS UNIQUEIDENTIFIER),
53 | [status],
54 | [load_id],
55 | [loaded_on]
56 | FROM ext.[fact_parking]
57 | WHERE load_id=@load_id;
58 | END
--------------------------------------------------------------------------------
/sql/ddo_azuresqldw_dw/ddo_azuresqldw_dw/dbo/Tables/dim_location.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE [dbo].[dim_location] (
2 | [dim_location_id] UNIQUEIDENTIFIER NOT NULL,
3 | [lat] REAL NULL,
4 | [lon] REAL NULL,
5 | [load_id] NVARCHAR (50) NULL,
6 | [loaded_on] DATETIME NULL
7 | )
8 | WITH (CLUSTERED COLUMNSTORE INDEX, DISTRIBUTION = REPLICATE);
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/sql/ddo_azuresqldw_dw/ddo_azuresqldw_dw/dbo/Tables/dim_parking_bay.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE [dbo].[dim_parking_bay] (
2 | [dim_parking_bay_id] UNIQUEIDENTIFIER NOT NULL,
3 | [bay_id] INT NULL,
4 | [marker_id] NVARCHAR (50) NULL,
5 | [meter_id] NVARCHAR (50) NULL,
6 | [rd_seg_id] NVARCHAR (50) NULL,
7 | [rd_seg_dsc] NVARCHAR (500) NULL,
8 | [load_id] NVARCHAR (50) NULL,
9 | [loaded_on] DATETIME NULL
10 | )
11 | WITH (CLUSTERED COLUMNSTORE INDEX, DISTRIBUTION = REPLICATE);
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/sql/ddo_azuresqldw_dw/ddo_azuresqldw_dw/dbo/Tables/dim_st_marker.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE [dbo].[dim_st_marker] (
2 | [dim_st_marker_id] UNIQUEIDENTIFIER NULL,
3 | [st_marker_id] NVARCHAR (50) NULL,
4 | [load_id] NVARCHAR (50) NULL,
5 | [loaded_on] DATETIME NULL
6 | )
7 | WITH (CLUSTERED COLUMNSTORE INDEX, DISTRIBUTION = REPLICATE);
8 |
9 |
--------------------------------------------------------------------------------
/sql/ddo_azuresqldw_dw/ddo_azuresqldw_dw/dbo/Tables/fact_parking.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE [dbo].[fact_parking] (
2 | [dim_date_id] NVARCHAR (50) NULL,
3 | [dim_time_id] NVARCHAR (50) NULL,
4 | [dim_parking_bay_id] UNIQUEIDENTIFIER NULL,
5 | [dim_location_id] UNIQUEIDENTIFIER NULL,
6 | [dim_st_marker_id] UNIQUEIDENTIFIER NULL,
7 | [status] NVARCHAR (50) NULL,
8 | [load_id] NVARCHAR (50) NULL,
9 | [loaded_on] DATETIME NULL
10 | )
11 | WITH (CLUSTERED COLUMNSTORE INDEX, DISTRIBUTION = HASH([dim_parking_bay_id]));
12 |
13 |
--------------------------------------------------------------------------------
/sql/ddo_azuresqldw_dw/ddo_azuresqldw_dw/ddo_azuresqldw_dw.sqlproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Debug
5 | AnyCPU
6 | ddo_azuresqldw_dw
7 | 2.0
8 | 4.1
9 | {aa416cf5-f184-4573-b591-7ed42a294421}
10 | Microsoft.Data.Tools.Schema.Sql.SqlDwDatabaseSchemaProvider
11 | Database
12 |
13 |
14 | ddo_azuresqldw_dw
15 | ddo_azuresqldw_dw
16 | 1033, CI
17 | BySchemaAndSchemaType
18 | True
19 | v4.5
20 | CS
21 | Properties
22 | False
23 | True
24 | True
25 |
26 |
27 | bin\Release\
28 | $(MSBuildProjectName).sql
29 | False
30 | pdbonly
31 | true
32 | false
33 | true
34 | prompt
35 | 4
36 |
37 |
38 | bin\Debug\
39 | $(MSBuildProjectName).sql
40 | false
41 | true
42 | full
43 | false
44 | true
45 | true
46 | prompt
47 | 4
48 |
49 |
50 | 11.0
51 |
52 | True
53 | 11.0
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 | Off
72 |
73 |
74 |
75 |
76 |
77 |
78 | Off
79 |
80 |
81 | Off
82 |
83 |
84 |
85 | Off
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 | $(SqlCmdVar__2)
96 |
97 |
98 |
99 |
100 | $(SqlCmdVar__1)
101 |
102 |
103 |
--------------------------------------------------------------------------------
/sql/ddo_azuresqldw_dw/ddo_azuresqldw_dw/ext/External Tables/dim_location.sql:
--------------------------------------------------------------------------------
1 | CREATE EXTERNAL TABLE [ext].[dim_location] (
2 | [dim_location_id] NVARCHAR (50) NULL,
3 | [lat] REAL NULL,
4 | [lon] REAL NULL,
5 | [load_id] NVARCHAR (50) NULL,
6 | [loaded_on] DATETIME NULL
7 | )
8 | WITH (
9 | DATA_SOURCE = [AzureDataLakeStorage],
10 | LOCATION = N'data/dw/dim_location/',
11 | FILE_FORMAT = [ParquetFileFormat],
12 | REJECT_TYPE = VALUE,
13 | REJECT_VALUE = 0
14 | );
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/sql/ddo_azuresqldw_dw/ddo_azuresqldw_dw/ext/External Tables/dim_parking_bay.sql:
--------------------------------------------------------------------------------
1 | CREATE EXTERNAL TABLE [ext].[dim_parking_bay] (
2 | [dim_parking_bay_id] NVARCHAR (50) NULL,
3 | [bay_id] INT NULL,
4 | [marker_id] NVARCHAR (50) NULL,
5 | [meter_id] NVARCHAR (50) NULL,
6 | [rd_seg_dsc] NVARCHAR (MAX) NULL,
7 | [rd_seg_id] NVARCHAR (50) NULL,
8 | [load_id] NVARCHAR (50) NULL,
9 | [loaded_on] DATETIME NULL
10 | )
11 | WITH (
12 | DATA_SOURCE = [AzureDataLakeStorage],
13 | LOCATION = N'data/dw/dim_parking_bay/',
14 | FILE_FORMAT = [ParquetFileFormat],
15 | REJECT_TYPE = VALUE,
16 | REJECT_VALUE = 0
17 | );
18 |
19 |
20 |
21 |
--------------------------------------------------------------------------------
/sql/ddo_azuresqldw_dw/ddo_azuresqldw_dw/ext/External Tables/dim_st_marker.sql:
--------------------------------------------------------------------------------
1 | CREATE EXTERNAL TABLE [ext].[dim_st_marker] (
2 | [dim_st_marker_id] NVARCHAR (50) NULL,
3 | [st_marker_id] NVARCHAR (50) NULL,
4 | [load_id] NVARCHAR (50) NULL,
5 | [loaded_on] DATETIME NULL
6 | )
7 | WITH (
8 | DATA_SOURCE = [AzureDataLakeStorage],
9 | LOCATION = N'data/dw/dim_st_marker/',
10 | FILE_FORMAT = [ParquetFileFormat],
11 | REJECT_TYPE = VALUE,
12 | REJECT_VALUE = 0
13 | );
14 |
15 |
16 |
17 |
--------------------------------------------------------------------------------
/sql/ddo_azuresqldw_dw/ddo_azuresqldw_dw/ext/External Tables/fact_parking.sql:
--------------------------------------------------------------------------------
1 | CREATE EXTERNAL TABLE [ext].[fact_parking] (
2 | [dim_date_id] NVARCHAR (50) NULL,
3 | [dim_time_id] NVARCHAR (50) NULL,
4 | [dim_parking_bay_id] NVARCHAR (50) NULL,
5 | [dim_location_id] NVARCHAR (50) NULL,
6 | [dim_st_marker_id] NVARCHAR (50) NULL,
7 | [status] NVARCHAR (50) NULL,
8 | [load_id] NVARCHAR (50) NULL,
9 | [loaded_on] DATETIME NULL
10 | )
11 | WITH (
12 | DATA_SOURCE = [AzureDataLakeStorage],
13 | LOCATION = N'data/dw/fact_parking/',
14 | FILE_FORMAT = [ParquetFileFormat],
15 | REJECT_TYPE = VALUE,
16 | REJECT_VALUE = 0
17 | );
18 |
19 |
20 |
21 |
--------------------------------------------------------------------------------
/src/ddo_transform/.editorconfig:
--------------------------------------------------------------------------------
1 | # http://editorconfig.org
2 |
3 | root = true
4 |
5 | [*]
6 | indent_style = space
7 | indent_size = 4
8 | trim_trailing_whitespace = true
9 | insert_final_newline = true
10 | charset = utf-8
11 | end_of_line = lf
12 |
13 | [*.bat]
14 | indent_style = tab
15 | end_of_line = crlf
16 |
17 | [LICENSE]
18 | insert_final_newline = false
19 |
20 | [Makefile]
21 | indent_style = tab
22 |
--------------------------------------------------------------------------------
/src/ddo_transform/AUTHORS.rst:
--------------------------------------------------------------------------------
1 | =======
2 | Credits
3 | =======
4 |
5 | Development Lead
6 | ----------------
7 |
8 | * Lace Lofranco
9 |
10 | Contributors
11 | ------------
12 |
13 | None yet. Why not be the first?
14 |
--------------------------------------------------------------------------------
/src/ddo_transform/CONTRIBUTING.rst:
--------------------------------------------------------------------------------
1 | .. highlight:: shell
2 |
3 | ============
4 | Contributing
5 | ============
6 |
7 | Contributions are welcome, and they are greatly appreciated! Every little bit
8 | helps, and credit will always be given.
9 |
10 | You can contribute in many ways:
11 |
12 | Types of Contributions
13 | ----------------------
14 |
15 | Report Bugs
16 | ~~~~~~~~~~~
17 |
18 | Report bugs at https://github.com/devlace/ddo_transform/issues.
19 |
20 | If you are reporting a bug, please include:
21 |
22 | * Your operating system name and version.
23 | * Any details about your local setup that might be helpful in troubleshooting.
24 | * Detailed steps to reproduce the bug.
25 |
26 | Fix Bugs
27 | ~~~~~~~~
28 |
29 | Look through the GitHub issues for bugs. Anything tagged with "bug" and "help
30 | wanted" is open to whoever wants to implement it.
31 |
32 | Implement Features
33 | ~~~~~~~~~~~~~~~~~~
34 |
35 | Look through the GitHub issues for features. Anything tagged with "enhancement"
36 | and "help wanted" is open to whoever wants to implement it.
37 |
38 | Write Documentation
39 | ~~~~~~~~~~~~~~~~~~~
40 |
41 | ddo_transform could always use more documentation, whether as part of the
42 | official ddo_transform docs, in docstrings, or even on the web in blog posts,
43 | articles, and such.
44 |
45 | Submit Feedback
46 | ~~~~~~~~~~~~~~~
47 |
48 | The best way to send feedback is to file an issue at https://github.com/devlace/ddo_transform/issues.
49 |
50 | If you are proposing a feature:
51 |
52 | * Explain in detail how it would work.
53 | * Keep the scope as narrow as possible, to make it easier to implement.
54 | * Remember that this is a volunteer-driven project, and that contributions
55 | are welcome :)
56 |
57 | Get Started!
58 | ------------
59 |
60 | Ready to contribute? Here's how to set up `ddo_transform` for local development.
61 |
62 | 1. Fork the `ddo_transform` repo on GitHub.
63 | 2. Clone your fork locally::
64 |
65 | $ git clone git@github.com:your_name_here/ddo_transform.git
66 |
67 | 3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, this is how you set up your fork for local development::
68 |
69 | $ mkvirtualenv ddo_transform
70 | $ cd ddo_transform/
71 | $ python setup.py develop
72 |
73 | 4. Create a branch for local development::
74 |
75 | $ git checkout -b name-of-your-bugfix-or-feature
76 |
77 | Now you can make your changes locally.
78 |
79 | 5. When you're done making changes, check that your changes pass flake8 and the
80 | tests, including testing other Python versions with tox::
81 |
82 | $ flake8 ddo_transform tests
83 | $ python setup.py test or py.test
84 | $ tox
85 |
86 | To get flake8 and tox, just pip install them into your virtualenv.
87 |
88 | 6. Commit your changes and push your branch to GitHub::
89 |
90 | $ git add .
91 | $ git commit -m "Your detailed description of your changes."
92 | $ git push origin name-of-your-bugfix-or-feature
93 |
94 | 7. Submit a pull request through the GitHub website.
95 |
96 | Pull Request Guidelines
97 | -----------------------
98 |
99 | Before you submit a pull request, check that it meets these guidelines:
100 |
101 | 1. The pull request should include tests.
102 | 2. If the pull request adds functionality, the docs should be updated. Put
103 | your new functionality into a function with a docstring, and add the
104 | feature to the list in README.rst.
105 | 3. The pull request should work for Python 2.7, 3.4, 3.5 and 3.6, and for PyPy. Check
106 | https://travis-ci.org/devlace/ddo_transform/pull_requests
107 | and make sure that the tests pass for all supported Python versions.
108 |
109 | Tips
110 | ----
111 |
112 | To run a subset of tests::
113 |
114 | $ py.test tests.test_ddo_transform
115 |
116 |
117 | Deploying
118 | ---------
119 |
120 | A reminder for the maintainers on how to deploy.
121 | Make sure all your changes are committed (including an entry in HISTORY.rst).
122 | Then run::
123 |
124 | $ bumpversion patch # possible: major / minor / patch
125 | $ git push
126 | $ git push --tags
127 |
128 | Travis will then deploy to PyPI if tests pass.
129 |
--------------------------------------------------------------------------------
/src/ddo_transform/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.7.3
2 |
3 | # Install OpenJDK 8 and Python
4 | RUN \
5 | apt-get update && \
6 | apt-get install -y openjdk-8-jdk && \
7 | rm -rf /var/lib/apt/lists/*
8 |
9 | WORKDIR /usr/ddo_transform
10 |
11 | COPY . .
12 |
13 | RUN pip install --no-cache-dir -r requirements_dev.txt && \
14 | make clean && \
15 | make lint && \
16 | make test && \
17 | make docs && \
18 | make dist
19 |
20 |
--------------------------------------------------------------------------------
/src/ddo_transform/HISTORY.rst:
--------------------------------------------------------------------------------
1 | =======
2 | History
3 | =======
4 |
5 | 0.1.0 (2019-01-29)
6 | ------------------
7 |
8 | * First release on PyPI.
9 |
--------------------------------------------------------------------------------
/src/ddo_transform/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include AUTHORS.rst
2 | include CONTRIBUTING.rst
3 | include HISTORY.rst
4 | include LICENSE
5 | include README.rst
6 |
7 | include data/On-street_Parking_Bay_Sensors.csv
8 | recursive-include tests *
9 | recursive-exclude * __pycache__
10 | recursive-exclude * *.py[co]
11 |
12 | recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif
13 |
--------------------------------------------------------------------------------
/src/ddo_transform/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: clean clean-test clean-pyc clean-build docs help
2 | .DEFAULT_GOAL := help
3 |
4 | define BROWSER_PYSCRIPT
5 | import os, webbrowser, sys
6 |
7 | try:
8 | from urllib import pathname2url
9 | except:
10 | from urllib.request import pathname2url
11 |
12 | webbrowser.open("file://" + pathname2url(os.path.abspath(sys.argv[1])))
13 | endef
14 | export BROWSER_PYSCRIPT
15 |
16 | define PRINT_HELP_PYSCRIPT
17 | import re, sys
18 |
19 | for line in sys.stdin:
20 | match = re.match(r'^([a-zA-Z_-]+):.*?## (.*)$$', line)
21 | if match:
22 | target, help = match.groups()
23 | print("%-20s %s" % (target, help))
24 | endef
25 | export PRINT_HELP_PYSCRIPT
26 |
27 | BROWSER := python -c "$$BROWSER_PYSCRIPT"
28 |
29 | help:
30 | @python -c "$$PRINT_HELP_PYSCRIPT" < $(MAKEFILE_LIST)
31 |
32 | clean: clean-build clean-pyc clean-test ## remove all build, test, coverage and Python artifacts
33 |
34 | clean-build: ## remove build artifacts
35 | rm -fr build/
36 | rm -fr dist/
37 | rm -fr .eggs/
38 | find . -name '*.egg-info' -exec rm -fr {} +
39 | find . -name '*.egg' -exec rm -f {} +
40 |
41 | clean-pyc: ## remove Python file artifacts
42 | find . -name '*.pyc' -exec rm -f {} +
43 | find . -name '*.pyo' -exec rm -f {} +
44 | find . -name '*~' -exec rm -f {} +
45 | find . -name '__pycache__' -exec rm -fr {} +
46 |
47 | clean-test: ## remove test and coverage artifacts
48 | rm -fr .tox/
49 | rm -f .coverage
50 | rm -fr htmlcov/
51 | rm -fr .pytest_cache
52 |
53 | lint: ## check style with flake8
54 | flake8 ddo_transform tests
55 |
56 | test: ## run tests quickly with the default Python
57 | PYTHONPATH=`pwd` py.test
58 |
59 | test-all: ## run tests on every Python version with tox
60 | tox
61 |
62 | coverage: ## check code coverage quickly with the default Python
63 | coverage run --source ddo_transform -m pytest
64 | coverage report -m
65 | coverage html
66 | $(BROWSER) htmlcov/index.html
67 |
68 | docs: ## generate Sphinx HTML documentation, including API docs
69 | rm -f docs/ddo_transform.rst
70 | rm -f docs/modules.rst
71 | sphinx-apidoc -o docs/ ddo_transform
72 | $(MAKE) -C docs clean
73 | $(MAKE) -C docs html
74 | $(BROWSER) docs/_build/html/index.html
75 |
76 | servedocs: docs ## compile the docs watching for changes
77 | watchmedo shell-command -p '*.rst' -c '$(MAKE) -C docs html' -R -D .
78 |
79 | release: dist ## package and upload a release
80 | twine upload dist/*
81 |
82 | dist: clean ## builds source and wheel package
83 | sed -i "s/{{version}}/$(package_version)/g" ddo_transform/__init__.py
84 | python setup.py sdist
85 | python setup.py bdist_wheel
86 | ls -l dist
87 |
88 | install: clean ## install the package to the active Python's site-packages
89 | python setup.py install
90 |
91 | installedit: clean ## install the package while dynamically picking up changes to source files
92 | pip install --editable .
93 |
94 | uploaddatabricks: dist
95 | package_name="$$(find dist/*.whl -printf "%f\n")"; \
96 | databricks fs cp --overwrite dist/"$$package_name" "$(DATABRICKS_DBFS_UPLOAD_PATH)/libs/$$package_name";\
97 |
98 | installdatabricks: dist uploaddatabricks ## install the package in databricks
99 | package_name="$$(find dist/*.whl -printf "%f\n")"; \
100 | databricks libraries install --cluster-id $(DATABRICKS_CLUSTER_ID) --whl "$(DATABRICKS_DBFS_UPLOAD_PATH)/libs/$$package_name"
101 | databricks clusters restart --cluster-id $(DATABRICKS_CLUSTER_ID)
--------------------------------------------------------------------------------
/src/ddo_transform/README.rst:
--------------------------------------------------------------------------------
1 | =============
2 | ddo_transform
3 | =============
4 |
5 |
6 | .. image:: https://dev.azure.com/msdevlace/DataDevOps/_apis/build/status/DDO-Python-CI-Artifacts
7 | :target: https://dev.azure.com/msdevlace/DataDevOps/_build/latest?definitionId=23
8 | :alt: Build Status
9 |
10 |
11 | This package contains all business/data transformation logic for ETL pipeline.
12 |
13 | * Free software: MIT license
14 | * Documentation: https://ddo-transform.readthedocs.io.
15 |
16 |
17 | Credits
18 | -------
19 |
20 | This package was created with Cookiecutter_ and the `audreyr/cookiecutter-pypackage`_ project template.
21 |
22 | .. _Cookiecutter: https://github.com/audreyr/cookiecutter
23 | .. _`audreyr/cookiecutter-pypackage`: https://github.com/audreyr/cookiecutter-pypackage
24 |
--------------------------------------------------------------------------------
/src/ddo_transform/azure-pipelines-ci-artifacts.yml:
--------------------------------------------------------------------------------
1 | # Starter pipeline
2 | # Start with a minimal pipeline that you can customize to build and deploy your code.
3 | # Add steps that build, run tests, deploy, and more:
4 | # https://aka.ms/yaml
5 |
6 | trigger:
7 | branches:
8 | include:
9 | - master
10 |
11 | variables:
12 | WORKING_DIR: 'src/ddo_transform'
13 | PACKAGE_MAJOR_VERSION: 1
14 | PACKAGE_MINOR_VERSION: 1
15 | PACKAGE_PATCH_VERSION: $(Build.BuildId)
16 | SQL_DW_PATH: 'sql/ddo_azuresqldw_dw'
17 | SQL_DW_SOLUTION_NAME: 'ddo_azuresqldw_dw'
18 | SQL_DW_SOLUTION: '$(SQL_DW_PATH)/$(SQL_DW_SOLUTION_NAME).sln'
19 | BUILD_PLATFORM: 'Any CPU'
20 | BUILD_CONFIGURATION: 'Release'
21 |
22 | stages:
23 | - stage: 'validate_pr'
24 | displayName: 'Validate PR'
25 | condition: and(always(), eq(variables['Build.Reason'], 'PullRequest'))
26 | jobs:
27 | - job: 'validate_python_packages'
28 | displayName: 'Validate Python Packages'
29 | pool:
30 | vmImage: 'Ubuntu-16.04'
31 | steps:
32 | - task: UsePythonVersion@0
33 | inputs:
34 | versionSpec: '3.6'
35 | architecture: 'x64'
36 |
37 | - script: pip install -r requirements_dev.txt && pip install -r requirements.txt
38 | workingDirectory: $(WORKING_DIR)
39 | displayName: 'Install requirements'
40 |
41 | - script: make lint
42 | workingDirectory: $(WORKING_DIR)
43 | displayName: 'Run lint'
44 |
45 | - script: make test
46 | workingDirectory: $(WORKING_DIR)
47 | displayName: 'Run tests'
48 |
49 | - job: 'validate_sql_packages'
50 | displayName: 'Validate SQL Packages'
51 | pool:
52 | vmImage: 'windows-latest'
53 | steps:
54 | - task: NuGetToolInstaller@1
55 |
56 | - task: NuGetCommand@2
57 | inputs:
58 | restoreSolution: '$(SQL_DW_SOLUTION)'
59 |
60 | - task: VSBuild@1
61 | inputs:
62 | solution: '$(SQL_DW_SOLUTION)'
63 | platform: '$(BUILD_PLATFORM)'
64 | configuration: '$(BUILD_CONFIGURATION)'
65 | - task: VSTest@2
66 | inputs:
67 | platform: '$(BUILD_PLATFORM)'
68 | configuration: '$(BUILD_CONFIGURATION)'
69 |
70 | - stage: 'publish_artifacts'
71 | displayName: 'Publish Build Artifacts'
72 | condition: and(always(), contains(variables['Build.SourceBranch'], 'refs/heads/master'))
73 | jobs:
74 | - job: 'publish_python_packages'
75 | displayName: 'Publish Python Packages'
76 | pool:
77 | vmImage: 'Ubuntu-16.04'
78 | steps:
79 | - task: UsePythonVersion@0
80 | inputs:
81 | versionSpec: '3.6'
82 | architecture: 'x64'
83 |
84 | - script: pip install -r requirements_dev.txt && pip install -r requirements.txt
85 | workingDirectory: $(WORKING_DIR)
86 | displayName: 'Install requirements'
87 |
88 | - script: make dist
89 | env:
90 | package_version: $(PACKAGE_MAJOR_VERSION).$(PACKAGE_MINOR_VERSION).$(PACKAGE_PATCH_VERSION)
91 | workingDirectory: $(WORKING_DIR)
92 | displayName: 'Create wheel package'
93 |
94 | - task: PublishBuildArtifacts@1
95 | inputs:
96 | PathtoPublish: '$(WORKING_DIR)/dist'
97 | ArtifactName: 'dist'
98 | displayName: 'Publish Dist Artifacts'
99 |
100 | - job: 'publish_static_artifacts'
101 | displayName: 'Publish Static Artifacts'
102 | pool:
103 | vmImage: 'Ubuntu-16.04'
104 | steps:
105 | - task: PublishBuildArtifacts@1
106 | inputs:
107 | PathtoPublish: 'databricks'
108 | ArtifactName: 'databricks'
109 | displayName: 'Publish Databricks Artifacts'
110 |
111 | - task: PublishBuildArtifacts@1
112 | inputs:
113 | PathtoPublish: 'adf/_scripts/deploymentadf.ps1'
114 | ArtifactName: 'adf_scripts'
115 | displayName: 'Publish ADF Scripts'
116 |
117 | - job: 'publish_sql_packages'
118 | displayName: 'Publish SQL Packages'
119 | pool:
120 | vmImage: 'windows-latest'
121 | steps:
122 | - task: NuGetToolInstaller@1
123 |
124 | - task: NuGetCommand@2
125 | inputs:
126 | restoreSolution: '$(SQL_DW_SOLUTION)'
127 |
128 | - task: VSBuild@1
129 | inputs:
130 | solution: '$(SQL_DW_SOLUTION)'
131 | platform: '$(BUILD_PLATFORM)'
132 | configuration: '$(BUILD_CONFIGURATION)'
133 | - task: VSTest@2
134 | inputs:
135 | platform: '$(BUILD_PLATFORM)'
136 | configuration: '$(BUILD_CONFIGURATION)'
137 |
138 | - task: PublishBuildArtifacts@1
139 | inputs:
140 | PathtoPublish: '$(SQL_DW_PATH)/$(SQL_DW_SOLUTION_NAME)/bin/$(BUILD_CONFIGURATION)/ddo_azuresqldw_dw.dacpac'
141 | ArtifactName: 'sql_dw_dacpac'
142 | displayName: 'Publish SQL DACPAC'
--------------------------------------------------------------------------------
/src/ddo_transform/azure-pipelines-ci-qa.yml:
--------------------------------------------------------------------------------
1 | # Starter pipeline
2 | # Start with a minimal pipeline that you can customize to build and deploy your code.
3 | # Add steps that build, run tests, deploy, and more:
4 | # https://aka.ms/yaml
5 |
6 | pr:
7 | branches:
8 | include:
9 | - master
10 | - releases/*
11 | paths:
12 | include:
13 | - src/ddo_transform/*
14 |
15 | variables:
16 | WORKING_DIR: 'src/ddo_transform'
17 |
18 | pool:
19 | vmImage: 'Ubuntu-16.04'
20 |
21 | steps:
22 | - task: UsePythonVersion@0
23 | inputs:
24 | versionSpec: '3.6'
25 | architecture: 'x64'
26 |
27 | - script: pip install -r requirements_dev.txt && pip install -r requirements.txt
28 | workingDirectory: $(WORKING_DIR)
29 | displayName: 'Install requirements'
30 |
31 | - script: make lint && make tests
32 | workingDirectory: $(WORKING_DIR)
33 | displayName: 'Run lint tests'
--------------------------------------------------------------------------------
/src/ddo_transform/ddo_transform/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """Top-level package for ddo_transform."""
4 |
5 | __author__ = """Lace Lofranco"""
6 | __email__ = 'lace.lofranco@microsoft.com'
7 | __version__ = '1.0.0'
8 |
--------------------------------------------------------------------------------
/src/ddo_transform/ddo_transform/standardize.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """Main module."""
4 |
5 |
6 | from pyspark.sql import DataFrame
7 | from pyspark.sql.functions import lit, col, to_timestamp
8 | from pyspark.sql.types import (
9 | ArrayType, StructType, StructField, StringType, DoubleType) # noqa: E501
10 |
11 |
12 | def get_schema(schema_name):
13 | if schema_name == 'in_parkingbay_schema':
14 | schema = StructType([
15 | StructField('the_geom', StructType([
16 | StructField('coordinates', ArrayType(
17 | ArrayType(ArrayType(ArrayType(DoubleType())))
18 | )),
19 | StructField('type', StringType())
20 | ])),
21 | StructField('marker_id', StringType()),
22 | StructField('meter_id', StringType()),
23 | StructField('bay_id', StringType(), False),
24 | StructField('last_edit', StringType()),
25 | StructField('rd_seg_id', StringType()),
26 | StructField('rd_seg_dsc', StringType()),
27 | ])
28 | elif schema_name == 'in_sensordata_schema':
29 | schema = StructType([
30 | StructField('bay_id', StringType(), False),
31 | StructField('st_marker_id', StringType()),
32 | StructField('status', StringType()),
33 | StructField('location', StructType([
34 | StructField('coordinates', ArrayType(DoubleType())),
35 | StructField('type', StringType())
36 | ])),
37 | StructField('lat', StringType()),
38 | StructField('lon', StringType())
39 | ])
40 | return schema
41 |
42 |
43 | def standardize_parking_bay(parkingbay_sdf: DataFrame, load_id, loaded_on):
44 | t_parkingbay_sdf = (
45 | parkingbay_sdf
46 | .withColumn("last_edit", to_timestamp("last_edit", "YYYYMMddHHmmss"))
47 | .select(
48 | col("bay_id").cast("int").alias("bay_id"),
49 | "last_edit",
50 | "marker_id",
51 | "meter_id",
52 | "rd_seg_dsc",
53 | col("rd_seg_id").cast("int").alias("rd_seg_id"),
54 | "the_geom",
55 | lit(load_id).alias("load_id"),
56 | lit(loaded_on.isoformat()).alias("loaded_on")
57 | )
58 | ).cache()
59 | # Data Validation
60 | good_records = t_parkingbay_sdf.filter(col("bay_id").isNotNull())
61 | bad_records = t_parkingbay_sdf.filter(col("bay_id").isNull())
62 | return good_records, bad_records
63 |
64 |
65 | def standardize_sensordata(sensordata_sdf: DataFrame, load_id, loaded_on):
66 | t_sensordata_sdf = (
67 | sensordata_sdf
68 | .select(
69 | col("bay_id").cast("int").alias("bay_id"),
70 | "st_marker_id",
71 | col("lat").cast("float").alias("lat"),
72 | col("lon").cast("float").alias("lon"),
73 | "location",
74 | "status",
75 | lit(load_id).alias("load_id"),
76 | lit(loaded_on.isoformat()).alias("loaded_on")
77 | )
78 | ).cache()
79 | # Data Validation
80 | good_records = t_sensordata_sdf.filter(col("bay_id").isNotNull())
81 | bad_records = t_sensordata_sdf.filter(col("bay_id").isNull())
82 | return good_records, bad_records
83 |
84 |
85 | if __name__ == "__main__":
86 | from pyspark.sql import SparkSession
87 | import datetime
88 | import os
89 |
90 | spark = SparkSession.builder\
91 | .master("local[2]")\
92 | .appName("standardize.py")\
93 | .getOrCreate()
94 | spark.sparkContext.setLogLevel("ERROR")
95 |
96 | THIS_DIR = os.path.dirname(os.path.abspath(__file__))
97 |
98 | schema = get_schema("in_parkingbay_schema")
99 | parkingbay_sdf = spark.read.json(os.path.join(THIS_DIR, "../data/MelbParkingBayData.json"),
100 | multiLine=True,
101 | schema=schema)
102 | load_id = 1
103 | loaded_on = datetime.datetime.now()
104 | t_parkingbay_sdf, t_parkingbay_malformed_sdf = standardize_parking_bay(parkingbay_sdf, load_id, loaded_on)
105 | t_parkingbay_sdf.write.json('./out/parkingbay_sdf')
106 |
--------------------------------------------------------------------------------
/src/ddo_transform/ddo_transform/transform.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """Main module."""
4 |
5 |
6 | import uuid
7 | from pyspark.sql import DataFrame
8 | from pyspark.sql.functions import lit, udf, col, when
9 | from pyspark.sql.types import (
10 | ArrayType, StructType, StructField, StringType, TimestampType, DoubleType, IntegerType, FloatType) # noqa: E501
11 |
12 | uuidUdf = udf(lambda: str(uuid.uuid4()), StringType())
13 | EMPTY_UUID = '00000000-0000-0000-0000-000000000000'
14 |
15 |
16 | def get_schema(schema_name):
17 | schema = None
18 | if schema_name == 'interim_parkingbay_schema':
19 | schema = StructType([
20 | StructField('bay_id', IntegerType(), False),
21 | StructField('last_edit', StringType()),
22 | StructField('marker_id', StringType()),
23 | StructField('meter_id', StringType()),
24 | StructField('rd_seg_id', StringType()),
25 | StructField('rd_seg_dsc', StringType()),
26 | StructField('the_geom', StructType([
27 | StructField('coordinates', ArrayType(
28 | ArrayType(ArrayType(ArrayType(DoubleType())))
29 | )),
30 | StructField('type', StringType())
31 | ])),
32 | StructField('load_id', StringType()),
33 | StructField('loaded_on', TimestampType())
34 | ])
35 | elif schema_name == 'interim_sensor':
36 | schema = StructType([
37 | StructField('bay_id', IntegerType(), False),
38 | StructField('st_marker_id', StringType()),
39 | StructField('lat', FloatType()),
40 | StructField('lon', FloatType()),
41 | StructField('location', StructType([
42 | StructField('coordinates', ArrayType(DoubleType())),
43 | StructField('type', StringType())
44 | ]), False),
45 | StructField('status', StringType()),
46 | StructField('load_id', StringType()),
47 | StructField('loaded_on', TimestampType())
48 | ])
49 | elif schema_name == 'dw_dim_parking_bay':
50 | schema = StructType([
51 | StructField('dim_parking_bay_id', StringType(), False),
52 | StructField('bay_id', IntegerType(), False),
53 | StructField('marker_id', StringType()),
54 | StructField('meter_id', StringType()),
55 | StructField('rd_seg_id', StringType()),
56 | StructField('rd_seg_dsc', StringType()),
57 | StructField('the_geom', StructType([
58 | StructField('coordinates', ArrayType(
59 | ArrayType(ArrayType(ArrayType(DoubleType())))
60 | )),
61 | StructField('type', StringType())
62 | ])),
63 | StructField('load_id', StringType()),
64 | StructField('loaded_on', TimestampType())
65 | ])
66 | elif schema_name == 'dw_dim_location':
67 | schema = StructType([
68 | StructField('dim_location_id', StringType(), False),
69 | StructField('location', StructType([
70 | StructField('coordinates', ArrayType(DoubleType())),
71 | StructField('type', StringType())
72 | ]), False),
73 | StructField('lat', FloatType()),
74 | StructField('lon', FloatType()),
75 | StructField('load_id', StringType()),
76 | StructField('loaded_on', TimestampType())
77 | ])
78 | elif schema_name == 'dw_dim_st_marker':
79 | schema = StructType([
80 | StructField('dim_st_marker_id', StringType(), False),
81 | StructField('st_marker_id', StringType()),
82 | StructField('load_id', StringType()),
83 | StructField('loaded_on', TimestampType())
84 | ])
85 | return schema
86 |
87 |
88 | def process_dim_parking_bay(parkingbay_sdf: DataFrame,
89 | dim_parkingbay_sdf: DataFrame,
90 | load_id, loaded_on):
91 | """Transform incoming parkingbay_sdf data and existing dim_parking_bay
92 | into the latest version of records of dim_parking_bay data.
93 | """
94 | # Get landing data distint rows
95 | parkingbay_sdf = parkingbay_sdf\
96 | .select([
97 | "bay_id",
98 | "marker_id",
99 | "meter_id",
100 | "rd_seg_dsc",
101 | "rd_seg_id"])\
102 | .distinct()
103 |
104 | # Using a left_outer join on the business key (bay_id),
105 | # identify rows that do NOT EXIST in landing data that EXISTS in existing Dimension table
106 | oldrows_parkingbay_sdf = dim_parkingbay_sdf.alias("dim")\
107 | .join(parkingbay_sdf, "bay_id", "left_outer")\
108 | .where(parkingbay_sdf["bay_id"].isNull())\
109 | .select(col("dim.*"))
110 |
111 | # Using a left_outer join on the business key (bay_id),
112 | # Identify rows that EXISTS in incoming landing data that does also EXISTS in existing Dimension table
113 | # and take the values of the incoming landing data. That is, we update existing table values.
114 | existingrows_parkingbay_sdf = parkingbay_sdf.alias("pb")\
115 | .join(dim_parkingbay_sdf.alias("dim"), "bay_id", "left_outer")\
116 | .where(dim_parkingbay_sdf["bay_id"].isNotNull())\
117 | .select(
118 | col("dim.dim_parking_bay_id"),
119 | col("pb.bay_id"),
120 | col("pb.marker_id"),
121 | col("pb.meter_id"),
122 | col("pb.rd_seg_dsc"),
123 | col("pb.rd_seg_id")
124 | )
125 |
126 | # Using a left_outer join on the business key (bay_id),
127 | # Identify rows that EXISTS in landing data that does NOT EXISTS in existing Dimension table
128 | newrows_parkingbay_sdf = parkingbay_sdf.alias("pb")\
129 | .join(dim_parkingbay_sdf, "bay_id", "left_outer")\
130 | .where(dim_parkingbay_sdf["bay_id"].isNull())\
131 | .select(col("pb.*"))
132 |
133 | # Add load_id, loaded_at and dim_parking_bay_id
134 | existingrows_parkingbay_sdf = existingrows_parkingbay_sdf.withColumn("load_id", lit(load_id))\
135 | .withColumn("loaded_on", lit(loaded_on.isoformat()).cast("timestamp"))
136 | newrows_parkingbay_sdf = newrows_parkingbay_sdf.withColumn("load_id", lit(load_id))\
137 | .withColumn("loaded_on", lit(loaded_on.isoformat()).cast("timestamp"))\
138 | .withColumn("dim_parking_bay_id", uuidUdf())
139 |
140 | # Select relevant columns
141 | relevant_cols = [
142 | "dim_parking_bay_id",
143 | "bay_id",
144 | "marker_id",
145 | "meter_id",
146 | "rd_seg_dsc",
147 | "rd_seg_id",
148 | "load_id",
149 | "loaded_on"
150 | ]
151 | oldrows_parkingbay_sdf = oldrows_parkingbay_sdf.select(relevant_cols)
152 | existingrows_parkingbay_sdf = existingrows_parkingbay_sdf.select(relevant_cols)
153 | newrows_parkingbay_sdf = newrows_parkingbay_sdf.select(relevant_cols)
154 |
155 | allrows_parkingbay_sdf = oldrows_parkingbay_sdf\
156 | .union(existingrows_parkingbay_sdf)\
157 | .union(newrows_parkingbay_sdf)
158 |
159 | return allrows_parkingbay_sdf
160 |
161 |
162 | def process_dim_location(sensordata_sdf: DataFrame, dim_location: DataFrame,
163 | load_id, loaded_on):
164 | """Transform sensordata into dim_location"""
165 |
166 | # Get landing data distint rows
167 | sensordata_sdf = sensordata_sdf\
168 | .select(["lat", "lon"]).distinct()
169 |
170 | # Using a left_outer join
171 | # identify rows that do NOT EXIST in landing data that EXISTS in existing Dimension table
172 | oldrows_sdf = dim_location.alias("dim")\
173 | .join(sensordata_sdf, ["lat", "lon"], "left_outer")\
174 | .where(sensordata_sdf["lat"].isNull() & sensordata_sdf["lon"].isNull())\
175 | .select(col("dim.*"))
176 |
177 | # Using a left_outer join
178 | # Identify rows that EXISTS in incoming landing data that does also EXISTS in existing Dimension table
179 | # and take the values of the incoming landing data. That is, we update existing table values.
180 | existingrows_sdf = sensordata_sdf.alias("in")\
181 | .join(dim_location.alias("dim"), ["lat", "lon"], "left_outer")\
182 | .where(dim_location["lat"].isNotNull() & dim_location["lon"].isNotNull())\
183 | .select(
184 | col("dim.dim_location_id"),
185 | col("in.lat"),
186 | col("in.lon")
187 | )
188 |
189 | # Using a left_outer join
190 | # Identify rows that EXISTS in landing data that does NOT EXISTS in existing Dimension table
191 | newrows_sdf = sensordata_sdf.alias("in")\
192 | .join(dim_location, ["lat", "lon"], "left_outer")\
193 | .where(dim_location["lat"].isNull() & dim_location["lon"].isNull())\
194 | .select(col("in.*"))
195 |
196 | # Add load_id, loaded_at and dim_parking_bay_id
197 | existingrows_sdf = existingrows_sdf.withColumn("load_id", lit(load_id))\
198 | .withColumn("loaded_on", lit(loaded_on.isoformat()).cast("timestamp"))
199 | newrows_sdf = newrows_sdf.withColumn("load_id", lit(load_id))\
200 | .withColumn("loaded_on", lit(loaded_on.isoformat()).cast("timestamp"))\
201 | .withColumn("dim_location_id", uuidUdf())
202 |
203 | # Select relevant columns
204 | relevant_cols = [
205 | "dim_location_id",
206 | "lat",
207 | "lon",
208 | "load_id",
209 | "loaded_on"
210 | ]
211 | oldrows_sdf = oldrows_sdf.select(relevant_cols)
212 | existingrows_sdf = existingrows_sdf.select(relevant_cols)
213 | newrows_sdf = newrows_sdf.select(relevant_cols)
214 |
215 | allrows_sdf = oldrows_sdf\
216 | .union(existingrows_sdf)\
217 | .union(newrows_sdf)
218 |
219 | return allrows_sdf
220 |
221 |
222 | def process_dim_st_marker(sensordata_sdf: DataFrame,
223 | dim_st_marker: DataFrame,
224 | load_id, loaded_on):
225 | """Transform sensordata into dim_st_marker"""
226 |
227 | # Get landing data distint rows
228 | sensordata_sdf = sensordata_sdf.select(["st_marker_id"]).distinct()
229 |
230 | # Using a left_outer join
231 | # identify rows that do NOT EXIST in landing data that EXISTS in existing Dimension table
232 | oldrows_sdf = dim_st_marker.alias("dim")\
233 | .join(sensordata_sdf, ["st_marker_id"], "left_outer")\
234 | .where(sensordata_sdf["st_marker_id"].isNull())\
235 | .select(col("dim.*"))
236 |
237 | # Using a left_outer join
238 | # Identify rows that EXISTS in incoming landing data that does also EXISTS in existing Dimension table
239 | # and take the values of the incoming landing data. That is, we update existing table values.
240 | existingrows_sdf = sensordata_sdf.alias("in")\
241 | .join(dim_st_marker.alias("dim"), ["st_marker_id"], "left_outer")\
242 | .where(dim_st_marker["st_marker_id"].isNotNull())\
243 | .select(col("dim.dim_st_marker_id"), col("in.st_marker_id"))
244 |
245 | # Using a left_outer join
246 | # Identify rows that EXISTS in landing data that does NOT EXISTS in existing Dimension table
247 | newrows_sdf = sensordata_sdf.alias("in")\
248 | .join(dim_st_marker, ["st_marker_id"], "left_outer")\
249 | .where(dim_st_marker["st_marker_id"].isNull())\
250 | .select(col("in.*"))
251 |
252 | # Add load_id, loaded_at and dim_parking_bay_id
253 | existingrows_sdf = existingrows_sdf.withColumn("load_id", lit(load_id))\
254 | .withColumn("loaded_on", lit(loaded_on.isoformat()).cast("timestamp"))
255 | newrows_sdf = newrows_sdf.withColumn("load_id", lit(load_id))\
256 | .withColumn("loaded_on", lit(loaded_on.isoformat()).cast("timestamp"))\
257 | .withColumn("dim_st_marker_id", uuidUdf())
258 |
259 | # Select relevant columns
260 | relevant_cols = [
261 | "dim_st_marker_id",
262 | "st_marker_id",
263 | "load_id",
264 | "loaded_on"
265 | ]
266 | oldrows_sdf = oldrows_sdf.select(relevant_cols)
267 | existingrows_sdf = existingrows_sdf.select(relevant_cols)
268 | newrows_sdf = newrows_sdf.select(relevant_cols)
269 |
270 | allrows_sdf = oldrows_sdf\
271 | .union(existingrows_sdf)\
272 | .union(newrows_sdf)
273 |
274 | return allrows_sdf
275 |
276 |
277 | def process_fact_parking(sensordata_sdf: DataFrame,
278 | dim_parkingbay_sdf: DataFrame,
279 | dim_location_sdf: DataFrame,
280 | dim_st_marker_sdf: DataFrame,
281 | load_id, loaded_on):
282 | """Transform sensordata into fact_parking"""
283 |
284 | dim_date_id = loaded_on.strftime("%Y%M%d")
285 | midnight = loaded_on.replace(hour=0, minute=0, second=0, microsecond=0)
286 | dim_time_id = (midnight - loaded_on).seconds
287 |
288 | # Build fact
289 | fact_parking = sensordata_sdf\
290 | .join(dim_parkingbay_sdf.alias("pb"), "bay_id", "left_outer")\
291 | .join(dim_location_sdf.alias("l"), ["lat", "lon"], "left_outer")\
292 | .join(dim_st_marker_sdf.alias("st"), "st_marker_id", "left_outer")\
293 | .select(
294 | lit(dim_date_id).alias("dim_date_id"),
295 | lit(dim_time_id).alias("dim_time_id"),
296 | when(col("pb.dim_parking_bay_id").isNull(), lit(EMPTY_UUID))
297 | .otherwise(col("pb.dim_parking_bay_id")).alias("dim_parking_bay_id"),
298 | when(col("l.dim_location_id").isNull(), lit(EMPTY_UUID))
299 | .otherwise(col("l.dim_location_id")).alias("dim_location_id"),
300 | when(col("st.dim_st_marker_id").isNull(), lit(EMPTY_UUID))
301 | .otherwise(col("st.dim_st_marker_id")).alias("dim_st_marker_id"),
302 | "status",
303 | lit(load_id).alias("load_id"),
304 | lit(loaded_on.isoformat()).cast("timestamp").alias("loaded_on")
305 | )
306 | return fact_parking
307 |
308 |
309 | if __name__ == "__main__":
310 | from pyspark.sql import SparkSession
311 | import datetime
312 | import os
313 |
314 | spark = SparkSession.builder\
315 | .master("local[2]")\
316 | .appName("transform.py")\
317 | .getOrCreate()
318 | spark.sparkContext.setLogLevel("ERROR")
319 |
320 | THIS_DIR = os.path.dirname(os.path.abspath(__file__))
321 | load_id = 1
322 | loaded_on = datetime.datetime.now()
323 |
324 | def _run_process_dim_parking_bay():
325 | parkingbay_sdf = spark.read\
326 | .schema(get_schema("interim_parkingbay_schema"))\
327 | .json(os.path.join(THIS_DIR, "../data/interim_parking_bay.json"))
328 | dim_parkingbay_sdf = spark.read\
329 | .schema(schema=get_schema("dw_dim_parking_bay"))\
330 | .json(os.path.join(THIS_DIR, "../data/dim_parking_bay.json"))
331 | new_dim_parkingbay_sdf = process_dim_parking_bay(parkingbay_sdf, dim_parkingbay_sdf, load_id, loaded_on)
332 | return new_dim_parkingbay_sdf
333 |
334 | def _run_process_dim_location():
335 | sensor_sdf = spark.read\
336 | .schema(get_schema("interim_sensor"))\
337 | .json(os.path.join(THIS_DIR, "../data/interim_sensor.json"))
338 | dim_location_sdf = spark.read\
339 | .schema(schema=get_schema("dw_dim_location"))\
340 | .json(os.path.join(THIS_DIR, "../data/dim_location.json"))
341 | new_dim_location_sdf = process_dim_location(sensor_sdf, dim_location_sdf, load_id, loaded_on)
342 | return new_dim_location_sdf
343 |
344 | def _run_process_dim_st_marker():
345 | sensor_sdf = spark.read\
346 | .schema(get_schema("interim_sensor"))\
347 | .json(os.path.join(THIS_DIR, "../data/interim_sensor.json"))
348 | dim_st_marker_sdf = spark.read\
349 | .schema(schema=get_schema("dw_dim_st_marker"))\
350 | .json(os.path.join(THIS_DIR, "../data/dim_st_marker.json"))
351 | new_dim_st_marker_sdf = process_dim_st_marker(sensor_sdf, dim_st_marker_sdf, load_id, loaded_on)
352 | return new_dim_st_marker_sdf
353 |
354 | def _run_process_fact_parking():
355 | sensor_sdf = spark.read\
356 | .schema(get_schema("interim_sensor"))\
357 | .json(os.path.join(THIS_DIR, "../data/interim_sensor.json"))
358 | dim_parking_bay_sdf = spark.read\
359 | .schema(schema=get_schema("dw_dim_parking_bay"))\
360 | .json(os.path.join(THIS_DIR, "../data/dim_parking_bay.json"))
361 | dim_location_sdf = spark.read\
362 | .schema(schema=get_schema("dw_dim_location"))\
363 | .json(os.path.join(THIS_DIR, "../data/dim_location.json"))
364 | dim_st_marker_sdf = spark.read\
365 | .schema(schema=get_schema("dw_dim_st_marker"))\
366 | .json(os.path.join(THIS_DIR, "../data/dim_st_marker.json"))
367 | new_fact_parking = process_fact_parking(sensor_sdf,
368 | dim_parking_bay_sdf,
369 | dim_location_sdf,
370 | dim_st_marker_sdf,
371 | load_id, loaded_on)
372 | return new_fact_parking
373 |
374 | def _inspect_df(df: DataFrame):
375 | df.show()
376 | df.printSchema()
377 | print(df.count())
378 |
379 | # _inspect_df(_run_process_dim_parking_bay())
380 | # _inspect_df(_run_process_dim_location())
381 | # _inspect_df(_run_process_dim_st_marker())
382 | _inspect_df(_run_process_fact_parking())
383 |
384 | print("done!")
385 |
--------------------------------------------------------------------------------
/src/ddo_transform/ddo_transform/util.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """Util module."""
4 |
5 | from pyspark.sql import DataFrame, SparkSession
6 |
7 |
8 | def save_overwrite_unmanaged_table(spark: SparkSession, dataframe: DataFrame, table_name: str, path: str):
9 | """When trying to read and overwrite the same table, you get this error:
10 | 'Cannot overwrite table dw.dim_parking_bay that is also being read from;'
11 | This utility function workarounds this by saving to a temporary table first prior to overwriting.
12 | """
13 | temp_table_name = table_name + "___temp"
14 | spark.sql("DROP TABLE IF EXISTS " + temp_table_name).collect()
15 | # Save temp table
16 | dataframe.write.saveAsTable(temp_table_name)
17 | # Read temp table and overwrite original table
18 | spark.read.table(temp_table_name)\
19 | .write.mode("overwrite")\
20 | .option("path", path)\
21 | .saveAsTable(table_name)
22 | # Drop temp table
23 | spark.sql("DROP TABLE " + temp_table_name).collect()
24 |
--------------------------------------------------------------------------------
/src/ddo_transform/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line.
5 | SPHINXOPTS =
6 | SPHINXBUILD = python -msphinx
7 | SPHINXPROJ = ddo_transform
8 | SOURCEDIR = .
9 | BUILDDIR = _build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/src/ddo_transform/docs/authors.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../AUTHORS.rst
2 |
--------------------------------------------------------------------------------
/src/ddo_transform/docs/conf.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | #
4 | # ddo_transform documentation build configuration file, created by
5 | # sphinx-quickstart on Fri Jun 9 13:47:02 2017.
6 | #
7 | # This file is execfile()d with the current directory set to its
8 | # containing dir.
9 | #
10 | # Note that not all possible configuration values are present in this
11 | # autogenerated file.
12 | #
13 | # All configuration values have a default; values that are commented out
14 | # serve to show the default.
15 |
16 | # If extensions (or modules to document with autodoc) are in another
17 | # directory, add these directories to sys.path here. If the directory is
18 | # relative to the documentation root, use os.path.abspath to make it
19 | # absolute, like shown here.
20 | #
21 | import os
22 | import sys
23 | sys.path.insert(0, os.path.abspath('..'))
24 |
25 | import ddo_transform
26 |
27 | # -- General configuration ---------------------------------------------
28 |
29 | # If your documentation needs a minimal Sphinx version, state it here.
30 | #
31 | # needs_sphinx = '1.0'
32 |
33 | # Add any Sphinx extension module names here, as strings. They can be
34 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
35 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode']
36 |
37 | # Add any paths that contain templates here, relative to this directory.
38 | templates_path = ['_templates']
39 |
40 | # The suffix(es) of source filenames.
41 | # You can specify multiple suffix as a list of string:
42 | #
43 | # source_suffix = ['.rst', '.md']
44 | source_suffix = '.rst'
45 |
46 | # The master toctree document.
47 | master_doc = 'index'
48 |
49 | # General information about the project.
50 | project = u'ddo_transform'
51 | copyright = u"2019, Lace Lofranco"
52 | author = u"Lace Lofranco"
53 |
54 | # The version info for the project you're documenting, acts as replacement
55 | # for |version| and |release|, also used in various other places throughout
56 | # the built documents.
57 | #
58 | # The short X.Y version.
59 | version = ddo_transform.__version__
60 | # The full version, including alpha/beta/rc tags.
61 | release = ddo_transform.__version__
62 |
63 | # The language for content autogenerated by Sphinx. Refer to documentation
64 | # for a list of supported languages.
65 | #
66 | # This is also used if you do content translation via gettext catalogs.
67 | # Usually you set "language" from the command line for these cases.
68 | language = None
69 |
70 | # List of patterns, relative to source directory, that match files and
71 | # directories to ignore when looking for source files.
72 | # This patterns also effect to html_static_path and html_extra_path
73 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
74 |
75 | # The name of the Pygments (syntax highlighting) style to use.
76 | pygments_style = 'sphinx'
77 |
78 | # If true, `todo` and `todoList` produce output, else they produce nothing.
79 | todo_include_todos = False
80 |
81 |
82 | # -- Options for HTML output -------------------------------------------
83 |
84 | # The theme to use for HTML and HTML Help pages. See the documentation for
85 | # a list of builtin themes.
86 | #
87 | html_theme = 'alabaster'
88 |
89 | # Theme options are theme-specific and customize the look and feel of a
90 | # theme further. For a list of options available for each theme, see the
91 | # documentation.
92 | #
93 | # html_theme_options = {}
94 |
95 | # Add any paths that contain custom static files (such as style sheets) here,
96 | # relative to this directory. They are copied after the builtin static files,
97 | # so a file named "default.css" will overwrite the builtin "default.css".
98 | html_static_path = ['_static']
99 |
100 |
101 | # -- Options for HTMLHelp output ---------------------------------------
102 |
103 | # Output file base name for HTML help builder.
104 | htmlhelp_basename = 'ddo_transformdoc'
105 |
106 |
107 | # -- Options for LaTeX output ------------------------------------------
108 |
109 | latex_elements = {
110 | # The paper size ('letterpaper' or 'a4paper').
111 | #
112 | # 'papersize': 'letterpaper',
113 |
114 | # The font size ('10pt', '11pt' or '12pt').
115 | #
116 | # 'pointsize': '10pt',
117 |
118 | # Additional stuff for the LaTeX preamble.
119 | #
120 | # 'preamble': '',
121 |
122 | # Latex figure (float) alignment
123 | #
124 | # 'figure_align': 'htbp',
125 | }
126 |
127 | # Grouping the document tree into LaTeX files. List of tuples
128 | # (source start file, target name, title, author, documentclass
129 | # [howto, manual, or own class]).
130 | latex_documents = [
131 | (master_doc, 'ddo_transform.tex',
132 | u'ddo_transform Documentation',
133 | u'Lace Lofranco', 'manual'),
134 | ]
135 |
136 |
137 | # -- Options for manual page output ------------------------------------
138 |
139 | # One entry per manual page. List of tuples
140 | # (source start file, name, description, authors, manual section).
141 | man_pages = [
142 | (master_doc, 'ddo_transform',
143 | u'ddo_transform Documentation',
144 | [author], 1)
145 | ]
146 |
147 |
148 | # -- Options for Texinfo output ----------------------------------------
149 |
150 | # Grouping the document tree into Texinfo files. List of tuples
151 | # (source start file, target name, title, author,
152 | # dir menu entry, description, category)
153 | texinfo_documents = [
154 | (master_doc, 'ddo_transform',
155 | u'ddo_transform Documentation',
156 | author,
157 | 'ddo_transform',
158 | 'One line description of project.',
159 | 'Miscellaneous'),
160 | ]
161 |
162 |
163 |
164 |
--------------------------------------------------------------------------------
/src/ddo_transform/docs/contributing.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../CONTRIBUTING.rst
2 |
--------------------------------------------------------------------------------
/src/ddo_transform/docs/history.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../HISTORY.rst
2 |
--------------------------------------------------------------------------------
/src/ddo_transform/docs/index.rst:
--------------------------------------------------------------------------------
1 | Welcome to ddo_transform's documentation!
2 | ======================================
3 |
4 | .. toctree::
5 | :maxdepth: 2
6 | :caption: Contents:
7 |
8 | readme
9 | installation
10 | usage
11 | modules
12 | contributing
13 | authors
14 | history
15 |
16 | Indices and tables
17 | ==================
18 | * :ref:`genindex`
19 | * :ref:`modindex`
20 | * :ref:`search`
21 |
--------------------------------------------------------------------------------
/src/ddo_transform/docs/installation.rst:
--------------------------------------------------------------------------------
1 | .. highlight:: shell
2 |
3 | ============
4 | Installation
5 | ============
6 |
7 |
8 | Stable release
9 | --------------
10 |
11 | To install ddo_transform, run this command in your terminal:
12 |
13 | .. code-block:: console
14 |
15 | $ pip install ddo_transform
16 |
17 | This is the preferred method to install ddo_transform, as it will always install the most recent stable release.
18 |
19 | If you don't have `pip`_ installed, this `Python installation guide`_ can guide
20 | you through the process.
21 |
22 | .. _pip: https://pip.pypa.io
23 | .. _Python installation guide: http://docs.python-guide.org/en/latest/starting/installation/
24 |
25 |
26 | From sources
27 | ------------
28 |
29 | The sources for ddo_transform can be downloaded from the `Github repo`_.
30 |
31 | You can either clone the public repository:
32 |
33 | .. code-block:: console
34 |
35 | $ git clone git://github.com/devlace/ddo_transform
36 |
37 | Or download the `tarball`_:
38 |
39 | .. code-block:: console
40 |
41 | $ curl -OL https://github.com/devlace/ddo_transform/tarball/master
42 |
43 | Once you have a copy of the source, you can install it with:
44 |
45 | .. code-block:: console
46 |
47 | $ python setup.py install
48 |
49 |
50 | .. _Github repo: https://github.com/devlace/ddo_transform
51 | .. _tarball: https://github.com/devlace/ddo_transform/tarball/master
52 |
--------------------------------------------------------------------------------
/src/ddo_transform/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=python -msphinx
9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | set SPHINXPROJ=ddo_transform
13 |
14 | if "%1" == "" goto help
15 |
16 | %SPHINXBUILD% >NUL 2>NUL
17 | if errorlevel 9009 (
18 | echo.
19 | echo.The Sphinx module was not found. Make sure you have Sphinx installed,
20 | echo.then set the SPHINXBUILD environment variable to point to the full
21 | echo.path of the 'sphinx-build' executable. Alternatively you may add the
22 | echo.Sphinx directory to PATH.
23 | echo.
24 | echo.If you don't have Sphinx installed, grab it from
25 | echo.http://sphinx-doc.org/
26 | exit /b 1
27 | )
28 |
29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
30 | goto end
31 |
32 | :help
33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
34 |
35 | :end
36 | popd
37 |
--------------------------------------------------------------------------------
/src/ddo_transform/docs/readme.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../README.rst
2 |
--------------------------------------------------------------------------------
/src/ddo_transform/docs/usage.rst:
--------------------------------------------------------------------------------
1 | =====
2 | Usage
3 | =====
4 |
5 | To use ddo_transform in a project::
6 |
7 | import ddo_transform
8 |
--------------------------------------------------------------------------------
/src/ddo_transform/requirements.txt:
--------------------------------------------------------------------------------
1 | pyspark==2.4.4
--------------------------------------------------------------------------------
/src/ddo_transform/requirements_dev.txt:
--------------------------------------------------------------------------------
1 | pip
2 | bumpversion
3 | wheel
4 | watchdog
5 | flake8
6 | tox
7 | coverage
8 | Sphinx
9 | twine
10 | click
11 | pytest
12 | pytest-runner
13 |
--------------------------------------------------------------------------------
/src/ddo_transform/setup.cfg:
--------------------------------------------------------------------------------
1 | [bumpversion]
2 | current_version = 0.1.0
3 | commit = True
4 | tag = True
5 |
6 | [bumpversion:file:setup.py]
7 | search = version='{current_version}'
8 | replace = version='{new_version}'
9 |
10 | [bumpversion:file:ddo_transform/__init__.py]
11 | search = __version__ = '{current_version}'
12 | replace = __version__ = '{new_version}'
13 |
14 | [bdist_wheel]
15 | universal = 1
16 |
17 | [flake8]
18 | exclude = docs
19 | max-line-length = 120
20 |
21 | [aliases]
22 | # Define setup.py command aliases here
23 | test = pytest
24 |
25 | [tool:pytest]
26 | collect_ignore = ['setup.py']
27 |
28 |
--------------------------------------------------------------------------------
/src/ddo_transform/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | """The setup script."""
5 |
6 | import os
7 | from setuptools import setup, find_packages
8 |
9 | version = os.environ['package_version']
10 |
11 | with open('README.rst') as readme_file:
12 | readme = readme_file.read()
13 |
14 | with open('HISTORY.rst') as history_file:
15 | history = history_file.read()
16 |
17 | requirements = ['Click>=6.0', ]
18 |
19 | setup_requirements = ['pytest-runner', ]
20 |
21 | test_requirements = ['pytest', ]
22 |
23 | setup(
24 | author="Lace Lofranco",
25 | author_email='lace.lofranco@microsoft.com',
26 | classifiers=[
27 | 'Development Status :: 2 - Pre-Alpha',
28 | 'Intended Audience :: Developers',
29 | 'License :: OSI Approved :: MIT License',
30 | 'Natural Language :: English',
31 | "Programming Language :: Python :: 2",
32 | 'Programming Language :: Python :: 2.7',
33 | 'Programming Language :: Python :: 3',
34 | 'Programming Language :: Python :: 3.4',
35 | 'Programming Language :: Python :: 3.5',
36 | 'Programming Language :: Python :: 3.6',
37 | 'Programming Language :: Python :: 3.7',
38 | ],
39 | description="Python Boilerplate contains all the boilerplate you need to create a Python package.",
40 | entry_points={
41 | 'console_scripts': [
42 | 'ddo_transform=ddo_transform.cli:main',
43 | ],
44 | },
45 | install_requires=requirements,
46 | license="MIT license",
47 | long_description=readme + '\n\n' + history,
48 | include_package_data=True,
49 | keywords='ddo_transform',
50 | name='ddo_transform',
51 | packages=find_packages(include=['ddo_transform']),
52 | setup_requires=setup_requirements,
53 | test_suite='tests',
54 | tests_require=test_requirements,
55 | url='https://github.com/devlace/datadevops',
56 | version=version,
57 | zip_safe=False,
58 | )
59 |
--------------------------------------------------------------------------------
/src/ddo_transform/tests/test_standardize.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | """Tests for `ddo_transform` package."""
5 |
6 | import os
7 | import pytest
8 | import datetime
9 | from pyspark.sql.functions import isnull
10 |
11 | from ddo_transform import standardize
12 |
13 | THIS_DIR = os.path.dirname(os.path.abspath(__file__))
14 |
15 |
16 | @pytest.fixture
17 | def spark():
18 | """Spark Session fixture
19 | """
20 | from pyspark.sql import SparkSession
21 |
22 | spark = SparkSession.builder\
23 | .master("local[2]")\
24 | .appName("Unit Testing")\
25 | .getOrCreate()
26 | spark.sparkContext.setLogLevel("ERROR")
27 | return spark
28 |
29 |
30 | def test_standardize_parking_bay(spark):
31 | """Test data transform"""
32 | # Arrange
33 | schema = standardize.get_schema("in_parkingbay_schema")
34 | parkingbay_sdf = spark.read.json("./data/MelbParkingBayData.json", multiLine=True, schema=schema)
35 | load_id = 1
36 | loaded_on = datetime.datetime.now()
37 | # Act
38 | t_parkingbay_sdf, t_parkingbay_malformed_sdf = standardize.standardize_parking_bay(parkingbay_sdf, load_id, loaded_on) # noqa: E501
39 | # Assert
40 | assert t_parkingbay_sdf.count() != 0
41 | assert t_parkingbay_malformed_sdf.count() == 0
42 | assert t_parkingbay_sdf.filter(isnull("bay_id")).count() == 0
43 |
44 |
45 | def test_standardize_sensordata(spark):
46 | """Test data transform"""
47 | # Arrange
48 | schema = standardize.get_schema("in_sensordata_schema")
49 | sensordata_sdf = spark.read.json("./data/MelbParkingSensorData.json", multiLine=True, schema=schema)
50 | load_id = 1
51 | loaded_on = datetime.datetime.now()
52 | # Act
53 | t_sensordata_sdf, t_sensordata_malformed_sdf = standardize.standardize_sensordata(sensordata_sdf, load_id, loaded_on) # noqa: E501
54 | # Assert
55 | assert t_sensordata_sdf.count() != 0
56 | assert t_sensordata_malformed_sdf.count() == 0
57 | assert t_sensordata_sdf.filter(isnull("bay_id")).count() == 0
58 |
--------------------------------------------------------------------------------
/src/ddo_transform/tests/test_transform.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | """Tests for `ddo_transform` package."""
5 |
6 | import os
7 | import pytest
8 | import datetime
9 |
10 | from ddo_transform import transform
11 |
12 | THIS_DIR = os.path.dirname(os.path.abspath(__file__))
13 |
14 |
15 | @pytest.fixture
16 | def spark():
17 | """Spark Session fixture
18 | """
19 | from pyspark.sql import SparkSession
20 |
21 | spark = SparkSession.builder\
22 | .master("local[2]")\
23 | .appName("Unit Testing")\
24 | .getOrCreate()
25 | spark.sparkContext.setLogLevel("ERROR")
26 | return spark
27 |
28 |
29 | def test_process_dim_parking_bay(spark):
30 | """Test data transform"""
31 | parkingbay_sdf = spark.read\
32 | .schema(transform.get_schema("interim_parkingbay_schema"))\
33 | .json(os.path.join(THIS_DIR, "../data/interim_parking_bay.json"))
34 | dim_parkingbay_sdf = spark.read\
35 | .schema(schema=transform.get_schema("dw_dim_parking_bay"))\
36 | .json(os.path.join(THIS_DIR, "../data/dim_parking_bay.json"))
37 |
38 | load_id = 1
39 | loaded_on = datetime.datetime.now()
40 | results_df = transform.process_dim_parking_bay(parkingbay_sdf, dim_parkingbay_sdf, load_id, loaded_on)
41 |
42 | # TODO add more asserts
43 | assert results_df.count() != 0
44 |
--------------------------------------------------------------------------------
/src/ddo_transform/tox.ini:
--------------------------------------------------------------------------------
1 | [tox]
2 | envlist = py27, py34, py35, py36, flake8
3 |
4 | [travis]
5 | python =
6 | 3.6: py36
7 | 3.5: py35
8 | 3.4: py34
9 | 2.7: py27
10 |
11 | [testenv:flake8]
12 | basepython = python
13 | deps = flake8
14 | commands = flake8 ddo_transform
15 |
16 | [testenv]
17 | setenv =
18 | PYTHONPATH = {toxinidir}
19 | deps =
20 | -r{toxinidir}/requirements_dev.txt
21 | ; If you want to make tox run the tests with the same versions, create a
22 | ; requirements.txt with the pinned versions and uncomment the following line:
23 | ; -r{toxinidir}/requirements.txt
24 | commands =
25 | pip install -U pip
26 | py.test --basetemp={envtmpdir}
27 |
28 |
29 |
--------------------------------------------------------------------------------