├── .github └── ISSUE_TEMPLATE.md ├── .gitignore ├── LICENSE ├── README.md ├── adf ├── _scripts │ └── deploymentadf.ps1 ├── arm-template-parameters-definition.json ├── dataset │ ├── Ds_AdlsGen2_MelbParkingData.json │ └── Ds_REST_MelbParkingData.json ├── linkedService │ ├── Ls_AdlsGen2_01.json │ ├── Ls_AzureSQLDW_01.json │ ├── Ls_KeyVault.json │ ├── Ls_Rest_MelParkSensors_01.json │ └── Ls_adb_01.json ├── pipeline │ └── P_Ingest_MelbParkingData.json └── trigger │ └── T_Sched.json ├── clean_up.sh ├── data ├── raw_data │ ├── On-street_Parking_Bay_Sensors │ │ ├── On-street_Parking_Bay_Sensors.csv │ │ ├── On-street_Parking_Bay_Sensors.json │ │ └── On-street_Parking_Bay_Sensors_baylist.csv │ └── README.md └── seed │ ├── DimDate.csv │ └── DimTime.csv ├── databricks ├── config │ ├── cluster.config.json │ ├── cluster.config.template.json │ └── run.setup.config.json ├── configure_databricks.sh ├── create_secrets.sh ├── deploy_app.sh ├── libs │ └── azure-cosmosdb-spark_2.3.0_2.11-1.2.2-uber.jar └── notebooks │ ├── 00_setup.py │ ├── 01_explore.py │ ├── 02_standardize.py │ └── 03_transform.py ├── deploy.sh ├── docs ├── CI_CD.md └── NDCSydney2019-DataDevOps.pdf ├── images ├── CI_CD_process.PNG ├── Release_1_Agent_DeployToDatabricks.PNG └── architecture.PNG ├── infrastructure ├── README.md ├── azuredeploy.json ├── azuredeploy.parameters.dev.json ├── azuredeploy.parameters.prod.json ├── azuredeploy.parameters.stg.json ├── configure_adlagen2.sh └── deploy_infrastructure.sh ├── samples ├── azuresql │ ├── README.md │ ├── azure-pipelines-ci.yml │ └── ddo_samples_azuresql │ │ ├── ddo_samples_azuresql.sln │ │ └── ddo_samples_azuresql │ │ ├── SalesLT │ │ ├── Sequences │ │ │ └── SalesOrderNumber.sql │ │ ├── Tables │ │ │ ├── Address.sql │ │ │ ├── Customer.sql │ │ │ ├── CustomerAddress.sql │ │ │ ├── Product.sql │ │ │ ├── ProductCategory.sql │ │ │ ├── ProductDescription.sql │ │ │ ├── ProductModel.sql │ │ │ ├── ProductModelProductDescription.sql │ │ │ ├── SalesOrderDetail.sql │ │ │ └── SalesOrderHeader.sql │ │ └── Views │ │ │ ├── vGetAllCategories.sql │ │ │ ├── vProductAndDescription.sql │ │ │ └── vProductModelCatalogDescription.sql │ │ ├── Security │ │ └── SalesLT.sql │ │ ├── dbo │ │ ├── Functions │ │ │ ├── ufnGetAllCategories.sql │ │ │ ├── ufnGetCustomerInformation.sql │ │ │ └── ufnGetSalesOrderStatusText.sql │ │ ├── Stored Procedures │ │ │ ├── uspLogError.sql │ │ │ └── uspPrintError.sql │ │ ├── Tables │ │ │ ├── BuildVersion.sql │ │ │ └── ErrorLog.sql │ │ └── User Defined Types │ │ │ ├── AccountNumber.sql │ │ │ ├── Flag.sql │ │ │ ├── Name.sql │ │ │ ├── NameStyle.sql │ │ │ ├── OrderNumber.sql │ │ │ └── Phone.sql │ │ └── ddo_samples_azuresql.sqlproj └── databricks │ └── README.md ├── sql └── ddo_azuresqldw_dw │ ├── ddo_azuresqldw_dw.sln │ └── ddo_azuresqldw_dw │ ├── External Resources │ ├── AzureDataLakeStorage.sql │ └── ParquetFileFormat.sql │ ├── Script.PostDeployment1.sql │ ├── Security │ ├── ADLSCredentialKey.sql │ ├── MasterKeys.sql │ └── ext.sql │ ├── dbo │ ├── Stored Procedures │ │ └── load_dw.sql │ └── Tables │ │ ├── dim_location.sql │ │ ├── dim_parking_bay.sql │ │ ├── dim_st_marker.sql │ │ └── fact_parking.sql │ ├── ddo_azuresqldw_dw.sqlproj │ └── ext │ └── External Tables │ ├── dim_location.sql │ ├── dim_parking_bay.sql │ ├── dim_st_marker.sql │ └── fact_parking.sql └── src └── ddo_transform ├── .editorconfig ├── AUTHORS.rst ├── CONTRIBUTING.rst ├── Dockerfile ├── HISTORY.rst ├── MANIFEST.in ├── Makefile ├── README.rst ├── azure-pipelines-ci-artifacts.yml ├── azure-pipelines-ci-qa.yml ├── data ├── MelbParkingBayData.json ├── MelbParkingSensorData.json ├── dim_location.json ├── dim_parking_bay.json ├── dim_st_marker.json ├── interim_parking_bay.json └── interim_sensor.json ├── ddo_transform ├── __init__.py ├── standardize.py ├── transform.py └── util.py ├── docs ├── Makefile ├── authors.rst ├── conf.py ├── contributing.rst ├── history.rst ├── index.rst ├── installation.rst ├── make.bat ├── readme.rst └── usage.rst ├── requirements.txt ├── requirements_dev.txt ├── setup.cfg ├── setup.py ├── tests ├── test_standardize.py └── test_transform.py └── tox.ini /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | * ddo_transform version: 2 | * Python version: 3 | * Operating System: 4 | 5 | ### Description 6 | 7 | Describe what you were trying to get done. 8 | Tell us what happened, what went wrong, and what you expected to happen. 9 | 10 | ### What I Did 11 | 12 | ``` 13 | Paste the command(s) you ran and the output. 14 | If there was a crash, please include the traceback here. 15 | ``` 16 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | 58 | # Flask stuff: 59 | instance/ 60 | .webassets-cache 61 | 62 | # Scrapy stuff: 63 | .scrapy 64 | 65 | # Sphinx documentation 66 | docs/_build/ 67 | 68 | # PyBuilder 69 | target/ 70 | 71 | # Jupyter Notebook 72 | .ipynb_checkpoints 73 | 74 | # pyenv 75 | .python-version 76 | 77 | # celery beat schedule file 78 | celerybeat-schedule 79 | 80 | # SageMath parsed files 81 | *.sage.py 82 | 83 | # dotenv 84 | .env 85 | .env.* 86 | 87 | # virtualenv 88 | .venv 89 | venv/ 90 | ENV/ 91 | 92 | # Spyder project settings 93 | .spyderproject 94 | .spyproject 95 | 96 | # Rope project settings 97 | .ropeproject 98 | 99 | # mkdocs documentation 100 | /site 101 | 102 | # mypy 103 | .mypy_cache/ 104 | 105 | ## Ignore Visual Studio temporary files, build results, and 106 | ## files generated by popular Visual Studio add-ons. 107 | ## 108 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore 109 | 110 | # User-specific files 111 | *.rsuser 112 | *.suo 113 | *.user 114 | *.userosscache 115 | *.sln.docstates 116 | 117 | # User-specific files (MonoDevelop/Xamarin Studio) 118 | *.userprefs 119 | 120 | # Mono auto generated files 121 | mono_crash.* 122 | 123 | # Build results 124 | [Dd]ebug/ 125 | [Dd]ebugPublic/ 126 | [Rr]elease/ 127 | [Rr]eleases/ 128 | x64/ 129 | x86/ 130 | [Aa][Rr][Mm]/ 131 | [Aa][Rr][Mm]64/ 132 | bld/ 133 | [Bb]in/ 134 | [Oo]bj/ 135 | [Ll]og/ 136 | 137 | # Visual Studio 2015/2017 cache/options directory 138 | .vs/ 139 | # Uncomment if you have tasks that create the project's static files in wwwroot 140 | #wwwroot/ 141 | 142 | # Visual Studio 2017 auto generated files 143 | Generated\ Files/ 144 | 145 | # MSTest test Results 146 | [Tt]est[Rr]esult*/ 147 | [Bb]uild[Ll]og.* 148 | 149 | # NUnit 150 | *.VisualState.xml 151 | TestResult.xml 152 | nunit-*.xml 153 | 154 | # Build Results of an ATL Project 155 | [Dd]ebugPS/ 156 | [Rr]eleasePS/ 157 | dlldata.c 158 | 159 | # Benchmark Results 160 | BenchmarkDotNet.Artifacts/ 161 | 162 | # .NET Core 163 | project.lock.json 164 | project.fragment.lock.json 165 | artifacts/ 166 | 167 | # StyleCop 168 | StyleCopReport.xml 169 | 170 | # Files built by Visual Studio 171 | *_i.c 172 | *_p.c 173 | *_h.h 174 | *.ilk 175 | *.meta 176 | *.obj 177 | *.iobj 178 | *.pch 179 | *.pdb 180 | *.ipdb 181 | *.pgc 182 | *.pgd 183 | *.rsp 184 | *.sbr 185 | *.tlb 186 | *.tli 187 | *.tlh 188 | *.tmp 189 | *.tmp_proj 190 | *_wpftmp.csproj 191 | *.log 192 | *.vspscc 193 | *.vssscc 194 | .builds 195 | *.pidb 196 | *.svclog 197 | *.scc 198 | 199 | # Chutzpah Test files 200 | _Chutzpah* 201 | 202 | # Visual C++ cache files 203 | ipch/ 204 | *.aps 205 | *.ncb 206 | *.opendb 207 | *.opensdf 208 | *.sdf 209 | *.cachefile 210 | *.VC.db 211 | *.VC.VC.opendb 212 | 213 | # Visual Studio profiler 214 | *.psess 215 | *.vsp 216 | *.vspx 217 | *.sap 218 | 219 | # Visual Studio Trace Files 220 | *.e2e 221 | 222 | # TFS 2012 Local Workspace 223 | $tf/ 224 | 225 | # Guidance Automation Toolkit 226 | *.gpState 227 | 228 | # ReSharper is a .NET coding add-in 229 | _ReSharper*/ 230 | *.[Rr]e[Ss]harper 231 | *.DotSettings.user 232 | 233 | # JustCode is a .NET coding add-in 234 | .JustCode 235 | 236 | # TeamCity is a build add-in 237 | _TeamCity* 238 | 239 | # DotCover is a Code Coverage Tool 240 | *.dotCover 241 | 242 | # AxoCover is a Code Coverage Tool 243 | .axoCover/* 244 | !.axoCover/settings.json 245 | 246 | # Visual Studio code coverage results 247 | *.coverage 248 | *.coveragexml 249 | 250 | # NCrunch 251 | _NCrunch_* 252 | .*crunch*.local.xml 253 | nCrunchTemp_* 254 | 255 | # MightyMoose 256 | *.mm.* 257 | AutoTest.Net/ 258 | 259 | # Web workbench (sass) 260 | .sass-cache/ 261 | 262 | # Installshield output folder 263 | [Ee]xpress/ 264 | 265 | # DocProject is a documentation generator add-in 266 | DocProject/buildhelp/ 267 | DocProject/Help/*.HxT 268 | DocProject/Help/*.HxC 269 | DocProject/Help/*.hhc 270 | DocProject/Help/*.hhk 271 | DocProject/Help/*.hhp 272 | DocProject/Help/Html2 273 | DocProject/Help/html 274 | 275 | # Click-Once directory 276 | publish/ 277 | 278 | # Publish Web Output 279 | *.[Pp]ublish.xml 280 | *.azurePubxml 281 | # Note: Comment the next line if you want to checkin your web deploy settings, 282 | # but database connection strings (with potential passwords) will be unencrypted 283 | *.pubxml 284 | *.publishproj 285 | 286 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 287 | # checkin your Azure Web App publish settings, but sensitive information contained 288 | # in these scripts will be unencrypted 289 | PublishScripts/ 290 | 291 | # NuGet Packages 292 | *.nupkg 293 | # NuGet Symbol Packages 294 | *.snupkg 295 | # The packages folder can be ignored because of Package Restore 296 | **/[Pp]ackages/* 297 | # except build/, which is used as an MSBuild target. 298 | !**/[Pp]ackages/build/ 299 | # Uncomment if necessary however generally it will be regenerated when needed 300 | #!**/[Pp]ackages/repositories.config 301 | # NuGet v3's project.json files produces more ignorable files 302 | *.nuget.props 303 | *.nuget.targets 304 | 305 | # Microsoft Azure Build Output 306 | csx/ 307 | *.build.csdef 308 | 309 | # Microsoft Azure Emulator 310 | ecf/ 311 | rcf/ 312 | 313 | # Windows Store app package directories and files 314 | AppPackages/ 315 | BundleArtifacts/ 316 | Package.StoreAssociation.xml 317 | _pkginfo.txt 318 | *.appx 319 | *.appxbundle 320 | *.appxupload 321 | 322 | # Visual Studio cache files 323 | # files ending in .cache can be ignored 324 | *.[Cc]ache 325 | # but keep track of directories ending in .cache 326 | !?*.[Cc]ache/ 327 | 328 | # Others 329 | ClientBin/ 330 | ~$* 331 | *~ 332 | *.dbmdl 333 | *.dbproj.schemaview 334 | *.jfm 335 | *.pfx 336 | *.publishsettings 337 | orleans.codegen.cs 338 | 339 | # Including strong name files can present a security risk 340 | # (https://github.com/github/gitignore/pull/2483#issue-259490424) 341 | #*.snk 342 | 343 | # Since there are multiple workflows, uncomment next line to ignore bower_components 344 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 345 | #bower_components/ 346 | 347 | # RIA/Silverlight projects 348 | Generated_Code/ 349 | 350 | # Backup & report files from converting an old project file 351 | # to a newer Visual Studio version. Backup files are not needed, 352 | # because we have git ;-) 353 | _UpgradeReport_Files/ 354 | Backup*/ 355 | UpgradeLog*.XML 356 | UpgradeLog*.htm 357 | ServiceFabricBackup/ 358 | *.rptproj.bak 359 | 360 | # SQL Server files 361 | *.mdf 362 | *.ldf 363 | *.ndf 364 | 365 | # Business Intelligence projects 366 | *.rdl.data 367 | *.bim.layout 368 | *.bim_*.settings 369 | *.rptproj.rsuser 370 | *- [Bb]ackup.rdl 371 | *- [Bb]ackup ([0-9]).rdl 372 | *- [Bb]ackup ([0-9][0-9]).rdl 373 | 374 | # Microsoft Fakes 375 | FakesAssemblies/ 376 | 377 | # GhostDoc plugin setting file 378 | *.GhostDoc.xml 379 | 380 | # Node.js Tools for Visual Studio 381 | .ntvs_analysis.dat 382 | node_modules/ 383 | 384 | # Visual Studio 6 build log 385 | *.plg 386 | 387 | # Visual Studio 6 workspace options file 388 | *.opt 389 | 390 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.) 391 | *.vbw 392 | 393 | # Visual Studio LightSwitch build output 394 | **/*.HTMLClient/GeneratedArtifacts 395 | **/*.DesktopClient/GeneratedArtifacts 396 | **/*.DesktopClient/ModelManifest.xml 397 | **/*.Server/GeneratedArtifacts 398 | **/*.Server/ModelManifest.xml 399 | _Pvt_Extensions 400 | 401 | # Paket dependency manager 402 | .paket/paket.exe 403 | paket-files/ 404 | 405 | # FAKE - F# Make 406 | .fake/ 407 | 408 | # CodeRush personal settings 409 | .cr/personal 410 | 411 | # Python Tools for Visual Studio (PTVS) 412 | __pycache__/ 413 | *.pyc 414 | 415 | # Cake - Uncomment if you are using it 416 | # tools/** 417 | # !tools/packages.config 418 | 419 | # Tabs Studio 420 | *.tss 421 | 422 | # Telerik's JustMock configuration file 423 | *.jmconfig 424 | 425 | # BizTalk build output 426 | *.btp.cs 427 | *.btm.cs 428 | *.odx.cs 429 | *.xsd.cs 430 | 431 | # OpenCover UI analysis results 432 | OpenCover/ 433 | 434 | # Azure Stream Analytics local run output 435 | ASALocalRun/ 436 | 437 | # MSBuild Binary and Structured Log 438 | *.binlog 439 | 440 | # NVidia Nsight GPU debugger configuration file 441 | *.nvuser 442 | 443 | # MFractors (Xamarin productivity tool) working folder 444 | .mfractor/ 445 | 446 | # Local History for Visual Studio 447 | .localhistory/ 448 | 449 | # BeatPulse healthcheck temp database 450 | healthchecksdb 451 | 452 | # Backup folder for Package Reference Convert tool in Visual Studio 2017 453 | MigrationBackup/ 454 | 455 | # Devcontainer 456 | .devcontainer/ 457 | 458 | .vscode/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019, Lace Lofranco 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # We've moved! 2 | 3 | This repository has moved under the official Azure Samples Github organization: . 4 | 5 | **https://github.com/Azure-Samples/modern-data-warehouse-dataops** 6 | 7 | 8 | ------------------- 9 | 10 | # DataDevOps 11 | 12 | The purpose of this repository is to demonstrate how DevOps principles can be applied Data Pipeline Solution. 13 | 14 | [![IMAGE ALT TEXT HERE](https://img.youtube.com/vi/Xs1-OU5cmsw/0.jpg)](https://www.youtube.com/watch?v=Xs1-OU5cmsw") 15 | 16 | ## Architecture 17 | 18 | The following shows the overall architecture of the solution. 19 | 20 | ![Architecture](images/architecture.PNG?raw=true "Architecture") 21 | 22 | ### Design Considerations 23 | 24 | - **Data Transformation logic belongs in packages, not Notebooks** 25 | - All main data transformation code should be packaged up within a Python package/JAR/etc. These packages are then uploaded to DBFS and installed on a specifically configured cluster, along with all other third-party dependencies (ei. azure-cosmosdb-spark jar). Notebooks then simply import the package(s) and calls any relevant functions. Effectively, Notebooks become a lightweight wrapper around the packages. This ensures seperation of concerns and promotes code reuse, testability, and code quality. 26 | - **Data should be tested** 27 | - Two different tests should be performed: 28 | - **Structure** (Is the data in the expected shape / schema?) 29 | - **Content** (Are there unexpected nulls? Are the summary statistics in expected ranges?) 30 | - **Data should have lineage** 31 | - Just as application deployments should have lineage in order to track which code commit produced which artifacts and deployments, each final loaded data record should be tagged with the appropriate ETL pipeline run id. Not only does this ensure traceability, it also helps with recovery from any potential failed / half-run data loads. 32 | 33 | ## Build and Release Pipeline 34 | 35 | The following shows the overall CI/CD process end to end. 36 | 37 | ![CI/CD](images/CI_CD_process.PNG?raw=true "CI/CD") 38 | 39 | Both Build and Release Pipelines are built using [AzureDevOps](https://dev.azure.com/) (Public instance) and can be view using the following links: 40 | - [Build Pipelines](https://dev.azure.com/devlacepub/DataDevOps/_build) 41 | - [Release Pipeline](https://dev.azure.com/devlacepub/DataDevOps/_release) 42 | 43 | More information [here](docs/CI_CD_process.md). 44 | ### Environments 45 | 46 | - **Dev** - Development collaboration branch 47 | - **QA** - Environment where all integration tests are run (*not yet implmented*) 48 | - **Staging/UAT** - A mirror of the production job, along with state and data. Deploying to staging first give the ability to "mock" a realistic release into production. 49 | - **Production** 50 | 51 | In addition to these environment, each developer may choose to have their own Development(s) environment for their individual use. 52 | 53 | ## Testing 54 | 55 | - Unit Testing - Standard unit tests which tests small pieces of functionality within your code. Data transformation code should have unit tests. 56 | 57 | - Integration Testing - This includes end-to-end testing of the ETL pipeline. 58 | 59 | - Data Testing 60 | 1. Structure - Test for correct schema, expected structure. 61 | 2. Content - Can be tested through quantitative summary statistics and qualitative data quality graphs within the notebook. 62 | 63 | ## Monitoring 64 | 65 | ### Databricks 66 | - [Monitoring Azure Databricks with Azure Monitor](https://docs.microsoft.com/en-us/azure/architecture/databricks-monitoring/) 67 | - [Monitoring Azure Databricks Jobs with Application Insights](https://msdn.microsoft.com/en-us/magazine/mt846727.aspx) 68 | 69 | ### Data Factory 70 | - [Monitor Azure Data Factory with Azure Monitor](https://docs.microsoft.com/en-us/azure/data-factory/monitor-using-azure-monitor) 71 | - [Alerting in Azure Data Factory](https://azure.microsoft.com/en-in/blog/create-alerts-to-proactively-monitor-your-data-factory-pipelines/) 72 | 73 | ## Deploy the solution 74 | 75 | ### Pre-requisites: 76 | 1. Github Account 77 | 2. Azure DevOps Account + Project 78 | 3. Azure Account 79 | 80 | ### Software pre-requisites: 81 | 1. For Windows users, [Windows Subsystem For Linux](https://docs.microsoft.com/en-us/windows/wsl/install-win10) 82 | 2. [az cli 2.x](https://docs.microsoft.com/en-us/cli/azure/install-azure-cli?view=azure-cli-latest) 83 | 3. [Python 3+](https://www.python.org/) 84 | 4. [databricks-cli](https://docs.azuredatabricks.net/dev-tools/databricks-cli.html) 85 | 5. [jq](https://stedolan.github.io/jq/) 86 | 87 | NOTE: This deployment was tested using WSL (Ubuntu 16.04) and Debian GNU/Linux 9.9 (stretch) 88 | 89 | ### Deployment Instructions 90 | 91 | 1. Fork this repository. Forking is necessary if you want to setup git integration with Azure Data Factory. 92 | 2. **Deploy Azure resources.** 93 | 1. Clone the forked repository and `cd` into the root of the repo 94 | 2. Run `./deploy.sh`. 95 | - This will deploy three Resource Groups (one per environment) each with the following Azure resources. 96 | - Data Factory (empty) - *next steps will deploy actual data pipelines*. 97 | - Data Lake Store Gen2 and Service Principal with Storage Contributor rights assigned. 98 | - Databricks workspace - notebooks uploaded, SparkSQL tables created, and ADLS Gen2 mounted using SP. 99 | - KeyVault with all secrets stored. 100 | - This will create a local `.env.{environment_name}` files containing essential configuration information. 101 | - All Azure resources are tagged with correct Environment. 102 | - IMPORTANT: Due to a limitation of the inability to generate Databricks PAT tokens automatically, you will be prompted generate and enter this per environment. See [here](https://docs.azuredatabricks.net/dev-tools/databricks-cli.html#set-up-authentication) for more information. 103 | - The solution is designed such that **all** starting environment deployment configuration should be specified in the arm.parameters files. This is to centralize configuration. 104 | 105 | 3. **Setup ADF git integration in DEV Data Factory** 106 | 1. In the Azure Portal, navigate to the Data Factory in the **DEV** environment. 107 | 2. Click "Author & Monitor" to launch the Data Factory portal. 108 | 3. On the landing page, select "Set up code repository". For more information, see [here](https://docs.microsoft.com/en-us/azure/data-factory/source-control). 109 | 4. Fill in the repository settings with the following: 110 | - Repository type: **Github** 111 | - Github Account: ***your_Github_account*** 112 | - Git repository name: **forked Github repository** 113 | - Collaboration branch: **master** 114 | - Root folder: **/adf** 115 | - Import Existing Data Factory resource to respository: **Unselected** 116 | 5. Navigating to "Author" tab, you should see all the pipelines deployed. 117 | 6. Select `Connections` > `Ls_KeyVault`. Update the Base Url to the KeyVault Url of your DEV environment. 118 | 7. Select `Connections` > `Ls_AdlsGen2_01`. Update URL to the ADLS Gen2 Url of your DEV environment. 119 | 8. Click `Publish` to publish changes. 120 | 121 | 4. **Setup Build Pipelines.** You will be creating two build pipelines, one which will trigger for every pull request which will run Unit Testing + Linting, and the second one which will trigger on every commit to master and will create the actual build artifacts for release. 122 | 1. In Azure DevOps, navigate to `Pipelines`. Select "Create Pipeline". 123 | 2. Under "Where is your code?", select Github (YAML). 124 | - If you have not yet already, you maybe prompted to connect your Github account. See [here](https://docs.microsoft.com/en-us/azure/devops/pipelines/repos/github?view=azure-devops&tabs=yaml#grant-access-to-your-github-repositories) for more information. 125 | 3. Under "Select a repository", select your forked repo. 126 | 3. Under "Configure your pipeline", select "Existing Azure Pipelines YAML file". 127 | - Branch: master 128 | - Path: `/src/ddo_transform/azure-pipelines-ci-qa.yaml` 129 | 4. Select `Run`. 130 | 5. Repeat steps 1-4, but select as the path `/src/ddo_transform/azure-pipelines-ci-artifacts`. 131 | 132 | 5. **Setup Release Pipelines** 133 | 134 | **WORK IN PROGRESS** 135 | 1. In Azure DevOps, navigate to `Release`. Select "New pipeline". 136 | 2. Under "Select a template", select "Empty job". 137 | 3. Under "Stage", set Stage name to "Deploy to STG". 138 | 4. Under Agent job, fill in information as shown: 139 | 140 | ![Release_1_AgentJob](images/Release_1_Agent_DeployToDatabricks.PNG?raw=true "CI/CD") 141 | 142 | 5. Add a step to the Agent job by select the "+" icon. 143 | 144 | 145 | 146 | ## Known Issues, Limitations and Workarounds 147 | - Currently, ADLS Gen2 cannot be managed via the az cli 2.0. 148 | - **Workaround**: Use the REST API to automate creation of the File System. 149 | - Databricks KeyVault-backed secrets scopes can only be create via the UI, and thus cannot be created programmatically and was not incorporated in the automated deployment of the solution. 150 | - **Workaround**: Use normal Databricks secrets with the downside of duplicated information. 151 | - Databricks Personal Access Tokens can only be created via the UI. 152 | - **Workaround**: User is asked to supply the tokens during deployment, which is unfortunately cumbersome. 153 | - Data Factory Databricks Linked Service does not support dynamic configuration, thus needing a manual step to point to new cluster during deployment of pipeline to a new environment. 154 | - **Workaround**: Alternative is to create an on-demand cluster however this may introduce latency issues with cluster spin up time. Optionally, user can manually update Linked Service to point to correct cluster. 155 | 156 | ## Data 157 | 158 | ### Physical layout 159 | 160 | ADLS Gen2 is structured as the following: 161 | ------------ 162 | 163 | datalake <- filesystem 164 | /libs <- contains all libs, jars, wheels needed for processing 165 | /data 166 | /lnd <- landing folder where all data files are ingested into. 167 | /interim <- interim (cleanesed) tables 168 | /dw <- final tables 169 | 170 | 171 | ------------ 172 | 173 | 174 | All data procured here: https://www.melbourne.vic.gov.au/about-council/governance-transparency/open-data/Pages/on-street-parking-data.aspx 175 | -------------------------------------------------------------------------------- /adf/_scripts/deploymentadf.ps1: -------------------------------------------------------------------------------- 1 | param 2 | ( 3 | [parameter(Mandatory = $false)] [String] $rootFolder, 4 | [parameter(Mandatory = $false)] [String] $armTemplate, 5 | [parameter(Mandatory = $false)] [String] $ResourceGroupName, 6 | [parameter(Mandatory = $false)] [String] $DataFactoryName, 7 | [parameter(Mandatory = $false)] [Bool] $predeployment=$true, 8 | [parameter(Mandatory = $false)] [Bool] $deleteDeployment=$false 9 | ) 10 | 11 | $templateJson = Get-Content $armTemplate | ConvertFrom-Json 12 | $resources = $templateJson.resources 13 | 14 | #Triggers 15 | Write-Host "Getting triggers" 16 | $triggersADF = Get-AzDataFactoryV2Trigger -DataFactoryName $DataFactoryName -ResourceGroupName $ResourceGroupName 17 | $triggersTemplate = $resources | Where-Object { $_.type -eq "Microsoft.DataFactory/factories/triggers" } 18 | $triggerNames = $triggersTemplate | ForEach-Object {$_.name.Substring(37, $_.name.Length-40)} 19 | $activeTriggerNames = $triggersTemplate | Where-Object { $_.properties.runtimeState -eq "Started" -and ($_.properties.pipelines.Count -gt 0 -or $_.properties.pipeline.pipelineReference -ne $null)} | ForEach-Object {$_.name.Substring(37, $_.name.Length-40)} 20 | $deletedtriggers = $triggersADF | Where-Object { $triggerNames -notcontains $_.Name } 21 | $triggerstostop = $triggerNames | where { ($triggersADF | Select-Object name).name -contains $_ } 22 | 23 | if ($predeployment -eq $true) { 24 | #Stop all triggers 25 | Write-Host "Stopping deployed triggers" 26 | $triggerstostop | ForEach-Object { 27 | Write-host "Disabling trigger " $_ 28 | Stop-AzDataFactoryV2Trigger -ResourceGroupName $ResourceGroupName -DataFactoryName $DataFactoryName -Name $_ -Force 29 | } 30 | } 31 | else { 32 | #Deleted resources 33 | #pipelines 34 | Write-Host "Getting pipelines" 35 | $pipelinesADF = Get-AzDataFactoryV2Pipeline -DataFactoryName $DataFactoryName -ResourceGroupName $ResourceGroupName 36 | $pipelinesTemplate = $resources | Where-Object { $_.type -eq "Microsoft.DataFactory/factories/pipelines" } 37 | $pipelinesNames = $pipelinesTemplate | ForEach-Object {$_.name.Substring(37, $_.name.Length-40)} 38 | $deletedpipelines = $pipelinesADF | Where-Object { $pipelinesNames -notcontains $_.Name } 39 | #datasets 40 | Write-Host "Getting datasets" 41 | $datasetsADF = Get-AzDataFactoryV2Dataset -DataFactoryName $DataFactoryName -ResourceGroupName $ResourceGroupName 42 | $datasetsTemplate = $resources | Where-Object { $_.type -eq "Microsoft.DataFactory/factories/datasets" } 43 | $datasetsNames = $datasetsTemplate | ForEach-Object {$_.name.Substring(37, $_.name.Length-40) } 44 | $deleteddataset = $datasetsADF | Where-Object { $datasetsNames -notcontains $_.Name } 45 | #linkedservices 46 | Write-Host "Getting linked services" 47 | $linkedservicesADF = Get-AzDataFactoryV2LinkedService -DataFactoryName $DataFactoryName -ResourceGroupName $ResourceGroupName 48 | $linkedservicesTemplate = $resources | Where-Object { $_.type -eq "Microsoft.DataFactory/factories/linkedservices" } 49 | $linkedservicesNames = $linkedservicesTemplate | ForEach-Object {$_.name.Substring(37, $_.name.Length-40)} 50 | $deletedlinkedservices = $linkedservicesADF | Where-Object { $linkedservicesNames -notcontains $_.Name } 51 | #Integrationruntimes 52 | Write-Host "Getting integration runtimes" 53 | $integrationruntimesADF = Get-AzDataFactoryV2IntegrationRuntime -DataFactoryName $DataFactoryName -ResourceGroupName $ResourceGroupName 54 | $integrationruntimesTemplate = $resources | Where-Object { $_.type -eq "Microsoft.DataFactory/factories/integrationruntimes" } 55 | $integrationruntimesNames = $integrationruntimesTemplate | ForEach-Object {$_.name.Substring(37, $_.name.Length-40)} 56 | $deletedintegrationruntimes = $integrationruntimesADF | Where-Object { $integrationruntimesNames -notcontains $_.Name } 57 | 58 | #Delete resources 59 | Write-Host "Deleting triggers" 60 | $deletedtriggers | ForEach-Object { 61 | Write-Host "Deleting trigger " $_.Name 62 | $trig = Get-AzDataFactoryV2Trigger -name $_.Name -ResourceGroupName $ResourceGroupName -DataFactoryName $DataFactoryName 63 | if ($trig.RuntimeState -eq "Started") { 64 | Stop-AzDataFactoryV2Trigger -ResourceGroupName $ResourceGroupName -DataFactoryName $DataFactoryName -Name $_.Name -Force 65 | } 66 | Remove-AzDataFactoryV2Trigger -Name $_.Name -ResourceGroupName $ResourceGroupName -DataFactoryName $DataFactoryName -Force 67 | } 68 | Write-Host "Deleting pipelines" 69 | $deletedpipelines | ForEach-Object { 70 | Write-Host "Deleting pipeline " $_.Name 71 | Remove-AzDataFactoryV2Pipeline -Name $_.Name -ResourceGroupName $ResourceGroupName -DataFactoryName $DataFactoryName -Force 72 | } 73 | Write-Host "Deleting datasets" 74 | $deleteddataset | ForEach-Object { 75 | Write-Host "Deleting dataset " $_.Name 76 | Remove-AzDataFactoryV2Dataset -Name $_.Name -ResourceGroupName $ResourceGroupName -DataFactoryName $DataFactoryName -Force 77 | } 78 | Write-Host "Deleting linked services" 79 | $deletedlinkedservices | ForEach-Object { 80 | Write-Host "Deleting Linked Service " $_.Name 81 | Remove-AzDataFactoryV2LinkedService -Name $_.Name -ResourceGroupName $ResourceGroupName -DataFactoryName $DataFactoryName -Force 82 | } 83 | Write-Host "Deleting integration runtimes" 84 | $deletedintegrationruntimes | ForEach-Object { 85 | Write-Host "Deleting integration runtime " $_.Name 86 | Remove-AzDataFactoryV2IntegrationRuntime -Name $_.Name -ResourceGroupName $ResourceGroupName -DataFactoryName $DataFactoryName -Force 87 | } 88 | 89 | if ($deleteDeployment -eq $true) { 90 | Write-Host "Deleting ARM deployment ... under resource group: " $ResourceGroupName 91 | $deployments = Get-AzResourceGroupDeployment -ResourceGroupName $ResourceGroupName 92 | $deploymentsToConsider = $deployments | Where { $_.DeploymentName -like "ArmTemplate_master*" -or $_.DeploymentName -like "ArmTemplateForFactory*" } | Sort-Object -Property Timestamp -Descending 93 | $deploymentName = $deploymentsToConsider[0].DeploymentName 94 | 95 | Write-Host "Deployment to be deleted: " $deploymentName 96 | $deploymentOperations = Get-AzResourceGroupDeploymentOperation -DeploymentName $deploymentName -ResourceGroupName $ResourceGroupName 97 | $deploymentsToDelete = $deploymentOperations | Where { $_.properties.targetResource.id -like "*Microsoft.Resources/deployments*" } 98 | 99 | $deploymentsToDelete | ForEach-Object { 100 | Write-host "Deleting inner deployment: " $_.properties.targetResource.id 101 | Remove-AzResourceGroupDeployment -Id $_.properties.targetResource.id 102 | } 103 | Write-Host "Deleting deployment: " $deploymentName 104 | Remove-AzResourceGroupDeployment -ResourceGroupName $ResourceGroupName -Name $deploymentName 105 | } 106 | 107 | #Start Active triggers - After cleanup efforts 108 | Write-Host "Starting active triggers" 109 | $activeTriggerNames | ForEach-Object { 110 | Write-host "Enabling trigger " $_ 111 | Start-AzDataFactoryV2Trigger -ResourceGroupName $ResourceGroupName -DataFactoryName $DataFactoryName -Name $_ -Force 112 | } 113 | } -------------------------------------------------------------------------------- /adf/arm-template-parameters-definition.json: -------------------------------------------------------------------------------- 1 | { 2 | "Microsoft.DataFactory/factories/pipelines": { 3 | "properties": { 4 | "activities": [{ 5 | "typeProperties": { 6 | "notebookPath": "=", 7 | "libraries": [{ 8 | "egg": "=" 9 | }] 10 | } 11 | }] 12 | } 13 | }, 14 | "Microsoft.DataFactory/factories/integrationRuntimes":{ 15 | "properties": { 16 | "typeProperties": { 17 | "ssisProperties": { 18 | "catalogInfo": { 19 | "catalogServerEndpoint": "=", 20 | "catalogAdminUserName": "=", 21 | "catalogAdminPassword": { 22 | "value": "-::secureString" 23 | } 24 | }, 25 | "customSetupScriptProperties": { 26 | "sasToken": { 27 | "value": "-::secureString" 28 | } 29 | } 30 | }, 31 | "linkedInfo": { 32 | "key": { 33 | "value": "-::secureString" 34 | }, 35 | "resourceId": "=" 36 | } 37 | } 38 | } 39 | }, 40 | "Microsoft.DataFactory/factories/triggers": { 41 | "properties": { 42 | "pipelines": [{ 43 | "parameters": { 44 | "*": "=" 45 | } 46 | }, 47 | "pipelineReference.referenceName" 48 | ], 49 | "pipeline": { 50 | "parameters": { 51 | "*": "=" 52 | } 53 | }, 54 | "typeProperties": { 55 | "scope": "=" 56 | } 57 | 58 | } 59 | }, 60 | "Microsoft.DataFactory/factories/linkedServices": { 61 | "*": { 62 | "properties": { 63 | "typeProperties": { 64 | "accountName": "=", 65 | "username": "=", 66 | "userName": "=", 67 | "accessKeyId": "=", 68 | "servicePrincipalId": "=", 69 | "userId": "=", 70 | "clientId": "=", 71 | "clusterUserName": "=", 72 | "clusterSshUserName": "=", 73 | "hostSubscriptionId": "=", 74 | "clusterResourceGroup": "=", 75 | "subscriptionId": "=", 76 | "resourceGroupName": "=", 77 | "tenant": "=", 78 | "dataLakeStoreUri": "=", 79 | "baseUrl": "=", 80 | "database": "=", 81 | "serviceEndpoint": "=", 82 | "batchUri": "=", 83 | "databaseName": "=", 84 | "systemNumber": "=", 85 | "server": "=", 86 | "url":"=", 87 | "aadResourceId": "=", 88 | "connectionString": "|:-connectionString:secureString" 89 | } 90 | } 91 | }, 92 | "Odbc": { 93 | "properties": { 94 | "typeProperties": { 95 | "userName": "=", 96 | "connectionString": { 97 | "secretName": "=" 98 | } 99 | } 100 | } 101 | } 102 | }, 103 | "Microsoft.DataFactory/factories/datasets": { 104 | "*": { 105 | "properties": { 106 | "typeProperties": { 107 | "folderPath": "=", 108 | "fileName": "=" 109 | } 110 | } 111 | }} 112 | } -------------------------------------------------------------------------------- /adf/dataset/Ds_AdlsGen2_MelbParkingData.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Ds_AdlsGen2_MelbParkingData", 3 | "properties": { 4 | "linkedServiceName": { 5 | "referenceName": "Ls_AdlsGen2_01", 6 | "type": "LinkedServiceReference" 7 | }, 8 | "parameters": { 9 | "infilefolder": { 10 | "type": "String" 11 | }, 12 | "infilename": { 13 | "type": "String" 14 | }, 15 | "container": { 16 | "type": "String", 17 | "defaultValue": "datalake/data/lnd" 18 | } 19 | }, 20 | "type": "AzureBlobFSFile", 21 | "typeProperties": { 22 | "format": { 23 | "type": "JsonFormat", 24 | "filePattern": "arrayOfObjects" 25 | }, 26 | "fileName": { 27 | "value": "@dataset().infilename", 28 | "type": "Expression" 29 | }, 30 | "folderPath": { 31 | "value": "@concat(dataset().container, '/', dataset().infilefolder)", 32 | "type": "Expression" 33 | } 34 | } 35 | } 36 | } -------------------------------------------------------------------------------- /adf/dataset/Ds_REST_MelbParkingData.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Ds_REST_MelbParkingData", 3 | "properties": { 4 | "linkedServiceName": { 5 | "referenceName": "Ls_Rest_MelParkSensors_01", 6 | "type": "LinkedServiceReference" 7 | }, 8 | "parameters": { 9 | "relativeurl": { 10 | "type": "String" 11 | } 12 | }, 13 | "annotations": [], 14 | "type": "RestResource", 15 | "typeProperties": { 16 | "relativeUrl": { 17 | "value": "@dataset().relativeurl", 18 | "type": "Expression" 19 | }, 20 | "requestMethod": "GET" 21 | } 22 | } 23 | } -------------------------------------------------------------------------------- /adf/linkedService/Ls_AdlsGen2_01.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Ls_AdlsGen2_01", 3 | "properties": { 4 | "annotations": [], 5 | "type": "AzureBlobFS", 6 | "typeProperties": { 7 | "url": "https://ddostordevvnhf6tvx.dfs.core.windows.net/", 8 | "accountKey": { 9 | "type": "AzureKeyVaultSecret", 10 | "store": { 11 | "referenceName": "Ls_KeyVault", 12 | "type": "LinkedServiceReference" 13 | }, 14 | "secretName": "storageKey" 15 | } 16 | } 17 | } 18 | } -------------------------------------------------------------------------------- /adf/linkedService/Ls_AzureSQLDW_01.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Ls_AzureSQLDW_01", 3 | "properties": { 4 | "annotations": [], 5 | "type": "AzureSqlDW", 6 | "typeProperties": { 7 | "connectionString": { 8 | "type": "AzureKeyVaultSecret", 9 | "store": { 10 | "referenceName": "Ls_KeyVault", 11 | "type": "LinkedServiceReference" 12 | }, 13 | "secretName": "sqldwConnectionString" 14 | } 15 | } 16 | } 17 | } -------------------------------------------------------------------------------- /adf/linkedService/Ls_KeyVault.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Ls_KeyVault", 3 | "properties": { 4 | "annotations": [], 5 | "type": "AzureKeyVault", 6 | "typeProperties": { 7 | "baseUrl": "https://ddokvdevvnhf6tvx.vault.azure.net/" 8 | } 9 | } 10 | } -------------------------------------------------------------------------------- /adf/linkedService/Ls_Rest_MelParkSensors_01.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Ls_Rest_MelParkSensors_01", 3 | "type": "Microsoft.DataFactory/factories/linkedservices", 4 | "properties": { 5 | "annotations": [], 6 | "type": "RestService", 7 | "typeProperties": { 8 | "url": "https://data.melbourne.vic.gov.au/resource/", 9 | "enableServerCertificateValidation": true, 10 | "authenticationType": "Anonymous" 11 | } 12 | } 13 | } -------------------------------------------------------------------------------- /adf/linkedService/Ls_adb_01.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Ls_adb_01", 3 | "properties": { 4 | "type": "AzureDatabricks", 5 | "typeProperties": { 6 | "domain": "https://australiaeast.azuredatabricks.net", 7 | "accessToken": { 8 | "type": "AzureKeyVaultSecret", 9 | "store": { 10 | "referenceName": "Ls_KeyVault", 11 | "type": "LinkedServiceReference" 12 | }, 13 | "secretName": "dbricksToken" 14 | }, 15 | "newClusterNodeType": "Standard_DS3_v2", 16 | "newClusterNumOfWorker": "1", 17 | "newClusterSparkEnvVars": { 18 | "PYSPARK_PYTHON": "/databricks/python3/bin/python3" 19 | }, 20 | "newClusterVersion": "5.4.x-scala2.11" 21 | } 22 | }, 23 | "type": "Microsoft.DataFactory/factories/linkedservices" 24 | } -------------------------------------------------------------------------------- /adf/pipeline/P_Ingest_MelbParkingData.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "P_Ingest_MelbParkingData", 3 | "properties": { 4 | "description": "Hello from NDC Sydney!!!", 5 | "activities": [ 6 | { 7 | "name": "StandardizeData", 8 | "description": "", 9 | "type": "DatabricksNotebook", 10 | "dependsOn": [ 11 | { 12 | "activity": "DownloadSensorData", 13 | "dependencyConditions": [ 14 | "Succeeded" 15 | ] 16 | }, 17 | { 18 | "activity": "DownloadBayData", 19 | "dependencyConditions": [ 20 | "Succeeded" 21 | ] 22 | } 23 | ], 24 | "policy": { 25 | "timeout": "7.00:00:00", 26 | "retry": 0, 27 | "retryIntervalInSeconds": 30, 28 | "secureOutput": false, 29 | "secureInput": false 30 | }, 31 | "userProperties": [], 32 | "typeProperties": { 33 | "notebookPath": "/notebooks/02_standardize", 34 | "baseParameters": { 35 | "infilefolder": { 36 | "value": "@variables('infilefolder')", 37 | "type": "Expression" 38 | }, 39 | "loadid": { 40 | "value": "@pipeline().RunId", 41 | "type": "Expression" 42 | } 43 | }, 44 | "libraries": [ 45 | { 46 | "egg": "dbfs:/mnt/datalake/libs/ddo_transform-1.0.0-py2.py3-none-any.whl" 47 | }, 48 | { 49 | "pypi": { 50 | "package": "applicationinsights" 51 | } 52 | } 53 | ] 54 | }, 55 | "linkedServiceName": { 56 | "referenceName": "Ls_adb_01", 57 | "type": "LinkedServiceReference" 58 | } 59 | }, 60 | { 61 | "name": "Set infilefolder", 62 | "type": "SetVariable", 63 | "dependsOn": [], 64 | "userProperties": [], 65 | "typeProperties": { 66 | "variableName": "infilefolder", 67 | "value": { 68 | "value": "@utcnow('yyyy_MM_dd_hh_mm_ss')", 69 | "type": "Expression" 70 | } 71 | } 72 | }, 73 | { 74 | "name": "DownloadSensorData", 75 | "type": "Copy", 76 | "dependsOn": [ 77 | { 78 | "activity": "Set infilefolder", 79 | "dependencyConditions": [ 80 | "Succeeded" 81 | ] 82 | } 83 | ], 84 | "policy": { 85 | "timeout": "7.00:00:00", 86 | "retry": 0, 87 | "retryIntervalInSeconds": 30, 88 | "secureOutput": false, 89 | "secureInput": false 90 | }, 91 | "userProperties": [], 92 | "typeProperties": { 93 | "source": { 94 | "type": "RestSource", 95 | "httpRequestTimeout": "00:01:40", 96 | "requestInterval": "00.00:00:00.010" 97 | }, 98 | "sink": { 99 | "type": "AzureBlobFSSink" 100 | }, 101 | "enableStaging": false 102 | }, 103 | "inputs": [ 104 | { 105 | "referenceName": "Ds_REST_MelbParkingData", 106 | "type": "DatasetReference", 107 | "parameters": { 108 | "relativeurl": "dtpv-d4pf.json" 109 | } 110 | } 111 | ], 112 | "outputs": [ 113 | { 114 | "referenceName": "Ds_AdlsGen2_MelbParkingData", 115 | "type": "DatasetReference", 116 | "parameters": { 117 | "infilefolder": { 118 | "value": "@variables('infilefolder')", 119 | "type": "Expression" 120 | }, 121 | "infilename": "MelbParkingSensorData.json", 122 | "container": "datalake/data/lnd" 123 | } 124 | } 125 | ] 126 | }, 127 | { 128 | "name": "DownloadBayData", 129 | "type": "Copy", 130 | "dependsOn": [ 131 | { 132 | "activity": "Set infilefolder", 133 | "dependencyConditions": [ 134 | "Succeeded" 135 | ] 136 | } 137 | ], 138 | "policy": { 139 | "timeout": "7.00:00:00", 140 | "retry": 0, 141 | "retryIntervalInSeconds": 30, 142 | "secureOutput": false, 143 | "secureInput": false 144 | }, 145 | "userProperties": [], 146 | "typeProperties": { 147 | "source": { 148 | "type": "RestSource", 149 | "httpRequestTimeout": "00:01:40", 150 | "requestInterval": "00.00:00:00.010" 151 | }, 152 | "sink": { 153 | "type": "AzureBlobFSSink" 154 | }, 155 | "enableStaging": false 156 | }, 157 | "inputs": [ 158 | { 159 | "referenceName": "Ds_REST_MelbParkingData", 160 | "type": "DatasetReference", 161 | "parameters": { 162 | "relativeurl": "wuf8-susg.json" 163 | } 164 | } 165 | ], 166 | "outputs": [ 167 | { 168 | "referenceName": "Ds_AdlsGen2_MelbParkingData", 169 | "type": "DatasetReference", 170 | "parameters": { 171 | "infilefolder": { 172 | "value": "@variables('infilefolder')", 173 | "type": "Expression" 174 | }, 175 | "infilename": "MelbParkingBayData.json", 176 | "container": "datalake/data/lnd" 177 | } 178 | } 179 | ] 180 | }, 181 | { 182 | "name": "TransformData", 183 | "type": "DatabricksNotebook", 184 | "dependsOn": [ 185 | { 186 | "activity": "StandardizeData", 187 | "dependencyConditions": [ 188 | "Succeeded" 189 | ] 190 | } 191 | ], 192 | "policy": { 193 | "timeout": "7.00:00:00", 194 | "retry": 0, 195 | "retryIntervalInSeconds": 30, 196 | "secureOutput": false, 197 | "secureInput": false 198 | }, 199 | "userProperties": [], 200 | "typeProperties": { 201 | "notebookPath": "/notebooks/03_transform", 202 | "baseParameters": { 203 | "loadid": { 204 | "value": "@pipeline().RunId", 205 | "type": "Expression" 206 | } 207 | }, 208 | "libraries": [ 209 | { 210 | "egg": "dbfs:/mnt/datalake/libs/ddo_transform-1.0.0-py2.py3-none-any.whl" 211 | }, 212 | { 213 | "pypi": { 214 | "package": "applicationinsights" 215 | } 216 | } 217 | ] 218 | }, 219 | "linkedServiceName": { 220 | "referenceName": "Ls_adb_01", 221 | "type": "LinkedServiceReference" 222 | } 223 | }, 224 | { 225 | "name": "Load SQLDW", 226 | "type": "SqlServerStoredProcedure", 227 | "dependsOn": [ 228 | { 229 | "activity": "TransformData", 230 | "dependencyConditions": [ 231 | "Succeeded" 232 | ] 233 | } 234 | ], 235 | "policy": { 236 | "timeout": "7.00:00:00", 237 | "retry": 0, 238 | "retryIntervalInSeconds": 30, 239 | "secureOutput": false, 240 | "secureInput": false 241 | }, 242 | "userProperties": [], 243 | "typeProperties": { 244 | "storedProcedureName": "[dbo].[load_dw]", 245 | "storedProcedureParameters": { 246 | "load_id": { 247 | "value": { 248 | "value": "@pipeline().RunId", 249 | "type": "Expression" 250 | }, 251 | "type": "String" 252 | } 253 | } 254 | }, 255 | "linkedServiceName": { 256 | "referenceName": "Ls_AzureSQLDW_01", 257 | "type": "LinkedServiceReference" 258 | } 259 | } 260 | ], 261 | "variables": { 262 | "infilefolder": { 263 | "type": "String", 264 | "defaultValue": "lnd" 265 | } 266 | }, 267 | "annotations": [] 268 | } 269 | } -------------------------------------------------------------------------------- /adf/trigger/T_Sched.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "T_Sched", 3 | "properties": { 4 | "annotations": [], 5 | "runtimeState": "Stopped", 6 | "pipelines": [ 7 | { 8 | "pipelineReference": { 9 | "referenceName": "P_Ingest_MelbParkingData", 10 | "type": "PipelineReference" 11 | } 12 | } 13 | ], 14 | "type": "ScheduleTrigger", 15 | "typeProperties": { 16 | "recurrence": { 17 | "frequency": "Hour", 18 | "interval": 1, 19 | "startTime": "2019-02-01T05:28:00.000Z", 20 | "timeZone": "UTC" 21 | } 22 | } 23 | } 24 | } -------------------------------------------------------------------------------- /clean_up.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Access granted under MIT Open Source License: https://en.wikipedia.org/wiki/MIT_License 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated 6 | # documentation files (the "Software"), to deal in the Software without restriction, including without limitation 7 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, # and/or sell copies of the Software, 8 | # and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 9 | # 10 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions 11 | # of the Software. 12 | # 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED 14 | # TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 15 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF 16 | # CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 17 | # DEALINGS IN THE SOFTWARE. 18 | 19 | set -o errexit 20 | set -o pipefail 21 | set -o nounset 22 | # set -o xtrace # For debugging 23 | 24 | ################### 25 | # PARAMETERS 26 | 27 | env_name="${1-}" 28 | 29 | # Import correct .env file 30 | set -o allexport 31 | env_file=".env.$env_name" 32 | if [[ -e $env_file ]] 33 | then 34 | source $env_file 35 | fi 36 | set +o allexport 37 | 38 | az group delete -g $RESOURCE_GROUP -y --no-wait 39 | az ad sp delete --id $SP_STOR_ID -------------------------------------------------------------------------------- /data/raw_data/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devlace/datadevops/e6c564a674a8264eed94fa6a8a8056e3b450525c/data/raw_data/README.md -------------------------------------------------------------------------------- /databricks/config/cluster.config.json: -------------------------------------------------------------------------------- 1 | { 2 | "cluster_name": "ddo_cluster", 3 | "autoscale": { "min_workers": 1, "max_workers": 2 }, 4 | "spark_version": "5.5.x-scala2.11", 5 | "autotermination_minutes": 30, 6 | "node_type_id": "Standard_DS3_v2", 7 | "driver_node_type_id": "Standard_DS3_v2", 8 | "spark_env_vars": { 9 | "PYSPARK_PYTHON": "/databricks/python3/bin/python3", 10 | "MOUNT_DATA_PATH": "/mnt/datalake", 11 | "MOUNT_DATA_CONTAINER": "datalake", 12 | "DATABASE": "datalake" 13 | } 14 | } -------------------------------------------------------------------------------- /databricks/config/cluster.config.template.json: -------------------------------------------------------------------------------- 1 | { 2 | "cluster_name": "__REPLACE_CLUSTER_NAME__", 3 | "autoscale": { "min_workers": 1, "max_workers": 2 }, 4 | "spark_version": "5.5.x-scala2.11", 5 | "autotermination_minutes": 120, 6 | "node_type_id": "Standard_DS3_v2", 7 | "driver_node_type_id": "Standard_DS3_v2", 8 | "spark_env_vars": { 9 | "PYSPARK_PYTHON": "/databricks/python3/bin/python3", 10 | "MOUNT_DATA_PATH": "__REPLACE_MOUNT_DATA_PATH__", 11 | "MOUNT_DATA_CONTAINER": "__REPLACE_MOUNT_DATA_CONTAINER__", 12 | "DATABASE": "__REPLACE_DATABASE__" 13 | } 14 | } -------------------------------------------------------------------------------- /databricks/config/run.setup.config.json: -------------------------------------------------------------------------------- 1 | { 2 | "run_name": "Setup workspace", 3 | "new_cluster": { 4 | "spark_version": "5.5.x-scala2.11", 5 | "node_type_id": "Standard_DS3_v2", 6 | "num_workers": 1, 7 | "spark_env_vars": { 8 | "PYSPARK_PYTHON": "/databricks/python3/bin/python3", 9 | "MOUNT_DATA_PATH": "/mnt/datalake", 10 | "MOUNT_DATA_CONTAINER": "datalake", 11 | "DATABASE": "datalake" 12 | } 13 | }, 14 | "libraries": [], 15 | "timeout_seconds": 3600, 16 | "notebook_task": { 17 | "notebook_path": "/notebooks/00_setup" 18 | } 19 | } -------------------------------------------------------------------------------- /databricks/configure_databricks.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Access granted under MIT Open Source License: https://en.wikipedia.org/wiki/MIT_License 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated 6 | # documentation files (the "Software"), to deal in the Software without restriction, including without limitation 7 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, # and/or sell copies of the Software, 8 | # and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 9 | # 10 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions 11 | # of the Software. 12 | # 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED 14 | # TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 15 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF 16 | # CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 17 | # DEALINGS IN THE SOFTWARE. 18 | 19 | 20 | set -o errexit 21 | set -o pipefail 22 | set -o nounset 23 | # set -o xtrace # For debugging 24 | 25 | # Set path 26 | parent_dir=$(pwd -P) 27 | dir_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P ); cd "$dir_path" 28 | 29 | # Constants 30 | RED='\033[0;31m' 31 | ORANGE='\033[0;33m' 32 | NC='\033[0m' 33 | 34 | CLUSTER_CONFIG="./config/cluster.config.json" 35 | MOUNT_DATA_PATH="/mnt/datalake" 36 | 37 | ################### 38 | # USER PARAMETERS 39 | env_name="${1-}" 40 | 41 | # Import correct .env file 42 | set -o allexport 43 | env_file="../.env.$env_name" 44 | if [[ -e $env_file ]] 45 | then 46 | source $env_file 47 | fi 48 | set +o allexport 49 | 50 | 51 | wait_for_run () { 52 | # See here: https://docs.azuredatabricks.net/api/latest/jobs.html#jobsrunresultstate 53 | declare mount_run_id=$1 54 | while : ; do 55 | life_cycle_status=$(databricks runs get --run-id $mount_run_id | jq -r ".state.life_cycle_state") 56 | result_state=$(databricks runs get --run-id $mount_run_id | jq -r ".state.result_state") 57 | if [[ $result_state == "SUCCESS" || $result_state == "SKIPPED" ]]; then 58 | break; 59 | elif [[ $life_cycle_status == "INTERNAL_ERROR" || $result_state == "FAILED" ]]; then 60 | state_message=$(databricks runs get --run-id $mount_run_id | jq -r ".state.state_message") 61 | echo -e "${RED}Error while running ${mount_run_id}: ${state_message} ${NC}" 62 | exit 1 63 | else 64 | echo "Waiting for run ${mount_run_id} to finish..." 65 | sleep 2m 66 | fi 67 | done 68 | } 69 | 70 | 71 | cluster_exists () { 72 | declare cluster_name="$1" 73 | declare cluster=$(databricks clusters list | tr -s " " | cut -d" " -f2 | grep ^${cluster_name}$) 74 | if [[ -n $cluster ]]; then 75 | return 0; # cluster exists 76 | else 77 | return 1; # cluster does not exists 78 | fi 79 | } 80 | 81 | 82 | _main() { 83 | # Upload notebooks 84 | echo "Uploading notebooks..." 85 | databricks workspace import_dir "notebooks" "/notebooks" --overwrite 86 | 87 | # Setup workspace 88 | echo "Setting up workspace and tables. This may take a while as cluster spins up..." 89 | wait_for_run $(databricks runs submit --json-file "./config/run.setup.config.json" | jq -r ".run_id" ) 90 | 91 | # Create initial cluster, if not yet exists 92 | echo "Creating an interactive cluster..." 93 | cluster_name=$(cat $CLUSTER_CONFIG | jq -r ".cluster_name") 94 | if cluster_exists $cluster_name; then 95 | echo "Cluster ${cluster_name} already exists!" 96 | else 97 | echo "Creating cluster ${cluster_name}..." 98 | databricks clusters create --json-file $CLUSTER_CONFIG 99 | fi 100 | 101 | # Upload dependencies 102 | echo "Uploading libraries dependencies..." 103 | databricks fs cp ./libs/ "dbfs:${MOUNT_DATA_PATH}/libs/" --recursive --overwrite 104 | 105 | # Install Library dependencies 106 | echo "Installing library depedencies..." 107 | cluster_id=$(databricks clusters list | awk '/'$cluster_name'/ {print $1}') 108 | databricks libraries install \ 109 | --jar "dbfs:${MOUNT_DATA_PATH}/libs/azure-cosmosdb-spark_2.3.0_2.11-1.2.2-uber.jar" \ 110 | --cluster-id $cluster_id 111 | 112 | } 113 | 114 | _main 115 | 116 | 117 | echo "Return to parent script dir: $parent_dir" 118 | cd "$parent_dir" -------------------------------------------------------------------------------- /databricks/create_secrets.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Access granted under MIT Open Source License: https://en.wikipedia.org/wiki/MIT_License 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated 6 | # documentation files (the "Software"), to deal in the Software without restriction, including without limitation 7 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, # and/or sell copies of the Software, 8 | # and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 9 | # 10 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions 11 | # of the Software. 12 | # 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED 14 | # TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 15 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF 16 | # CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 17 | # DEALINGS IN THE SOFTWARE. 18 | 19 | 20 | set -o errexit 21 | set -o pipefail 22 | set -o nounset 23 | # set -o xtrace # For debugging 24 | 25 | # Set path 26 | parent_dir=$(pwd -P) 27 | dir_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P ); cd "$dir_path" 28 | 29 | # Set constants 30 | scope_name="storage_scope" # fixed # TODO pass via arm template 31 | 32 | ################### 33 | # Requires the following to be set: 34 | # 35 | # BLOB_STORAGE_ACCOUNT= 36 | # SP_STOR_ID= 37 | # SP_STOR_PASS= 38 | # SP_STOR_TENANT= 39 | 40 | 41 | ################### 42 | # USER PARAMETERS 43 | env_name="${1-}" 44 | 45 | # Import correct .env file 46 | set -o allexport 47 | env_file="../.env.$env_name" 48 | if [[ -e $env_file ]] 49 | then 50 | source $env_file 51 | fi 52 | set +o allexport 53 | 54 | # Create scope, if not exists 55 | if [[ -z $(databricks secrets list-scopes | grep "$scope_name") ]]; then 56 | echo "Creating secrets scope: $scope_name" 57 | databricks secrets create-scope --scope "$scope_name" 58 | fi 59 | 60 | # Create secrets 61 | echo "Creating secrets within scope $scope_name..." 62 | 63 | databricks secrets write --scope "$scope_name" --key "storage_account" --string-value "$BLOB_STORAGE_ACCOUNT" 64 | databricks secrets write --scope "$scope_name" --key "storage_sp_id" --string-value "$SP_STOR_ID" 65 | databricks secrets write --scope "$scope_name" --key "storage_sp_key" --string-value "$SP_STOR_PASS" 66 | databricks secrets write --scope "$scope_name" --key "storage_sp_tenant" --string-value "$SP_STOR_TENANT" 67 | 68 | 69 | echo "Return to parent script dir: $parent_dir" 70 | cd "$parent_dir" -------------------------------------------------------------------------------- /databricks/deploy_app.sh: -------------------------------------------------------------------------------- 1 | 2 | #!/bin/bash 3 | 4 | # Access granted under MIT Open Source License: https://en.wikipedia.org/wiki/MIT_License 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated 7 | # documentation files (the "Software"), to deal in the Software without restriction, including without limitation 8 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, # and/or sell copies of the Software, 9 | # and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions 12 | # of the Software. 13 | # 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED 15 | # TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 16 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF 17 | # CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 18 | # DEALINGS IN THE SOFTWARE. 19 | 20 | 21 | set -o errexit 22 | set -o pipefail 23 | set -o nounset 24 | set -o xtrace # For debugging 25 | 26 | # Set path 27 | dir_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P ) 28 | cd "$dir_path" 29 | 30 | 31 | ################### 32 | # Requires the following to be set: 33 | # 34 | # RELEASE_ID= 35 | # MOUNT_DATA_PATH= 36 | # WHEEL_FILE_PATH= 37 | 38 | # Set DBFS libs path 39 | dbfs_libs_path=dbfs:${MOUNT_DATA_PATH}/libs/release_${RELEASE_ID} 40 | 41 | # Upload dependencies 42 | echo "Uploading libraries dependencies to DBFS..." 43 | databricks fs cp ./libs/ "${dbfs_libs_path}" --recursive 44 | 45 | echo "Uploading app libraries to DBFS..." 46 | databricks fs cp $WHEEL_FILE_PATH "${dbfs_libs_path}" 47 | 48 | # Upload notebooks to workspace 49 | echo "Uploading notebooks to workspace..." 50 | databricks workspace import_dir "notebooks" "/releases/release_${RELEASE_ID}/" 51 | -------------------------------------------------------------------------------- /databricks/libs/azure-cosmosdb-spark_2.3.0_2.11-1.2.2-uber.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devlace/datadevops/e6c564a674a8264eed94fa6a8a8056e3b450525c/databricks/libs/azure-cosmosdb-spark_2.3.0_2.11-1.2.2-uber.jar -------------------------------------------------------------------------------- /databricks/notebooks/00_setup.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC ## Mount ADLS Gen2 4 | 5 | # COMMAND ---------- 6 | 7 | import os 8 | 9 | # Set mount path 10 | storage_mount_data_path = os.environ['MOUNT_DATA_PATH'] 11 | storage_mount_container = os.environ['MOUNT_DATA_CONTAINER'] 12 | 13 | # Unmount if existing 14 | for mp in dbutils.fs.mounts(): 15 | if mp.mountPoint == storage_mount_data_path: 16 | dbutils.fs.unmount(storage_mount_data_path) 17 | 18 | # Refresh mounts 19 | dbutils.fs.refreshMounts() 20 | 21 | # COMMAND ---------- 22 | 23 | # Retrieve storage credentials 24 | storage_account = dbutils.secrets.get(scope = "storage_scope", key = "storage_account") 25 | storage_sp_id = dbutils.secrets.get(scope = "storage_scope", key = "storage_sp_id") 26 | storage_sp_key = dbutils.secrets.get(scope = "storage_scope", key = "storage_sp_key") 27 | storage_sp_tenant = dbutils.secrets.get(scope = "storage_scope", key = "storage_sp_tenant") 28 | 29 | # Mount 30 | configs = {"fs.azure.account.auth.type": "OAuth", 31 | "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider", 32 | "fs.azure.account.oauth2.client.id": storage_sp_id, 33 | "fs.azure.account.oauth2.client.secret": storage_sp_key, 34 | "fs.azure.account.oauth2.client.endpoint": "https://login.microsoftonline.com/" + storage_sp_tenant + "/oauth2/token"} 35 | 36 | # Optionally, you can add to the source URI of your mount point. 37 | dbutils.fs.mount( 38 | source = "abfss://" + storage_mount_container + "@" + storage_account + ".dfs.core.windows.net/", 39 | mount_point = storage_mount_data_path, 40 | extra_configs = configs) 41 | 42 | # Refresh mounts 43 | dbutils.fs.refreshMounts() 44 | 45 | # COMMAND ---------- 46 | 47 | # MAGIC %md 48 | # MAGIC ## Create Tables 49 | 50 | # COMMAND ---------- 51 | 52 | # MAGIC %sql 53 | # MAGIC CREATE SCHEMA IF NOT EXISTS dw; 54 | # MAGIC CREATE SCHEMA IF NOT EXISTS lnd; 55 | # MAGIC CREATE SCHEMA IF NOT EXISTS interim; 56 | # MAGIC CREATE SCHEMA IF NOT EXISTS malformed; 57 | 58 | # COMMAND ---------- 59 | 60 | # MAGIC %sql 61 | # MAGIC -- FACT tables 62 | # MAGIC DROP TABLE IF EXISTS dw.fact_parking; 63 | # MAGIC CREATE TABLE dw.fact_parking ( 64 | # MAGIC dim_date_id STRING, 65 | # MAGIC dim_time_id STRING, 66 | # MAGIC dim_parking_bay_id STRING, 67 | # MAGIC dim_location_id STRING, 68 | # MAGIC dim_st_marker_id STRING, 69 | # MAGIC status STRING, 70 | # MAGIC load_id STRING, 71 | # MAGIC loaded_on TIMESTAMP 72 | # MAGIC ) 73 | # MAGIC USING parquet 74 | # MAGIC LOCATION '/mnt/datalake/data/dw/fact_parking/'; 75 | # MAGIC 76 | # MAGIC REFRESH TABLE dw.fact_parking; 77 | 78 | # COMMAND ---------- 79 | 80 | # MAGIC %sql 81 | # MAGIC -- DIMENSION tables 82 | # MAGIC DROP TABLE IF EXISTS dw.dim_st_marker; 83 | # MAGIC CREATE TABLE dw.dim_st_marker ( 84 | # MAGIC dim_st_marker_id STRING, 85 | # MAGIC st_marker_id STRING, 86 | # MAGIC load_id STRING, 87 | # MAGIC loaded_on TIMESTAMP 88 | # MAGIC ) 89 | # MAGIC USING parquet 90 | # MAGIC LOCATION '/mnt/datalake/data/dw/dim_st_marker/'; 91 | # MAGIC 92 | # MAGIC REFRESH TABLE dw.dim_st_marker; 93 | # MAGIC 94 | # MAGIC -- 95 | # MAGIC DROP TABLE IF EXISTS dw.dim_location; 96 | # MAGIC CREATE TABLE dw.dim_location ( 97 | # MAGIC dim_location_id STRING, 98 | # MAGIC lat FLOAT, 99 | # MAGIC lon FLOAT, 100 | # MAGIC load_id STRING, 101 | # MAGIC loaded_on TIMESTAMP 102 | # MAGIC ) 103 | # MAGIC USING parquet 104 | # MAGIC LOCATION '/mnt/datalake/data/dw/dim_location/'; 105 | # MAGIC 106 | # MAGIC REFRESH TABLE dw.dim_location; 107 | # MAGIC 108 | # MAGIC -- 109 | # MAGIC DROP TABLE IF EXISTS dw.dim_parking_bay; 110 | # MAGIC CREATE TABLE dw.dim_parking_bay ( 111 | # MAGIC dim_parking_bay_id STRING, 112 | # MAGIC bay_id INT, 113 | # MAGIC `marker_id` STRING, 114 | # MAGIC `meter_id` STRING, 115 | # MAGIC `rd_seg_dsc` STRING, 116 | # MAGIC `rd_seg_id` STRING, 117 | # MAGIC load_id STRING, 118 | # MAGIC loaded_on TIMESTAMP 119 | # MAGIC ) 120 | # MAGIC USING parquet 121 | # MAGIC LOCATION '/mnt/datalake/data/dw/dim_parking_bay/'; 122 | # MAGIC 123 | # MAGIC REFRESH TABLE dw.dim_parking_bay; 124 | 125 | # COMMAND ---------- 126 | 127 | # MAGIC %sql 128 | # MAGIC DROP TABLE IF EXISTS dw.dim_date; 129 | # MAGIC DROP TABLE IF EXISTS dw.dim_time; 130 | 131 | # COMMAND ---------- 132 | 133 | from pyspark.sql.functions import col 134 | import os 135 | from urllib.request import urlretrieve 136 | 137 | def download_url(url, filename): 138 | # Create dir if not exist 139 | dir_path = os.path.dirname(filename) 140 | if not os.path.exists(dir_path): 141 | os.makedirs(dir_path) 142 | urlretrieve(url, filename) 143 | 144 | # Download data 145 | download_url("https://lacedemodata.blob.core.windows.net/data/DimDate.csv", "/dbfs/mnt/datalake/data/seed/DimDate.csv") 146 | download_url("https://lacedemodata.blob.core.windows.net/data/DimTime.csv", "/dbfs/mnt/datalake/data/seed/DimTime.csv") 147 | 148 | # DimDate 149 | dimdate = spark.read.csv("dbfs:/mnt/datalake/data/seed/DimDate.csv", header=True) 150 | dimdate.write.saveAsTable("dw.dim_date") 151 | 152 | # DimTime 153 | dimtime = spark.read.csv("dbfs:/mnt/datalake/data/seed/DimTime.csv", header=True) 154 | dimtime = dimtime.select(dimtime["second_of_day"].alias("dim_time_id"), col("*")) 155 | dimtime.write.saveAsTable("dw.dim_time") 156 | 157 | # COMMAND ---------- 158 | 159 | # MAGIC %sql 160 | # MAGIC -- INTERIM tables 161 | # MAGIC DROP TABLE IF EXISTS interim.parking_bay; 162 | # MAGIC CREATE TABLE interim.parking_bay ( 163 | # MAGIC bay_id INT, 164 | # MAGIC `last_edit` TIMESTAMP, 165 | # MAGIC `marker_id` STRING, 166 | # MAGIC `meter_id` STRING, 167 | # MAGIC `rd_seg_dsc` STRING, 168 | # MAGIC `rd_seg_id` STRING, 169 | # MAGIC `the_geom` STRUCT<`coordinates`: ARRAY>>>, `type`: STRING>, 170 | # MAGIC load_id STRING, 171 | # MAGIC loaded_on TIMESTAMP 172 | # MAGIC ) 173 | # MAGIC USING parquet 174 | # MAGIC LOCATION '/mnt/datalake/data/interim/parking_bay/'; 175 | # MAGIC 176 | # MAGIC REFRESH TABLE interim.parking_bay; 177 | # MAGIC 178 | # MAGIC -- 179 | # MAGIC DROP TABLE IF EXISTS interim.sensor; 180 | # MAGIC CREATE TABLE interim.sensor ( 181 | # MAGIC bay_id INT, 182 | # MAGIC `st_marker_id` STRING, 183 | # MAGIC `lat` FLOAT, 184 | # MAGIC `lon` FLOAT, 185 | # MAGIC `location` STRUCT<`coordinates`: ARRAY, `type`: STRING>, 186 | # MAGIC `status` STRING, 187 | # MAGIC load_id STRING, 188 | # MAGIC loaded_on TIMESTAMP 189 | # MAGIC ) 190 | # MAGIC USING parquet 191 | # MAGIC LOCATION '/mnt/datalake/data/interim/sensors/'; 192 | # MAGIC 193 | # MAGIC REFRESH TABLE interim.sensor; 194 | 195 | # COMMAND ---------- 196 | 197 | # MAGIC %sql 198 | # MAGIC -- ERROR tables 199 | # MAGIC DROP TABLE IF EXISTS malformed.parking_bay; 200 | # MAGIC CREATE TABLE malformed.parking_bay ( 201 | # MAGIC bay_id INT, 202 | # MAGIC `last_edit` TIMESTAMP, 203 | # MAGIC `marker_id` STRING, 204 | # MAGIC `meter_id` STRING, 205 | # MAGIC `rd_seg_dsc` STRING, 206 | # MAGIC `rd_seg_id` STRING, 207 | # MAGIC `the_geom` STRUCT<`coordinates`: ARRAY>>>, `type`: STRING>, 208 | # MAGIC load_id STRING, 209 | # MAGIC loaded_on TIMESTAMP 210 | # MAGIC ) 211 | # MAGIC USING parquet 212 | # MAGIC LOCATION '/mnt/datalake/data/interim/parking_bay/'; 213 | # MAGIC 214 | # MAGIC REFRESH TABLE interim.parking_bay; 215 | # MAGIC 216 | # MAGIC -- 217 | # MAGIC DROP TABLE IF EXISTS malformed.sensor; 218 | # MAGIC CREATE TABLE malformed.sensor ( 219 | # MAGIC bay_id INT, 220 | # MAGIC `st_marker_id` STRING, 221 | # MAGIC `lat` FLOAT, 222 | # MAGIC `lon` FLOAT, 223 | # MAGIC `location` STRUCT<`coordinates`: ARRAY, `type`: STRING>, 224 | # MAGIC `status` STRING, 225 | # MAGIC load_id STRING, 226 | # MAGIC loaded_on TIMESTAMP 227 | # MAGIC ) 228 | # MAGIC USING parquet 229 | # MAGIC LOCATION '/mnt/datalake/data/interim/sensors/'; 230 | # MAGIC 231 | # MAGIC REFRESH TABLE interim.sensor; 232 | 233 | # COMMAND ---------- 234 | 235 | 236 | -------------------------------------------------------------------------------- /databricks/notebooks/01_explore.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | import os 3 | import datetime 4 | 5 | # For testing 6 | base_path = 'dbfs:/mnt/datalake/data/lnd/2019_10_06_05_54_25' 7 | parkingbay_filepath = os.path.join(base_path, "MelbParkingBayData.json") 8 | sensors_filepath = os.path.join(base_path, "MelbParkingSensorData.json") 9 | 10 | # COMMAND ---------- 11 | 12 | parkingbay_sdf = spark.read\ 13 | .option("multiLine", True)\ 14 | .json(parkingbay_filepath) 15 | sensordata_sdf = spark.read\ 16 | .option("multiLine", True)\ 17 | .json(sensors_filepath) 18 | 19 | # COMMAND ---------- 20 | 21 | display(parkingbay_sdf) 22 | 23 | # COMMAND ---------- 24 | 25 | display(sensordata_sdf) 26 | 27 | # COMMAND ---------- 28 | 29 | display(sensordata_sdf) 30 | -------------------------------------------------------------------------------- /databricks/notebooks/02_standardize.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | dbutils.widgets.text("infilefolder", "", "In - Folder Path") 3 | infilefolder = dbutils.widgets.get("infilefolder") 4 | 5 | dbutils.widgets.text("loadid", "", "Load Id") 6 | loadid = dbutils.widgets.get("loadid") 7 | 8 | # COMMAND ---------- 9 | 10 | from applicationinsights import TelemetryClient 11 | tc = TelemetryClient(dbutils.secrets.get(scope = "storage_scope", key = "appinsights_key")) 12 | 13 | # COMMAND ---------- 14 | 15 | import os 16 | import datetime 17 | 18 | # For testing 19 | # infilefolder = 'datalake/data/lnd/2019_03_11_01_38_00/' 20 | load_id = loadid 21 | loaded_on = datetime.datetime.now() 22 | base_path = os.path.join('dbfs:/mnt/datalake/data/lnd/', infilefolder) 23 | parkingbay_filepath = os.path.join(base_path, "MelbParkingBayData.json") 24 | sensors_filepath = os.path.join(base_path, "MelbParkingSensorData.json") 25 | 26 | 27 | # COMMAND ---------- 28 | 29 | import ddo_transform.standardize as s 30 | 31 | # Retrieve schema 32 | parkingbay_schema = s.get_schema("in_parkingbay_schema") 33 | sensordata_schema = s.get_schema("in_sensordata_schema") 34 | 35 | # Read data 36 | parkingbay_sdf = spark.read\ 37 | .schema(parkingbay_schema)\ 38 | .option("badRecordsPath", os.path.join(base_path, "__corrupt", "MelbParkingBayData"))\ 39 | .option("multiLine", True)\ 40 | .json(parkingbay_filepath) 41 | sensordata_sdf = spark.read\ 42 | .schema(sensordata_schema)\ 43 | .option("badRecordsPath", os.path.join(base_path, "__corrupt", "MelbParkingSensorData"))\ 44 | .option("multiLine", True)\ 45 | .json(sensors_filepath) 46 | 47 | 48 | # Standardize 49 | t_parkingbay_sdf, t_parkingbay_malformed_sdf = s.standardize_parking_bay(parkingbay_sdf, load_id, loaded_on) 50 | t_sensordata_sdf, t_sensordata_malformed_sdf = s.standardize_sensordata(sensordata_sdf, load_id, loaded_on) 51 | 52 | # Insert new rows 53 | t_parkingbay_sdf.write.mode("append").insertInto("interim.parking_bay") 54 | t_sensordata_sdf.write.mode("append").insertInto("interim.sensor") 55 | 56 | # Insert bad rows 57 | t_parkingbay_malformed_sdf.write.mode("append").insertInto("malformed.parking_bay") 58 | t_sensordata_malformed_sdf.write.mode("append").insertInto("malformed.sensor") 59 | 60 | 61 | # COMMAND ---------- 62 | 63 | # MAGIC %md 64 | # MAGIC ### Metrics 65 | 66 | # COMMAND ---------- 67 | 68 | parkingbay_count = t_parkingbay_sdf.count() 69 | sensordata_count = t_sensordata_sdf.count() 70 | parkingbay_malformed_count = t_parkingbay_malformed_sdf.count() 71 | sensordata_malformed_count = t_sensordata_malformed_sdf.count() 72 | 73 | tc.track_event('Standardize : Completed load', 74 | properties={'parkingbay_filepath': parkingbay_filepath, 75 | 'sensors_filepath': sensors_filepath, 76 | 'load_id': load_id 77 | }, 78 | measurements={'parkingbay_count': parkingbay_count, 79 | 'sensordata_count': sensordata_count, 80 | 'parkingbay_malformed_count': parkingbay_malformed_count, 81 | 'sensordata_malformed_count': sensordata_malformed_count 82 | }) 83 | tc.flush() 84 | 85 | # COMMAND ---------- 86 | 87 | dbutils.notebook.exit("success") 88 | -------------------------------------------------------------------------------- /databricks/notebooks/03_transform.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | dbutils.widgets.text("loadid", "", "Load Id") 3 | loadid = dbutils.widgets.get("loadid") 4 | 5 | # COMMAND ---------- 6 | 7 | from applicationinsights import TelemetryClient 8 | tc = TelemetryClient(dbutils.secrets.get(scope = "storage_scope", key = "appinsights_key")) 9 | 10 | # COMMAND ---------- 11 | 12 | import datetime 13 | import os 14 | from pyspark.sql.functions import col, lit 15 | import ddo_transform.transform as t 16 | import ddo_transform.util as util 17 | 18 | load_id = loadid 19 | loaded_on = datetime.datetime.now() 20 | base_path = 'dbfs:/mnt/datalake/data/dw/' 21 | 22 | # Read interim cleansed data 23 | parkingbay_sdf = spark.read.table("interim.parking_bay").filter(col('load_id') == lit(load_id)) 24 | sensordata_sdf = spark.read.table("interim.sensor").filter(col('load_id') == lit(load_id)) 25 | 26 | # COMMAND ---------- 27 | 28 | # MAGIC %md 29 | # MAGIC ### Transform and load Dimension tables 30 | 31 | # COMMAND ---------- 32 | 33 | # Read existing Dimensions 34 | dim_parkingbay_sdf = spark.read.table("dw.dim_parking_bay") 35 | dim_location_sdf = spark.read.table("dw.dim_location") 36 | dim_st_marker = spark.read.table("dw.dim_st_marker") 37 | 38 | # Transform 39 | new_dim_parkingbay_sdf = t.process_dim_parking_bay(parkingbay_sdf, dim_parkingbay_sdf, load_id, loaded_on).cache() 40 | new_dim_location_sdf = t.process_dim_location(sensordata_sdf, dim_location_sdf, load_id, loaded_on).cache() 41 | new_dim_st_marker_sdf = t.process_dim_st_marker(sensordata_sdf, dim_st_marker, load_id, loaded_on).cache() 42 | 43 | # Load 44 | util.save_overwrite_unmanaged_table(spark, new_dim_parkingbay_sdf, table_name="dw.dim_parking_bay", path=os.path.join(base_path, "dim_parking_bay")) 45 | util.save_overwrite_unmanaged_table(spark, new_dim_location_sdf, table_name="dw.dim_location", path=os.path.join(base_path, "dim_location")) 46 | util.save_overwrite_unmanaged_table(spark, new_dim_st_marker_sdf, table_name="dw.dim_st_marker", path=os.path.join(base_path, "dim_st_marker")) 47 | 48 | # COMMAND ---------- 49 | 50 | # MAGIC %md 51 | # MAGIC ### Transform and load Fact tables 52 | 53 | # COMMAND ---------- 54 | 55 | # Read existing Dimensions 56 | dim_parkingbay_sdf = spark.read.table("dw.dim_parking_bay") 57 | dim_location_sdf = spark.read.table("dw.dim_location") 58 | dim_st_marker = spark.read.table("dw.dim_st_marker") 59 | 60 | # Process 61 | nr_fact_parking = t.process_fact_parking(sensordata_sdf, dim_parkingbay_sdf, dim_location_sdf, dim_st_marker, load_id, loaded_on) 62 | 63 | # Insert new rows 64 | nr_fact_parking.write.mode("append").insertInto("dw.fact_parking") 65 | 66 | # COMMAND ---------- 67 | 68 | # MAGIC %md 69 | # MAGIC ### Metrics 70 | 71 | # COMMAND ---------- 72 | 73 | new_dim_parkingbay_count = spark.read.table("dw.dim_parking_bay").count() 74 | new_dim_location_count = spark.read.table("dw.dim_location").count() 75 | new_dim_st_marker_count = spark.read.table("dw.dim_st_marker").count() 76 | nr_fact_parking_count = nr_fact_parking.count() 77 | 78 | 79 | tc.track_event('Transform : Completed load', 80 | properties={'load_id': load_id}, 81 | measurements={'new_dim_parkingbay_count': new_dim_parkingbay_count, 82 | 'new_dim_location_count': new_dim_location_count, 83 | 'new_dim_st_marker_count': new_dim_st_marker_count, 84 | 'newrecords_fact_parking_count': nr_fact_parking_count 85 | }) 86 | tc.flush() 87 | 88 | # COMMAND ---------- 89 | 90 | dbutils.notebook.exit("success") 91 | -------------------------------------------------------------------------------- /deploy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Access granted under MIT Open Source License: https://en.wikipedia.org/wiki/MIT_License 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated 6 | # documentation files (the "Software"), to deal in the Software without restriction, including without limitation 7 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, # and/or sell copies of the Software, 8 | # and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 9 | # 10 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions 11 | # of the Software. 12 | # 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED 14 | # TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 15 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF 16 | # CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 17 | # DEALINGS IN THE SOFTWARE. 18 | 19 | set -o errexit 20 | set -o pipefail 21 | set -o nounset 22 | # set -o xtrace # For debugging 23 | 24 | # Check if required utilities are installed 25 | command -v jq >/dev/null 2>&1 || { echo >&2 "I require jq but it's not installed. See https://stedolan.github.io/jq/. Aborting."; exit 1; } 26 | command -v az >/dev/null 2>&1 || { echo >&2 "I require azure cli but it's not installed. See https://bit.ly/2Gc8IsS. Aborting."; exit 1; } 27 | 28 | # Check if user is logged in 29 | [[ -n $(az account show 2> /dev/null) ]] || { echo "Please login via the Azure CLI: "; az login; } 30 | 31 | # Globals and constants 32 | TIMESTAMP=$(date +%s) 33 | RED='\033[0;31m' 34 | ORANGE='\033[0;33m' 35 | NC='\033[0m' 36 | 37 | # Set path 38 | dir_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P ); cd "$dir_path" 39 | 40 | 41 | ################### 42 | # USER PARAMETERS 43 | 44 | rg_name_pre="${1-}" 45 | rg_location="${2-}" 46 | sub_id="${3-}" 47 | 48 | # while [[ -z $env_name ]]; do 49 | # read -rp "$(echo -e ${ORANGE}"Enter environment (dev, stg or prod): "${NC})" env_name 50 | # # TODO validate if dev, stg, prod 51 | # done 52 | 53 | while [[ -z $rg_name_pre ]]; do 54 | read -rp "$(echo -e ${ORANGE}"Enter Resource Group name: "${NC})" rg_name_pre 55 | done 56 | 57 | while [[ -z $rg_location ]]; do 58 | read -rp "$(echo -e ${ORANGE}"Enter Azure Location (ei. EAST US 2): "${NC})" rg_location 59 | done 60 | 61 | while [[ -z $sub_id ]]; do 62 | # Check if user only has one sub 63 | sub_count=$(az account list --output json | jq '. | length') 64 | if (( $sub_count != 1 )); then 65 | az account list --output table 66 | read -rp "$(echo -e ${ORANGE}"Enter Azure Subscription Id you wish to deploy to (enter to use Default): "${NC})" sub_id 67 | fi 68 | # If still empty then user selected IsDefault 69 | if [[ -z $sub_id ]]; then 70 | sub_id=$(az account show --output json | jq -r '.id') 71 | fi 72 | done 73 | 74 | # By default, set all KeyVault permission to deployer 75 | # Retrieve KeyVault User Id 76 | userId=$(az account show --output json | jq -r '.user.name') 77 | kvOwnerObjectId=$(az ad user show --id $userId \ 78 | --output json | jq -r '.objectId') 79 | 80 | 81 | ################### 82 | # DEPLOY ALL 83 | 84 | for env_name in dev stg prod; do 85 | # Azure infrastructure 86 | . ./infrastructure/deploy_infrastructure.sh "$env_name" "$rg_name_pre-$env_name" $rg_location $sub_id $kvOwnerObjectId 87 | 88 | # Databricks 89 | . ./databricks/create_secrets.sh "$env_name" 90 | . ./databricks/configure_databricks.sh "$env_name" 91 | done 92 | -------------------------------------------------------------------------------- /docs/CI_CD.md: -------------------------------------------------------------------------------- 1 | ### Build Pipelines 2 | 3 | 1. **Build - Quality Assurance** 4 | - Purpose: Ensure code quality and integrity 5 | - Trigger: Pull Request to Master 6 | - Steps: 7 | 1. Build Python packages 8 | 2. Run units tests 9 | 3. Code Coverage 10 | 4. Linting 11 | 2. **Build - Artifacts** 12 | - Purpose: To produce necessary artifacts for Release 13 | - Trigger: Commit to Master 14 | - Steps: 15 | 1. Build and create Python Wheel 16 | 2. Publish artifacts: 17 | - Python Wheel 18 | - Databricks Notebooks and cluster configuration 19 | - Data Factory pipeline definitions 20 | - IaC - ARM templates, Bash scripts 21 | - 3rd party library dependencies (JARs, etc) 22 | 23 | ### Release Pipelines 24 | 25 | Currently, there is one multi-stage release pipeline with the following stages. Each stage deploys to a different environment. 26 | 27 | 1. **On-demand Integration Testing (QA) environment** - **TODO** 28 | 1. Deploy Azure resources with ARM templates + Bash scripts 29 | 2. Store sensitive configuration information in shared QA KeyVault 30 | 3. Download integration test data from shared Storage to newly deployed ADAL Gen2. 31 | 4. Configure Databricks workspace 32 | - Setup Data mount 33 | - Create Databricks secrets 34 | 5. Deploy Data Application to Databricks 35 | - Deploy cluster given configuration 36 | - Upload Jars, Python wheels to DBFS 37 | - Install libraries on cluster 38 | 6. Deploy ADF pipeline 39 | 7. Run integration tests 40 | - Trigger ADF Pipeline 41 | - Databricks job to run integration test notebook 42 | 43 | 2. **Deploy to Staging** 44 | - NOTE: *Staging environment should be a mirror of Production and thus already have a configured Databricks workspace (secrets, data mount, etc), ADAL Gen2, ADF Pipeline, KeyVault, etc.* 45 | 1. Hydrate data with latest production data 46 | 2. Deploy Data Application to Databricks 47 | - Deploy cluster given configuration 48 | - Upload Jars, Python wheels to DBFS 49 | - Install libraries on cluster 50 | 3. Deploy ADF Pipeline and activate triggers 51 | 4. Run integration tests 52 | 53 | 3. **Deploy to Production** 54 | 1. Deploy Data Application to Databricks 55 | - Deploy cluster given configuration 56 | - Upload Jars, Python wheels to DBFS 57 | - Install libraries on cluster 58 | 2. Deploy ADF Pipeline 59 | 3. Swap between existing deployment and newly released deployment -------------------------------------------------------------------------------- /docs/NDCSydney2019-DataDevOps.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devlace/datadevops/e6c564a674a8264eed94fa6a8a8056e3b450525c/docs/NDCSydney2019-DataDevOps.pdf -------------------------------------------------------------------------------- /images/CI_CD_process.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devlace/datadevops/e6c564a674a8264eed94fa6a8a8056e3b450525c/images/CI_CD_process.PNG -------------------------------------------------------------------------------- /images/Release_1_Agent_DeployToDatabricks.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devlace/datadevops/e6c564a674a8264eed94fa6a8a8056e3b450525c/images/Release_1_Agent_DeployToDatabricks.PNG -------------------------------------------------------------------------------- /images/architecture.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devlace/datadevops/e6c564a674a8264eed94fa6a8a8056e3b450525c/images/architecture.PNG -------------------------------------------------------------------------------- /infrastructure/README.md: -------------------------------------------------------------------------------- 1 | All parameters are set in azure.deploy.parameters..json files 2 | 3 | .env. files are produced after every deployment 4 | 5 | ## Scripts 6 | 7 | deploy_all.sh 8 | └── deploy_infrastructure.sh <- deploys resources to a specific Environment 9 | └── configure_adlagen2.sh <- configures the newly deployed ADLA Gen2 -------------------------------------------------------------------------------- /infrastructure/azuredeploy.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", 3 | "contentVersion": "1.0.0.0", 4 | "parameters": { 5 | "deployNs": { 6 | "defaultValue": "ddo", 7 | "type": "string" 8 | }, 9 | "env": { 10 | "defaultValue": "dev", 11 | "type": "string", 12 | "allowedValues": [ 13 | "dev", 14 | "stg", 15 | "prod" 16 | ] 17 | }, 18 | "dbricksName": { 19 | "defaultValue": "[concat(parameters('deployNs'), 'dbricks', parameters('env'))]", 20 | "type": "String" 21 | }, 22 | "kvName": { 23 | "defaultValue": "[concat(parameters('deployNs'), 'kv', parameters('env'), substring(uniqueString(resourceGroup().id), 0, 8))]", 24 | "type": "String", 25 | "metadata": { 26 | "description": "Key Vault Name" 27 | } 28 | }, 29 | "kvOwnerObjectId": { 30 | "type": "String", 31 | "metadata": { 32 | "description": "Active Directory ObjectId to be granted full rights to KV" 33 | } 34 | }, 35 | "storName": { 36 | "defaultValue": "[concat(parameters('deployNs'), 'stor', parameters('env'), substring(uniqueString(resourceGroup().id), 0, 8))]", 37 | "type": "String", 38 | "metadata": { 39 | "description": "Storage account - ADLA Gen2" 40 | } 41 | }, 42 | "spStorName": { 43 | "defaultValue": "[concat(parameters('deployNs'), 'sp', parameters('env'), substring(uniqueString(resourceGroup().id), 0, 8))]", 44 | "type": "String", 45 | "metadata": { 46 | "description": "Service Principal to be granted access to Storage Account - ADLA Gen2" 47 | } 48 | }, 49 | "adfName": { 50 | "defaultValue": "[concat(parameters('deployNs'), 'adf', parameters('env'), substring(uniqueString(resourceGroup().id), 0, 8))]", 51 | "type": "string", 52 | "metadata": { 53 | "description": "Data Factory Name" 54 | } 55 | } 56 | }, 57 | "variables": { 58 | "managedResourceGroupId": "[concat(subscription().id, '/resourceGroups/', variables('managedResourceGroupName'))]", 59 | "managedResourceGroupName": "[concat('databricks-rg-', parameters('dbricksName'), '-', uniqueString(parameters('dbricksName'), resourceGroup().id))]" 60 | }, 61 | "resources": [ 62 | { 63 | "apiVersion": "2018-04-01", 64 | "location": "[resourceGroup().location]", 65 | "name": "[parameters('dbricksName')]", 66 | "tags": { 67 | "displayName": "Databricks Workspace", 68 | "Environment": "[parameters('env')]" 69 | }, 70 | "sku": { 71 | "name": "premium" 72 | }, 73 | "properties": { 74 | "ManagedResourceGroupId": "[variables('managedResourceGroupId')]" 75 | }, 76 | "type": "Microsoft.Databricks/workspaces" 77 | }, 78 | { 79 | "type": "Microsoft.KeyVault/vaults", 80 | "name": "[parameters('kvName')]", 81 | "apiVersion": "2015-06-01", 82 | "location": "[resourceGroup().location]", 83 | "tags": { 84 | "displayName": "Key Vault", 85 | "Environment": "[parameters('env')]" 86 | }, 87 | "properties": { 88 | "enabledForDeployment": false, 89 | "enabledForTemplateDeployment": true, 90 | "enabledForVolumeEncryption": false, 91 | "tenantId": "[subscription().tenantId]", 92 | "accessPolicies": [{ 93 | "tenantId": "[subscription().tenantId]", 94 | "objectId": "[parameters('kvOwnerObjectId')]", 95 | "permissions": { 96 | "keys": [ 97 | "All" 98 | ], 99 | "secrets": [ 100 | "All" 101 | ] 102 | } 103 | }, { 104 | "tenantId": "[subscription().tenantId]", 105 | "objectId": "[reference(parameters('adfName'), '2018-06-01', 'Full').identity.principalId]", 106 | "permissions": { 107 | "secrets": [ 108 | "get", "list" 109 | ] 110 | } 111 | }], 112 | "sku": { 113 | "family": "A", 114 | "name": "Standard" 115 | } 116 | } 117 | }, 118 | { 119 | "type": "Microsoft.Storage/storageAccounts", 120 | "sku": { 121 | "name": "Standard_LRS", 122 | "tier": "Standard" 123 | }, 124 | "kind": "StorageV2", 125 | "name": "[parameters('storName')]", 126 | "apiVersion": "2018-07-01", 127 | "location": "[resourceGroup().location]", 128 | "tags": { 129 | "displayName": "Data Lake", 130 | "Environment": "[parameters('env')]" 131 | }, 132 | "scale": null, 133 | "properties": { 134 | "isHnsEnabled": true, 135 | "networkAcls": { 136 | "bypass": "AzureServices", 137 | "virtualNetworkRules": [], 138 | "ipRules": [], 139 | "defaultAction": "Allow" 140 | }, 141 | "supportsHttpsTrafficOnly": true, 142 | "encryption": { 143 | "services": { 144 | "file": { 145 | "enabled": true 146 | }, 147 | "blob": { 148 | "enabled": true 149 | } 150 | }, 151 | "keySource": "Microsoft.Storage" 152 | }, 153 | "accessTier": "Hot" 154 | }, 155 | "dependsOn": [] 156 | }, 157 | { 158 | "apiVersion": "2018-06-01", 159 | "name": "[parameters('adfName')]", 160 | "location": "[resourceGroup().location]", 161 | "tags": { 162 | "displayName": "DataFactory", 163 | "Environment": "[parameters('env')]" 164 | }, 165 | "type": "Microsoft.DataFactory/factories", 166 | "identity": { 167 | "type": "SystemAssigned" 168 | }, 169 | "properties": {} 170 | } 171 | ], 172 | "outputs": { 173 | "dbricksName": { 174 | "value": "[parameters('dbricksName')]", 175 | "type": "string" 176 | }, 177 | "dbricksLocation": { 178 | "value": "[resourceGroup().location]", 179 | "type": "string" 180 | }, 181 | "kvName": { 182 | "value": "[parameters('kvName')]", 183 | "type": "string" 184 | }, 185 | "storName": { 186 | "value": "[parameters('storName')]", 187 | "type": "string" 188 | }, 189 | "spStorName": { 190 | "value": "[parameters('spStorName')]", 191 | "type": "string" 192 | } 193 | } 194 | } -------------------------------------------------------------------------------- /infrastructure/azuredeploy.parameters.dev.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentParameters.json#", 3 | "contentVersion": "1.0.0.0", 4 | "parameters": { 5 | "env": { 6 | "value": "dev" 7 | } 8 | } 9 | } -------------------------------------------------------------------------------- /infrastructure/azuredeploy.parameters.prod.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentParameters.json#", 3 | "contentVersion": "1.0.0.0", 4 | "parameters": { 5 | "env": { 6 | "value": "prod" 7 | } 8 | } 9 | } -------------------------------------------------------------------------------- /infrastructure/azuredeploy.parameters.stg.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentParameters.json#", 3 | "contentVersion": "1.0.0.0", 4 | "parameters": { 5 | "env": { 6 | "value": "stg" 7 | } 8 | } 9 | } -------------------------------------------------------------------------------- /infrastructure/configure_adlagen2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Access granted under MIT Open Source License: https://en.wikipedia.org/wiki/MIT_License 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated 6 | # documentation files (the "Software"), to deal in the Software without restriction, including without limitation 7 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, # and/or sell copies of the Software, 8 | # and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 9 | # 10 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions 11 | # of the Software. 12 | # 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED 14 | # TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 15 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF 16 | # CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 17 | # DEALINGS IN THE SOFTWARE. 18 | 19 | ####################################################### 20 | # Configure ADLA Gen2 Service Principal permissions 21 | # 22 | # This script performs the following: 23 | # 1. Create Service Principle for ADLS Gen2 24 | # 2. Grant correct RBAC role to the SP 25 | # 3. Create File System using REST API 26 | # 27 | # Prerequisites: 28 | # - User is logged in to the azure cli 29 | # - Correct Azure subscription is selected 30 | ####################################################### 31 | 32 | set -o errexit 33 | set -o pipefail 34 | set -o nounset 35 | # set -o xtrace # For debugging 36 | 37 | ################ 38 | # PARAMETERS 39 | ################ 40 | rg_name="${1-}" 41 | storage_account="${2-}" 42 | sp_stor_id="${3-}" 43 | sp_stor_pass="${4-}" 44 | sp_stor_tenantid="${5-}" 45 | 46 | storage_fs=datalake # Constant 47 | 48 | # Retrieve full storage account azure id 49 | storage_account_id=$(az storage account show \ 50 | --name "$storage_account" \ 51 | --resource-group "$rg_name" \ 52 | --output json | 53 | jq -r '.id') 54 | 55 | # See this issue: https://github.com/Azure/azure-powershell/issues/2286 56 | # TODO: make more robust 57 | sleep 1m 58 | 59 | # Grant "Storage Blob Data Owner (Preview) 60 | echo "Granting 'Storage Blob Data Contributor' for '$storage_account' to SP" 61 | az role assignment create --assignee "$sp_stor_id" \ 62 | --role "Storage Blob Data Contributor" \ 63 | --scope "$storage_account_id" 64 | 65 | # Because ADLA Gen2 is not yet supported by the az cli 2.0 as of 2019/02/04 66 | # we resort to calling the REST API directly: 67 | # https://docs.microsoft.com/en-us/rest/api/storageservices/datalakestoragegen2/filesystem 68 | # 69 | # For information on calling Azure REST API, see here: 70 | # https://docs.microsoft.com/en-us/rest/api/azure/ 71 | 72 | # It takes time for AD permissions to propogate 73 | # TODO: make more robust 74 | sleep 2m 75 | 76 | # Use service principle to generate bearer token 77 | bearer_token=$(curl -X POST -d "grant_type=client_credentials&client_id=${sp_stor_id}&client_secret=${sp_stor_pass}&resource=https%3A%2F%2Fstorage.azure.com%2F" \ 78 | https://login.microsoftonline.com/${sp_stor_tenantid}/oauth2/token | 79 | jq -r '.access_token') 80 | 81 | # Use bearer token to create file system 82 | echo "Creating ADLA Gen2 File System '$storage_fs' in storage account: '$storage_account'" 83 | curl -X PUT -d -H 'Content-Type:application/json' -H "Authorization: Bearer ${bearer_token}" \ 84 | https://${storage_account}.dfs.core.windows.net/${storage_fs}?resource=filesystem 85 | 86 | echo "Completed configuring ADLA Gen2." -------------------------------------------------------------------------------- /infrastructure/deploy_infrastructure.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Access granted under MIT Open Source License: https://en.wikipedia.org/wiki/MIT_License 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated 6 | # documentation files (the "Software"), to deal in the Software without restriction, including without limitation 7 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, # and/or sell copies of the Software, 8 | # and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 9 | # 10 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions 11 | # of the Software. 12 | # 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED 14 | # TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 15 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF 16 | # CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 17 | # DEALINGS IN THE SOFTWARE. 18 | 19 | ####################################################### 20 | # Deploys all necessary azure resources and stores 21 | # configuration information in an .ENV file 22 | # 23 | # Prerequisites: 24 | # - User is logged in to the azure cli 25 | # - Correct Azure subscription is selected 26 | ####################################################### 27 | 28 | set -o errexit 29 | set -o pipefail 30 | set -o nounset 31 | # set -o xtrace # For debugging 32 | 33 | ################### 34 | # PARAMETERS 35 | 36 | env_name="${1-}" 37 | rg_name="${2-}" 38 | rg_location="${3-}" 39 | sub_id="${4-}" 40 | kvOwnerObjectId="${5-}" 41 | 42 | env_file="../.env.${env_name}" 43 | 44 | # Set path 45 | parent_dir=$(pwd -P) 46 | dir_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P ); cd "$dir_path" 47 | 48 | 49 | ##################### 50 | # DEPLOY ARM TEMPLATE 51 | 52 | # Set account to where ARM template will be deployed to 53 | echo "Deploying to Subscription: $sub_id" 54 | az account set --subscription $sub_id 55 | 56 | # Create resource group 57 | echo "Creating resource group: $rg_name" 58 | az group create --name "$rg_name" --location "$rg_location" 59 | 60 | # Deploy arm template 61 | echo "Deploying resources into $rg_name" 62 | arm_output=$(az group deployment create \ 63 | --resource-group "$rg_name" \ 64 | --template-file "./azuredeploy.json" \ 65 | --parameters @"./azuredeploy.parameters.${env_name}.json" \ 66 | --parameters "kvOwnerObjectId=${kvOwnerObjectId}" \ 67 | --output json) 68 | 69 | if [[ -z $arm_output ]]; then 70 | echo >&2 "ARM deployment failed." 71 | exit 1 72 | fi 73 | 74 | ########################### 75 | # RETRIEVE DATABRICKS INFORMATION 76 | 77 | # Ask user to configure databricks cli 78 | # TODO: see if this can be automated 79 | dbricks_name=$(echo $arm_output | jq -r '.properties.outputs.dbricksName.value') 80 | echo -e "${ORANGE}" 81 | echo "Configure your databricks cli to connect to the newly created Databricks workspace: ${dbricks_name}. See here for more info: https://bit.ly/2GUwHcw." 82 | databricks configure --token 83 | echo -e "${NC}" 84 | 85 | # Databricks token and details 86 | dbricks_location=$(echo $arm_output | jq -r '.properties.outputs.dbricksLocation.value') 87 | dbi_token=$(awk '/token/ && NR==3 {print $0;exit;}' ~/.databrickscfg | cut -d' ' -f3) 88 | [[ -n $dbi_token ]] || { echo >&2 "Databricks cli not configured correctly. Please run databricks configure --token. Aborting."; exit 1; } 89 | 90 | 91 | ######################### 92 | # RETRIEVE CONFIG INFORMATION 93 | 94 | # Retrieve KeyVault details 95 | kv_name=$(echo $arm_output | jq -r '.properties.outputs.kvName.value') 96 | 97 | # Retrieve storage account (ADLS Gen2) details 98 | storage_account=$(echo $arm_output | jq -r '.properties.outputs.storName.value') 99 | storage_account_key=$(az storage account keys list \ 100 | --account-name $storage_account \ 101 | --resource-group $rg_name \ 102 | --output json | 103 | jq -r '.[0].value') 104 | 105 | # Retrieve SP name for ADLA Gen2 from arm output 106 | sp_stor_name=$(echo $arm_output | jq -r '.properties.outputs.spStorName.value') 107 | 108 | 109 | ######################### 110 | # CREATE AND CONFIGURE SERVICE PRINCIPAL FOR ADLA GEN2 111 | 112 | echo "Creating Service Principal (SP) for access to ADLA Gen2: '$sp_stor_name'" 113 | sp_stor_out=$(az ad sp create-for-rbac --name $sp_stor_name \ 114 | --skip-assignment \ 115 | --output json) 116 | sp_stor_id=$(echo $sp_stor_out | jq -r '.appId') 117 | sp_stor_pass=$(echo $sp_stor_out | jq -r '.password') 118 | sp_stor_tenantid=$(echo $sp_stor_out | jq -r '.tenant') 119 | 120 | . ./configure_adlagen2.sh "$rg_name" "$storage_account" "$sp_stor_id" "$sp_stor_pass" "$sp_stor_tenantid" 121 | 122 | 123 | #################### 124 | # SAVE RELEVANT SECRETS IN KEYVAULT 125 | 126 | az keyvault secret set --vault-name $kv_name --name "storageAccount" --value $storage_account 127 | az keyvault secret set --vault-name $kv_name --name "storageKey" --value $storage_account_key 128 | az keyvault secret set --vault-name $kv_name --name "spStorName" --value $sp_stor_name 129 | az keyvault secret set --vault-name $kv_name --name "spStorId" --value $sp_stor_id 130 | az keyvault secret set --vault-name $kv_name --name "spStorPass" --value $sp_stor_pass 131 | az keyvault secret set --vault-name $kv_name --name "spStorTenantId" --value $sp_stor_tenantid 132 | az keyvault secret set --vault-name $kv_name --name "dbricksDomain" --value https://${dbricks_location}.azuredatabricks.net 133 | az keyvault secret set --vault-name $kv_name --name "dbricksToken" --value $dbi_token 134 | 135 | 136 | #################### 137 | # BUILD ENV FILE FROM CONFIG INFORMATION 138 | 139 | echo "Appending configuration to .env file." 140 | cat << EOF >> $env_file 141 | 142 | # ------ Configuration from deployment on ${TIMESTAMP} ----------- 143 | RESOURCE_GROUP=${rg_name} 144 | BLOB_STORAGE_ACCOUNT=${storage_account} 145 | BLOB_STORAGE_KEY=${storage_account_key} 146 | SP_STOR_NAME=${sp_stor_name} 147 | SP_STOR_ID=${sp_stor_id} 148 | SP_STOR_PASS=${sp_stor_pass} 149 | SP_STOR_TENANT=${sp_stor_tenantid} 150 | KV_NAME=${kv_name} 151 | DATABRICKS_HOST=https://${dbricks_location}.azuredatabricks.net 152 | DATABRICKS_TOKEN=${dbi_token} 153 | 154 | EOF 155 | echo "Completed deploying Azure resources $rg_name ($env_name)" 156 | 157 | 158 | echo "Return to parent script dir: $parent_dir" 159 | cd "$parent_dir" -------------------------------------------------------------------------------- /samples/azuresql/README.md: -------------------------------------------------------------------------------- 1 | ## Deploying SQL Azure Database 2 | 3 | ### Build Pipeline 4 | 5 | ### Release Pipeline -------------------------------------------------------------------------------- /samples/azuresql/azure-pipelines-ci.yml: -------------------------------------------------------------------------------- 1 | # Starter pipeline 2 | # Start with a minimal pipeline that you can customize to build and deploy your code. 3 | # Add steps that build, run tests, deploy, and more: 4 | # https://aka.ms/yaml 5 | 6 | trigger: 7 | - master 8 | 9 | pool: 10 | vmImage: 'vs2017-win2016' 11 | 12 | variables: 13 | SLN_DIR: 'samples\azuresql\ddo_samples_azuresql' 14 | BUILD_PLATFORM: 'any cpu' 15 | BUILD_CONFIGURATION: 'release' 16 | 17 | steps: 18 | - task: VSBuild@1 19 | displayName: 'Build solution sln' 20 | inputs: 21 | solution: '$(SLN_DIR)\ddo_samples_azuresql.sln' 22 | platform: '$(BUILD_PLATFORM)' 23 | configuration: '$(BUILD_CONFIGURATION)' 24 | 25 | - task: VSTest@2 26 | displayName: 'VsTest - testAssemblies' 27 | inputs: 28 | testAssemblyVer2: | 29 | $(SLN_DIR)\**\$(BUILD_CONFIGURATION)\*test*.dll 30 | !**\obj\** 31 | platform: '$(BUILD_PLATFORM)' 32 | configuration: '$(BUILD_CONFIGURATION)' 33 | 34 | - task: PublishSymbols@2 35 | displayName: 'Publish symbols path' 36 | inputs: 37 | SearchPattern: '$(SLN_DIR)\**\bin\**\*.pdb' 38 | PublishSymbols: false 39 | continueOnError: true 40 | 41 | - task: CopyFiles@2 42 | displayName: 'Copy Files to: $(build.artifactstagingdirectory)' 43 | inputs: 44 | SourceFolder: '$(system.defaultworkingdirectory)' 45 | Contents: '$(SLN_DIR)\**\bin\$(BUILD_CONFIGURATION)\**' 46 | TargetFolder: '$(build.artifactstagingdirectory)' 47 | condition: succeededOrFailed() 48 | -------------------------------------------------------------------------------- /samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 15 4 | VisualStudioVersion = 15.0.28307.645 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{00D1A9C2-B5F0-4AF3-8072-F6C62B433612}") = "ddo_samples_azuresql", "ddo_samples_azuresql\ddo_samples_azuresql.sqlproj", "{387EBFC5-ACB1-4445-A25F-D70D18D34C30}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Debug|Any CPU = Debug|Any CPU 11 | Release|Any CPU = Release|Any CPU 12 | EndGlobalSection 13 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 14 | {387EBFC5-ACB1-4445-A25F-D70D18D34C30}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 15 | {387EBFC5-ACB1-4445-A25F-D70D18D34C30}.Debug|Any CPU.Build.0 = Debug|Any CPU 16 | {387EBFC5-ACB1-4445-A25F-D70D18D34C30}.Debug|Any CPU.Deploy.0 = Debug|Any CPU 17 | {387EBFC5-ACB1-4445-A25F-D70D18D34C30}.Release|Any CPU.ActiveCfg = Release|Any CPU 18 | {387EBFC5-ACB1-4445-A25F-D70D18D34C30}.Release|Any CPU.Build.0 = Release|Any CPU 19 | {387EBFC5-ACB1-4445-A25F-D70D18D34C30}.Release|Any CPU.Deploy.0 = Release|Any CPU 20 | EndGlobalSection 21 | GlobalSection(SolutionProperties) = preSolution 22 | HideSolutionNode = FALSE 23 | EndGlobalSection 24 | GlobalSection(ExtensibilityGlobals) = postSolution 25 | SolutionGuid = {D547B017-8D99-4D13-8CA6-E7BC0430B4E5} 26 | EndGlobalSection 27 | EndGlobal 28 | -------------------------------------------------------------------------------- /samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/SalesLT/Sequences/SalesOrderNumber.sql: -------------------------------------------------------------------------------- 1 | CREATE SEQUENCE [SalesLT].[SalesOrderNumber] 2 | AS INT 3 | START WITH 1 4 | INCREMENT BY 1; 5 | 6 | -------------------------------------------------------------------------------- /samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/SalesLT/Tables/Address.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE [SalesLT].[Address] ( 2 | [AddressID] INT IDENTITY (1, 1) NOT NULL, 3 | [AddressLine1] NVARCHAR (60) NOT NULL, 4 | [AddressLine2] NVARCHAR (60) NULL, 5 | [City] NVARCHAR (30) NOT NULL, 6 | [StateProvince] [dbo].[Name] NOT NULL, 7 | [CountryRegion] [dbo].[Name] NOT NULL, 8 | [PostalCode] NVARCHAR (15) NOT NULL, 9 | [rowguid] UNIQUEIDENTIFIER CONSTRAINT [DF_Address_rowguid] DEFAULT (newid()) NOT NULL, 10 | [ModifiedDate] DATETIME CONSTRAINT [DF_Address_ModifiedDate] DEFAULT (getdate()) NOT NULL, 11 | CONSTRAINT [PK_Address_AddressID] PRIMARY KEY CLUSTERED ([AddressID] ASC), 12 | CONSTRAINT [AK_Address_rowguid] UNIQUE NONCLUSTERED ([rowguid] ASC) 13 | ); 14 | 15 | 16 | GO 17 | CREATE NONCLUSTERED INDEX [IX_Address_StateProvince] 18 | ON [SalesLT].[Address]([StateProvince] ASC); 19 | 20 | 21 | GO 22 | CREATE NONCLUSTERED INDEX [IX_Address_AddressLine1_AddressLine2_City_StateProvince_PostalCode_CountryRegion] 23 | ON [SalesLT].[Address]([AddressLine1] ASC, [AddressLine2] ASC, [City] ASC, [StateProvince] ASC, [PostalCode] ASC, [CountryRegion] ASC); 24 | 25 | -------------------------------------------------------------------------------- /samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/SalesLT/Tables/Customer.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE [SalesLT].[Customer] ( 2 | [CustomerID] INT IDENTITY (1, 1) NOT NULL, 3 | [NameStyle] [dbo].[NameStyle] CONSTRAINT [DF_Customer_NameStyle] DEFAULT ((0)) NOT NULL, 4 | [Title] NVARCHAR (8) NULL, 5 | [FirstName] [dbo].[Name] NOT NULL, 6 | [MiddleName] [dbo].[Name] NULL, 7 | [LastName] [dbo].[Name] NOT NULL, 8 | [Suffix] NVARCHAR (10) NULL, 9 | [CompanyName] NVARCHAR (128) NULL, 10 | [SalesPerson] NVARCHAR (256) NULL, 11 | [EmailAddress] NVARCHAR (50) NULL, 12 | [Phone] [dbo].[Phone] NULL, 13 | [PasswordHash] VARCHAR (128) NOT NULL, 14 | [PasswordSalt] VARCHAR (10) NOT NULL, 15 | [rowguid] UNIQUEIDENTIFIER CONSTRAINT [DF_Customer_rowguid] DEFAULT (newid()) NOT NULL, 16 | [ModifiedDate] DATETIME CONSTRAINT [DF_Customer_ModifiedDate] DEFAULT (getdate()) NOT NULL, 17 | CONSTRAINT [PK_Customer_CustomerID] PRIMARY KEY CLUSTERED ([CustomerID] ASC), 18 | CONSTRAINT [AK_Customer_rowguid] UNIQUE NONCLUSTERED ([rowguid] ASC) 19 | ); 20 | 21 | 22 | GO 23 | CREATE NONCLUSTERED INDEX [IX_Customer_EmailAddress] 24 | ON [SalesLT].[Customer]([EmailAddress] ASC); 25 | 26 | -------------------------------------------------------------------------------- /samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/SalesLT/Tables/CustomerAddress.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE [SalesLT].[CustomerAddress] ( 2 | [CustomerID] INT NOT NULL, 3 | [AddressID] INT NOT NULL, 4 | [AddressType] [dbo].[Name] NOT NULL, 5 | [rowguid] UNIQUEIDENTIFIER CONSTRAINT [DF_CustomerAddress_rowguid] DEFAULT (newid()) NOT NULL, 6 | [ModifiedDate] DATETIME CONSTRAINT [DF_CustomerAddress_ModifiedDate] DEFAULT (getdate()) NOT NULL, 7 | CONSTRAINT [PK_CustomerAddress_CustomerID_AddressID] PRIMARY KEY CLUSTERED ([CustomerID] ASC, [AddressID] ASC), 8 | CONSTRAINT [FK_CustomerAddress_Address_AddressID] FOREIGN KEY ([AddressID]) REFERENCES [SalesLT].[Address] ([AddressID]), 9 | CONSTRAINT [FK_CustomerAddress_Customer_CustomerID] FOREIGN KEY ([CustomerID]) REFERENCES [SalesLT].[Customer] ([CustomerID]), 10 | CONSTRAINT [AK_CustomerAddress_rowguid] UNIQUE NONCLUSTERED ([rowguid] ASC) 11 | ); 12 | 13 | -------------------------------------------------------------------------------- /samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/SalesLT/Tables/Product.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE [SalesLT].[Product] ( 2 | [ProductID] INT IDENTITY (1, 1) NOT NULL, 3 | [Name] [dbo].[Name] NOT NULL, 4 | [ProductNumber] NVARCHAR (25) NOT NULL, 5 | [Color] NVARCHAR (15) NULL, 6 | [StandardCost] MONEY NOT NULL, 7 | [ListPrice] MONEY NOT NULL, 8 | [Size] NVARCHAR (5) NULL, 9 | [Weight] DECIMAL (8, 2) NULL, 10 | [ProductCategoryID] INT NULL, 11 | [ProductModelID] INT NULL, 12 | [SellStartDate] DATETIME NOT NULL, 13 | [SellEndDate] DATETIME NULL, 14 | [DiscontinuedDate] DATETIME NULL, 15 | [ThumbNailPhoto] VARBINARY (MAX) NULL, 16 | [ThumbnailPhotoFileName] NVARCHAR (50) NULL, 17 | [rowguid] UNIQUEIDENTIFIER CONSTRAINT [DF_Product_rowguid] DEFAULT (newid()) NOT NULL, 18 | [ModifiedDate] DATETIME CONSTRAINT [DF_Product_ModifiedDate] DEFAULT (getdate()) NOT NULL, 19 | CONSTRAINT [PK_Product_ProductID] PRIMARY KEY CLUSTERED ([ProductID] ASC), 20 | CONSTRAINT [CK_Product_ListPrice] CHECK ([ListPrice]>=(0.00)), 21 | CONSTRAINT [CK_Product_SellEndDate] CHECK ([SellEndDate]>=[SellStartDate] OR [SellEndDate] IS NULL), 22 | CONSTRAINT [CK_Product_StandardCost] CHECK ([StandardCost]>=(0.00)), 23 | CONSTRAINT [CK_Product_Weight] CHECK ([Weight]>(0.00)), 24 | CONSTRAINT [FK_Product_ProductCategory_ProductCategoryID] FOREIGN KEY ([ProductCategoryID]) REFERENCES [SalesLT].[ProductCategory] ([ProductCategoryID]), 25 | CONSTRAINT [FK_Product_ProductModel_ProductModelID] FOREIGN KEY ([ProductModelID]) REFERENCES [SalesLT].[ProductModel] ([ProductModelID]), 26 | CONSTRAINT [AK_Product_Name] UNIQUE NONCLUSTERED ([Name] ASC), 27 | CONSTRAINT [AK_Product_ProductNumber] UNIQUE NONCLUSTERED ([ProductNumber] ASC), 28 | CONSTRAINT [AK_Product_rowguid] UNIQUE NONCLUSTERED ([rowguid] ASC) 29 | ); 30 | 31 | -------------------------------------------------------------------------------- /samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/SalesLT/Tables/ProductCategory.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE [SalesLT].[ProductCategory] ( 2 | [ProductCategoryID] INT IDENTITY (1, 1) NOT NULL, 3 | [ParentProductCategoryID] INT NULL, 4 | [Name] [dbo].[Name] NOT NULL, 5 | [rowguid] UNIQUEIDENTIFIER CONSTRAINT [DF_ProductCategory_rowguid] DEFAULT (newid()) NOT NULL, 6 | [ModifiedDate] DATETIME CONSTRAINT [DF_ProductCategory_ModifiedDate] DEFAULT (getdate()) NOT NULL, 7 | CONSTRAINT [PK_ProductCategory_ProductCategoryID] PRIMARY KEY CLUSTERED ([ProductCategoryID] ASC), 8 | CONSTRAINT [FK_ProductCategory_ProductCategory_ParentProductCategoryID_ProductCategoryID] FOREIGN KEY ([ParentProductCategoryID]) REFERENCES [SalesLT].[ProductCategory] ([ProductCategoryID]), 9 | CONSTRAINT [AK_ProductCategory_Name] UNIQUE NONCLUSTERED ([Name] ASC), 10 | CONSTRAINT [AK_ProductCategory_rowguid] UNIQUE NONCLUSTERED ([rowguid] ASC) 11 | ); 12 | 13 | -------------------------------------------------------------------------------- /samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/SalesLT/Tables/ProductDescription.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE [SalesLT].[ProductDescription] ( 2 | [ProductDescriptionID] INT IDENTITY (1, 1) NOT NULL, 3 | [Description] NVARCHAR (400) NOT NULL, 4 | [rowguid] UNIQUEIDENTIFIER CONSTRAINT [DF_ProductDescription_rowguid] DEFAULT (newid()) NOT NULL, 5 | [ModifiedDate] DATETIME CONSTRAINT [DF_ProductDescription_ModifiedDate] DEFAULT (getdate()) NOT NULL, 6 | CONSTRAINT [PK_ProductDescription_ProductDescriptionID] PRIMARY KEY CLUSTERED ([ProductDescriptionID] ASC), 7 | CONSTRAINT [AK_ProductDescription_rowguid] UNIQUE NONCLUSTERED ([rowguid] ASC) 8 | ); 9 | 10 | -------------------------------------------------------------------------------- /samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/SalesLT/Tables/ProductModel.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE [SalesLT].[ProductModel] ( 2 | [ProductModelID] INT IDENTITY (1, 1) NOT NULL, 3 | [Name] [dbo].[Name] NOT NULL, 4 | [CatalogDescription] XML NULL, 5 | [rowguid] UNIQUEIDENTIFIER CONSTRAINT [DF_ProductModel_rowguid] DEFAULT (newid()) NOT NULL, 6 | [ModifiedDate] DATETIME CONSTRAINT [DF_ProductModel_ModifiedDate] DEFAULT (getdate()) NOT NULL, 7 | CONSTRAINT [PK_ProductModel_ProductModelID] PRIMARY KEY CLUSTERED ([ProductModelID] ASC), 8 | CONSTRAINT [AK_ProductModel_Name] UNIQUE NONCLUSTERED ([Name] ASC), 9 | CONSTRAINT [AK_ProductModel_rowguid] UNIQUE NONCLUSTERED ([rowguid] ASC) 10 | ); 11 | 12 | -------------------------------------------------------------------------------- /samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/SalesLT/Tables/ProductModelProductDescription.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE [SalesLT].[ProductModelProductDescription] ( 2 | [ProductModelID] INT NOT NULL, 3 | [ProductDescriptionID] INT NOT NULL, 4 | [Culture] NCHAR (6) NOT NULL, 5 | [rowguid] UNIQUEIDENTIFIER CONSTRAINT [DF_ProductModelProductDescription_rowguid] DEFAULT (newid()) NOT NULL, 6 | [ModifiedDate] DATETIME CONSTRAINT [DF_ProductModelProductDescription_ModifiedDate] DEFAULT (getdate()) NOT NULL, 7 | CONSTRAINT [PK_ProductModelProductDescription_ProductModelID_ProductDescriptionID_Culture] PRIMARY KEY CLUSTERED ([ProductModelID] ASC, [ProductDescriptionID] ASC, [Culture] ASC), 8 | CONSTRAINT [FK_ProductModelProductDescription_ProductDescription_ProductDescriptionID] FOREIGN KEY ([ProductDescriptionID]) REFERENCES [SalesLT].[ProductDescription] ([ProductDescriptionID]), 9 | CONSTRAINT [FK_ProductModelProductDescription_ProductModel_ProductModelID] FOREIGN KEY ([ProductModelID]) REFERENCES [SalesLT].[ProductModel] ([ProductModelID]), 10 | CONSTRAINT [AK_ProductModelProductDescription_rowguid] UNIQUE NONCLUSTERED ([rowguid] ASC) 11 | ); 12 | 13 | -------------------------------------------------------------------------------- /samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/SalesLT/Tables/SalesOrderDetail.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE [SalesLT].[SalesOrderDetail] ( 2 | [SalesOrderID] INT NOT NULL, 3 | [SalesOrderDetailID] INT IDENTITY (1, 1) NOT NULL, 4 | [OrderQty] SMALLINT NOT NULL, 5 | [ProductID] INT NOT NULL, 6 | [UnitPrice] MONEY NOT NULL, 7 | [UnitPriceDiscount] MONEY CONSTRAINT [DF_SalesOrderDetail_UnitPriceDiscount] DEFAULT ((0.0)) NOT NULL, 8 | [LineTotal] AS (isnull(([UnitPrice]*((1.0)-[UnitPriceDiscount]))*[OrderQty],(0.0))), 9 | [rowguid] UNIQUEIDENTIFIER CONSTRAINT [DF_SalesOrderDetail_rowguid] DEFAULT (newid()) NOT NULL, 10 | [ModifiedDate] DATETIME CONSTRAINT [DF_SalesOrderDetail_ModifiedDate] DEFAULT (getdate()) NOT NULL, 11 | CONSTRAINT [PK_SalesOrderDetail_SalesOrderID_SalesOrderDetailID] PRIMARY KEY CLUSTERED ([SalesOrderID] ASC, [SalesOrderDetailID] ASC), 12 | CONSTRAINT [CK_SalesOrderDetail_OrderQty] CHECK ([OrderQty]>(0)), 13 | CONSTRAINT [CK_SalesOrderDetail_UnitPrice] CHECK ([UnitPrice]>=(0.00)), 14 | CONSTRAINT [CK_SalesOrderDetail_UnitPriceDiscount] CHECK ([UnitPriceDiscount]>=(0.00)), 15 | CONSTRAINT [FK_SalesOrderDetail_Product_ProductID] FOREIGN KEY ([ProductID]) REFERENCES [SalesLT].[Product] ([ProductID]), 16 | CONSTRAINT [FK_SalesOrderDetail_SalesOrderHeader_SalesOrderID] FOREIGN KEY ([SalesOrderID]) REFERENCES [SalesLT].[SalesOrderHeader] ([SalesOrderID]) ON DELETE CASCADE, 17 | CONSTRAINT [AK_SalesOrderDetail_rowguid] UNIQUE NONCLUSTERED ([rowguid] ASC) 18 | ); 19 | 20 | 21 | GO 22 | CREATE NONCLUSTERED INDEX [IX_SalesOrderDetail_ProductID] 23 | ON [SalesLT].[SalesOrderDetail]([ProductID] ASC); 24 | 25 | -------------------------------------------------------------------------------- /samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/SalesLT/Tables/SalesOrderHeader.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE [SalesLT].[SalesOrderHeader] ( 2 | [SalesOrderID] INT CONSTRAINT [DF_SalesOrderHeader_OrderID] DEFAULT (NEXT VALUE FOR [SalesLT].[SalesOrderNumber]) NOT NULL, 3 | [RevisionNumber] TINYINT CONSTRAINT [DF_SalesOrderHeader_RevisionNumber] DEFAULT ((0)) NOT NULL, 4 | [OrderDate] DATETIME CONSTRAINT [DF_SalesOrderHeader_OrderDate] DEFAULT (getdate()) NOT NULL, 5 | [DueDate] DATETIME NOT NULL, 6 | [ShipDate] DATETIME NULL, 7 | [Status] TINYINT CONSTRAINT [DF_SalesOrderHeader_Status] DEFAULT ((1)) NOT NULL, 8 | [OnlineOrderFlag] [dbo].[Flag] CONSTRAINT [DF_SalesOrderHeader_OnlineOrderFlag] DEFAULT ((1)) NOT NULL, 9 | [SalesOrderNumber] AS (isnull(N'SO'+CONVERT([nvarchar](23),[SalesOrderID],(0)),N'*** ERROR ***')), 10 | [PurchaseOrderNumber] [dbo].[OrderNumber] NULL, 11 | [AccountNumber] [dbo].[AccountNumber] NULL, 12 | [CustomerID] INT NOT NULL, 13 | [ShipToAddressID] INT NULL, 14 | [BillToAddressID] INT NULL, 15 | [ShipMethod] NVARCHAR (50) NOT NULL, 16 | [CreditCardApprovalCode] VARCHAR (15) NULL, 17 | [SubTotal] MONEY CONSTRAINT [DF_SalesOrderHeader_SubTotal] DEFAULT ((0.00)) NOT NULL, 18 | [TaxAmt] MONEY CONSTRAINT [DF_SalesOrderHeader_TaxAmt] DEFAULT ((0.00)) NOT NULL, 19 | [Freight] MONEY CONSTRAINT [DF_SalesOrderHeader_Freight] DEFAULT ((0.00)) NOT NULL, 20 | [TotalDue] AS (isnull(([SubTotal]+[TaxAmt])+[Freight],(0))), 21 | [Comment] NVARCHAR (MAX) NULL, 22 | [rowguid] UNIQUEIDENTIFIER CONSTRAINT [DF_SalesOrderHeader_rowguid] DEFAULT (newid()) NOT NULL, 23 | [ModifiedDate] DATETIME CONSTRAINT [DF_SalesOrderHeader_ModifiedDate] DEFAULT (getdate()) NOT NULL, 24 | CONSTRAINT [PK_SalesOrderHeader_SalesOrderID] PRIMARY KEY CLUSTERED ([SalesOrderID] ASC), 25 | CONSTRAINT [CK_SalesOrderHeader_DueDate] CHECK ([DueDate]>=[OrderDate]), 26 | CONSTRAINT [CK_SalesOrderHeader_Freight] CHECK ([Freight]>=(0.00)), 27 | CONSTRAINT [CK_SalesOrderHeader_ShipDate] CHECK ([ShipDate]>=[OrderDate] OR [ShipDate] IS NULL), 28 | CONSTRAINT [CK_SalesOrderHeader_Status] CHECK ([Status]>=(0) AND [Status]<=(8)), 29 | CONSTRAINT [CK_SalesOrderHeader_SubTotal] CHECK ([SubTotal]>=(0.00)), 30 | CONSTRAINT [CK_SalesOrderHeader_TaxAmt] CHECK ([TaxAmt]>=(0.00)), 31 | CONSTRAINT [FK_SalesOrderHeader_Address_BillTo_AddressID] FOREIGN KEY ([BillToAddressID]) REFERENCES [SalesLT].[Address] ([AddressID]), 32 | CONSTRAINT [FK_SalesOrderHeader_Address_ShipTo_AddressID] FOREIGN KEY ([ShipToAddressID]) REFERENCES [SalesLT].[Address] ([AddressID]), 33 | CONSTRAINT [FK_SalesOrderHeader_Customer_CustomerID] FOREIGN KEY ([CustomerID]) REFERENCES [SalesLT].[Customer] ([CustomerID]), 34 | CONSTRAINT [AK_SalesOrderHeader_rowguid] UNIQUE NONCLUSTERED ([rowguid] ASC), 35 | CONSTRAINT [AK_SalesOrderHeader_SalesOrderNumber] UNIQUE NONCLUSTERED ([SalesOrderNumber] ASC) 36 | ); 37 | 38 | 39 | GO 40 | CREATE NONCLUSTERED INDEX [IX_SalesOrderHeader_CustomerID] 41 | ON [SalesLT].[SalesOrderHeader]([CustomerID] ASC); 42 | 43 | -------------------------------------------------------------------------------- /samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/SalesLT/Views/vGetAllCategories.sql: -------------------------------------------------------------------------------- 1 | CREATE VIEW [SalesLT].[vGetAllCategories] 2 | WITH SCHEMABINDING 3 | AS 4 | -- Returns the CustomerID, first name, and last name for the specified customer. 5 | WITH CategoryCTE([ParentProductCategoryID], [ProductCategoryID], [Name]) AS 6 | ( 7 | SELECT [ParentProductCategoryID], [ProductCategoryID], [Name] 8 | FROM SalesLT.ProductCategory 9 | WHERE ParentProductCategoryID IS NULL 10 | 11 | UNION ALL 12 | 13 | SELECT C.[ParentProductCategoryID], C.[ProductCategoryID], C.[Name] 14 | FROM SalesLT.ProductCategory AS C 15 | INNER JOIN CategoryCTE AS BC ON BC.ProductCategoryID = C.ParentProductCategoryID 16 | ) 17 | 18 | SELECT PC.[Name] AS [ParentProductCategoryName], CCTE.[Name] as [ProductCategoryName], CCTE.[ProductCategoryID] 19 | FROM CategoryCTE AS CCTE 20 | JOIN SalesLT.ProductCategory AS PC 21 | ON PC.[ProductCategoryID] = CCTE.[ParentProductCategoryID] -------------------------------------------------------------------------------- /samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/SalesLT/Views/vProductAndDescription.sql: -------------------------------------------------------------------------------- 1 | CREATE VIEW [SalesLT].[vProductAndDescription] 2 | WITH SCHEMABINDING 3 | AS 4 | -- View (indexed or standard) to display products and product descriptions by language. 5 | SELECT 6 | p.[ProductID] 7 | ,p.[Name] 8 | ,pm.[Name] AS [ProductModel] 9 | ,pmx.[Culture] 10 | ,pd.[Description] 11 | FROM [SalesLT].[Product] p 12 | INNER JOIN [SalesLT].[ProductModel] pm 13 | ON p.[ProductModelID] = pm.[ProductModelID] 14 | INNER JOIN [SalesLT].[ProductModelProductDescription] pmx 15 | ON pm.[ProductModelID] = pmx.[ProductModelID] 16 | INNER JOIN [SalesLT].[ProductDescription] pd 17 | ON pmx.[ProductDescriptionID] = pd.[ProductDescriptionID]; 18 | GO 19 | CREATE UNIQUE CLUSTERED INDEX [IX_vProductAndDescription] 20 | ON [SalesLT].[vProductAndDescription]([Culture] ASC, [ProductID] ASC); 21 | 22 | -------------------------------------------------------------------------------- /samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/SalesLT/Views/vProductModelCatalogDescription.sql: -------------------------------------------------------------------------------- 1 | CREATE VIEW [SalesLT].[vProductModelCatalogDescription] 2 | AS 3 | SELECT 4 | [ProductModelID] 5 | ,[Name] 6 | ,[CatalogDescription].value(N'declare namespace p1="http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/ProductModelDescription"; 7 | declare namespace html="http://www.w3.org/1999/xhtml"; 8 | (/p1:ProductDescription/p1:Summary/html:p)[1]', 'nvarchar(max)') AS [Summary] 9 | ,[CatalogDescription].value(N'declare namespace p1="http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/ProductModelDescription"; 10 | (/p1:ProductDescription/p1:Manufacturer/p1:Name)[1]', 'nvarchar(max)') AS [Manufacturer] 11 | ,[CatalogDescription].value(N'declare namespace p1="http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/ProductModelDescription"; 12 | (/p1:ProductDescription/p1:Manufacturer/p1:Copyright)[1]', 'nvarchar(30)') AS [Copyright] 13 | ,[CatalogDescription].value(N'declare namespace p1="http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/ProductModelDescription"; 14 | (/p1:ProductDescription/p1:Manufacturer/p1:ProductURL)[1]', 'nvarchar(256)') AS [ProductURL] 15 | ,[CatalogDescription].value(N'declare namespace p1="http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/ProductModelDescription"; 16 | declare namespace wm="http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/ProductModelWarrAndMain"; 17 | (/p1:ProductDescription/p1:Features/wm:Warranty/wm:WarrantyPeriod)[1]', 'nvarchar(256)') AS [WarrantyPeriod] 18 | ,[CatalogDescription].value(N'declare namespace p1="http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/ProductModelDescription"; 19 | declare namespace wm="http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/ProductModelWarrAndMain"; 20 | (/p1:ProductDescription/p1:Features/wm:Warranty/wm:Description)[1]', 'nvarchar(256)') AS [WarrantyDescription] 21 | ,[CatalogDescription].value(N'declare namespace p1="http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/ProductModelDescription"; 22 | declare namespace wm="http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/ProductModelWarrAndMain"; 23 | (/p1:ProductDescription/p1:Features/wm:Maintenance/wm:NoOfYears)[1]', 'nvarchar(256)') AS [NoOfYears] 24 | ,[CatalogDescription].value(N'declare namespace p1="http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/ProductModelDescription"; 25 | declare namespace wm="http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/ProductModelWarrAndMain"; 26 | (/p1:ProductDescription/p1:Features/wm:Maintenance/wm:Description)[1]', 'nvarchar(256)') AS [MaintenanceDescription] 27 | ,[CatalogDescription].value(N'declare namespace p1="http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/ProductModelDescription"; 28 | declare namespace wf="http://www.adventure-works.com/schemas/OtherFeatures"; 29 | (/p1:ProductDescription/p1:Features/wf:wheel)[1]', 'nvarchar(256)') AS [Wheel] 30 | ,[CatalogDescription].value(N'declare namespace p1="http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/ProductModelDescription"; 31 | declare namespace wf="http://www.adventure-works.com/schemas/OtherFeatures"; 32 | (/p1:ProductDescription/p1:Features/wf:saddle)[1]', 'nvarchar(256)') AS [Saddle] 33 | ,[CatalogDescription].value(N'declare namespace p1="http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/ProductModelDescription"; 34 | declare namespace wf="http://www.adventure-works.com/schemas/OtherFeatures"; 35 | (/p1:ProductDescription/p1:Features/wf:pedal)[1]', 'nvarchar(256)') AS [Pedal] 36 | ,[CatalogDescription].value(N'declare namespace p1="http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/ProductModelDescription"; 37 | declare namespace wf="http://www.adventure-works.com/schemas/OtherFeatures"; 38 | (/p1:ProductDescription/p1:Features/wf:BikeFrame)[1]', 'nvarchar(max)') AS [BikeFrame] 39 | ,[CatalogDescription].value(N'declare namespace p1="http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/ProductModelDescription"; 40 | declare namespace wf="http://www.adventure-works.com/schemas/OtherFeatures"; 41 | (/p1:ProductDescription/p1:Features/wf:crankset)[1]', 'nvarchar(256)') AS [Crankset] 42 | ,[CatalogDescription].value(N'declare namespace p1="http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/ProductModelDescription"; 43 | (/p1:ProductDescription/p1:Picture/p1:Angle)[1]', 'nvarchar(256)') AS [PictureAngle] 44 | ,[CatalogDescription].value(N'declare namespace p1="http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/ProductModelDescription"; 45 | (/p1:ProductDescription/p1:Picture/p1:Size)[1]', 'nvarchar(256)') AS [PictureSize] 46 | ,[CatalogDescription].value(N'declare namespace p1="http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/ProductModelDescription"; 47 | (/p1:ProductDescription/p1:Picture/p1:ProductPhotoID)[1]', 'nvarchar(256)') AS [ProductPhotoID] 48 | ,[CatalogDescription].value(N'declare namespace p1="http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/ProductModelDescription"; 49 | (/p1:ProductDescription/p1:Specifications/Material)[1]', 'nvarchar(256)') AS [Material] 50 | ,[CatalogDescription].value(N'declare namespace p1="http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/ProductModelDescription"; 51 | (/p1:ProductDescription/p1:Specifications/Color)[1]', 'nvarchar(256)') AS [Color] 52 | ,[CatalogDescription].value(N'declare namespace p1="http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/ProductModelDescription"; 53 | (/p1:ProductDescription/p1:Specifications/ProductLine)[1]', 'nvarchar(256)') AS [ProductLine] 54 | ,[CatalogDescription].value(N'declare namespace p1="http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/ProductModelDescription"; 55 | (/p1:ProductDescription/p1:Specifications/Style)[1]', 'nvarchar(256)') AS [Style] 56 | ,[CatalogDescription].value(N'declare namespace p1="http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/ProductModelDescription"; 57 | (/p1:ProductDescription/p1:Specifications/RiderExperience)[1]', 'nvarchar(1024)') AS [RiderExperience] 58 | ,[rowguid] 59 | ,[ModifiedDate] 60 | FROM [SalesLT].[ProductModel] 61 | WHERE [CatalogDescription] IS NOT NULL; -------------------------------------------------------------------------------- /samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/Security/SalesLT.sql: -------------------------------------------------------------------------------- 1 | CREATE SCHEMA [SalesLT] 2 | AUTHORIZATION [dbo]; 3 | 4 | -------------------------------------------------------------------------------- /samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/dbo/Functions/ufnGetAllCategories.sql: -------------------------------------------------------------------------------- 1 | CREATE FUNCTION [dbo].[ufnGetAllCategories]() 2 | RETURNS @retCategoryInformation TABLE 3 | ( 4 | -- Columns returned by the function 5 | [ParentProductCategoryName] nvarchar(50) NULL, 6 | [ProductCategoryName] nvarchar(50) NOT NULL, 7 | [ProductCategoryID] int NOT NULL 8 | ) 9 | AS 10 | -- Returns the CustomerID, first name, and last name for the specified customer. 11 | BEGIN 12 | WITH CategoryCTE([ParentProductCategoryID], [ProductCategoryID], [Name]) AS 13 | ( 14 | SELECT [ParentProductCategoryID], [ProductCategoryID], [Name] 15 | FROM SalesLT.ProductCategory 16 | WHERE ParentProductCategoryID IS NULL 17 | 18 | UNION ALL 19 | 20 | SELECT C.[ParentProductCategoryID], C.[ProductCategoryID], C.[Name] 21 | FROM SalesLT.ProductCategory AS C 22 | INNER JOIN CategoryCTE AS BC ON BC.ProductCategoryID = C.ParentProductCategoryID 23 | ) 24 | 25 | INSERT INTO @retCategoryInformation 26 | SELECT PC.[Name] AS [ParentProductCategoryName], CCTE.[Name] as [ProductCategoryName], CCTE.[ProductCategoryID] 27 | FROM CategoryCTE AS CCTE 28 | JOIN SalesLT.ProductCategory AS PC 29 | ON PC.[ProductCategoryID] = CCTE.[ParentProductCategoryID]; 30 | RETURN; 31 | END; -------------------------------------------------------------------------------- /samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/dbo/Functions/ufnGetCustomerInformation.sql: -------------------------------------------------------------------------------- 1 | CREATE FUNCTION [dbo].[ufnGetCustomerInformation](@CustomerID int) 2 | RETURNS TABLE 3 | AS 4 | -- Returns the CustomerID, first name, and last name for the specified customer. 5 | RETURN ( 6 | SELECT 7 | CustomerID, 8 | FirstName, 9 | LastName 10 | FROM [SalesLT].[Customer] 11 | WHERE [CustomerID] = @CustomerID 12 | ); -------------------------------------------------------------------------------- /samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/dbo/Functions/ufnGetSalesOrderStatusText.sql: -------------------------------------------------------------------------------- 1 | CREATE FUNCTION [dbo].[ufnGetSalesOrderStatusText](@Status tinyint) 2 | RETURNS nvarchar(15) 3 | AS 4 | -- Returns the sales order status text representation for the status value. 5 | BEGIN 6 | DECLARE @ret nvarchar(15); 7 | 8 | SET @ret = 9 | CASE @Status 10 | WHEN 1 THEN 'In process' 11 | WHEN 2 THEN 'Approved' 12 | WHEN 3 THEN 'Backordered' 13 | WHEN 4 THEN 'Rejected' 14 | WHEN 5 THEN 'Shipped' 15 | WHEN 6 THEN 'Cancelled' 16 | ELSE '** Invalid **' 17 | END; 18 | 19 | RETURN @ret 20 | END; -------------------------------------------------------------------------------- /samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/dbo/Stored Procedures/uspLogError.sql: -------------------------------------------------------------------------------- 1 |  2 | -- uspLogError logs error information in the ErrorLog table about the 3 | -- error that caused execution to jump to the CATCH block of a 4 | -- TRY...CATCH construct. This should be executed from within the scope 5 | -- of a CATCH block otherwise it will return without inserting error 6 | -- information. 7 | CREATE PROCEDURE [dbo].[uspLogError] 8 | @ErrorLogID int = 0 OUTPUT -- contains the ErrorLogID of the row inserted 9 | AS -- by uspLogError in the ErrorLog table 10 | BEGIN 11 | SET NOCOUNT ON; 12 | 13 | -- Output parameter value of 0 indicates that error 14 | -- information was not logged 15 | SET @ErrorLogID = 0; 16 | 17 | BEGIN TRY 18 | -- Return if there is no error information to log 19 | IF ERROR_NUMBER() IS NULL 20 | RETURN; 21 | 22 | -- Return if inside an uncommittable transaction. 23 | -- Data insertion/modification is not allowed when 24 | -- a transaction is in an uncommittable state. 25 | IF XACT_STATE() = -1 26 | BEGIN 27 | PRINT 'Cannot log error since the current transaction is in an uncommittable state. ' 28 | + 'Rollback the transaction before executing uspLogError in order to successfully log error information.'; 29 | RETURN; 30 | END 31 | 32 | INSERT [dbo].[ErrorLog] 33 | ( 34 | [UserName], 35 | [ErrorNumber], 36 | [ErrorSeverity], 37 | [ErrorState], 38 | [ErrorProcedure], 39 | [ErrorLine], 40 | [ErrorMessage] 41 | ) 42 | VALUES 43 | ( 44 | CONVERT(sysname, CURRENT_USER), 45 | ERROR_NUMBER(), 46 | ERROR_SEVERITY(), 47 | ERROR_STATE(), 48 | ERROR_PROCEDURE(), 49 | ERROR_LINE(), 50 | ERROR_MESSAGE() 51 | ); 52 | 53 | -- Pass back the ErrorLogID of the row inserted 54 | SET @ErrorLogID = @@IDENTITY; 55 | END TRY 56 | BEGIN CATCH 57 | PRINT 'An error occurred in stored procedure uspLogError: '; 58 | EXECUTE [dbo].[uspPrintError]; 59 | RETURN -1; 60 | END CATCH 61 | END; -------------------------------------------------------------------------------- /samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/dbo/Stored Procedures/uspPrintError.sql: -------------------------------------------------------------------------------- 1 |  2 | -- uspPrintError prints error information about the error that caused 3 | -- execution to jump to the CATCH block of a TRY...CATCH construct. 4 | -- Should be executed from within the scope of a CATCH block otherwise 5 | -- it will return without printing any error information. 6 | CREATE PROCEDURE [dbo].[uspPrintError] 7 | AS 8 | BEGIN 9 | SET NOCOUNT ON; 10 | 11 | -- Print error information. 12 | PRINT 'Error ' + CONVERT(varchar(50), ERROR_NUMBER()) + 13 | ', Severity ' + CONVERT(varchar(5), ERROR_SEVERITY()) + 14 | ', State ' + CONVERT(varchar(5), ERROR_STATE()) + 15 | ', Procedure ' + ISNULL(ERROR_PROCEDURE(), '-') + 16 | ', Line ' + CONVERT(varchar(5), ERROR_LINE()); 17 | PRINT ERROR_MESSAGE(); 18 | END; -------------------------------------------------------------------------------- /samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/dbo/Tables/BuildVersion.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE [dbo].[BuildVersion] ( 2 | [SystemInformationID] TINYINT IDENTITY (1, 1) NOT NULL, 3 | [Database Version] NVARCHAR (25) NOT NULL, 4 | [VersionDate] DATETIME NOT NULL, 5 | [ModifiedDate] DATETIME CONSTRAINT [DF_BuildVersion_ModifiedDate] DEFAULT (getdate()) NOT NULL, 6 | PRIMARY KEY CLUSTERED ([SystemInformationID] ASC) 7 | ); 8 | 9 | -------------------------------------------------------------------------------- /samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/dbo/Tables/ErrorLog.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE [dbo].[ErrorLog] ( 2 | [ErrorLogID] INT IDENTITY (1, 1) NOT NULL, 3 | [ErrorTime] DATETIME CONSTRAINT [DF_ErrorLog_ErrorTime] DEFAULT (getdate()) NOT NULL, 4 | [UserName] [sysname] NOT NULL, 5 | [ErrorNumber] INT NOT NULL, 6 | [ErrorSeverity] INT NULL, 7 | [ErrorState] INT NULL, 8 | [ErrorProcedure] NVARCHAR (126) NULL, 9 | [ErrorLine] INT NULL, 10 | [ErrorMessage] NVARCHAR (4000) NOT NULL, 11 | CONSTRAINT [PK_ErrorLog_ErrorLogID] PRIMARY KEY CLUSTERED ([ErrorLogID] ASC) 12 | ); 13 | 14 | -------------------------------------------------------------------------------- /samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/dbo/User Defined Types/AccountNumber.sql: -------------------------------------------------------------------------------- 1 | CREATE TYPE [dbo].[AccountNumber] 2 | FROM NVARCHAR (15) NULL; 3 | 4 | -------------------------------------------------------------------------------- /samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/dbo/User Defined Types/Flag.sql: -------------------------------------------------------------------------------- 1 | CREATE TYPE [dbo].[Flag] 2 | FROM BIT NOT NULL; 3 | 4 | -------------------------------------------------------------------------------- /samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/dbo/User Defined Types/Name.sql: -------------------------------------------------------------------------------- 1 | CREATE TYPE [dbo].[Name] 2 | FROM NVARCHAR (50) NULL; 3 | 4 | -------------------------------------------------------------------------------- /samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/dbo/User Defined Types/NameStyle.sql: -------------------------------------------------------------------------------- 1 | CREATE TYPE [dbo].[NameStyle] 2 | FROM BIT NOT NULL; 3 | 4 | -------------------------------------------------------------------------------- /samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/dbo/User Defined Types/OrderNumber.sql: -------------------------------------------------------------------------------- 1 | CREATE TYPE [dbo].[OrderNumber] 2 | FROM NVARCHAR (25) NULL; 3 | 4 | -------------------------------------------------------------------------------- /samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/dbo/User Defined Types/Phone.sql: -------------------------------------------------------------------------------- 1 | CREATE TYPE [dbo].[Phone] 2 | FROM NVARCHAR (25) NULL; 3 | 4 | -------------------------------------------------------------------------------- /samples/azuresql/ddo_samples_azuresql/ddo_samples_azuresql/ddo_samples_azuresql.sqlproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | Debug 5 | AnyCPU 6 | ddo_samples_azuresql 7 | 2.0 8 | 4.1 9 | {387ebfc5-acb1-4445-a25f-d70d18d34c30} 10 | Microsoft.Data.Tools.Schema.Sql.SqlAzureV12DatabaseSchemaProvider 11 | Database 12 | 13 | 14 | ddo_samples_azuresql 15 | ddo_samples_azuresql 16 | 1033, CI 17 | BySchemaAndSchemaType 18 | True 19 | v4.5 20 | CS 21 | Properties 22 | False 23 | True 24 | True 25 | 26 | 27 | bin\Release\ 28 | $(MSBuildProjectName).sql 29 | False 30 | pdbonly 31 | true 32 | false 33 | true 34 | prompt 35 | 4 36 | 37 | 38 | bin\Debug\ 39 | $(MSBuildProjectName).sql 40 | false 41 | true 42 | full 43 | false 44 | true 45 | true 46 | prompt 47 | 4 48 | 49 | 50 | 11.0 51 | 52 | True 53 | 11.0 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | -------------------------------------------------------------------------------- /samples/databricks/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devlace/datadevops/e6c564a674a8264eed94fa6a8a8056e3b450525c/samples/databricks/README.md -------------------------------------------------------------------------------- /sql/ddo_azuresqldw_dw/ddo_azuresqldw_dw.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio Version 16 4 | VisualStudioVersion = 16.0.29326.143 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{00D1A9C2-B5F0-4AF3-8072-F6C62B433612}") = "ddo_azuresqldw_dw", "ddo_azuresqldw_dw\ddo_azuresqldw_dw.sqlproj", "{AA416CF5-F184-4573-B591-7ED42A294421}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Debug|Any CPU = Debug|Any CPU 11 | Release|Any CPU = Release|Any CPU 12 | EndGlobalSection 13 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 14 | {AA416CF5-F184-4573-B591-7ED42A294421}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 15 | {AA416CF5-F184-4573-B591-7ED42A294421}.Debug|Any CPU.Build.0 = Debug|Any CPU 16 | {AA416CF5-F184-4573-B591-7ED42A294421}.Debug|Any CPU.Deploy.0 = Debug|Any CPU 17 | {AA416CF5-F184-4573-B591-7ED42A294421}.Release|Any CPU.ActiveCfg = Release|Any CPU 18 | {AA416CF5-F184-4573-B591-7ED42A294421}.Release|Any CPU.Build.0 = Release|Any CPU 19 | {AA416CF5-F184-4573-B591-7ED42A294421}.Release|Any CPU.Deploy.0 = Release|Any CPU 20 | EndGlobalSection 21 | GlobalSection(SolutionProperties) = preSolution 22 | HideSolutionNode = FALSE 23 | EndGlobalSection 24 | GlobalSection(ExtensibilityGlobals) = postSolution 25 | SolutionGuid = {1B991B8B-9C61-481C-8B3C-CC2958974BA2} 26 | EndGlobalSection 27 | EndGlobal 28 | -------------------------------------------------------------------------------- /sql/ddo_azuresqldw_dw/ddo_azuresqldw_dw/External Resources/AzureDataLakeStorage.sql: -------------------------------------------------------------------------------- 1 | CREATE EXTERNAL DATA SOURCE [AzureDataLakeStorage] 2 | WITH ( 3 | TYPE = HADOOP, 4 | LOCATION = N'$(ADLSLocation)', 5 | CREDENTIAL = [ADLSCredentialKey] 6 | ); 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /sql/ddo_azuresqldw_dw/ddo_azuresqldw_dw/External Resources/ParquetFileFormat.sql: -------------------------------------------------------------------------------- 1 | CREATE EXTERNAL FILE FORMAT [ParquetFileFormat] 2 | WITH ( 3 | FORMAT_TYPE = PARQUET 4 | ); 5 | 6 | -------------------------------------------------------------------------------- /sql/ddo_azuresqldw_dw/ddo_azuresqldw_dw/Script.PostDeployment1.sql: -------------------------------------------------------------------------------- 1 | /* 2 | Post-Deployment Script Template 3 | -------------------------------------------------------------------------------------- 4 | This file contains SQL statements that will be appended to the build script. 5 | Use SQLCMD syntax to include a file in the post-deployment script. 6 | Example: :r .\myfile.sql 7 | Use SQLCMD syntax to reference a variable in the post-deployment script. 8 | Example: :setvar TableName MyTable 9 | SELECT * FROM [$(TableName)] 10 | -------------------------------------------------------------------------------------- 11 | */ 12 | 13 | :setvar ADLSLocation ADLSLocation 14 | :setvar ADLSCredentialKey ADLSCredentialKey 15 | 16 | ALTER EXTERNAL DATA SOURCE [AzureDataLakeStorage] SET LOCATION = '$(ADLSLocation)'; 17 | GO 18 | 19 | ALTER DATABASE SCOPED CREDENTIAL [ADLSCredentialKey] WITH IDENTITY = N'user', SECRET = '$(ADLSCredentialKey)'; 20 | GO 21 | -------------------------------------------------------------------------------- /sql/ddo_azuresqldw_dw/ddo_azuresqldw_dw/Security/ADLSCredentialKey.sql: -------------------------------------------------------------------------------- 1 | CREATE DATABASE SCOPED CREDENTIAL [ADLSCredentialKey] WITH IDENTITY = N'user', SECRET = '$(ADLSCredentialKey)'; 2 | 3 | -------------------------------------------------------------------------------- /sql/ddo_azuresqldw_dw/ddo_azuresqldw_dw/Security/MasterKeys.sql: -------------------------------------------------------------------------------- 1 | CREATE MASTER KEY; 2 | 3 | -------------------------------------------------------------------------------- /sql/ddo_azuresqldw_dw/ddo_azuresqldw_dw/Security/ext.sql: -------------------------------------------------------------------------------- 1 | CREATE SCHEMA [ext] 2 | AUTHORIZATION [dbo]; 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /sql/ddo_azuresqldw_dw/ddo_azuresqldw_dw/dbo/Stored Procedures/load_dw.sql: -------------------------------------------------------------------------------- 1 | CREATE PROC [dbo].[load_dw] @load_id [VARCHAR](50) AS 2 | BEGIN 3 | -- SET NOCOUNT ON added to prevent extra result sets from 4 | -- interfering with SELECT statements. 5 | SET NOCOUNT ON 6 | 7 | -- DIM TABLES 8 | 9 | TRUNCATE TABLE dbo.[dim_parking_bay]; 10 | INSERT INTO dbo.[dim_parking_bay] 11 | SELECT 12 | CAST([dim_parking_bay_id] AS UNIQUEIDENTIFIER), 13 | [bay_id], 14 | [marker_id], 15 | [meter_id], 16 | [rd_seg_id], 17 | [rd_seg_dsc], 18 | [load_id], 19 | [loaded_on] 20 | FROM ext.[dim_parking_bay]; 21 | 22 | -- 23 | TRUNCATE TABLE dbo.[dim_location]; 24 | INSERT INTO dbo.[dim_location] 25 | SELECT 26 | CAST([dim_location_id] AS UNIQUEIDENTIFIER), 27 | [lat], 28 | [lon], 29 | [load_id], 30 | [loaded_on] 31 | FROM ext.[dim_location]; 32 | 33 | -- 34 | TRUNCATE TABLE dbo.[dim_st_marker]; 35 | INSERT INTO dbo.[dim_st_marker] 36 | SELECT 37 | CAST([dim_st_marker_id] AS UNIQUEIDENTIFIER), 38 | [st_marker_id], 39 | [load_id], 40 | [loaded_on] 41 | FROM ext.[dim_st_marker]; 42 | 43 | 44 | -- FACT TABLES 45 | DELETE FROM dbo.[fact_parking] WHERE load_id=@load_id; 46 | INSERT INTO dbo.[fact_parking] 47 | SELECT 48 | [dim_date_id], 49 | [dim_time_id], 50 | CAST([dim_parking_bay_id] AS UNIQUEIDENTIFIER), 51 | CAST([dim_location_id] AS UNIQUEIDENTIFIER), 52 | CAST([dim_st_marker_id] AS UNIQUEIDENTIFIER), 53 | [status], 54 | [load_id], 55 | [loaded_on] 56 | FROM ext.[fact_parking] 57 | WHERE load_id=@load_id; 58 | END -------------------------------------------------------------------------------- /sql/ddo_azuresqldw_dw/ddo_azuresqldw_dw/dbo/Tables/dim_location.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE [dbo].[dim_location] ( 2 | [dim_location_id] UNIQUEIDENTIFIER NOT NULL, 3 | [lat] REAL NULL, 4 | [lon] REAL NULL, 5 | [load_id] NVARCHAR (50) NULL, 6 | [loaded_on] DATETIME NULL 7 | ) 8 | WITH (CLUSTERED COLUMNSTORE INDEX, DISTRIBUTION = REPLICATE); 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /sql/ddo_azuresqldw_dw/ddo_azuresqldw_dw/dbo/Tables/dim_parking_bay.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE [dbo].[dim_parking_bay] ( 2 | [dim_parking_bay_id] UNIQUEIDENTIFIER NOT NULL, 3 | [bay_id] INT NULL, 4 | [marker_id] NVARCHAR (50) NULL, 5 | [meter_id] NVARCHAR (50) NULL, 6 | [rd_seg_id] NVARCHAR (50) NULL, 7 | [rd_seg_dsc] NVARCHAR (500) NULL, 8 | [load_id] NVARCHAR (50) NULL, 9 | [loaded_on] DATETIME NULL 10 | ) 11 | WITH (CLUSTERED COLUMNSTORE INDEX, DISTRIBUTION = REPLICATE); 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /sql/ddo_azuresqldw_dw/ddo_azuresqldw_dw/dbo/Tables/dim_st_marker.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE [dbo].[dim_st_marker] ( 2 | [dim_st_marker_id] UNIQUEIDENTIFIER NULL, 3 | [st_marker_id] NVARCHAR (50) NULL, 4 | [load_id] NVARCHAR (50) NULL, 5 | [loaded_on] DATETIME NULL 6 | ) 7 | WITH (CLUSTERED COLUMNSTORE INDEX, DISTRIBUTION = REPLICATE); 8 | 9 | -------------------------------------------------------------------------------- /sql/ddo_azuresqldw_dw/ddo_azuresqldw_dw/dbo/Tables/fact_parking.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE [dbo].[fact_parking] ( 2 | [dim_date_id] NVARCHAR (50) NULL, 3 | [dim_time_id] NVARCHAR (50) NULL, 4 | [dim_parking_bay_id] UNIQUEIDENTIFIER NULL, 5 | [dim_location_id] UNIQUEIDENTIFIER NULL, 6 | [dim_st_marker_id] UNIQUEIDENTIFIER NULL, 7 | [status] NVARCHAR (50) NULL, 8 | [load_id] NVARCHAR (50) NULL, 9 | [loaded_on] DATETIME NULL 10 | ) 11 | WITH (CLUSTERED COLUMNSTORE INDEX, DISTRIBUTION = HASH([dim_parking_bay_id])); 12 | 13 | -------------------------------------------------------------------------------- /sql/ddo_azuresqldw_dw/ddo_azuresqldw_dw/ddo_azuresqldw_dw.sqlproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | Debug 5 | AnyCPU 6 | ddo_azuresqldw_dw 7 | 2.0 8 | 4.1 9 | {aa416cf5-f184-4573-b591-7ed42a294421} 10 | Microsoft.Data.Tools.Schema.Sql.SqlDwDatabaseSchemaProvider 11 | Database 12 | 13 | 14 | ddo_azuresqldw_dw 15 | ddo_azuresqldw_dw 16 | 1033, CI 17 | BySchemaAndSchemaType 18 | True 19 | v4.5 20 | CS 21 | Properties 22 | False 23 | True 24 | True 25 | 26 | 27 | bin\Release\ 28 | $(MSBuildProjectName).sql 29 | False 30 | pdbonly 31 | true 32 | false 33 | true 34 | prompt 35 | 4 36 | 37 | 38 | bin\Debug\ 39 | $(MSBuildProjectName).sql 40 | false 41 | true 42 | full 43 | false 44 | true 45 | true 46 | prompt 47 | 4 48 | 49 | 50 | 11.0 51 | 52 | True 53 | 11.0 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | Off 72 | 73 | 74 | 75 | 76 | 77 | 78 | Off 79 | 80 | 81 | Off 82 | 83 | 84 | 85 | Off 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | $(SqlCmdVar__2) 96 | 97 | 98 | 99 | 100 | $(SqlCmdVar__1) 101 | 102 | 103 | -------------------------------------------------------------------------------- /sql/ddo_azuresqldw_dw/ddo_azuresqldw_dw/ext/External Tables/dim_location.sql: -------------------------------------------------------------------------------- 1 | CREATE EXTERNAL TABLE [ext].[dim_location] ( 2 | [dim_location_id] NVARCHAR (50) NULL, 3 | [lat] REAL NULL, 4 | [lon] REAL NULL, 5 | [load_id] NVARCHAR (50) NULL, 6 | [loaded_on] DATETIME NULL 7 | ) 8 | WITH ( 9 | DATA_SOURCE = [AzureDataLakeStorage], 10 | LOCATION = N'data/dw/dim_location/', 11 | FILE_FORMAT = [ParquetFileFormat], 12 | REJECT_TYPE = VALUE, 13 | REJECT_VALUE = 0 14 | ); 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /sql/ddo_azuresqldw_dw/ddo_azuresqldw_dw/ext/External Tables/dim_parking_bay.sql: -------------------------------------------------------------------------------- 1 | CREATE EXTERNAL TABLE [ext].[dim_parking_bay] ( 2 | [dim_parking_bay_id] NVARCHAR (50) NULL, 3 | [bay_id] INT NULL, 4 | [marker_id] NVARCHAR (50) NULL, 5 | [meter_id] NVARCHAR (50) NULL, 6 | [rd_seg_dsc] NVARCHAR (MAX) NULL, 7 | [rd_seg_id] NVARCHAR (50) NULL, 8 | [load_id] NVARCHAR (50) NULL, 9 | [loaded_on] DATETIME NULL 10 | ) 11 | WITH ( 12 | DATA_SOURCE = [AzureDataLakeStorage], 13 | LOCATION = N'data/dw/dim_parking_bay/', 14 | FILE_FORMAT = [ParquetFileFormat], 15 | REJECT_TYPE = VALUE, 16 | REJECT_VALUE = 0 17 | ); 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /sql/ddo_azuresqldw_dw/ddo_azuresqldw_dw/ext/External Tables/dim_st_marker.sql: -------------------------------------------------------------------------------- 1 | CREATE EXTERNAL TABLE [ext].[dim_st_marker] ( 2 | [dim_st_marker_id] NVARCHAR (50) NULL, 3 | [st_marker_id] NVARCHAR (50) NULL, 4 | [load_id] NVARCHAR (50) NULL, 5 | [loaded_on] DATETIME NULL 6 | ) 7 | WITH ( 8 | DATA_SOURCE = [AzureDataLakeStorage], 9 | LOCATION = N'data/dw/dim_st_marker/', 10 | FILE_FORMAT = [ParquetFileFormat], 11 | REJECT_TYPE = VALUE, 12 | REJECT_VALUE = 0 13 | ); 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /sql/ddo_azuresqldw_dw/ddo_azuresqldw_dw/ext/External Tables/fact_parking.sql: -------------------------------------------------------------------------------- 1 | CREATE EXTERNAL TABLE [ext].[fact_parking] ( 2 | [dim_date_id] NVARCHAR (50) NULL, 3 | [dim_time_id] NVARCHAR (50) NULL, 4 | [dim_parking_bay_id] NVARCHAR (50) NULL, 5 | [dim_location_id] NVARCHAR (50) NULL, 6 | [dim_st_marker_id] NVARCHAR (50) NULL, 7 | [status] NVARCHAR (50) NULL, 8 | [load_id] NVARCHAR (50) NULL, 9 | [loaded_on] DATETIME NULL 10 | ) 11 | WITH ( 12 | DATA_SOURCE = [AzureDataLakeStorage], 13 | LOCATION = N'data/dw/fact_parking/', 14 | FILE_FORMAT = [ParquetFileFormat], 15 | REJECT_TYPE = VALUE, 16 | REJECT_VALUE = 0 17 | ); 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /src/ddo_transform/.editorconfig: -------------------------------------------------------------------------------- 1 | # http://editorconfig.org 2 | 3 | root = true 4 | 5 | [*] 6 | indent_style = space 7 | indent_size = 4 8 | trim_trailing_whitespace = true 9 | insert_final_newline = true 10 | charset = utf-8 11 | end_of_line = lf 12 | 13 | [*.bat] 14 | indent_style = tab 15 | end_of_line = crlf 16 | 17 | [LICENSE] 18 | insert_final_newline = false 19 | 20 | [Makefile] 21 | indent_style = tab 22 | -------------------------------------------------------------------------------- /src/ddo_transform/AUTHORS.rst: -------------------------------------------------------------------------------- 1 | ======= 2 | Credits 3 | ======= 4 | 5 | Development Lead 6 | ---------------- 7 | 8 | * Lace Lofranco 9 | 10 | Contributors 11 | ------------ 12 | 13 | None yet. Why not be the first? 14 | -------------------------------------------------------------------------------- /src/ddo_transform/CONTRIBUTING.rst: -------------------------------------------------------------------------------- 1 | .. highlight:: shell 2 | 3 | ============ 4 | Contributing 5 | ============ 6 | 7 | Contributions are welcome, and they are greatly appreciated! Every little bit 8 | helps, and credit will always be given. 9 | 10 | You can contribute in many ways: 11 | 12 | Types of Contributions 13 | ---------------------- 14 | 15 | Report Bugs 16 | ~~~~~~~~~~~ 17 | 18 | Report bugs at https://github.com/devlace/ddo_transform/issues. 19 | 20 | If you are reporting a bug, please include: 21 | 22 | * Your operating system name and version. 23 | * Any details about your local setup that might be helpful in troubleshooting. 24 | * Detailed steps to reproduce the bug. 25 | 26 | Fix Bugs 27 | ~~~~~~~~ 28 | 29 | Look through the GitHub issues for bugs. Anything tagged with "bug" and "help 30 | wanted" is open to whoever wants to implement it. 31 | 32 | Implement Features 33 | ~~~~~~~~~~~~~~~~~~ 34 | 35 | Look through the GitHub issues for features. Anything tagged with "enhancement" 36 | and "help wanted" is open to whoever wants to implement it. 37 | 38 | Write Documentation 39 | ~~~~~~~~~~~~~~~~~~~ 40 | 41 | ddo_transform could always use more documentation, whether as part of the 42 | official ddo_transform docs, in docstrings, or even on the web in blog posts, 43 | articles, and such. 44 | 45 | Submit Feedback 46 | ~~~~~~~~~~~~~~~ 47 | 48 | The best way to send feedback is to file an issue at https://github.com/devlace/ddo_transform/issues. 49 | 50 | If you are proposing a feature: 51 | 52 | * Explain in detail how it would work. 53 | * Keep the scope as narrow as possible, to make it easier to implement. 54 | * Remember that this is a volunteer-driven project, and that contributions 55 | are welcome :) 56 | 57 | Get Started! 58 | ------------ 59 | 60 | Ready to contribute? Here's how to set up `ddo_transform` for local development. 61 | 62 | 1. Fork the `ddo_transform` repo on GitHub. 63 | 2. Clone your fork locally:: 64 | 65 | $ git clone git@github.com:your_name_here/ddo_transform.git 66 | 67 | 3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, this is how you set up your fork for local development:: 68 | 69 | $ mkvirtualenv ddo_transform 70 | $ cd ddo_transform/ 71 | $ python setup.py develop 72 | 73 | 4. Create a branch for local development:: 74 | 75 | $ git checkout -b name-of-your-bugfix-or-feature 76 | 77 | Now you can make your changes locally. 78 | 79 | 5. When you're done making changes, check that your changes pass flake8 and the 80 | tests, including testing other Python versions with tox:: 81 | 82 | $ flake8 ddo_transform tests 83 | $ python setup.py test or py.test 84 | $ tox 85 | 86 | To get flake8 and tox, just pip install them into your virtualenv. 87 | 88 | 6. Commit your changes and push your branch to GitHub:: 89 | 90 | $ git add . 91 | $ git commit -m "Your detailed description of your changes." 92 | $ git push origin name-of-your-bugfix-or-feature 93 | 94 | 7. Submit a pull request through the GitHub website. 95 | 96 | Pull Request Guidelines 97 | ----------------------- 98 | 99 | Before you submit a pull request, check that it meets these guidelines: 100 | 101 | 1. The pull request should include tests. 102 | 2. If the pull request adds functionality, the docs should be updated. Put 103 | your new functionality into a function with a docstring, and add the 104 | feature to the list in README.rst. 105 | 3. The pull request should work for Python 2.7, 3.4, 3.5 and 3.6, and for PyPy. Check 106 | https://travis-ci.org/devlace/ddo_transform/pull_requests 107 | and make sure that the tests pass for all supported Python versions. 108 | 109 | Tips 110 | ---- 111 | 112 | To run a subset of tests:: 113 | 114 | $ py.test tests.test_ddo_transform 115 | 116 | 117 | Deploying 118 | --------- 119 | 120 | A reminder for the maintainers on how to deploy. 121 | Make sure all your changes are committed (including an entry in HISTORY.rst). 122 | Then run:: 123 | 124 | $ bumpversion patch # possible: major / minor / patch 125 | $ git push 126 | $ git push --tags 127 | 128 | Travis will then deploy to PyPI if tests pass. 129 | -------------------------------------------------------------------------------- /src/ddo_transform/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.7.3 2 | 3 | # Install OpenJDK 8 and Python 4 | RUN \ 5 | apt-get update && \ 6 | apt-get install -y openjdk-8-jdk && \ 7 | rm -rf /var/lib/apt/lists/* 8 | 9 | WORKDIR /usr/ddo_transform 10 | 11 | COPY . . 12 | 13 | RUN pip install --no-cache-dir -r requirements_dev.txt && \ 14 | make clean && \ 15 | make lint && \ 16 | make test && \ 17 | make docs && \ 18 | make dist 19 | 20 | -------------------------------------------------------------------------------- /src/ddo_transform/HISTORY.rst: -------------------------------------------------------------------------------- 1 | ======= 2 | History 3 | ======= 4 | 5 | 0.1.0 (2019-01-29) 6 | ------------------ 7 | 8 | * First release on PyPI. 9 | -------------------------------------------------------------------------------- /src/ddo_transform/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include AUTHORS.rst 2 | include CONTRIBUTING.rst 3 | include HISTORY.rst 4 | include LICENSE 5 | include README.rst 6 | 7 | include data/On-street_Parking_Bay_Sensors.csv 8 | recursive-include tests * 9 | recursive-exclude * __pycache__ 10 | recursive-exclude * *.py[co] 11 | 12 | recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif 13 | -------------------------------------------------------------------------------- /src/ddo_transform/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: clean clean-test clean-pyc clean-build docs help 2 | .DEFAULT_GOAL := help 3 | 4 | define BROWSER_PYSCRIPT 5 | import os, webbrowser, sys 6 | 7 | try: 8 | from urllib import pathname2url 9 | except: 10 | from urllib.request import pathname2url 11 | 12 | webbrowser.open("file://" + pathname2url(os.path.abspath(sys.argv[1]))) 13 | endef 14 | export BROWSER_PYSCRIPT 15 | 16 | define PRINT_HELP_PYSCRIPT 17 | import re, sys 18 | 19 | for line in sys.stdin: 20 | match = re.match(r'^([a-zA-Z_-]+):.*?## (.*)$$', line) 21 | if match: 22 | target, help = match.groups() 23 | print("%-20s %s" % (target, help)) 24 | endef 25 | export PRINT_HELP_PYSCRIPT 26 | 27 | BROWSER := python -c "$$BROWSER_PYSCRIPT" 28 | 29 | help: 30 | @python -c "$$PRINT_HELP_PYSCRIPT" < $(MAKEFILE_LIST) 31 | 32 | clean: clean-build clean-pyc clean-test ## remove all build, test, coverage and Python artifacts 33 | 34 | clean-build: ## remove build artifacts 35 | rm -fr build/ 36 | rm -fr dist/ 37 | rm -fr .eggs/ 38 | find . -name '*.egg-info' -exec rm -fr {} + 39 | find . -name '*.egg' -exec rm -f {} + 40 | 41 | clean-pyc: ## remove Python file artifacts 42 | find . -name '*.pyc' -exec rm -f {} + 43 | find . -name '*.pyo' -exec rm -f {} + 44 | find . -name '*~' -exec rm -f {} + 45 | find . -name '__pycache__' -exec rm -fr {} + 46 | 47 | clean-test: ## remove test and coverage artifacts 48 | rm -fr .tox/ 49 | rm -f .coverage 50 | rm -fr htmlcov/ 51 | rm -fr .pytest_cache 52 | 53 | lint: ## check style with flake8 54 | flake8 ddo_transform tests 55 | 56 | test: ## run tests quickly with the default Python 57 | PYTHONPATH=`pwd` py.test 58 | 59 | test-all: ## run tests on every Python version with tox 60 | tox 61 | 62 | coverage: ## check code coverage quickly with the default Python 63 | coverage run --source ddo_transform -m pytest 64 | coverage report -m 65 | coverage html 66 | $(BROWSER) htmlcov/index.html 67 | 68 | docs: ## generate Sphinx HTML documentation, including API docs 69 | rm -f docs/ddo_transform.rst 70 | rm -f docs/modules.rst 71 | sphinx-apidoc -o docs/ ddo_transform 72 | $(MAKE) -C docs clean 73 | $(MAKE) -C docs html 74 | $(BROWSER) docs/_build/html/index.html 75 | 76 | servedocs: docs ## compile the docs watching for changes 77 | watchmedo shell-command -p '*.rst' -c '$(MAKE) -C docs html' -R -D . 78 | 79 | release: dist ## package and upload a release 80 | twine upload dist/* 81 | 82 | dist: clean ## builds source and wheel package 83 | sed -i "s/{{version}}/$(package_version)/g" ddo_transform/__init__.py 84 | python setup.py sdist 85 | python setup.py bdist_wheel 86 | ls -l dist 87 | 88 | install: clean ## install the package to the active Python's site-packages 89 | python setup.py install 90 | 91 | installedit: clean ## install the package while dynamically picking up changes to source files 92 | pip install --editable . 93 | 94 | uploaddatabricks: dist 95 | package_name="$$(find dist/*.whl -printf "%f\n")"; \ 96 | databricks fs cp --overwrite dist/"$$package_name" "$(DATABRICKS_DBFS_UPLOAD_PATH)/libs/$$package_name";\ 97 | 98 | installdatabricks: dist uploaddatabricks ## install the package in databricks 99 | package_name="$$(find dist/*.whl -printf "%f\n")"; \ 100 | databricks libraries install --cluster-id $(DATABRICKS_CLUSTER_ID) --whl "$(DATABRICKS_DBFS_UPLOAD_PATH)/libs/$$package_name" 101 | databricks clusters restart --cluster-id $(DATABRICKS_CLUSTER_ID) -------------------------------------------------------------------------------- /src/ddo_transform/README.rst: -------------------------------------------------------------------------------- 1 | ============= 2 | ddo_transform 3 | ============= 4 | 5 | 6 | .. image:: https://dev.azure.com/msdevlace/DataDevOps/_apis/build/status/DDO-Python-CI-Artifacts 7 | :target: https://dev.azure.com/msdevlace/DataDevOps/_build/latest?definitionId=23 8 | :alt: Build Status 9 | 10 | 11 | This package contains all business/data transformation logic for ETL pipeline. 12 | 13 | * Free software: MIT license 14 | * Documentation: https://ddo-transform.readthedocs.io. 15 | 16 | 17 | Credits 18 | ------- 19 | 20 | This package was created with Cookiecutter_ and the `audreyr/cookiecutter-pypackage`_ project template. 21 | 22 | .. _Cookiecutter: https://github.com/audreyr/cookiecutter 23 | .. _`audreyr/cookiecutter-pypackage`: https://github.com/audreyr/cookiecutter-pypackage 24 | -------------------------------------------------------------------------------- /src/ddo_transform/azure-pipelines-ci-artifacts.yml: -------------------------------------------------------------------------------- 1 | # Starter pipeline 2 | # Start with a minimal pipeline that you can customize to build and deploy your code. 3 | # Add steps that build, run tests, deploy, and more: 4 | # https://aka.ms/yaml 5 | 6 | trigger: 7 | branches: 8 | include: 9 | - master 10 | 11 | variables: 12 | WORKING_DIR: 'src/ddo_transform' 13 | PACKAGE_MAJOR_VERSION: 1 14 | PACKAGE_MINOR_VERSION: 1 15 | PACKAGE_PATCH_VERSION: $(Build.BuildId) 16 | SQL_DW_PATH: 'sql/ddo_azuresqldw_dw' 17 | SQL_DW_SOLUTION_NAME: 'ddo_azuresqldw_dw' 18 | SQL_DW_SOLUTION: '$(SQL_DW_PATH)/$(SQL_DW_SOLUTION_NAME).sln' 19 | BUILD_PLATFORM: 'Any CPU' 20 | BUILD_CONFIGURATION: 'Release' 21 | 22 | stages: 23 | - stage: 'validate_pr' 24 | displayName: 'Validate PR' 25 | condition: and(always(), eq(variables['Build.Reason'], 'PullRequest')) 26 | jobs: 27 | - job: 'validate_python_packages' 28 | displayName: 'Validate Python Packages' 29 | pool: 30 | vmImage: 'Ubuntu-16.04' 31 | steps: 32 | - task: UsePythonVersion@0 33 | inputs: 34 | versionSpec: '3.6' 35 | architecture: 'x64' 36 | 37 | - script: pip install -r requirements_dev.txt && pip install -r requirements.txt 38 | workingDirectory: $(WORKING_DIR) 39 | displayName: 'Install requirements' 40 | 41 | - script: make lint 42 | workingDirectory: $(WORKING_DIR) 43 | displayName: 'Run lint' 44 | 45 | - script: make test 46 | workingDirectory: $(WORKING_DIR) 47 | displayName: 'Run tests' 48 | 49 | - job: 'validate_sql_packages' 50 | displayName: 'Validate SQL Packages' 51 | pool: 52 | vmImage: 'windows-latest' 53 | steps: 54 | - task: NuGetToolInstaller@1 55 | 56 | - task: NuGetCommand@2 57 | inputs: 58 | restoreSolution: '$(SQL_DW_SOLUTION)' 59 | 60 | - task: VSBuild@1 61 | inputs: 62 | solution: '$(SQL_DW_SOLUTION)' 63 | platform: '$(BUILD_PLATFORM)' 64 | configuration: '$(BUILD_CONFIGURATION)' 65 | - task: VSTest@2 66 | inputs: 67 | platform: '$(BUILD_PLATFORM)' 68 | configuration: '$(BUILD_CONFIGURATION)' 69 | 70 | - stage: 'publish_artifacts' 71 | displayName: 'Publish Build Artifacts' 72 | condition: and(always(), contains(variables['Build.SourceBranch'], 'refs/heads/master')) 73 | jobs: 74 | - job: 'publish_python_packages' 75 | displayName: 'Publish Python Packages' 76 | pool: 77 | vmImage: 'Ubuntu-16.04' 78 | steps: 79 | - task: UsePythonVersion@0 80 | inputs: 81 | versionSpec: '3.6' 82 | architecture: 'x64' 83 | 84 | - script: pip install -r requirements_dev.txt && pip install -r requirements.txt 85 | workingDirectory: $(WORKING_DIR) 86 | displayName: 'Install requirements' 87 | 88 | - script: make dist 89 | env: 90 | package_version: $(PACKAGE_MAJOR_VERSION).$(PACKAGE_MINOR_VERSION).$(PACKAGE_PATCH_VERSION) 91 | workingDirectory: $(WORKING_DIR) 92 | displayName: 'Create wheel package' 93 | 94 | - task: PublishBuildArtifacts@1 95 | inputs: 96 | PathtoPublish: '$(WORKING_DIR)/dist' 97 | ArtifactName: 'dist' 98 | displayName: 'Publish Dist Artifacts' 99 | 100 | - job: 'publish_static_artifacts' 101 | displayName: 'Publish Static Artifacts' 102 | pool: 103 | vmImage: 'Ubuntu-16.04' 104 | steps: 105 | - task: PublishBuildArtifacts@1 106 | inputs: 107 | PathtoPublish: 'databricks' 108 | ArtifactName: 'databricks' 109 | displayName: 'Publish Databricks Artifacts' 110 | 111 | - task: PublishBuildArtifacts@1 112 | inputs: 113 | PathtoPublish: 'adf/_scripts/deploymentadf.ps1' 114 | ArtifactName: 'adf_scripts' 115 | displayName: 'Publish ADF Scripts' 116 | 117 | - job: 'publish_sql_packages' 118 | displayName: 'Publish SQL Packages' 119 | pool: 120 | vmImage: 'windows-latest' 121 | steps: 122 | - task: NuGetToolInstaller@1 123 | 124 | - task: NuGetCommand@2 125 | inputs: 126 | restoreSolution: '$(SQL_DW_SOLUTION)' 127 | 128 | - task: VSBuild@1 129 | inputs: 130 | solution: '$(SQL_DW_SOLUTION)' 131 | platform: '$(BUILD_PLATFORM)' 132 | configuration: '$(BUILD_CONFIGURATION)' 133 | - task: VSTest@2 134 | inputs: 135 | platform: '$(BUILD_PLATFORM)' 136 | configuration: '$(BUILD_CONFIGURATION)' 137 | 138 | - task: PublishBuildArtifacts@1 139 | inputs: 140 | PathtoPublish: '$(SQL_DW_PATH)/$(SQL_DW_SOLUTION_NAME)/bin/$(BUILD_CONFIGURATION)/ddo_azuresqldw_dw.dacpac' 141 | ArtifactName: 'sql_dw_dacpac' 142 | displayName: 'Publish SQL DACPAC' -------------------------------------------------------------------------------- /src/ddo_transform/azure-pipelines-ci-qa.yml: -------------------------------------------------------------------------------- 1 | # Starter pipeline 2 | # Start with a minimal pipeline that you can customize to build and deploy your code. 3 | # Add steps that build, run tests, deploy, and more: 4 | # https://aka.ms/yaml 5 | 6 | pr: 7 | branches: 8 | include: 9 | - master 10 | - releases/* 11 | paths: 12 | include: 13 | - src/ddo_transform/* 14 | 15 | variables: 16 | WORKING_DIR: 'src/ddo_transform' 17 | 18 | pool: 19 | vmImage: 'Ubuntu-16.04' 20 | 21 | steps: 22 | - task: UsePythonVersion@0 23 | inputs: 24 | versionSpec: '3.6' 25 | architecture: 'x64' 26 | 27 | - script: pip install -r requirements_dev.txt && pip install -r requirements.txt 28 | workingDirectory: $(WORKING_DIR) 29 | displayName: 'Install requirements' 30 | 31 | - script: make lint && make tests 32 | workingDirectory: $(WORKING_DIR) 33 | displayName: 'Run lint tests' -------------------------------------------------------------------------------- /src/ddo_transform/ddo_transform/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """Top-level package for ddo_transform.""" 4 | 5 | __author__ = """Lace Lofranco""" 6 | __email__ = 'lace.lofranco@microsoft.com' 7 | __version__ = '1.0.0' 8 | -------------------------------------------------------------------------------- /src/ddo_transform/ddo_transform/standardize.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """Main module.""" 4 | 5 | 6 | from pyspark.sql import DataFrame 7 | from pyspark.sql.functions import lit, col, to_timestamp 8 | from pyspark.sql.types import ( 9 | ArrayType, StructType, StructField, StringType, DoubleType) # noqa: E501 10 | 11 | 12 | def get_schema(schema_name): 13 | if schema_name == 'in_parkingbay_schema': 14 | schema = StructType([ 15 | StructField('the_geom', StructType([ 16 | StructField('coordinates', ArrayType( 17 | ArrayType(ArrayType(ArrayType(DoubleType()))) 18 | )), 19 | StructField('type', StringType()) 20 | ])), 21 | StructField('marker_id', StringType()), 22 | StructField('meter_id', StringType()), 23 | StructField('bay_id', StringType(), False), 24 | StructField('last_edit', StringType()), 25 | StructField('rd_seg_id', StringType()), 26 | StructField('rd_seg_dsc', StringType()), 27 | ]) 28 | elif schema_name == 'in_sensordata_schema': 29 | schema = StructType([ 30 | StructField('bay_id', StringType(), False), 31 | StructField('st_marker_id', StringType()), 32 | StructField('status', StringType()), 33 | StructField('location', StructType([ 34 | StructField('coordinates', ArrayType(DoubleType())), 35 | StructField('type', StringType()) 36 | ])), 37 | StructField('lat', StringType()), 38 | StructField('lon', StringType()) 39 | ]) 40 | return schema 41 | 42 | 43 | def standardize_parking_bay(parkingbay_sdf: DataFrame, load_id, loaded_on): 44 | t_parkingbay_sdf = ( 45 | parkingbay_sdf 46 | .withColumn("last_edit", to_timestamp("last_edit", "YYYYMMddHHmmss")) 47 | .select( 48 | col("bay_id").cast("int").alias("bay_id"), 49 | "last_edit", 50 | "marker_id", 51 | "meter_id", 52 | "rd_seg_dsc", 53 | col("rd_seg_id").cast("int").alias("rd_seg_id"), 54 | "the_geom", 55 | lit(load_id).alias("load_id"), 56 | lit(loaded_on.isoformat()).alias("loaded_on") 57 | ) 58 | ).cache() 59 | # Data Validation 60 | good_records = t_parkingbay_sdf.filter(col("bay_id").isNotNull()) 61 | bad_records = t_parkingbay_sdf.filter(col("bay_id").isNull()) 62 | return good_records, bad_records 63 | 64 | 65 | def standardize_sensordata(sensordata_sdf: DataFrame, load_id, loaded_on): 66 | t_sensordata_sdf = ( 67 | sensordata_sdf 68 | .select( 69 | col("bay_id").cast("int").alias("bay_id"), 70 | "st_marker_id", 71 | col("lat").cast("float").alias("lat"), 72 | col("lon").cast("float").alias("lon"), 73 | "location", 74 | "status", 75 | lit(load_id).alias("load_id"), 76 | lit(loaded_on.isoformat()).alias("loaded_on") 77 | ) 78 | ).cache() 79 | # Data Validation 80 | good_records = t_sensordata_sdf.filter(col("bay_id").isNotNull()) 81 | bad_records = t_sensordata_sdf.filter(col("bay_id").isNull()) 82 | return good_records, bad_records 83 | 84 | 85 | if __name__ == "__main__": 86 | from pyspark.sql import SparkSession 87 | import datetime 88 | import os 89 | 90 | spark = SparkSession.builder\ 91 | .master("local[2]")\ 92 | .appName("standardize.py")\ 93 | .getOrCreate() 94 | spark.sparkContext.setLogLevel("ERROR") 95 | 96 | THIS_DIR = os.path.dirname(os.path.abspath(__file__)) 97 | 98 | schema = get_schema("in_parkingbay_schema") 99 | parkingbay_sdf = spark.read.json(os.path.join(THIS_DIR, "../data/MelbParkingBayData.json"), 100 | multiLine=True, 101 | schema=schema) 102 | load_id = 1 103 | loaded_on = datetime.datetime.now() 104 | t_parkingbay_sdf, t_parkingbay_malformed_sdf = standardize_parking_bay(parkingbay_sdf, load_id, loaded_on) 105 | t_parkingbay_sdf.write.json('./out/parkingbay_sdf') 106 | -------------------------------------------------------------------------------- /src/ddo_transform/ddo_transform/transform.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """Main module.""" 4 | 5 | 6 | import uuid 7 | from pyspark.sql import DataFrame 8 | from pyspark.sql.functions import lit, udf, col, when 9 | from pyspark.sql.types import ( 10 | ArrayType, StructType, StructField, StringType, TimestampType, DoubleType, IntegerType, FloatType) # noqa: E501 11 | 12 | uuidUdf = udf(lambda: str(uuid.uuid4()), StringType()) 13 | EMPTY_UUID = '00000000-0000-0000-0000-000000000000' 14 | 15 | 16 | def get_schema(schema_name): 17 | schema = None 18 | if schema_name == 'interim_parkingbay_schema': 19 | schema = StructType([ 20 | StructField('bay_id', IntegerType(), False), 21 | StructField('last_edit', StringType()), 22 | StructField('marker_id', StringType()), 23 | StructField('meter_id', StringType()), 24 | StructField('rd_seg_id', StringType()), 25 | StructField('rd_seg_dsc', StringType()), 26 | StructField('the_geom', StructType([ 27 | StructField('coordinates', ArrayType( 28 | ArrayType(ArrayType(ArrayType(DoubleType()))) 29 | )), 30 | StructField('type', StringType()) 31 | ])), 32 | StructField('load_id', StringType()), 33 | StructField('loaded_on', TimestampType()) 34 | ]) 35 | elif schema_name == 'interim_sensor': 36 | schema = StructType([ 37 | StructField('bay_id', IntegerType(), False), 38 | StructField('st_marker_id', StringType()), 39 | StructField('lat', FloatType()), 40 | StructField('lon', FloatType()), 41 | StructField('location', StructType([ 42 | StructField('coordinates', ArrayType(DoubleType())), 43 | StructField('type', StringType()) 44 | ]), False), 45 | StructField('status', StringType()), 46 | StructField('load_id', StringType()), 47 | StructField('loaded_on', TimestampType()) 48 | ]) 49 | elif schema_name == 'dw_dim_parking_bay': 50 | schema = StructType([ 51 | StructField('dim_parking_bay_id', StringType(), False), 52 | StructField('bay_id', IntegerType(), False), 53 | StructField('marker_id', StringType()), 54 | StructField('meter_id', StringType()), 55 | StructField('rd_seg_id', StringType()), 56 | StructField('rd_seg_dsc', StringType()), 57 | StructField('the_geom', StructType([ 58 | StructField('coordinates', ArrayType( 59 | ArrayType(ArrayType(ArrayType(DoubleType()))) 60 | )), 61 | StructField('type', StringType()) 62 | ])), 63 | StructField('load_id', StringType()), 64 | StructField('loaded_on', TimestampType()) 65 | ]) 66 | elif schema_name == 'dw_dim_location': 67 | schema = StructType([ 68 | StructField('dim_location_id', StringType(), False), 69 | StructField('location', StructType([ 70 | StructField('coordinates', ArrayType(DoubleType())), 71 | StructField('type', StringType()) 72 | ]), False), 73 | StructField('lat', FloatType()), 74 | StructField('lon', FloatType()), 75 | StructField('load_id', StringType()), 76 | StructField('loaded_on', TimestampType()) 77 | ]) 78 | elif schema_name == 'dw_dim_st_marker': 79 | schema = StructType([ 80 | StructField('dim_st_marker_id', StringType(), False), 81 | StructField('st_marker_id', StringType()), 82 | StructField('load_id', StringType()), 83 | StructField('loaded_on', TimestampType()) 84 | ]) 85 | return schema 86 | 87 | 88 | def process_dim_parking_bay(parkingbay_sdf: DataFrame, 89 | dim_parkingbay_sdf: DataFrame, 90 | load_id, loaded_on): 91 | """Transform incoming parkingbay_sdf data and existing dim_parking_bay 92 | into the latest version of records of dim_parking_bay data. 93 | """ 94 | # Get landing data distint rows 95 | parkingbay_sdf = parkingbay_sdf\ 96 | .select([ 97 | "bay_id", 98 | "marker_id", 99 | "meter_id", 100 | "rd_seg_dsc", 101 | "rd_seg_id"])\ 102 | .distinct() 103 | 104 | # Using a left_outer join on the business key (bay_id), 105 | # identify rows that do NOT EXIST in landing data that EXISTS in existing Dimension table 106 | oldrows_parkingbay_sdf = dim_parkingbay_sdf.alias("dim")\ 107 | .join(parkingbay_sdf, "bay_id", "left_outer")\ 108 | .where(parkingbay_sdf["bay_id"].isNull())\ 109 | .select(col("dim.*")) 110 | 111 | # Using a left_outer join on the business key (bay_id), 112 | # Identify rows that EXISTS in incoming landing data that does also EXISTS in existing Dimension table 113 | # and take the values of the incoming landing data. That is, we update existing table values. 114 | existingrows_parkingbay_sdf = parkingbay_sdf.alias("pb")\ 115 | .join(dim_parkingbay_sdf.alias("dim"), "bay_id", "left_outer")\ 116 | .where(dim_parkingbay_sdf["bay_id"].isNotNull())\ 117 | .select( 118 | col("dim.dim_parking_bay_id"), 119 | col("pb.bay_id"), 120 | col("pb.marker_id"), 121 | col("pb.meter_id"), 122 | col("pb.rd_seg_dsc"), 123 | col("pb.rd_seg_id") 124 | ) 125 | 126 | # Using a left_outer join on the business key (bay_id), 127 | # Identify rows that EXISTS in landing data that does NOT EXISTS in existing Dimension table 128 | newrows_parkingbay_sdf = parkingbay_sdf.alias("pb")\ 129 | .join(dim_parkingbay_sdf, "bay_id", "left_outer")\ 130 | .where(dim_parkingbay_sdf["bay_id"].isNull())\ 131 | .select(col("pb.*")) 132 | 133 | # Add load_id, loaded_at and dim_parking_bay_id 134 | existingrows_parkingbay_sdf = existingrows_parkingbay_sdf.withColumn("load_id", lit(load_id))\ 135 | .withColumn("loaded_on", lit(loaded_on.isoformat()).cast("timestamp")) 136 | newrows_parkingbay_sdf = newrows_parkingbay_sdf.withColumn("load_id", lit(load_id))\ 137 | .withColumn("loaded_on", lit(loaded_on.isoformat()).cast("timestamp"))\ 138 | .withColumn("dim_parking_bay_id", uuidUdf()) 139 | 140 | # Select relevant columns 141 | relevant_cols = [ 142 | "dim_parking_bay_id", 143 | "bay_id", 144 | "marker_id", 145 | "meter_id", 146 | "rd_seg_dsc", 147 | "rd_seg_id", 148 | "load_id", 149 | "loaded_on" 150 | ] 151 | oldrows_parkingbay_sdf = oldrows_parkingbay_sdf.select(relevant_cols) 152 | existingrows_parkingbay_sdf = existingrows_parkingbay_sdf.select(relevant_cols) 153 | newrows_parkingbay_sdf = newrows_parkingbay_sdf.select(relevant_cols) 154 | 155 | allrows_parkingbay_sdf = oldrows_parkingbay_sdf\ 156 | .union(existingrows_parkingbay_sdf)\ 157 | .union(newrows_parkingbay_sdf) 158 | 159 | return allrows_parkingbay_sdf 160 | 161 | 162 | def process_dim_location(sensordata_sdf: DataFrame, dim_location: DataFrame, 163 | load_id, loaded_on): 164 | """Transform sensordata into dim_location""" 165 | 166 | # Get landing data distint rows 167 | sensordata_sdf = sensordata_sdf\ 168 | .select(["lat", "lon"]).distinct() 169 | 170 | # Using a left_outer join 171 | # identify rows that do NOT EXIST in landing data that EXISTS in existing Dimension table 172 | oldrows_sdf = dim_location.alias("dim")\ 173 | .join(sensordata_sdf, ["lat", "lon"], "left_outer")\ 174 | .where(sensordata_sdf["lat"].isNull() & sensordata_sdf["lon"].isNull())\ 175 | .select(col("dim.*")) 176 | 177 | # Using a left_outer join 178 | # Identify rows that EXISTS in incoming landing data that does also EXISTS in existing Dimension table 179 | # and take the values of the incoming landing data. That is, we update existing table values. 180 | existingrows_sdf = sensordata_sdf.alias("in")\ 181 | .join(dim_location.alias("dim"), ["lat", "lon"], "left_outer")\ 182 | .where(dim_location["lat"].isNotNull() & dim_location["lon"].isNotNull())\ 183 | .select( 184 | col("dim.dim_location_id"), 185 | col("in.lat"), 186 | col("in.lon") 187 | ) 188 | 189 | # Using a left_outer join 190 | # Identify rows that EXISTS in landing data that does NOT EXISTS in existing Dimension table 191 | newrows_sdf = sensordata_sdf.alias("in")\ 192 | .join(dim_location, ["lat", "lon"], "left_outer")\ 193 | .where(dim_location["lat"].isNull() & dim_location["lon"].isNull())\ 194 | .select(col("in.*")) 195 | 196 | # Add load_id, loaded_at and dim_parking_bay_id 197 | existingrows_sdf = existingrows_sdf.withColumn("load_id", lit(load_id))\ 198 | .withColumn("loaded_on", lit(loaded_on.isoformat()).cast("timestamp")) 199 | newrows_sdf = newrows_sdf.withColumn("load_id", lit(load_id))\ 200 | .withColumn("loaded_on", lit(loaded_on.isoformat()).cast("timestamp"))\ 201 | .withColumn("dim_location_id", uuidUdf()) 202 | 203 | # Select relevant columns 204 | relevant_cols = [ 205 | "dim_location_id", 206 | "lat", 207 | "lon", 208 | "load_id", 209 | "loaded_on" 210 | ] 211 | oldrows_sdf = oldrows_sdf.select(relevant_cols) 212 | existingrows_sdf = existingrows_sdf.select(relevant_cols) 213 | newrows_sdf = newrows_sdf.select(relevant_cols) 214 | 215 | allrows_sdf = oldrows_sdf\ 216 | .union(existingrows_sdf)\ 217 | .union(newrows_sdf) 218 | 219 | return allrows_sdf 220 | 221 | 222 | def process_dim_st_marker(sensordata_sdf: DataFrame, 223 | dim_st_marker: DataFrame, 224 | load_id, loaded_on): 225 | """Transform sensordata into dim_st_marker""" 226 | 227 | # Get landing data distint rows 228 | sensordata_sdf = sensordata_sdf.select(["st_marker_id"]).distinct() 229 | 230 | # Using a left_outer join 231 | # identify rows that do NOT EXIST in landing data that EXISTS in existing Dimension table 232 | oldrows_sdf = dim_st_marker.alias("dim")\ 233 | .join(sensordata_sdf, ["st_marker_id"], "left_outer")\ 234 | .where(sensordata_sdf["st_marker_id"].isNull())\ 235 | .select(col("dim.*")) 236 | 237 | # Using a left_outer join 238 | # Identify rows that EXISTS in incoming landing data that does also EXISTS in existing Dimension table 239 | # and take the values of the incoming landing data. That is, we update existing table values. 240 | existingrows_sdf = sensordata_sdf.alias("in")\ 241 | .join(dim_st_marker.alias("dim"), ["st_marker_id"], "left_outer")\ 242 | .where(dim_st_marker["st_marker_id"].isNotNull())\ 243 | .select(col("dim.dim_st_marker_id"), col("in.st_marker_id")) 244 | 245 | # Using a left_outer join 246 | # Identify rows that EXISTS in landing data that does NOT EXISTS in existing Dimension table 247 | newrows_sdf = sensordata_sdf.alias("in")\ 248 | .join(dim_st_marker, ["st_marker_id"], "left_outer")\ 249 | .where(dim_st_marker["st_marker_id"].isNull())\ 250 | .select(col("in.*")) 251 | 252 | # Add load_id, loaded_at and dim_parking_bay_id 253 | existingrows_sdf = existingrows_sdf.withColumn("load_id", lit(load_id))\ 254 | .withColumn("loaded_on", lit(loaded_on.isoformat()).cast("timestamp")) 255 | newrows_sdf = newrows_sdf.withColumn("load_id", lit(load_id))\ 256 | .withColumn("loaded_on", lit(loaded_on.isoformat()).cast("timestamp"))\ 257 | .withColumn("dim_st_marker_id", uuidUdf()) 258 | 259 | # Select relevant columns 260 | relevant_cols = [ 261 | "dim_st_marker_id", 262 | "st_marker_id", 263 | "load_id", 264 | "loaded_on" 265 | ] 266 | oldrows_sdf = oldrows_sdf.select(relevant_cols) 267 | existingrows_sdf = existingrows_sdf.select(relevant_cols) 268 | newrows_sdf = newrows_sdf.select(relevant_cols) 269 | 270 | allrows_sdf = oldrows_sdf\ 271 | .union(existingrows_sdf)\ 272 | .union(newrows_sdf) 273 | 274 | return allrows_sdf 275 | 276 | 277 | def process_fact_parking(sensordata_sdf: DataFrame, 278 | dim_parkingbay_sdf: DataFrame, 279 | dim_location_sdf: DataFrame, 280 | dim_st_marker_sdf: DataFrame, 281 | load_id, loaded_on): 282 | """Transform sensordata into fact_parking""" 283 | 284 | dim_date_id = loaded_on.strftime("%Y%M%d") 285 | midnight = loaded_on.replace(hour=0, minute=0, second=0, microsecond=0) 286 | dim_time_id = (midnight - loaded_on).seconds 287 | 288 | # Build fact 289 | fact_parking = sensordata_sdf\ 290 | .join(dim_parkingbay_sdf.alias("pb"), "bay_id", "left_outer")\ 291 | .join(dim_location_sdf.alias("l"), ["lat", "lon"], "left_outer")\ 292 | .join(dim_st_marker_sdf.alias("st"), "st_marker_id", "left_outer")\ 293 | .select( 294 | lit(dim_date_id).alias("dim_date_id"), 295 | lit(dim_time_id).alias("dim_time_id"), 296 | when(col("pb.dim_parking_bay_id").isNull(), lit(EMPTY_UUID)) 297 | .otherwise(col("pb.dim_parking_bay_id")).alias("dim_parking_bay_id"), 298 | when(col("l.dim_location_id").isNull(), lit(EMPTY_UUID)) 299 | .otherwise(col("l.dim_location_id")).alias("dim_location_id"), 300 | when(col("st.dim_st_marker_id").isNull(), lit(EMPTY_UUID)) 301 | .otherwise(col("st.dim_st_marker_id")).alias("dim_st_marker_id"), 302 | "status", 303 | lit(load_id).alias("load_id"), 304 | lit(loaded_on.isoformat()).cast("timestamp").alias("loaded_on") 305 | ) 306 | return fact_parking 307 | 308 | 309 | if __name__ == "__main__": 310 | from pyspark.sql import SparkSession 311 | import datetime 312 | import os 313 | 314 | spark = SparkSession.builder\ 315 | .master("local[2]")\ 316 | .appName("transform.py")\ 317 | .getOrCreate() 318 | spark.sparkContext.setLogLevel("ERROR") 319 | 320 | THIS_DIR = os.path.dirname(os.path.abspath(__file__)) 321 | load_id = 1 322 | loaded_on = datetime.datetime.now() 323 | 324 | def _run_process_dim_parking_bay(): 325 | parkingbay_sdf = spark.read\ 326 | .schema(get_schema("interim_parkingbay_schema"))\ 327 | .json(os.path.join(THIS_DIR, "../data/interim_parking_bay.json")) 328 | dim_parkingbay_sdf = spark.read\ 329 | .schema(schema=get_schema("dw_dim_parking_bay"))\ 330 | .json(os.path.join(THIS_DIR, "../data/dim_parking_bay.json")) 331 | new_dim_parkingbay_sdf = process_dim_parking_bay(parkingbay_sdf, dim_parkingbay_sdf, load_id, loaded_on) 332 | return new_dim_parkingbay_sdf 333 | 334 | def _run_process_dim_location(): 335 | sensor_sdf = spark.read\ 336 | .schema(get_schema("interim_sensor"))\ 337 | .json(os.path.join(THIS_DIR, "../data/interim_sensor.json")) 338 | dim_location_sdf = spark.read\ 339 | .schema(schema=get_schema("dw_dim_location"))\ 340 | .json(os.path.join(THIS_DIR, "../data/dim_location.json")) 341 | new_dim_location_sdf = process_dim_location(sensor_sdf, dim_location_sdf, load_id, loaded_on) 342 | return new_dim_location_sdf 343 | 344 | def _run_process_dim_st_marker(): 345 | sensor_sdf = spark.read\ 346 | .schema(get_schema("interim_sensor"))\ 347 | .json(os.path.join(THIS_DIR, "../data/interim_sensor.json")) 348 | dim_st_marker_sdf = spark.read\ 349 | .schema(schema=get_schema("dw_dim_st_marker"))\ 350 | .json(os.path.join(THIS_DIR, "../data/dim_st_marker.json")) 351 | new_dim_st_marker_sdf = process_dim_st_marker(sensor_sdf, dim_st_marker_sdf, load_id, loaded_on) 352 | return new_dim_st_marker_sdf 353 | 354 | def _run_process_fact_parking(): 355 | sensor_sdf = spark.read\ 356 | .schema(get_schema("interim_sensor"))\ 357 | .json(os.path.join(THIS_DIR, "../data/interim_sensor.json")) 358 | dim_parking_bay_sdf = spark.read\ 359 | .schema(schema=get_schema("dw_dim_parking_bay"))\ 360 | .json(os.path.join(THIS_DIR, "../data/dim_parking_bay.json")) 361 | dim_location_sdf = spark.read\ 362 | .schema(schema=get_schema("dw_dim_location"))\ 363 | .json(os.path.join(THIS_DIR, "../data/dim_location.json")) 364 | dim_st_marker_sdf = spark.read\ 365 | .schema(schema=get_schema("dw_dim_st_marker"))\ 366 | .json(os.path.join(THIS_DIR, "../data/dim_st_marker.json")) 367 | new_fact_parking = process_fact_parking(sensor_sdf, 368 | dim_parking_bay_sdf, 369 | dim_location_sdf, 370 | dim_st_marker_sdf, 371 | load_id, loaded_on) 372 | return new_fact_parking 373 | 374 | def _inspect_df(df: DataFrame): 375 | df.show() 376 | df.printSchema() 377 | print(df.count()) 378 | 379 | # _inspect_df(_run_process_dim_parking_bay()) 380 | # _inspect_df(_run_process_dim_location()) 381 | # _inspect_df(_run_process_dim_st_marker()) 382 | _inspect_df(_run_process_fact_parking()) 383 | 384 | print("done!") 385 | -------------------------------------------------------------------------------- /src/ddo_transform/ddo_transform/util.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """Util module.""" 4 | 5 | from pyspark.sql import DataFrame, SparkSession 6 | 7 | 8 | def save_overwrite_unmanaged_table(spark: SparkSession, dataframe: DataFrame, table_name: str, path: str): 9 | """When trying to read and overwrite the same table, you get this error: 10 | 'Cannot overwrite table dw.dim_parking_bay that is also being read from;' 11 | This utility function workarounds this by saving to a temporary table first prior to overwriting. 12 | """ 13 | temp_table_name = table_name + "___temp" 14 | spark.sql("DROP TABLE IF EXISTS " + temp_table_name).collect() 15 | # Save temp table 16 | dataframe.write.saveAsTable(temp_table_name) 17 | # Read temp table and overwrite original table 18 | spark.read.table(temp_table_name)\ 19 | .write.mode("overwrite")\ 20 | .option("path", path)\ 21 | .saveAsTable(table_name) 22 | # Drop temp table 23 | spark.sql("DROP TABLE " + temp_table_name).collect() 24 | -------------------------------------------------------------------------------- /src/ddo_transform/docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = python -msphinx 7 | SPHINXPROJ = ddo_transform 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /src/ddo_transform/docs/authors.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../AUTHORS.rst 2 | -------------------------------------------------------------------------------- /src/ddo_transform/docs/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # ddo_transform documentation build configuration file, created by 5 | # sphinx-quickstart on Fri Jun 9 13:47:02 2017. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | # If extensions (or modules to document with autodoc) are in another 17 | # directory, add these directories to sys.path here. If the directory is 18 | # relative to the documentation root, use os.path.abspath to make it 19 | # absolute, like shown here. 20 | # 21 | import os 22 | import sys 23 | sys.path.insert(0, os.path.abspath('..')) 24 | 25 | import ddo_transform 26 | 27 | # -- General configuration --------------------------------------------- 28 | 29 | # If your documentation needs a minimal Sphinx version, state it here. 30 | # 31 | # needs_sphinx = '1.0' 32 | 33 | # Add any Sphinx extension module names here, as strings. They can be 34 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 35 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode'] 36 | 37 | # Add any paths that contain templates here, relative to this directory. 38 | templates_path = ['_templates'] 39 | 40 | # The suffix(es) of source filenames. 41 | # You can specify multiple suffix as a list of string: 42 | # 43 | # source_suffix = ['.rst', '.md'] 44 | source_suffix = '.rst' 45 | 46 | # The master toctree document. 47 | master_doc = 'index' 48 | 49 | # General information about the project. 50 | project = u'ddo_transform' 51 | copyright = u"2019, Lace Lofranco" 52 | author = u"Lace Lofranco" 53 | 54 | # The version info for the project you're documenting, acts as replacement 55 | # for |version| and |release|, also used in various other places throughout 56 | # the built documents. 57 | # 58 | # The short X.Y version. 59 | version = ddo_transform.__version__ 60 | # The full version, including alpha/beta/rc tags. 61 | release = ddo_transform.__version__ 62 | 63 | # The language for content autogenerated by Sphinx. Refer to documentation 64 | # for a list of supported languages. 65 | # 66 | # This is also used if you do content translation via gettext catalogs. 67 | # Usually you set "language" from the command line for these cases. 68 | language = None 69 | 70 | # List of patterns, relative to source directory, that match files and 71 | # directories to ignore when looking for source files. 72 | # This patterns also effect to html_static_path and html_extra_path 73 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 74 | 75 | # The name of the Pygments (syntax highlighting) style to use. 76 | pygments_style = 'sphinx' 77 | 78 | # If true, `todo` and `todoList` produce output, else they produce nothing. 79 | todo_include_todos = False 80 | 81 | 82 | # -- Options for HTML output ------------------------------------------- 83 | 84 | # The theme to use for HTML and HTML Help pages. See the documentation for 85 | # a list of builtin themes. 86 | # 87 | html_theme = 'alabaster' 88 | 89 | # Theme options are theme-specific and customize the look and feel of a 90 | # theme further. For a list of options available for each theme, see the 91 | # documentation. 92 | # 93 | # html_theme_options = {} 94 | 95 | # Add any paths that contain custom static files (such as style sheets) here, 96 | # relative to this directory. They are copied after the builtin static files, 97 | # so a file named "default.css" will overwrite the builtin "default.css". 98 | html_static_path = ['_static'] 99 | 100 | 101 | # -- Options for HTMLHelp output --------------------------------------- 102 | 103 | # Output file base name for HTML help builder. 104 | htmlhelp_basename = 'ddo_transformdoc' 105 | 106 | 107 | # -- Options for LaTeX output ------------------------------------------ 108 | 109 | latex_elements = { 110 | # The paper size ('letterpaper' or 'a4paper'). 111 | # 112 | # 'papersize': 'letterpaper', 113 | 114 | # The font size ('10pt', '11pt' or '12pt'). 115 | # 116 | # 'pointsize': '10pt', 117 | 118 | # Additional stuff for the LaTeX preamble. 119 | # 120 | # 'preamble': '', 121 | 122 | # Latex figure (float) alignment 123 | # 124 | # 'figure_align': 'htbp', 125 | } 126 | 127 | # Grouping the document tree into LaTeX files. List of tuples 128 | # (source start file, target name, title, author, documentclass 129 | # [howto, manual, or own class]). 130 | latex_documents = [ 131 | (master_doc, 'ddo_transform.tex', 132 | u'ddo_transform Documentation', 133 | u'Lace Lofranco', 'manual'), 134 | ] 135 | 136 | 137 | # -- Options for manual page output ------------------------------------ 138 | 139 | # One entry per manual page. List of tuples 140 | # (source start file, name, description, authors, manual section). 141 | man_pages = [ 142 | (master_doc, 'ddo_transform', 143 | u'ddo_transform Documentation', 144 | [author], 1) 145 | ] 146 | 147 | 148 | # -- Options for Texinfo output ---------------------------------------- 149 | 150 | # Grouping the document tree into Texinfo files. List of tuples 151 | # (source start file, target name, title, author, 152 | # dir menu entry, description, category) 153 | texinfo_documents = [ 154 | (master_doc, 'ddo_transform', 155 | u'ddo_transform Documentation', 156 | author, 157 | 'ddo_transform', 158 | 'One line description of project.', 159 | 'Miscellaneous'), 160 | ] 161 | 162 | 163 | 164 | -------------------------------------------------------------------------------- /src/ddo_transform/docs/contributing.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../CONTRIBUTING.rst 2 | -------------------------------------------------------------------------------- /src/ddo_transform/docs/history.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../HISTORY.rst 2 | -------------------------------------------------------------------------------- /src/ddo_transform/docs/index.rst: -------------------------------------------------------------------------------- 1 | Welcome to ddo_transform's documentation! 2 | ====================================== 3 | 4 | .. toctree:: 5 | :maxdepth: 2 6 | :caption: Contents: 7 | 8 | readme 9 | installation 10 | usage 11 | modules 12 | contributing 13 | authors 14 | history 15 | 16 | Indices and tables 17 | ================== 18 | * :ref:`genindex` 19 | * :ref:`modindex` 20 | * :ref:`search` 21 | -------------------------------------------------------------------------------- /src/ddo_transform/docs/installation.rst: -------------------------------------------------------------------------------- 1 | .. highlight:: shell 2 | 3 | ============ 4 | Installation 5 | ============ 6 | 7 | 8 | Stable release 9 | -------------- 10 | 11 | To install ddo_transform, run this command in your terminal: 12 | 13 | .. code-block:: console 14 | 15 | $ pip install ddo_transform 16 | 17 | This is the preferred method to install ddo_transform, as it will always install the most recent stable release. 18 | 19 | If you don't have `pip`_ installed, this `Python installation guide`_ can guide 20 | you through the process. 21 | 22 | .. _pip: https://pip.pypa.io 23 | .. _Python installation guide: http://docs.python-guide.org/en/latest/starting/installation/ 24 | 25 | 26 | From sources 27 | ------------ 28 | 29 | The sources for ddo_transform can be downloaded from the `Github repo`_. 30 | 31 | You can either clone the public repository: 32 | 33 | .. code-block:: console 34 | 35 | $ git clone git://github.com/devlace/ddo_transform 36 | 37 | Or download the `tarball`_: 38 | 39 | .. code-block:: console 40 | 41 | $ curl -OL https://github.com/devlace/ddo_transform/tarball/master 42 | 43 | Once you have a copy of the source, you can install it with: 44 | 45 | .. code-block:: console 46 | 47 | $ python setup.py install 48 | 49 | 50 | .. _Github repo: https://github.com/devlace/ddo_transform 51 | .. _tarball: https://github.com/devlace/ddo_transform/tarball/master 52 | -------------------------------------------------------------------------------- /src/ddo_transform/docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=python -msphinx 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | set SPHINXPROJ=ddo_transform 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The Sphinx module was not found. Make sure you have Sphinx installed, 20 | echo.then set the SPHINXBUILD environment variable to point to the full 21 | echo.path of the 'sphinx-build' executable. Alternatively you may add the 22 | echo.Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /src/ddo_transform/docs/readme.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../README.rst 2 | -------------------------------------------------------------------------------- /src/ddo_transform/docs/usage.rst: -------------------------------------------------------------------------------- 1 | ===== 2 | Usage 3 | ===== 4 | 5 | To use ddo_transform in a project:: 6 | 7 | import ddo_transform 8 | -------------------------------------------------------------------------------- /src/ddo_transform/requirements.txt: -------------------------------------------------------------------------------- 1 | pyspark==2.4.4 -------------------------------------------------------------------------------- /src/ddo_transform/requirements_dev.txt: -------------------------------------------------------------------------------- 1 | pip 2 | bumpversion 3 | wheel 4 | watchdog 5 | flake8 6 | tox 7 | coverage 8 | Sphinx 9 | twine 10 | click 11 | pytest 12 | pytest-runner 13 | -------------------------------------------------------------------------------- /src/ddo_transform/setup.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 0.1.0 3 | commit = True 4 | tag = True 5 | 6 | [bumpversion:file:setup.py] 7 | search = version='{current_version}' 8 | replace = version='{new_version}' 9 | 10 | [bumpversion:file:ddo_transform/__init__.py] 11 | search = __version__ = '{current_version}' 12 | replace = __version__ = '{new_version}' 13 | 14 | [bdist_wheel] 15 | universal = 1 16 | 17 | [flake8] 18 | exclude = docs 19 | max-line-length = 120 20 | 21 | [aliases] 22 | # Define setup.py command aliases here 23 | test = pytest 24 | 25 | [tool:pytest] 26 | collect_ignore = ['setup.py'] 27 | 28 | -------------------------------------------------------------------------------- /src/ddo_transform/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """The setup script.""" 5 | 6 | import os 7 | from setuptools import setup, find_packages 8 | 9 | version = os.environ['package_version'] 10 | 11 | with open('README.rst') as readme_file: 12 | readme = readme_file.read() 13 | 14 | with open('HISTORY.rst') as history_file: 15 | history = history_file.read() 16 | 17 | requirements = ['Click>=6.0', ] 18 | 19 | setup_requirements = ['pytest-runner', ] 20 | 21 | test_requirements = ['pytest', ] 22 | 23 | setup( 24 | author="Lace Lofranco", 25 | author_email='lace.lofranco@microsoft.com', 26 | classifiers=[ 27 | 'Development Status :: 2 - Pre-Alpha', 28 | 'Intended Audience :: Developers', 29 | 'License :: OSI Approved :: MIT License', 30 | 'Natural Language :: English', 31 | "Programming Language :: Python :: 2", 32 | 'Programming Language :: Python :: 2.7', 33 | 'Programming Language :: Python :: 3', 34 | 'Programming Language :: Python :: 3.4', 35 | 'Programming Language :: Python :: 3.5', 36 | 'Programming Language :: Python :: 3.6', 37 | 'Programming Language :: Python :: 3.7', 38 | ], 39 | description="Python Boilerplate contains all the boilerplate you need to create a Python package.", 40 | entry_points={ 41 | 'console_scripts': [ 42 | 'ddo_transform=ddo_transform.cli:main', 43 | ], 44 | }, 45 | install_requires=requirements, 46 | license="MIT license", 47 | long_description=readme + '\n\n' + history, 48 | include_package_data=True, 49 | keywords='ddo_transform', 50 | name='ddo_transform', 51 | packages=find_packages(include=['ddo_transform']), 52 | setup_requires=setup_requirements, 53 | test_suite='tests', 54 | tests_require=test_requirements, 55 | url='https://github.com/devlace/datadevops', 56 | version=version, 57 | zip_safe=False, 58 | ) 59 | -------------------------------------------------------------------------------- /src/ddo_transform/tests/test_standardize.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """Tests for `ddo_transform` package.""" 5 | 6 | import os 7 | import pytest 8 | import datetime 9 | from pyspark.sql.functions import isnull 10 | 11 | from ddo_transform import standardize 12 | 13 | THIS_DIR = os.path.dirname(os.path.abspath(__file__)) 14 | 15 | 16 | @pytest.fixture 17 | def spark(): 18 | """Spark Session fixture 19 | """ 20 | from pyspark.sql import SparkSession 21 | 22 | spark = SparkSession.builder\ 23 | .master("local[2]")\ 24 | .appName("Unit Testing")\ 25 | .getOrCreate() 26 | spark.sparkContext.setLogLevel("ERROR") 27 | return spark 28 | 29 | 30 | def test_standardize_parking_bay(spark): 31 | """Test data transform""" 32 | # Arrange 33 | schema = standardize.get_schema("in_parkingbay_schema") 34 | parkingbay_sdf = spark.read.json("./data/MelbParkingBayData.json", multiLine=True, schema=schema) 35 | load_id = 1 36 | loaded_on = datetime.datetime.now() 37 | # Act 38 | t_parkingbay_sdf, t_parkingbay_malformed_sdf = standardize.standardize_parking_bay(parkingbay_sdf, load_id, loaded_on) # noqa: E501 39 | # Assert 40 | assert t_parkingbay_sdf.count() != 0 41 | assert t_parkingbay_malformed_sdf.count() == 0 42 | assert t_parkingbay_sdf.filter(isnull("bay_id")).count() == 0 43 | 44 | 45 | def test_standardize_sensordata(spark): 46 | """Test data transform""" 47 | # Arrange 48 | schema = standardize.get_schema("in_sensordata_schema") 49 | sensordata_sdf = spark.read.json("./data/MelbParkingSensorData.json", multiLine=True, schema=schema) 50 | load_id = 1 51 | loaded_on = datetime.datetime.now() 52 | # Act 53 | t_sensordata_sdf, t_sensordata_malformed_sdf = standardize.standardize_sensordata(sensordata_sdf, load_id, loaded_on) # noqa: E501 54 | # Assert 55 | assert t_sensordata_sdf.count() != 0 56 | assert t_sensordata_malformed_sdf.count() == 0 57 | assert t_sensordata_sdf.filter(isnull("bay_id")).count() == 0 58 | -------------------------------------------------------------------------------- /src/ddo_transform/tests/test_transform.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """Tests for `ddo_transform` package.""" 5 | 6 | import os 7 | import pytest 8 | import datetime 9 | 10 | from ddo_transform import transform 11 | 12 | THIS_DIR = os.path.dirname(os.path.abspath(__file__)) 13 | 14 | 15 | @pytest.fixture 16 | def spark(): 17 | """Spark Session fixture 18 | """ 19 | from pyspark.sql import SparkSession 20 | 21 | spark = SparkSession.builder\ 22 | .master("local[2]")\ 23 | .appName("Unit Testing")\ 24 | .getOrCreate() 25 | spark.sparkContext.setLogLevel("ERROR") 26 | return spark 27 | 28 | 29 | def test_process_dim_parking_bay(spark): 30 | """Test data transform""" 31 | parkingbay_sdf = spark.read\ 32 | .schema(transform.get_schema("interim_parkingbay_schema"))\ 33 | .json(os.path.join(THIS_DIR, "../data/interim_parking_bay.json")) 34 | dim_parkingbay_sdf = spark.read\ 35 | .schema(schema=transform.get_schema("dw_dim_parking_bay"))\ 36 | .json(os.path.join(THIS_DIR, "../data/dim_parking_bay.json")) 37 | 38 | load_id = 1 39 | loaded_on = datetime.datetime.now() 40 | results_df = transform.process_dim_parking_bay(parkingbay_sdf, dim_parkingbay_sdf, load_id, loaded_on) 41 | 42 | # TODO add more asserts 43 | assert results_df.count() != 0 44 | -------------------------------------------------------------------------------- /src/ddo_transform/tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py27, py34, py35, py36, flake8 3 | 4 | [travis] 5 | python = 6 | 3.6: py36 7 | 3.5: py35 8 | 3.4: py34 9 | 2.7: py27 10 | 11 | [testenv:flake8] 12 | basepython = python 13 | deps = flake8 14 | commands = flake8 ddo_transform 15 | 16 | [testenv] 17 | setenv = 18 | PYTHONPATH = {toxinidir} 19 | deps = 20 | -r{toxinidir}/requirements_dev.txt 21 | ; If you want to make tox run the tests with the same versions, create a 22 | ; requirements.txt with the pinned versions and uncomment the following line: 23 | ; -r{toxinidir}/requirements.txt 24 | commands = 25 | pip install -U pip 26 | py.test --basetemp={envtmpdir} 27 | 28 | 29 | --------------------------------------------------------------------------------