├── .dockerignore ├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── azure-pipelines.yml ├── deploy ├── Dockerfile ├── azuredeploy.json ├── azuredeploy.parameters.json ├── databricks │ ├── config │ │ ├── cluster.config.json │ │ ├── job.batchscoring.config.json │ │ ├── job.streamdatagen.config.json │ │ ├── job.streamscoring.config.json │ │ ├── run.downloaddata.config.json │ │ ├── run.etl.config.json │ │ └── run.trainmodelall.config.json │ ├── configure_databricks.sh │ └── create_secrets.sh └── deploy.sh ├── images ├── FileStore │ ├── GBTModel.PNG │ ├── LogRegCVPipeline.PNG │ ├── MLPipeline.PNG │ ├── PCAAnomalyPipeline.PNG │ ├── RandomForestPipeline.PNG │ ├── TransformPipeline.PNG │ └── transformation_and_actions.PNG └── archi.PNG ├── notebooks ├── .gitkeep └── databricks_notebooks │ ├── 00_demo_hello_spark.py │ ├── 01_download_data.py │ ├── 02_ETL.py │ ├── 03_explore_data.py │ ├── 04_trainmodel_multiple.scala │ ├── 04_trainmodel_pca_w_custom.scala │ ├── 04_trainmodel_pca_wo_custom.scala │ ├── 05_batch_scoring.scala │ ├── 06a_streaming_datagen.scala │ └── 06b_streaming_scoring.scala ├── references └── Lace Lofranco - Building Advanced Analytics Pipelines with Azure Databricks.pdf ├── requirements.txt ├── setup.py └── test_environment.py /.dockerignore: -------------------------------------------------------------------------------- 1 | .env 2 | .git 3 | .cache 4 | *.md 5 | !README.md 6 | data -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | 56 | # PyBuilder 57 | target/ 58 | 59 | # DotEnv configuration 60 | .env 61 | 62 | # Database 63 | *.db 64 | *.rdb 65 | 66 | # Pycharm 67 | .idea 68 | 69 | # VS Code 70 | .vscode/ 71 | 72 | # Spyder 73 | .spyproject/ 74 | 75 | # Jupyter NB Checkpoints 76 | .ipynb_checkpoints/ 77 | 78 | # exclude data from source control by default 79 | /data/ 80 | 81 | # Mac OS-specific storage files 82 | .DS_Store 83 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 devlace 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: clean data lint requirements deploy_resources deploy deploy_w_docker download_notebooks 2 | 3 | ################################################################################# 4 | # GLOBALS # 5 | ################################################################################# 6 | 7 | PROJECT_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST)))) 8 | PROFILE = default 9 | PROJECT_NAME = azure-databricks-anomaly 10 | PYTHON_INTERPRETER = python3 11 | DOCKER_DEPLOY_CONTAINER = devlace/azdatabricksanomaly 12 | DATABRICKS_NOTEBOOKS_FOLDER = anomaly 13 | 14 | ifeq (,$(shell which conda)) 15 | HAS_CONDA=False 16 | else 17 | HAS_CONDA=True 18 | endif 19 | 20 | ################################################################################# 21 | # COMMANDS # 22 | ################################################################################# 23 | 24 | ## Install Python Dependencies 25 | requirements: test_environment 26 | pip install -U pip setuptools wheel 27 | pip install -r requirements.txt 28 | 29 | ## Deploy infrastructure 30 | deploy_resources: 31 | deploy/deploy.sh 32 | 33 | ## Deploys entire solution 34 | deploy: deploy_resources data 35 | deploy/databricks/create_secrets.sh 36 | deploy/databricks/configure_databricks.sh 37 | 38 | ## Deploys entire solutions using Docker 39 | deploy_w_docker: 40 | docker build -t $(DOCKER_DEPLOY_CONTAINER) -f deploy/Dockerfile . 41 | docker run -it $(DOCKER_DEPLOY_CONTAINER) 42 | 43 | ## Download notebooks in anomaly workspace folder locally 44 | download_notebooks: 45 | databricks workspace export_dir --overwrite /$(DATABRICKS_NOTEBOOKS_FOLDER) notebooks/databricks_notebooks 46 | 47 | ## Downloads models 48 | download_models: 49 | databricks fs cp --recursive --overwrite dbfs:/mnt/blob_storage/models/ models/ 50 | 51 | ## Delete all compiled Python files 52 | clean: 53 | find . -type f -name "*.py[co]" -delete 54 | find . -type d -name "__pycache__" -delete 55 | 56 | ## Lint using flake8 57 | lint: 58 | flake8 src 59 | 60 | 61 | ## Set up python interpreter environment 62 | create_environment: 63 | ifeq (True,$(HAS_CONDA)) 64 | @echo ">>> Detected conda, creating conda environment." 65 | ifeq (3,$(findstring 3,$(PYTHON_INTERPRETER))) 66 | conda create --name $(PROJECT_NAME) python=3 67 | else 68 | conda create --name $(PROJECT_NAME) python=2.7 69 | endif 70 | @echo ">>> New conda env created. Activate with:\nsource activate $(PROJECT_NAME)" 71 | else 72 | @pip install -q virtualenv virtualenvwrapper 73 | @echo ">>> Installing virtualenvwrapper if not already intalled.\nMake sure the following lines are in shell startup file\n\ 74 | export WORKON_HOME=$$HOME/.virtualenvs\nexport PROJECT_HOME=$$HOME/Devel\nsource /usr/local/bin/virtualenvwrapper.sh\n" 75 | @bash -c "source `which virtualenvwrapper.sh`;mkvirtualenv $(PROJECT_NAME) --python=$(PYTHON_INTERPRETER)" 76 | @echo ">>> New virtualenv created. Activate with:\nworkon $(PROJECT_NAME)" 77 | endif 78 | 79 | ## Test python environment is setup correctly 80 | test_environment: 81 | $(PYTHON_INTERPRETER) test_environment.py 82 | 83 | ################################################################################# 84 | # PROJECT RULES # 85 | ################################################################################# 86 | 87 | 88 | 89 | ################################################################################# 90 | # Self Documenting Commands # 91 | ################################################################################# 92 | 93 | .DEFAULT_GOAL := help 94 | 95 | # Inspired by 96 | # sed script explained: 97 | # /^##/: 98 | # * save line in hold space 99 | # * purge line 100 | # * Loop: 101 | # * append newline + line to hold space 102 | # * go to next line 103 | # * if line starts with doc comment, strip comment character off and loop 104 | # * remove target prerequisites 105 | # * append hold space (+ newline) to line 106 | # * replace newline plus comments by `---` 107 | # * print line 108 | # Separate expressions are necessary because labels cannot be delimited by 109 | # semicolon; see 110 | .PHONY: help 111 | help: 112 | @echo "$$(tput bold)Available rules:$$(tput sgr0)" 113 | @echo 114 | @sed -n -e "/^## / { \ 115 | h; \ 116 | s/.*//; \ 117 | :doc" \ 118 | -e "H; \ 119 | n; \ 120 | s/^## //; \ 121 | t doc" \ 122 | -e "s/:.*//; \ 123 | G; \ 124 | s/\\n## /---/; \ 125 | s/\\n/ /g; \ 126 | p; \ 127 | }" ${MAKEFILE_LIST} \ 128 | | LC_ALL='C' sort --ignore-case \ 129 | | awk -F '---' \ 130 | -v ncol=$$(tput cols) \ 131 | -v indent=19 \ 132 | -v col_on="$$(tput setaf 6)" \ 133 | -v col_off="$$(tput sgr0)" \ 134 | '{ \ 135 | printf "%s%*s%s ", col_on, -indent, $$1, col_off; \ 136 | n = split($$2, words, " "); \ 137 | line_length = ncol - indent; \ 138 | for (i = 1; i <= n; i++) { \ 139 | line_length -= length(words[i]) + 1; \ 140 | if (line_length <= 0) { \ 141 | line_length = ncol - indent - length(words[i]) - 1; \ 142 | printf "\n%*s ", -indent, " "; \ 143 | } \ 144 | printf "%s ", words[i]; \ 145 | } \ 146 | printf "\n"; \ 147 | }' \ 148 | | more $(shell test $(shell uname) = Darwin && echo '--no-init --raw-control-chars') 149 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build Status](https://msdevlace.visualstudio.com/MLDevOps/_apis/build/status/devlace.azure-databricks-anomaly)](https://msdevlace.visualstudio.com/MLDevOps/_build/latest?definitionId=3) 2 | 3 | Anomaly Detection Pipeline on Azure Databricks 4 | ============================== 5 | 6 | The following is an anomaly detection data pipeline on Azure Databricks. This solution was built to demonstrate how to build Advance Analytics Pipelines on Azure Databricks, with a particular focus on the Spark MLLib library. This solution includes: 7 | 1. Initial ETL Data loading process into SparkSQL tables 8 | 2. Model training and scoring 9 | - Explanation of Pipelines, Transformer and Estimators 10 | - Sample Custom Estimator (PCAAnomaly) 11 | 3. Persisting trained models 12 | 4. Productionizing models through 13 | - Batch inference 14 | - Streaming 15 | 16 | # Architecture 17 | ![Architecture](images/archi.PNG?raw=true "Architecture") 18 | 19 | 20 | # Data 21 | [KDD Cup 1999 Data](http://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html) 22 | 23 | # Deployment 24 | 25 | - Ensure you are in the root of the repository 26 | - To deploy the solution, use one of the following commands: 27 | 1. (*Easiest*) Using pre-built docker container: `docker run -it devlace/azdatabricksanomaly` 28 | 2. Build and run the container locally: `make deploy_w_docker` 29 | 3. Deploy using local environment (see requirements below): `make deploy` 30 | - Follow the prompts to login to Azure, name of resource group, deployment location, etc. 31 | - When prompted for a Databricks Host, enter the full name of your databricks workspace host, e.g. `https://southeastasia.azuredatabricks.net` 32 | - When prompted for a token, you can [generate a new token](https://docs.databricks.com/api/latest/authentication.html) in the databricks workspace. 33 | 34 | To view additional make commands run `make` 35 | 36 | ## For local deployment 37 | 38 | ### Requirements 39 | 40 | - [Azure CLI 2.0+](https://azure.github.io/projects/clis/) 41 | - [Python virtualenv](http://docs.python-guide.org/en/latest/dev/virtualenvs/) or [Anaconda](https://anaconda.org/anaconda/python) 42 | - [jq tool](https://stedolan.github.io/jq/download/) 43 | - Check the requirements.txt for list of necessary Python packages. (will be installed by `make requirements`) 44 | 45 | ### Development environment 46 | 47 | - The following works with [Windows Subsystem for Linux](https://docs.microsoft.com/en-us/windows/wsl/install-win10) 48 | - Clone this repository 49 | - `cd azure-databricks-anomaly` 50 | - Create a python environment (Virtualenv or Conda). The following uses virtualenv. 51 | - `virtualenv .` This creates a python virtual environment to work in. 52 | - `source bin/activate` This activates the virtual environment. 53 | - `make requirements`. This installs python dependencies in the virtual environment. 54 | 55 | # Project Organization 56 | ------------ 57 | 58 | ├── LICENSE 59 | ├── Makefile <- Makefile with commands like `make data` or `make train` 60 | ├── README.md <- The top-level README for developers using this project. 61 | ├── deploy <- Deployment artifacts 62 | │ │ 63 | │   └── databricks <- Deployment artifacts in relation to the Databricks workspace 64 | │ │ 65 | │ └── deploy.sh <- Deployment script to deploy all Azure Resources 66 | │ │ 67 | │ └── azuredeploy.json <- Azure ARM template w/ .parameters file 68 | │ │ 69 | │ └── Dockerfile <- Dockerfile for deployment 70 | │ 71 | ├── models <- Trained and serialized models, model predictions, or model summaries 72 | │ 73 | ├── notebooks <- Jupyter notebooks. Naming convention is a number (for ordering), 74 | │ the creator's initials, and a short `-` delimited description, e.g. 75 | │ `1.0-jqp-initial-data-exploration`. 76 | │ 77 | ├── references <- Contains the powerpoint presentation, and other reference materials. 78 | │ 79 | ├── requirements.txt <- The requirements file for reproducing the analysis environment, e.g. 80 | │ generated with `pip freeze > requirements.txt` 81 | │ 82 | ├── setup.py <- makes project pip installable (pip install -e .) so src can be imported 83 | 84 | 85 | -------- 86 | 87 |

Project based on the cookiecutter data science project template. #cookiecutterdatascience

88 | -------------------------------------------------------------------------------- /azure-pipelines.yml: -------------------------------------------------------------------------------- 1 | # Docker image 2 | # Build a Docker image to run, deploy, or push to a container registry. 3 | # Add steps that use Docker Compose, tag images, push to a registry, run an image, and more: 4 | # https://docs.microsoft.com/vsts/pipelines/languages/docker 5 | 6 | pool: 7 | vmImage: 'Ubuntu 16.04' 8 | 9 | variables: 10 | dockerId: 'devlace' 11 | imageName: 'azdatabricksanomaly' 12 | 13 | steps: 14 | - script: docker build -f deploy/Dockerfile -t $(dockerId)/$(imageName) . 15 | displayName: 'docker build' 16 | - script: | 17 | docker login -u $(dockerId) -p $(dockerPswd) 18 | docker push $(dockerId)/$(imageName) 19 | displayName: 'docker push' 20 | -------------------------------------------------------------------------------- /deploy/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM continuumio/miniconda3:4.6.14 2 | 3 | # Install any needed packages specified in requirements.txt 4 | RUN apt-get update 5 | RUN apt-get install -y autoconf=2.69-10 automake=1:1.15-6 build-essential=12.3 libtool=2.4.6-2 python-dev=2.7.13-2 jq=1.5+dfsg-1.3 6 | 7 | # Set the working directory to / 8 | WORKDIR / 9 | # Copy the directory contents into the container at / 10 | COPY . / 11 | 12 | RUN make requirements 13 | 14 | RUN chmod +x -R /deploy 15 | 16 | CMD ["make", "deploy"] 17 | 18 | 19 | -------------------------------------------------------------------------------- /deploy/azuredeploy.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", 3 | "contentVersion": "1.0.0.0", 4 | "parameters": { 5 | "deployNs": { 6 | "type": "string" 7 | }, 8 | "dbricksWorkspaceName": { 9 | "defaultValue": "[concat(parameters('deployNs'), 'dbricks', uniqueString(resourceGroup().id))]", 10 | "type": "string" 11 | }, 12 | "dbricksLocation": { 13 | "defaultValue": "[resourceGroup().location]", 14 | "type": "string", 15 | "metadata": { 16 | "description": "Location of Databricks workspace" 17 | } 18 | }, 19 | "dbricksTier": { 20 | "defaultValue": "premium", 21 | "type": "string", 22 | "allowedValues": [ 23 | "premium", 24 | "standard" 25 | ] 26 | }, 27 | "eventhubsNsName": { 28 | "defaultValue": "[concat(parameters('deployNs'), 'ehns', uniqueString(resourceGroup().id))]", 29 | "type": "string", 30 | "metadata": { 31 | "description": "Event Hubs namespace name" 32 | } 33 | }, 34 | "eventhubsLocation": { 35 | "defaultValue": "[resourceGroup().location]", 36 | "type": "string", 37 | "metadata": { 38 | "description": "Event Hubs location" 39 | } 40 | }, 41 | "eventhubDataName": { 42 | "defaultValue": "[concat(parameters('deployNs'), 'ehData', uniqueString(resourceGroup().id))]", 43 | "type": "string", 44 | "metadata": { 45 | "description": "Event Hub name" 46 | } 47 | }, 48 | "eventhubAnomName": { 49 | "defaultValue": "[concat(parameters('deployNs'), 'ehAnom', uniqueString(resourceGroup().id))]", 50 | "type": "string", 51 | "metadata": { 52 | "description": "Event Hub name - anomalies" 53 | } 54 | }, 55 | "storAccountName": { 56 | "defaultValue": "[concat(parameters('deployNs'), 'stor', uniqueString(resourceGroup().id))]", 57 | "type": "string", 58 | "metadata": { 59 | "description": "Storage account name" 60 | } 61 | }, 62 | "storLocation": { 63 | "defaultValue": "[resourceGroup().location]", 64 | "type": "string", 65 | "metadata": { 66 | "description": "Storage account location" 67 | } 68 | } 69 | }, 70 | "variables": { 71 | "managedResourceGroupId": "[concat(subscription().id, '/resourceGroups/', variables('managedResourceGroupName'))]", 72 | "managedResourceGroupName": "[concat('databricks-rg-', parameters('dbricksWorkspaceName'), '-', uniqueString(parameters('dbricksWorkspaceName'), resourceGroup().id))]", 73 | "eventhubsDataFullName": "[concat(parameters('eventhubsNsName'), '/', parameters('eventhubDataName'))]", 74 | "eventhubsAnomFullName": "[concat(parameters('eventhubsNsName'), '/', parameters('eventhubAnomName'))]" 75 | }, 76 | "resources": [ 77 | { 78 | "apiVersion": "2018-04-01", 79 | "location": "[parameters('dbricksLocation')]", 80 | "name": "[parameters('dbricksWorkspaceName')]", 81 | "tags": { 82 | "displayName": "Databricks Workspace" 83 | }, 84 | "sku": { 85 | "name": "[parameters('dbricksTier')]" 86 | }, 87 | "properties": { 88 | "ManagedResourceGroupId": "[variables('managedResourceGroupId')]" 89 | }, 90 | "type": "Microsoft.Databricks/workspaces" 91 | }, 92 | { 93 | "type": "Microsoft.EventHub/namespaces", 94 | "sku": { 95 | "name": "Standard", 96 | "tier": "Standard", 97 | "capacity": 1 98 | }, 99 | "name": "[parameters('eventhubsNsName')]", 100 | "apiVersion": "2017-04-01", 101 | "location": "[parameters('eventhubsLocation')]", 102 | "tags": {}, 103 | "scale": null, 104 | "properties": { 105 | "isAutoInflateEnabled": true, 106 | "maximumThroughputUnits": 20 107 | }, 108 | "dependsOn": [] 109 | }, 110 | { 111 | "type": "Microsoft.EventHub/namespaces/eventhubs", 112 | "name": "[variables('eventhubsDataFullName')]", 113 | "apiVersion": "2017-04-01", 114 | "location": "[parameters('eventhubsLocation')]", 115 | "scale": null, 116 | "properties": { 117 | "messageRetentionInDays": 1, 118 | "partitionCount": 2, 119 | "status": "Active" 120 | }, 121 | "dependsOn": [ 122 | "[resourceId('Microsoft.EventHub/namespaces', parameters('eventhubsNsName'))]" 123 | ] 124 | }, 125 | { 126 | "type": "Microsoft.EventHub/namespaces/eventhubs/authorizationRules", 127 | "name": "[concat(variables('eventhubsDataFullName'), '/send')]", 128 | "apiVersion": "2017-04-01", 129 | "location": "[parameters('eventhubsLocation')]", 130 | "scale": null, 131 | "properties": { 132 | "rights": [ 133 | "Send" 134 | ] 135 | }, 136 | "dependsOn": [ 137 | "[resourceId('Microsoft.EventHub/namespaces', parameters('eventhubsNsName'))]", 138 | "[resourceId('Microsoft.EventHub/namespaces/eventhubs', parameters('eventhubsNsName'), parameters('eventhubDataName'))]" 139 | ] 140 | }, 141 | { 142 | "type": "Microsoft.EventHub/namespaces/eventhubs/authorizationRules", 143 | "name": "[concat(variables('eventhubsDataFullName'), '/listen')]", 144 | "apiVersion": "2017-04-01", 145 | "location": "[parameters('eventhubsLocation')]", 146 | "scale": null, 147 | "properties": { 148 | "rights": [ 149 | "Listen" 150 | ] 151 | }, 152 | "dependsOn": [ 153 | "[resourceId('Microsoft.EventHub/namespaces', parameters('eventhubsNsName'))]", 154 | "[resourceId('Microsoft.EventHub/namespaces/eventhubs', parameters('eventhubsNsName'), parameters('eventhubDataName'))]" 155 | ] 156 | }, 157 | { 158 | "type": "Microsoft.EventHub/namespaces/eventhubs", 159 | "name": "[variables('eventhubsAnomFullName')]", 160 | "apiVersion": "2017-04-01", 161 | "location": "[parameters('eventhubsLocation')]", 162 | "scale": null, 163 | "properties": { 164 | "messageRetentionInDays": 1, 165 | "partitionCount": 2, 166 | "status": "Active" 167 | }, 168 | "dependsOn": [ 169 | "[resourceId('Microsoft.EventHub/namespaces', parameters('eventhubsNsName'))]" 170 | ] 171 | }, 172 | { 173 | "type": "Microsoft.EventHub/namespaces/eventhubs/authorizationRules", 174 | "name": "[concat(variables('eventhubsAnomFullName'), '/send')]", 175 | "apiVersion": "2017-04-01", 176 | "location": "[parameters('eventhubsLocation')]", 177 | "scale": null, 178 | "properties": { 179 | "rights": [ 180 | "Send" 181 | ] 182 | }, 183 | "dependsOn": [ 184 | "[resourceId('Microsoft.EventHub/namespaces', parameters('eventhubsNsName'))]", 185 | "[resourceId('Microsoft.EventHub/namespaces/eventhubs', parameters('eventhubsNsName'), parameters('eventhubAnomName'))]" 186 | ] 187 | }, 188 | { 189 | "type": "Microsoft.EventHub/namespaces/eventhubs/authorizationRules", 190 | "name": "[concat(variables('eventhubsAnomFullName'), '/listen')]", 191 | "apiVersion": "2017-04-01", 192 | "location": "[parameters('eventhubsLocation')]", 193 | "scale": null, 194 | "properties": { 195 | "rights": [ 196 | "Listen" 197 | ] 198 | }, 199 | "dependsOn": [ 200 | "[resourceId('Microsoft.EventHub/namespaces', parameters('eventhubsNsName'))]", 201 | "[resourceId('Microsoft.EventHub/namespaces/eventhubs', parameters('eventhubsNsName'), parameters('eventhubAnomName'))]" 202 | ] 203 | }, 204 | { 205 | "type": "Microsoft.Storage/storageAccounts", 206 | "sku": { 207 | "name": "Standard_LRS", 208 | "tier": "Standard" 209 | }, 210 | "kind": "StorageV2", 211 | "name": "[parameters('storAccountName')]", 212 | "apiVersion": "2017-10-01", 213 | "location": "[parameters('storLocation')]", 214 | "tags": { 215 | "displayname": "Databricks storage" 216 | }, 217 | "properties": { 218 | "networkAcls": { 219 | "bypass": "AzureServices", 220 | "virtualNetworkRules": [], 221 | "ipRules": [], 222 | "defaultAction": "Allow" 223 | }, 224 | "supportsHttpsTrafficOnly": true, 225 | "encryption": { 226 | "services": { 227 | "file": { 228 | "enabled": true 229 | }, 230 | "blob": { 231 | "enabled": true 232 | } 233 | }, 234 | "keySource": "Microsoft.Storage" 235 | }, 236 | "accessTier": "Hot" 237 | } 238 | } 239 | ], 240 | "outputs": { 241 | "dbricksWorkspaceName": { 242 | "value": "[parameters('dbricksWorkspaceName')]", 243 | "type": "string" 244 | }, 245 | "dbricksLocation": { 246 | "value": "[parameters('dbricksLocation')]", 247 | "type": "string" 248 | }, 249 | "storAccountName": { 250 | "value": "[parameters('storAccountName')]", 251 | "type": "string" 252 | }, 253 | "eventhubsNsName": { 254 | "value": "[parameters('eventhubsNsName')]", 255 | "type": "string" 256 | }, 257 | "eventhubDataName": { 258 | "value": "[parameters('eventhubDataName')]", 259 | "type": "string" 260 | }, 261 | "eventhubAnomName": { 262 | "value": "[parameters('eventhubAnomName')]", 263 | "type": "string" 264 | } 265 | } 266 | } -------------------------------------------------------------------------------- /deploy/azuredeploy.parameters.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentParameters.json#", 3 | "contentVersion": "1.0.0.0", 4 | "parameters": { 5 | "deployNs": { 6 | "value": "lace" 7 | } 8 | } 9 | } -------------------------------------------------------------------------------- /deploy/databricks/config/cluster.config.json: -------------------------------------------------------------------------------- 1 | { 2 | "cluster_name": "anomalycluster", 3 | "autoscale": { "min_workers": 1, "max_workers": 4 }, 4 | "spark_version": "5.5.x-scala2.11", 5 | "spark_env_vars": { 6 | "PYSPARK_PYTHON": "/databricks/python3/bin/python3" 7 | }, 8 | "autotermination_minutes": 120, 9 | "node_type_id": "Standard_DS12_v2", 10 | "driver_node_type_id": "Standard_DS12_v2" 11 | } 12 | -------------------------------------------------------------------------------- /deploy/databricks/config/job.batchscoring.config.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Batch - Score using anomaly model", 3 | "new_cluster": { 4 | "spark_version": "5.5.x-scala2.11", 5 | "node_type_id": "Standard_DS12_v2", 6 | "num_workers": 3, 7 | "spark_env_vars": { 8 | "PYSPARK_PYTHON": "/databricks/python3/bin/python3" 9 | } 10 | }, 11 | "libraries": [], 12 | "timeout_seconds": 3600, 13 | "email_notifications": { 14 | "on_start": [], 15 | "on_success": [], 16 | "on_failure": [] 17 | }, 18 | "max_retries": 3, 19 | "schedule": { 20 | "quartz_cron_expression": "0 0 22 ? * *", 21 | "timezone_id": "Australia/Victoria" 22 | }, 23 | "notebook_task": { 24 | "notebook_path": "/anomaly/05_batch_scoring" 25 | } 26 | } -------------------------------------------------------------------------------- /deploy/databricks/config/job.streamdatagen.config.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Streaming - Send data to evenhubs", 3 | "new_cluster": { 4 | "spark_version": "5.5.x-scala2.11", 5 | "node_type_id": "Standard_DS12_v2", 6 | "num_workers": 3, 7 | "spark_env_vars": { 8 | "PYSPARK_PYTHON": "/databricks/python3/bin/python3" 9 | } 10 | }, 11 | "libraries": [ 12 | { 13 | "maven": { 14 | "coordinates": "com.microsoft.azure:azure-eventhubs-spark_2.11:2.3.2" 15 | } 16 | } 17 | ], 18 | "email_notifications": { 19 | "on_start": [], 20 | "on_success": [], 21 | "on_failure": [] 22 | }, 23 | "max_retries": -1, 24 | "notebook_task": { 25 | "notebook_path": "/anomaly/06a_streaming_datagen" 26 | } 27 | } -------------------------------------------------------------------------------- /deploy/databricks/config/job.streamscoring.config.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Streaming - Ingest data from eventhub and identify anomalies", 3 | "new_cluster": { 4 | "spark_version": "5.5.x-scala2.11", 5 | "node_type_id": "Standard_DS12_v2", 6 | "num_workers": 3, 7 | "spark_env_vars": { 8 | "PYSPARK_PYTHON": "/databricks/python3/bin/python3" 9 | } 10 | }, 11 | "libraries": [ 12 | { 13 | "maven": { 14 | "coordinates": "com.microsoft.azure:azure-eventhubs-spark_2.11:2.3.2" 15 | } 16 | } 17 | ], 18 | "email_notifications": { 19 | "on_start": [], 20 | "on_success": [], 21 | "on_failure": [] 22 | }, 23 | "max_retries": -1, 24 | "notebook_task": { 25 | "notebook_path": "/anomaly/06b_streaming_scoring" 26 | } 27 | } -------------------------------------------------------------------------------- /deploy/databricks/config/run.downloaddata.config.json: -------------------------------------------------------------------------------- 1 | { 2 | "run_name": "Mount storage and download data", 3 | "new_cluster": { 4 | "spark_version": "5.5.x-scala2.11", 5 | "node_type_id": "Standard_DS12_v2", 6 | "num_workers": 1, 7 | "spark_env_vars": { 8 | "PYSPARK_PYTHON": "/databricks/python3/bin/python3" 9 | } 10 | }, 11 | "libraries": [], 12 | "timeout_seconds": 3600, 13 | "notebook_task": { 14 | "notebook_path": "/anomaly/01_download_data" 15 | } 16 | } -------------------------------------------------------------------------------- /deploy/databricks/config/run.etl.config.json: -------------------------------------------------------------------------------- 1 | { 2 | "run_name": "Perform ETL (SparkSQL tables)", 3 | "new_cluster": { 4 | "spark_version": "5.5.x-scala2.11", 5 | "node_type_id": "Standard_DS12_v2", 6 | "num_workers": 1, 7 | "spark_env_vars": { 8 | "PYSPARK_PYTHON": "/databricks/python3/bin/python3" 9 | } 10 | }, 11 | "libraries": [], 12 | "timeout_seconds": 3600, 13 | "notebook_task": { 14 | "notebook_path": "/anomaly/02_ETL" 15 | } 16 | } -------------------------------------------------------------------------------- /deploy/databricks/config/run.trainmodelall.config.json: -------------------------------------------------------------------------------- 1 | { 2 | "run_name": "Train PCAAnomaly model", 3 | "new_cluster": { 4 | "spark_version": "5.5.x-scala2.11", 5 | "node_type_id": "Standard_DS12_v2", 6 | "num_workers": 4, 7 | "spark_env_vars": { 8 | "PYSPARK_PYTHON": "/databricks/python3/bin/python3" 9 | } 10 | }, 11 | "libraries": [], 12 | "timeout_seconds": 3600, 13 | "notebook_task": { 14 | "notebook_path": "/anomaly/04_trainmodel_multiple" 15 | } 16 | } -------------------------------------------------------------------------------- /deploy/databricks/configure_databricks.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Access granted under MIT Open Source License: https://en.wikipedia.org/wiki/MIT_License 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated 6 | # documentation files (the "Software"), to deal in the Software without restriction, including without limitation 7 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, # and/or sell copies of the Software, 8 | # and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 9 | # 10 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions 11 | # of the Software. 12 | # 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED 14 | # TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 15 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF 16 | # CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 17 | # DEALINGS IN THE SOFTWARE. 18 | # 19 | # 20 | # Description: Deploy Databricks cluster 21 | # 22 | # Usage: 23 | # 24 | # Requirments: 25 | # 26 | 27 | set -o errexit 28 | set -o pipefail 29 | set -o nounset 30 | # set -o xtrace 31 | 32 | # Set path 33 | parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P ) 34 | cd "$parent_path" 35 | 36 | # Constants 37 | RED='\033[0;31m' 38 | ORANGE='\033[0;33m' 39 | NC='\033[0m' 40 | 41 | cluster_config="./config/cluster.config.json" 42 | 43 | wait_for_run () { 44 | # See here: https://docs.azuredatabricks.net/api/latest/jobs.html#jobsrunresultstate 45 | declare mount_run_id=$1 46 | while : ; do 47 | life_cycle_status=$(databricks runs get --run-id $mount_run_id | jq -r ".state.life_cycle_state") 48 | result_state=$(databricks runs get --run-id $mount_run_id | jq -r ".state.result_state") 49 | if [[ $result_state == "SUCCESS" || $result_state == "SKIPPED" ]]; then 50 | break; 51 | elif [[ $life_cycle_status == "INTERNAL_ERROR" || $result_state == "FAILED" ]]; then 52 | state_message=$(databricks runs get --run-id $mount_run_id | jq -r ".state.state_message") 53 | echo -e "${RED}Error while running ${mount_run_id}: ${state_message} ${NC}" 54 | exit 1 55 | else 56 | echo "Waiting for run ${mount_run_id} to finish..." 57 | sleep 2m 58 | fi 59 | done 60 | } 61 | 62 | cluster_exists () { 63 | declare cluster_name="$1" 64 | declare cluster=$(databricks clusters list | tr -s " " | cut -d" " -f2 | grep ^${cluster_name}$) 65 | if [[ -n $cluster ]]; then 66 | return 0; # cluster exists 67 | else 68 | return 1; # cluster does not exists 69 | fi 70 | } 71 | 72 | yes_or_no () { 73 | while true; do 74 | read -p "$(echo -e ${ORANGE}"$* [y/n]: "${NC})" yn 75 | case $yn in 76 | [Yy]*) return 0 ;; 77 | [Nn]*) echo -e "${RED}Aborted${NC}" ; return 1 ;; 78 | esac 79 | done 80 | } 81 | 82 | 83 | _main() { 84 | echo -e "${ORANGE}" 85 | echo -e "!! -- WARNING --!!" 86 | echo -e "If this is the second time you are running this, this will re-upload and overwrite existing notebooks with the same names in the 'notebooks' folder. " 87 | echo -e "This will also drop and reload data in Tables." 88 | echo -e "${NC}" 89 | yes_or_no "Are you sure you want to continue (Y/N)?" || { exit 1; } 90 | 91 | # Upload notebooks 92 | echo "Uploading notebooks..." 93 | databricks workspace import_dir "../../notebooks/databricks_notebooks" "/anomaly" --overwrite 94 | 95 | # Upload notebook images to FileStore 96 | # https://docs.databricks.com/user-guide/advanced/filestore.html 97 | databricks fs cp "../../images/FileStore" "dbfs:/FileStore/images" --recursive 98 | 99 | # Upload models 100 | # echo "Uploading pre-trained models..." 101 | # databricks fs cp --recursive --overwrite models/ dbfs:/mnt/blob_storage/models/ 102 | 103 | # Setup workspace 104 | echo "Downloading data. This may take a while as cluster spins up..." 105 | wait_for_run $(databricks runs submit --json-file "./config/run.downloaddata.config.json" | jq -r ".run_id" ) 106 | echo "Performing initial ETL of data. This may take a while as cluster spins up..." 107 | wait_for_run $(databricks runs submit --json-file "./config/run.etl.config.json" | jq -r ".run_id" ) 108 | echo "Training anomaly model. This may take a while as cluster spins up..." 109 | wait_for_run $(databricks runs submit --json-file "./config/run.trainmodelall.config.json" | jq -r ".run_id" ) 110 | 111 | # Schedule and run jobs 112 | echo "Scheduling and running jobs..." 113 | databricks jobs run-now --job-id $(databricks jobs create --json-file "./config/job.streamscoring.config.json" | jq ".job_id") 114 | databricks jobs run-now --job-id $(databricks jobs create --json-file "./config/job.batchscoring.config.json" | jq ".job_id") 115 | 116 | # Create initial cluster, if not yet exists 117 | echo "Creating an interactive cluster..." 118 | cluster_name=$(cat $cluster_config | jq -r ".cluster_name") 119 | if cluster_exists $cluster_name; then 120 | echo "Cluster ${cluster_name} already exists!" 121 | else 122 | echo "Creating cluster ${cluster_name}..." 123 | databricks clusters create --json-file $cluster_config 124 | fi 125 | 126 | # Install Library 127 | echo "Installing libraries..." 128 | cluster_id=$(databricks clusters list | awk '/'$cluster_name'/ {print $1}') 129 | databricks libraries install --maven-coordinates com.microsoft.azure:azure-eventhubs-spark_2.11:2.3.2 --cluster-id $cluster_id 130 | 131 | } 132 | 133 | _main 134 | -------------------------------------------------------------------------------- /deploy/databricks/create_secrets.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -o allexport 4 | source .env 5 | set +o allexport 6 | 7 | scope_name="storage_scope" 8 | 9 | # Create scope, if not exists 10 | if [[ -z $(databricks secrets list-scopes | grep "$scope_name") ]]; then 11 | echo "Creating secrets scope: $scope_name" 12 | databricks secrets create-scope --scope "$scope_name" 13 | fi 14 | 15 | # Create secrets 16 | echo "Creating secrets within scope $scope_name..." 17 | databricks secrets write --scope "$scope_name" --key "storage_account" --string-value "$BLOB_STORAGE_ACCOUNT" 18 | databricks secrets write --scope "$scope_name" --key "storage_key" --string-value "$BLOB_STORAGE_KEY" 19 | databricks secrets write --scope "$scope_name" --key "eventhub_namespace" --string-value "$EVENTHUB_NAMESPACE" 20 | databricks secrets write --scope "$scope_name" --key "eventhub_data_name" --string-value "$EVENTHUB_DATA_NAME" 21 | databricks secrets write --scope "$scope_name" --key "eventhub_data_send_key" --string-value "$EVENTHUB_DATA_SEND_KEY" 22 | databricks secrets write --scope "$scope_name" --key "eventhub_data_listen_key" --string-value "$EVENTHUB_DATA_LISTEN_KEY" 23 | databricks secrets write --scope "$scope_name" --key "eventhub_anom_name" --string-value "$EVENTHUB_ANOM_NAME" 24 | databricks secrets write --scope "$scope_name" --key "eventhub_anom_send_key" --string-value "$EVENTHUB_ANOM_SEND_KEY" 25 | databricks secrets write --scope "$scope_name" --key "eventhub_anom_listen_key" --string-value "$EVENTHUB_ANOM_LISTEN_KEY" 26 | -------------------------------------------------------------------------------- /deploy/deploy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Access granted under MIT Open Source License: https://en.wikipedia.org/wiki/MIT_License 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated 6 | # documentation files (the "Software"), to deal in the Software without restriction, including without limitation 7 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, # and/or sell copies of the Software, 8 | # and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 9 | # 10 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions 11 | # of the Software. 12 | # 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED 14 | # TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 15 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF 16 | # CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 17 | # DEALINGS IN THE SOFTWARE. 18 | # 19 | # 20 | # Description: Deploy ARM template which creates a Databricks account 21 | # 22 | # Usage: ./deploy.sh myResourceGroup "East US 2" 23 | # 24 | # Requirments: 25 | # - User must be logged in to the az cli with the appropriate account set. 26 | # - User must have appropraite permission to deploy to a resource group 27 | # - User must have appropriate permission to create a service principal 28 | 29 | set -o errexit 30 | set -o pipefail 31 | set -o nounset 32 | #set -o xtrace # For debugging 33 | 34 | ################### 35 | # SETUP 36 | 37 | # Check if required utilities are installed 38 | command -v jq >/dev/null 2>&1 || { echo >&2 "I require jq but it's not installed. See https://stedolan.github.io/jq/. Aborting."; exit 1; } 39 | command -v az >/dev/null 2>&1 || { echo >&2 "I require azure cli but it's not installed. See https://bit.ly/2Gc8IsS. Aborting."; exit 1; } 40 | 41 | # Globals 42 | timestamp=$(date +%s) 43 | deploy_name="deployment${timestamp}" 44 | env_file="../.env" 45 | 46 | # Constants 47 | RED='\033[0;31m' 48 | ORANGE='\033[0;33m' 49 | NC='\033[0m' 50 | 51 | # Set path 52 | parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P ) 53 | cd "$parent_path" 54 | 55 | # Check if user is logged in 56 | [[ -n $(az account show 2> /dev/null) ]] || { echo "Please login via the Azure CLI: "; az login; } 57 | 58 | 59 | ################### 60 | # USER PARAMETERS 61 | 62 | rg_name="${1-}" 63 | rg_location="${2-}" 64 | sub_id="${3-}" 65 | 66 | storage_container=databricks #fixed 67 | 68 | while [[ -z $rg_name ]]; do 69 | read -rp "$(echo -e ${ORANGE}"Enter Resource Group name: "${NC})" rg_name 70 | done 71 | 72 | while [[ -z $rg_location ]]; do 73 | read -rp "$(echo -e ${ORANGE}"Enter Azure Location (ei. EAST US 2): "${NC})" rg_location 74 | done 75 | 76 | while [[ -z $sub_id ]]; do 77 | # Check if user only has one sub 78 | sub_count=$(az account list --output json | jq '. | length') 79 | if (( $sub_count != 1 )); then 80 | az account list --output table 81 | read -rp "$(echo -e ${ORANGE}"Enter Azure Subscription Id you wish to deploy to (enter to use Default): "${NC})" sub_id 82 | fi 83 | # If still empty then user selected IsDefault 84 | if [[ -z $sub_id ]]; then 85 | sub_id=$(az account show --output json | jq -r '.id') 86 | fi 87 | done 88 | 89 | # Set account 90 | echo "Deploying to Subscription: $sub_id" 91 | az account set --subscription $sub_id 92 | 93 | ##################### 94 | # Deploy ARM template 95 | 96 | echo "Creating resource group: $rg_name" 97 | az group create --name "$rg_name" --location "$rg_location" 98 | 99 | echo "Deploying resources into $rg_name" 100 | arm_output=$(az group deployment create \ 101 | --name "$deploy_name" \ 102 | --resource-group "$rg_name" \ 103 | --template-file "./azuredeploy.json" \ 104 | --parameters @"./azuredeploy.parameters.json" \ 105 | --output json) 106 | 107 | if [[ -z $arm_output ]]; then 108 | echo >&2 "ARM deployment failed." 109 | exit 1 110 | fi 111 | 112 | 113 | ##################### 114 | # Ask user to configure databricks cli 115 | dbi_workspace=$(echo $arm_output | jq -r '.properties.outputs.dbricksWorkspaceName.value') 116 | echo -e "${ORANGE}" 117 | echo "Configure your databricks cli to connect to the newly created Databricks workspace: ${dbi_workspace}. See here for more info: https://bit.ly/2GUwHcw." 118 | databricks configure --token 119 | echo -e "${NC}" 120 | 121 | 122 | ##################### 123 | # Append to .env file 124 | 125 | echo "Retrieving configuration information from newly deployed resources." 126 | 127 | # Databricks details 128 | dbricks_location=$(echo $arm_output | jq -r '.properties.outputs.dbricksLocation.value') 129 | dbi_token=$(awk '/token/ && NR==3 {print $0;exit;}' ~/.databrickscfg | cut -d' ' -f3) 130 | [[ -n $dbi_token ]] || { echo >&2 "Databricks cli not configured correctly. Please run databricks configure --token. Aborting."; exit 1; } 131 | 132 | # Retrieve storage account details 133 | storage_account=$(echo $arm_output | jq -r '.properties.outputs.storAccountName.value') 134 | storage_account_key=$(az storage account keys list \ 135 | --account-name $storage_account \ 136 | --resource-group $rg_name \ 137 | --output json | 138 | jq -r '.[0].value') 139 | 140 | # Retrieve eventhub details 141 | ehns_name=$(echo $arm_output | jq -r '.properties.outputs.eventhubsNsName.value') 142 | ## EH - data 143 | eh_data_name=$(echo $arm_output | jq -r '.properties.outputs.eventhubDataName.value') 144 | eh_data_send_key=$(az eventhubs eventhub authorization-rule keys list \ 145 | --namespace-name $ehns_name \ 146 | --eventhub-name $eh_data_name \ 147 | --name send \ 148 | --resource-group $rg_name \ 149 | --output json | 150 | jq -r '.primaryKey') 151 | eh_data_listen_key=$(az eventhubs eventhub authorization-rule keys list \ 152 | --namespace-name $ehns_name \ 153 | --eventhub-name $eh_data_name \ 154 | --name listen \ 155 | --resource-group $rg_name \ 156 | --output json | 157 | jq -r '.primaryKey') 158 | ## EH - anom 159 | eh_anom_name=$(echo $arm_output | jq -r '.properties.outputs.eventhubAnomName.value') 160 | eh_anom_send_key=$(az eventhubs eventhub authorization-rule keys list \ 161 | --namespace-name $ehns_name \ 162 | --eventhub-name $eh_anom_name \ 163 | --name send \ 164 | --resource-group $rg_name \ 165 | --output json | 166 | jq -r '.primaryKey') 167 | eh_anom_listen_key=$(az eventhubs eventhub authorization-rule keys list \ 168 | --namespace-name $ehns_name \ 169 | --eventhub-name $eh_anom_name \ 170 | --name listen \ 171 | --resource-group $rg_name \ 172 | --output json | 173 | jq -r '.primaryKey') 174 | 175 | # Create storage container 176 | # LACE TODO Idempotent? 177 | az storage container create \ 178 | --name "$storage_container" \ 179 | --account-name "$storage_account" \ 180 | --account-key "$storage_account_key" 181 | 182 | 183 | # Build .env file 184 | echo "Appending configuration to .env file." 185 | cat << EOF >> $env_file 186 | 187 | # ------ Configuration from deployment ${deploy_name} ----------- 188 | BLOB_STORAGE_ACCOUNT=${storage_account} 189 | BLOB_STORAGE_KEY=${storage_account_key} 190 | EVENTHUB_NAMESPACE=${ehns_name} 191 | EVENTHUB_DATA_NAME=${eh_data_name} 192 | EVENTHUB_DATA_SEND_KEY=${eh_data_send_key} 193 | EVENTHUB_DATA_LISTEN_KEY=${eh_data_listen_key} 194 | EVENTHUB_ANOM_NAME=${eh_anom_name} 195 | EVENTHUB_ANOM_SEND_KEY=${eh_anom_send_key} 196 | EVENTHUB_ANOM_LISTEN_KEY=${eh_anom_listen_key} 197 | DBRICKS_DOMAIN=${dbricks_location}.azuredatabricks.net 198 | DBRICKS_TOKEN=${dbi_token} 199 | 200 | EOF 201 | 202 | echo "Completed deploying Azure resources." 203 | -------------------------------------------------------------------------------- /images/FileStore/GBTModel.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devlace/azure-databricks-anomaly/081c56515ba8d8f614e85ae4b7207eb75c6a0900/images/FileStore/GBTModel.PNG -------------------------------------------------------------------------------- /images/FileStore/LogRegCVPipeline.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devlace/azure-databricks-anomaly/081c56515ba8d8f614e85ae4b7207eb75c6a0900/images/FileStore/LogRegCVPipeline.PNG -------------------------------------------------------------------------------- /images/FileStore/MLPipeline.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devlace/azure-databricks-anomaly/081c56515ba8d8f614e85ae4b7207eb75c6a0900/images/FileStore/MLPipeline.PNG -------------------------------------------------------------------------------- /images/FileStore/PCAAnomalyPipeline.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devlace/azure-databricks-anomaly/081c56515ba8d8f614e85ae4b7207eb75c6a0900/images/FileStore/PCAAnomalyPipeline.PNG -------------------------------------------------------------------------------- /images/FileStore/RandomForestPipeline.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devlace/azure-databricks-anomaly/081c56515ba8d8f614e85ae4b7207eb75c6a0900/images/FileStore/RandomForestPipeline.PNG -------------------------------------------------------------------------------- /images/FileStore/TransformPipeline.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devlace/azure-databricks-anomaly/081c56515ba8d8f614e85ae4b7207eb75c6a0900/images/FileStore/TransformPipeline.PNG -------------------------------------------------------------------------------- /images/FileStore/transformation_and_actions.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devlace/azure-databricks-anomaly/081c56515ba8d8f614e85ae4b7207eb75c6a0900/images/FileStore/transformation_and_actions.PNG -------------------------------------------------------------------------------- /images/archi.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devlace/azure-databricks-anomaly/081c56515ba8d8f614e85ae4b7207eb75c6a0900/images/archi.PNG -------------------------------------------------------------------------------- /notebooks/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devlace/azure-databricks-anomaly/081c56515ba8d8f614e85ae4b7207eb75c6a0900/notebooks/.gitkeep -------------------------------------------------------------------------------- /notebooks/databricks_notebooks/00_demo_hello_spark.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC ## Spark session 4 | 5 | # COMMAND ---------- 6 | 7 | spark 8 | 9 | # COMMAND ---------- 10 | 11 | spark.version 12 | 13 | # COMMAND ---------- 14 | 15 | # MAGIC %md 16 | # MAGIC ## Spark Dataframe 17 | 18 | # COMMAND ---------- 19 | 20 | df = spark.createDataFrame([('Fiji Apple', 'Red', 3.5), 21 | ('Banana', 'Yellow', 1.0), 22 | ('Green Grape', 'Green', 2.0), 23 | ('Red Grape', 'Red', 2.0), 24 | ('Peach', 'Yellow', 3.0), 25 | ('Orange', 'Orange', 2.0), 26 | ('Green Apple', 'Green', 2.5)], 27 | ['Fruit', 'Color', 'Price']) 28 | display(df) 29 | 30 | # COMMAND ---------- 31 | 32 | df.printSchema() 33 | 34 | # COMMAND ---------- 35 | 36 | # MAGIC %md 37 | # MAGIC ### Let's mix in some Spark SQL 38 | 39 | # COMMAND ---------- 40 | 41 | df.createOrReplaceTempView("temp_df") 42 | 43 | # COMMAND ---------- 44 | 45 | # MAGIC %sql 46 | # MAGIC SELECT * FROM temp_df 47 | 48 | # COMMAND ---------- 49 | 50 | # MAGIC %md 51 | # MAGIC ## Transformation and Actions 52 | # MAGIC ![Transformation and Actions](files/images/transformation_and_actions.PNG) 53 | 54 | # COMMAND ---------- 55 | 56 | # MAGIC %md 57 | # MAGIC #### Transformation 58 | 59 | # COMMAND ---------- 60 | 61 | df_agg = df\ 62 | .select("Fruit", "Color", "Price")\ 63 | .groupBy("Color")\ 64 | .avg("Price") 65 | 66 | # COMMAND ---------- 67 | 68 | # MAGIC %md 69 | # MAGIC #### Action 70 | 71 | # COMMAND ---------- 72 | 73 | df_agg.collect() -------------------------------------------------------------------------------- /notebooks/databricks_notebooks/01_download_data.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC # Mount blob storage 4 | 5 | # COMMAND ---------- 6 | 7 | # Set mount path 8 | storage_mount_path = "/mnt/blob_storage" 9 | 10 | # Unmount if existing 11 | for mp in dbutils.fs.mounts(): 12 | if mp.mountPoint == storage_mount_path: 13 | dbutils.fs.unmount(storage_mount_path) 14 | 15 | # Refresh mounts 16 | dbutils.fs.refreshMounts() 17 | 18 | # COMMAND ---------- 19 | 20 | # Retrieve storage credentials 21 | storage_account = dbutils.secrets.get(scope = "storage_scope", key = "storage_account") 22 | storage_key = dbutils.secrets.get(scope = "storage_scope", key = "storage_key") 23 | 24 | # Try to print out: 25 | storage_key 26 | 27 | # COMMAND ---------- 28 | 29 | # Mount 30 | dbutils.fs.mount( 31 | source = "wasbs://databricks@" + storage_account + ".blob.core.windows.net", 32 | mount_point = storage_mount_path, 33 | extra_configs = {"fs.azure.account.key." + storage_account + ".blob.core.windows.net": storage_key}) 34 | 35 | # Refresh mounts 36 | dbutils.fs.refreshMounts() 37 | 38 | # COMMAND ---------- 39 | 40 | # MAGIC %md 41 | # MAGIC # Download Data 42 | 43 | # COMMAND ---------- 44 | 45 | import os 46 | import gzip 47 | import shutil 48 | from urllib.request import urlretrieve 49 | 50 | def download_and_uncompress_gz(data_url, out_file): 51 | tmp_loc = '/tmp/data.gz' 52 | 53 | # Download 54 | urlretrieve(data_url, tmp_loc) 55 | 56 | # Create dir if not exist 57 | dir_path = os.path.dirname(out_file) 58 | if not os.path.exists(dir_path): 59 | os.makedirs(dir_path) 60 | 61 | # Uncompress 62 | with gzip.open(tmp_loc, 'rb') as f_in: 63 | with open(out_file, 'wb') as f_out: 64 | shutil.copyfileobj(f_in, f_out) 65 | 66 | # Cleanup 67 | os.remove(tmp_loc) 68 | 69 | 70 | # Note that Azure Databricks configures each cluster node with a FUSE mount that allows processes running on cluster nodes to read and write to the underlying 71 | # distributed storage layer with local file APIs 72 | # See here: https://docs.azuredatabricks.net/user-guide/dbfs-databricks-file-system.html#access-dbfs-using-local-file-apis 73 | # 'https://archive.ics.uci.edu/ml/machine-learning-databases/kddcup99-mld/kddcup.data.gz' 74 | download_and_uncompress_gz(data_url='https://lacedemodata.blob.core.windows.net/data/kddcup.data.gz', 75 | out_file='/dbfs' + storage_mount_path + '/data/raw/kddcup.data.csv') 76 | 77 | # 'http://kdd.ics.uci.edu/databases/kddcup99/kddcup.testdata.unlabeled.gz' 78 | download_and_uncompress_gz(data_url='https://lacedemodata.blob.core.windows.net/data/kddcup.testdata.unlabeled.gz', 79 | out_file='/dbfs' + storage_mount_path + '/data/raw/kddcup.testdata.unlabeled.csv') -------------------------------------------------------------------------------- /notebooks/databricks_notebooks/02_ETL.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # Prepare data 3 | 4 | from pyspark.sql.functions import monotonically_increasing_id, lit, concat 5 | 6 | # Set mount path 7 | storage_mount_path = "/mnt/blob_storage" 8 | 9 | raw_df = spark.read.csv(storage_mount_path + '/data/raw/kddcup.data.csv') 10 | raw_unlabeled_df = spark.read.csv(storage_mount_path + '/data/raw/kddcup.testdata.unlabeled.csv') 11 | 12 | # Add id 13 | df = raw_df.withColumn('id', concat(lit('A'), monotonically_increasing_id()))\ 14 | .select(['id'] + raw_df.columns)\ 15 | .repartition(20) 16 | unlabeled_df = raw_unlabeled_df.withColumn('id', concat(lit('B'), monotonically_increasing_id()))\ 17 | .select(['id'] + raw_unlabeled_df.columns)\ 18 | .repartition(20) 19 | 20 | # Write out to csv 21 | df.write.csv(storage_mount_path + '/data/for_streaming/kddcup.data/', mode='overwrite') 22 | unlabeled_df.write.csv(storage_mount_path + '/data/for_streaming/kddcup.testdata.unlabeled/', mode='overwrite') 23 | 24 | # COMMAND ---------- 25 | 26 | # MAGIC %md 27 | # MAGIC # Create and load SparkSQL tables 28 | 29 | # COMMAND ---------- 30 | 31 | # MAGIC %sql 32 | # MAGIC ------------------ 33 | # MAGIC -- Create KDD Table 34 | # MAGIC 35 | # MAGIC DROP TABLE IF EXISTS kdd_temp; 36 | # MAGIC CREATE TABLE kdd_temp 37 | # MAGIC ( 38 | # MAGIC id STRING, 39 | # MAGIC duration FLOAT, 40 | # MAGIC protocol_type STRING, 41 | # MAGIC service STRING, 42 | # MAGIC flag STRING, 43 | # MAGIC src_bytes FLOAT, 44 | # MAGIC dst_bytes FLOAT, 45 | # MAGIC land SHORT, 46 | # MAGIC wrong_fragment FLOAT, 47 | # MAGIC urgent FLOAT, 48 | # MAGIC hot FLOAT, 49 | # MAGIC num_failed_logins FLOAT, 50 | # MAGIC logged_in SHORT, 51 | # MAGIC num_compromised FLOAT, 52 | # MAGIC root_shell FLOAT, 53 | # MAGIC su_attempted FLOAT, 54 | # MAGIC num_root FLOAT, 55 | # MAGIC num_file_creations FLOAT, 56 | # MAGIC num_shells FLOAT, 57 | # MAGIC num_access_files FLOAT, 58 | # MAGIC num_outbound_cmds FLOAT, 59 | # MAGIC is_host_login SHORT, 60 | # MAGIC is_guest_login SHORT, 61 | # MAGIC count FLOAT, 62 | # MAGIC srv_count FLOAT, 63 | # MAGIC serror_rate FLOAT, 64 | # MAGIC srv_serror_rate FLOAT, 65 | # MAGIC rerror_rate FLOAT, 66 | # MAGIC srv_rerror_rate FLOAT, 67 | # MAGIC same_srv_rate FLOAT, 68 | # MAGIC diff_srv_rate FLOAT, 69 | # MAGIC srv_diff_host_rate FLOAT, 70 | # MAGIC dst_host_count FLOAT, 71 | # MAGIC dst_host_srv_count FLOAT, 72 | # MAGIC dst_host_same_srv_rate FLOAT, 73 | # MAGIC dst_host_diff_srv_rate FLOAT, 74 | # MAGIC dst_host_same_src_port_rate FLOAT, 75 | # MAGIC dst_host_srv_diff_host_rate FLOAT, 76 | # MAGIC dst_host_serror_rate FLOAT, 77 | # MAGIC dst_host_srv_serror_rate FLOAT, 78 | # MAGIC dst_host_rerror_rate FLOAT, 79 | # MAGIC dst_host_srv_rerror_rate FLOAT, 80 | # MAGIC label STRING 81 | # MAGIC ) 82 | # MAGIC USING CSV 83 | # MAGIC LOCATION '/mnt/blob_storage/data/for_streaming/kddcup.data/' 84 | # MAGIC OPTIONS ("header"="false"); 85 | # MAGIC 86 | # MAGIC DROP TABLE IF EXISTS kdd; 87 | # MAGIC CREATE TABLE kdd 88 | # MAGIC USING org.apache.spark.sql.parquet 89 | # MAGIC AS SELECT * FROM kdd_temp; 90 | # MAGIC 91 | # MAGIC -- Drop temporary table 92 | # MAGIC DROP TABLE kdd_temp; 93 | # MAGIC 94 | # MAGIC --Refresh 95 | # MAGIC REFRESH TABLE kdd; 96 | # MAGIC 97 | # MAGIC --select 98 | # MAGIC SELECT * FROM kdd LIMIT 100; 99 | 100 | # COMMAND ---------- 101 | 102 | # MAGIC %sql 103 | # MAGIC ------------------ 104 | # MAGIC -- Create KDD_unlabelled Table 105 | # MAGIC 106 | # MAGIC DROP TABLE IF EXISTS kdd_unlabeled_temp; 107 | # MAGIC CREATE TABLE kdd_unlabeled_temp 108 | # MAGIC ( 109 | # MAGIC id STRING, 110 | # MAGIC duration FLOAT, 111 | # MAGIC protocol_type STRING, 112 | # MAGIC service STRING, 113 | # MAGIC flag STRING, 114 | # MAGIC src_bytes FLOAT, 115 | # MAGIC dst_bytes FLOAT, 116 | # MAGIC land SHORT, 117 | # MAGIC wrong_fragment FLOAT, 118 | # MAGIC urgent FLOAT, 119 | # MAGIC hot FLOAT, 120 | # MAGIC num_failed_logins FLOAT, 121 | # MAGIC logged_in SHORT, 122 | # MAGIC num_compromised FLOAT, 123 | # MAGIC root_shell FLOAT, 124 | # MAGIC su_attempted FLOAT, 125 | # MAGIC num_root FLOAT, 126 | # MAGIC num_file_creations FLOAT, 127 | # MAGIC num_shells FLOAT, 128 | # MAGIC num_access_files FLOAT, 129 | # MAGIC num_outbound_cmds FLOAT, 130 | # MAGIC is_host_login SHORT, 131 | # MAGIC is_guest_login SHORT, 132 | # MAGIC count FLOAT, 133 | # MAGIC srv_count FLOAT, 134 | # MAGIC serror_rate FLOAT, 135 | # MAGIC srv_serror_rate FLOAT, 136 | # MAGIC rerror_rate FLOAT, 137 | # MAGIC srv_rerror_rate FLOAT, 138 | # MAGIC same_srv_rate FLOAT, 139 | # MAGIC diff_srv_rate FLOAT, 140 | # MAGIC srv_diff_host_rate FLOAT, 141 | # MAGIC dst_host_count FLOAT, 142 | # MAGIC dst_host_srv_count FLOAT, 143 | # MAGIC dst_host_same_srv_rate FLOAT, 144 | # MAGIC dst_host_diff_srv_rate FLOAT, 145 | # MAGIC dst_host_same_src_port_rate FLOAT, 146 | # MAGIC dst_host_srv_diff_host_rate FLOAT, 147 | # MAGIC dst_host_serror_rate FLOAT, 148 | # MAGIC dst_host_srv_serror_rate FLOAT, 149 | # MAGIC dst_host_rerror_rate FLOAT, 150 | # MAGIC dst_host_srv_rerror_rate FLOAT 151 | # MAGIC ) 152 | # MAGIC USING CSV 153 | # MAGIC LOCATION '/mnt/blob_storage/data/for_streaming/kddcup.testdata.unlabeled/' 154 | # MAGIC OPTIONS ("header"="false"); 155 | # MAGIC 156 | # MAGIC DROP TABLE IF EXISTS kdd_unlabeled; 157 | # MAGIC CREATE TABLE kdd_unlabeled 158 | # MAGIC USING org.apache.spark.sql.parquet 159 | # MAGIC AS SELECT * FROM kdd_unlabeled_temp; 160 | # MAGIC 161 | # MAGIC -- Drop temporary table 162 | # MAGIC DROP TABLE kdd_unlabeled_temp; 163 | # MAGIC 164 | # MAGIC --Refresh 165 | # MAGIC REFRESH TABLE kdd_unlabeled; 166 | # MAGIC 167 | # MAGIC --Select 168 | # MAGIC SELECT * FROM kdd_unlabeled LIMIT 100; -------------------------------------------------------------------------------- /notebooks/databricks_notebooks/03_explore_data.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | from pyspark.sql import functions as F 3 | 4 | # COMMAND ---------- 5 | 6 | df = spark.read.table("kdd") 7 | 8 | # COMMAND ---------- 9 | 10 | display(df) 11 | 12 | # COMMAND ---------- 13 | 14 | df.printSchema() 15 | 16 | # COMMAND ---------- 17 | 18 | df.count() 19 | 20 | # COMMAND ---------- 21 | 22 | # Summary on continuous features 23 | cols = df.columns 24 | noncont_features = ['id', 'protocol_type', 'service', 'flag', 'label'] 25 | cont_features = [x for x in cols if x not in noncont_features] 26 | 27 | summary_df = df.select(cont_features).summary().cache() 28 | display(summary_df) 29 | 30 | # COMMAND ---------- 31 | 32 | # Normal vs Anomalies 33 | transformed_df = (df\ 34 | .withColumn("label", F.when(df.label == "normal.", 0).otherwise(1))\ 35 | .groupBy("label") 36 | .agg(F.count("id"))) 37 | 38 | display(transformed_df) 39 | 40 | # COMMAND ---------- 41 | 42 | # Count by label 43 | transformed_df = (df\ 44 | .groupBy("label")\ 45 | .agg(F.count("label").alias("num_requests"))\ 46 | .orderBy("num_requests", ascending=False)) 47 | 48 | display(transformed_df) 49 | 50 | # COMMAND ---------- 51 | 52 | -------------------------------------------------------------------------------- /notebooks/databricks_notebooks/04_trainmodel_multiple.scala: -------------------------------------------------------------------------------- 1 | // Databricks notebook source 2 | // MAGIC %md 3 | // MAGIC ## Setup 4 | 5 | // COMMAND ---------- 6 | 7 | import org.apache.spark.ml.Pipeline 8 | import org.apache.spark.ml.feature._ 9 | import org.apache.spark.sql.functions._ 10 | import org.apache.spark.ml.classification._ 11 | import org.apache.spark.ml.evaluation.{MulticlassClassificationEvaluator, BinaryClassificationEvaluator} 12 | import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder} 13 | 14 | // Model Directory 15 | val modelDir = "/mnt/blob_storage/models" 16 | val randomSeed = 123 17 | 18 | // COMMAND ---------- 19 | 20 | // MAGIC %md 21 | // MAGIC ## Load and transform data 22 | 23 | // COMMAND ---------- 24 | 25 | // Read data 26 | spark.catalog.refreshTable("kdd") // need to refresh to invalidate cache 27 | val df = spark.read.table("kdd") 28 | 29 | // Clean data 30 | val cleanDf = df 31 | .withColumn("is_anomaly", when(col("label") === "normal.", 0).otherwise(1)) 32 | .na.drop() 33 | 34 | // Clean up labels for anomaly 35 | display(cleanDf) 36 | 37 | val columns = cleanDf.columns.toSet 38 | val features = columns -- Set("id", "label", "is_anomaly") 39 | val categoricalFeatures = Set("protocol_type", "service", "flag") 40 | val continuousFeatures = features -- categoricalFeatures 41 | 42 | 43 | // COMMAND ---------- 44 | 45 | // MAGIC %md 46 | // MAGIC ## Define Feature Estimators and Transformers 47 | 48 | // COMMAND ---------- 49 | 50 | // Label indexer 51 | val labelIndexer = new StringIndexer() 52 | .setInputCol("label") 53 | .setOutputCol("label_index") 54 | val labelIndexerModel = labelIndexer.fit(cleanDf) 55 | 56 | // Categorical Feature Indexers 57 | val indexers = categoricalFeatures.map({ colName => 58 | new StringIndexer().setInputCol(colName).setOutputCol(colName + "_index").setHandleInvalid("keep") 59 | }).toArray 60 | 61 | // Encoders 62 | val encoder = new OneHotEncoderEstimator() 63 | .setInputCols(categoricalFeatures.map(colName => colName + "_index").toArray) 64 | .setOutputCols(categoricalFeatures.map(colName => colName + "_encoded").toArray) 65 | 66 | // Vector Assembler 67 | var selectedFeatures = continuousFeatures ++ categoricalFeatures.map(colName => colName + "_encoded") 68 | val assembler = new VectorAssembler() 69 | .setInputCols(selectedFeatures.toArray) 70 | .setOutputCol("features") 71 | 72 | // Standard Scalar 73 | val standardScalar = new StandardScaler() 74 | .setInputCol("features") 75 | .setOutputCol("norm_features") 76 | .setWithMean(true) 77 | .setWithStd(true) 78 | 79 | // Convert indexed labels back to original labels. 80 | val labelConverter = new IndexToString() 81 | .setInputCol("prediction") 82 | .setOutputCol("predicted_label") 83 | .setLabels(labelIndexerModel.labels) 84 | 85 | // COMMAND ---------- 86 | 87 | // MAGIC %md 88 | // MAGIC ## Build Data Transformation pipeline 89 | // MAGIC ![Data Transform Pipeline](files/images/TransformPipeline.PNG) 90 | 91 | // COMMAND ---------- 92 | 93 | // Transform pipeline 94 | val transformPipeline = new Pipeline().setStages(indexers ++ Array(labelIndexer, encoder, assembler, standardScalar)) 95 | val transformedDf = transformPipeline 96 | .fit(cleanDf) 97 | .transform(cleanDf) 98 | 99 | // Split data 100 | val Array(transformedTraining, transformedTest) = transformedDf.randomSplit(Array(0.8, 0.2), seed = randomSeed) 101 | 102 | display(transformedDf.select("label_index", "norm_features")) 103 | 104 | // COMMAND ---------- 105 | 106 | // MAGIC %md 107 | // MAGIC ## GBT Binary classification 108 | // MAGIC ![GBT Model](files/images/GBTModel.PNG) 109 | 110 | // COMMAND ---------- 111 | 112 | // Train a GBT model. 113 | val gbt = new GBTClassifier() 114 | .setLabelCol("is_anomaly") 115 | .setFeaturesCol("norm_features") 116 | .setMaxIter(10) 117 | .setFeatureSubsetStrategy("auto") 118 | 119 | // Fit pipeline 120 | val gbtModel = gbt.fit(transformedTraining) 121 | 122 | // Make predictions. 123 | val gbtPredictions = gbtModel.transform(transformedTest) 124 | gbtPredictions.select("prediction", "label", "features").show(10) 125 | 126 | val gbtEvaluator = new BinaryClassificationEvaluator() 127 | .setMetricName("areaUnderROC") 128 | .setLabelCol("is_anomaly") 129 | .setRawPredictionCol("rawPrediction") 130 | val gbtAccuracy = gbtEvaluator.evaluate(gbtPredictions) 131 | println(s"Test Error = ${(1.0 - gbtAccuracy)}") 132 | 133 | // COMMAND ---------- 134 | 135 | // MAGIC %md 136 | // MAGIC ## Random Forest Multiclassification - End to end pipeline 137 | // MAGIC ![RandomForest Model](files/images/RandomForestPipeline.PNG) 138 | 139 | // COMMAND ---------- 140 | 141 | // Using non-transformed data (cleanDf) 142 | val Array(training, test) = cleanDf.randomSplit(Array(0.8, 0.2), seed = 123) 143 | 144 | // Train a RandomForest model. 145 | val rf = new RandomForestClassifier() 146 | .setLabelCol("label_index") 147 | .setFeaturesCol("norm_features") 148 | .setNumTrees(10) 149 | 150 | // Chain indexers and Random Forest in a Pipeline. 151 | val rfPipeline = new Pipeline().setStages(indexers ++ Array(labelIndexer, encoder, assembler, standardScalar, rf, labelConverter)) 152 | 153 | // Fit pipeline 154 | val rfPipelineModel = rfPipeline.fit(training) 155 | 156 | // Make predictions. 157 | val rfPredictions = rfPipelineModel.transform(test) 158 | rfPredictions.select("predicted_label", "label", "features").show(10) 159 | 160 | // Evaluate 161 | val rfEvaluator = new MulticlassClassificationEvaluator() 162 | .setMetricName("accuracy") 163 | .setLabelCol("label_index") 164 | .setPredictionCol("prediction") 165 | val rfAccuracy = rfEvaluator.evaluate(rfPredictions) 166 | println(s"Test Error = ${(1.0 - rfAccuracy)}") 167 | 168 | 169 | // COMMAND ---------- 170 | 171 | // MAGIC %md 172 | // MAGIC ## Logistic Regression with CrossValidation 173 | // MAGIC ![Logistic Regression w/ CrossValidation](files/images/LogRegCVPipeline.PNG) 174 | 175 | // COMMAND ---------- 176 | 177 | // Train a Logistic Regres model. 178 | val lr = new LogisticRegression() 179 | .setMaxIter(10) 180 | .setLabelCol("label_index") 181 | .setFeaturesCol("norm_features") 182 | 183 | // Define ParamGrid 184 | val lrParamGrid = new ParamGridBuilder() 185 | .addGrid(lr.regParam, Array(0.1, 0.01)) 186 | .addGrid(lr.elasticNetParam, Array(0.1, 0.5, 0.8)) 187 | .build() 188 | 189 | // Define evaluator 190 | val lrEvaluator = new MulticlassClassificationEvaluator() 191 | .setMetricName("accuracy") 192 | .setLabelCol("label_index") 193 | .setPredictionCol("prediction") 194 | 195 | // CrossValidation model 196 | val lrCv = new CrossValidator() 197 | .setEstimator(lr) 198 | .setEvaluator(lrEvaluator) 199 | .setEstimatorParamMaps(lrParamGrid) 200 | .setNumFolds(3) 201 | 202 | // Chain indexers and Random Forest in a Pipeline. 203 | val lrCvPipeline = new Pipeline().setStages(Array(lrCv, labelConverter)) 204 | 205 | // Fit model 206 | val lrCvPipelineModel = lrCvPipeline.fit(transformedTraining) 207 | 208 | // Make predictions with test 209 | val lrCvPredictions = lrCvPipelineModel.transform(transformedTest) 210 | lrCvPredictions.select("predicted_label", "label", "features").show(10) 211 | 212 | // Evaluate 213 | val lrCvAccuracy = lrEvaluator.evaluate(lrCvPredictions) 214 | println(s"Test Error = ${(1.0 - lrCvAccuracy)}") 215 | 216 | 217 | // COMMAND ---------- 218 | 219 | // MAGIC %md 220 | // MAGIC ## Save models 221 | // MAGIC - Saving Data Scientist work 222 | // MAGIC - Compose models and train in a different cluster 223 | // MAGIC - Productionizing ML models 224 | 225 | // COMMAND ---------- 226 | 227 | gbtModel.write.overwrite().save(s"$modelDir/GBT") 228 | rfPipelineModel.write.overwrite().save(s"$modelDir/RandomForestPipeline") 229 | lrCvPipelineModel.write.overwrite().save(s"$modelDir/LogRegPipeline") -------------------------------------------------------------------------------- /notebooks/databricks_notebooks/04_trainmodel_pca_w_custom.scala: -------------------------------------------------------------------------------- 1 | // Databricks notebook source 2 | // MAGIC %md 3 | // MAGIC ## Writing your own Model (Custom Spark Estimators and Transformers) 4 | 5 | // COMMAND ---------- 6 | 7 | // MAGIC %md 8 | // MAGIC ### PCA for Anomaly detection 9 | // MAGIC 1. Filter out nomalous points and perform PCA to extract Principal Components 10 | // MAGIC 2. Reconstruct the features using the Principal Components and the feature vectors. 11 | // MAGIC 3. To calculate the Anomaly Score, calculate the normalized error between the reconstructed features and the original feature vector 12 | // MAGIC - In this case, we use the sum of squared differences from the two vectors 13 | // MAGIC 14 | // MAGIC For more information: 15 | // MAGIC - [PCA-based Anomaly Detection](https://docs.microsoft.com/en-us/azure/machine-learning/studio-module-reference/pca-based-anomaly-detection) 16 | // MAGIC - [A randomized algorithm for principal component analysis](https://arxiv.org/abs/0809.2274). Rokhlin, Szlan and Tygert 17 | // MAGIC - [Finding Structure with Randomness: Probabilistic Algorithms for Constructing Approximate Matrix Decompositions](http://users.cms.caltech.edu/~jtropp/papers/HMT11-Finding-Structure-SIREV.pdf). Halko, Martinsson and Tropp. 18 | 19 | // COMMAND ---------- 20 | 21 | package org.apache.spark.ml.feature 22 | 23 | import org.apache.hadoop.fs.Path 24 | 25 | import org.apache.spark.ml._ 26 | import org.apache.spark.ml.linalg._ 27 | import org.apache.spark.ml.param._ 28 | import org.apache.spark.ml.param.shared._ 29 | import org.apache.spark.ml.util._ 30 | import org.apache.spark.sql._ 31 | import org.apache.spark.sql.functions._ 32 | import org.apache.spark.sql.types.{StructField, StructType, DoubleType} 33 | 34 | import breeze.linalg.{DenseVector, sum} 35 | import breeze.numerics.pow 36 | 37 | /** 38 | * Params for [[PCAAnomaly]] and [[PCAAnomalyModel]]. 39 | */ 40 | trait PCAAnomalyParams extends Params with HasInputCol with HasOutputCol { 41 | final val outputPCACol = new Param[String](this, "outputPCACol", "The output column with PCA features") 42 | final val outputAbsScoreCol = new Param[String](this, "outputAbsScoreCol", "The output column with non-normalized Anomaly Scores") 43 | final val labelCol = new Param[String](this, "labelCol", "Label column") 44 | setDefault(outputPCACol, "pca_features") 45 | setDefault(outputAbsScoreCol, "nonnorm_anomaly_score") 46 | setDefault(labelCol, "label") 47 | 48 | final val k: IntParam = new IntParam(this, "k", "the number of principal components (> 0)", 49 | ParamValidators.gt(0)) 50 | 51 | /** Validates and transforms the input schema. */ 52 | protected def validateAndTransformSchema(schema: StructType): StructType = { 53 | //SchemaUtils.checkColumnType(schema, $(inputCol), new VectorUDT) 54 | require(!schema.fieldNames.contains($(outputCol)), s"Output column ${$(outputCol)} already exists.") 55 | val outputFields = schema.fields :+ 56 | StructField($(outputPCACol), new VectorUDT, false) :+ 57 | StructField($(outputCol), DoubleType, false) 58 | StructType(outputFields) 59 | } 60 | } 61 | 62 | /** 63 | * PCA trains a model to project vectors to a lower dimensional space of the top `PCA!.k` 64 | * principal components. 65 | */ 66 | class PCAAnomaly (override val uid: String) 67 | extends Estimator[PCAAnomalyModel] with PCAAnomalyParams with DefaultParamsWritable { 68 | 69 | def this() = this(Identifiable.randomUID("pca_anomaly")) 70 | 71 | def setInputCol(value: String): this.type = set(inputCol, value) 72 | def setOutputCol(value: String): this.type = set(outputCol, value) 73 | def setLabelCol(value: String): this.type = set(labelCol, value) 74 | def setOutputPCACol(value: String): this.type = set(outputPCACol, value) 75 | def setOutputAbsScoreCol(value: String): this.type = set(outputAbsScoreCol, value) 76 | def setK(value: Int): this.type = set(k, value) 77 | 78 | /** 79 | * Computes a [[PCAAnomalyModel]] that contains the principal components of the input vectors. 80 | */ 81 | override def fit(dataset: Dataset[_]): PCAAnomalyModel = { 82 | transformSchema(dataset.schema, logging = true) 83 | 84 | // remove anomalies 85 | val cleanDataset = dataset.filter(col($(labelCol)) === 0) 86 | 87 | // Fit regular PCA model 88 | val pcaModel = new PCA() 89 | .setInputCol($(inputCol)) 90 | .setOutputCol($(outputPCACol)) 91 | .setK($(k)) 92 | .fit(cleanDataset) 93 | 94 | copyValues(new PCAAnomalyModel(uid, pcaModel).setParent(this)) 95 | } 96 | 97 | override def transformSchema(schema: StructType): StructType = { 98 | validateAndTransformSchema(schema) 99 | } 100 | 101 | override def copy(extra: ParamMap): PCAAnomaly = defaultCopy(extra) 102 | } 103 | 104 | object PCAAnomaly extends DefaultParamsReadable[PCAAnomaly] { 105 | override def load(path: String): PCAAnomaly = super.load(path) 106 | } 107 | 108 | /** 109 | * Model fitted by [[PCAAnomaly]]. Uses PCA to detect anomalies 110 | * 111 | * @param pcaModel A PCA model 112 | */ 113 | class PCAAnomalyModel ( 114 | override val uid: String, 115 | val pcaModel: PCAModel) 116 | extends Model[PCAAnomalyModel] with PCAAnomalyParams with MLWritable { 117 | 118 | import PCAAnomalyModel._ 119 | 120 | def setInputCol(value: String): this.type = set(inputCol, value) 121 | def setOutputCol(value: String): this.type = set(outputCol, value) 122 | def setLabelCol(value: String): this.type = set(labelCol, value) 123 | def setOutputPCACol(value: String): this.type = set(outputPCACol, value) 124 | def setOutputAbsScoreCol(value: String): this.type = set(outputAbsScoreCol, value) 125 | def setK(value: Int): this.type = set(k, value) 126 | 127 | /** 128 | * Transform a vector by computed Principal Components. 129 | * 130 | * @note Vectors to be transformed must be the same length as the source vectors given 131 | * to `PCAAnomaly.fit()`. 132 | */ 133 | override def transform(dataset: Dataset[_]): DataFrame = { 134 | transformSchema(dataset.schema, logging = true) 135 | 136 | val pcaResults = pcaModel.transform(dataset) 137 | 138 | val anomalyScoreUdf = udf((originalFeatures:Vector, pcaFeatures:Vector) => { 139 | // Reconstruct vector using Principal components 140 | val reconstructedFeatures = pcaModel.pc.multiply(pcaFeatures) 141 | 142 | // Calculate error (sum of squared differences) 143 | val originalFeaturesB = DenseVector(originalFeatures.toArray) 144 | val reconstructedFeaturesB = DenseVector(reconstructedFeatures.toArray) 145 | val diff = originalFeaturesB - reconstructedFeaturesB 146 | val error = sum(pow(diff, 2)) 147 | error 148 | }) 149 | val anomalyScore = pcaResults.withColumn($(outputAbsScoreCol), anomalyScoreUdf(col($(inputCol)), col($(outputPCACol)))) 150 | 151 | // Normalize 152 | val Row(maxVal: Double) = anomalyScore.select(max($(outputAbsScoreCol))).head 153 | val Row(minVal: Double) = anomalyScore.select(min($(outputAbsScoreCol))).head 154 | val nomarlizeAnomalyScore = anomalyScore 155 | .withColumn($(outputCol), (col($(outputAbsScoreCol)) - minVal) / (maxVal - minVal)) 156 | 157 | nomarlizeAnomalyScore 158 | } 159 | 160 | override def transformSchema(schema: StructType): StructType = { 161 | validateAndTransformSchema(schema) 162 | } 163 | 164 | override def copy(extra: ParamMap): PCAAnomalyModel = { 165 | val copied = new PCAAnomalyModel(uid, pcaModel) 166 | copyValues(copied, extra).setParent(parent) 167 | } 168 | 169 | override def write: MLWriter = new PCAAnomalyModelWriter(this) 170 | } 171 | 172 | object PCAAnomalyModel extends MLReadable[PCAAnomalyModel] { 173 | 174 | private[PCAAnomalyModel] class PCAAnomalyModelWriter(instance: PCAAnomalyModel) extends MLWriter { 175 | override protected def saveImpl(path: String): Unit = { 176 | DefaultParamsWriter.saveMetadata(instance, path, sc) 177 | val pcaPath = new Path(path, "pca").toString 178 | instance.pcaModel.save(pcaPath) 179 | } 180 | } 181 | 182 | private class PCAAnomalyModelReader extends MLReader[PCAAnomalyModel] { 183 | 184 | private val className = classOf[PCAAnomalyModel].getName 185 | 186 | /** 187 | * Loads a [[PCAAnomalyModel]] from data located at the input path. 188 | * 189 | * @param path path to serialized model data 190 | * @return a [[PCAAnomalyModel]] 191 | */ 192 | override def load(path: String): PCAAnomalyModel = { 193 | val metadata = DefaultParamsReader.loadMetadata(path, sc, className) 194 | val pcaPath = new Path(path, "pca").toString 195 | val pcaModel = PCAModel.load(pcaPath) 196 | val model = new PCAAnomalyModel(metadata.uid, pcaModel) 197 | DefaultParamsReader.getAndSetParams(model, metadata) 198 | model 199 | } 200 | } 201 | 202 | override def read: MLReader[PCAAnomalyModel] = new PCAAnomalyModelReader 203 | 204 | override def load(path: String): PCAAnomalyModel = super.load(path) 205 | } 206 | 207 | 208 | 209 | // COMMAND ---------- 210 | 211 | // MAGIC %md 212 | // MAGIC ## Use Custom Model in a Pipeline 213 | 214 | // COMMAND ---------- 215 | 216 | // MAGIC %md 217 | // MAGIC ### Setup 218 | 219 | // COMMAND ---------- 220 | 221 | import org.apache.spark.ml.feature.{StringIndexer, OneHotEncoderEstimator, VectorAssembler, PCA, StandardScaler, MinMaxScaler, PCAAnomaly} 222 | import org.apache.spark.ml.{Pipeline, PipelineModel} 223 | import org.apache.spark.ml.linalg.{Vector, Vectors} 224 | import org.apache.spark.sql.functions._ 225 | import breeze.linalg.{DenseVector, sum} 226 | import breeze.numerics.pow 227 | 228 | val modelDir = "mnt/blob_storage/models/PCAAnomalyModel" 229 | 230 | // COMMAND ---------- 231 | 232 | // MAGIC %md 233 | // MAGIC ### Load and transform data 234 | 235 | // COMMAND ---------- 236 | 237 | // Read data 238 | spark.catalog.refreshTable("kdd") // need to refresh to invalidate cache 239 | val df = spark.read.table("kdd") 240 | 241 | // Clean data 242 | val cleanDf = df 243 | .withColumn("is_anomaly", when(col("label") === "normal.", 0).otherwise(1)) 244 | .na.drop() 245 | 246 | // Clean up labels for anomaly 247 | display(cleanDf) 248 | 249 | val columns = cleanDf.columns.toSet 250 | val features = columns -- Set("id", "label", "is_anomaly") 251 | val categoricalFeatures = Set("protocol_type", "service", "flag") 252 | val continuousFeatures = features -- categoricalFeatures 253 | 254 | // Split 255 | val Array(training, test) = cleanDf.randomSplit(Array(0.8, 0.2), seed = 123) 256 | 257 | 258 | // COMMAND ---------- 259 | 260 | // MAGIC %md 261 | // MAGIC ### Define Feature Estimators and Transformers 262 | 263 | // COMMAND ---------- 264 | 265 | // Indexers 266 | val indexers = categoricalFeatures.map({ colName => 267 | new StringIndexer().setInputCol(colName).setOutputCol(colName + "_index").setHandleInvalid("keep") 268 | }).toArray 269 | 270 | // Encoders 271 | val encoder = new OneHotEncoderEstimator() 272 | .setInputCols(categoricalFeatures.map(colName => colName + "_index").toArray) 273 | .setOutputCols(categoricalFeatures.map(colName => colName + "_encoded").toArray) 274 | 275 | // Vector Assembler 276 | var selectedFeatures = continuousFeatures ++ categoricalFeatures.map(colName => colName + "_encoded") 277 | val assembler = new VectorAssembler() 278 | .setInputCols(selectedFeatures.toArray) 279 | .setOutputCol("features") 280 | 281 | // Standard Scalar 282 | val standardScalar = new StandardScaler() 283 | .setInputCol("features") 284 | .setOutputCol("norm_features") 285 | .setWithMean(true) 286 | .setWithStd(true) 287 | 288 | // PCA Anomaly model 289 | val pcaAnom = new PCAAnomaly() 290 | .setInputCol("norm_features") 291 | .setOutputPCACol("pca_features") 292 | .setOutputCol("anomaly_score") 293 | .setLabelCol("is_anomaly") 294 | .setK(2) 295 | 296 | // COMMAND ---------- 297 | 298 | // MAGIC %md 299 | // MAGIC ### Build and Fit Pipeline using PCAAnomaly (custom model) 300 | // MAGIC ![PCAAnomaly Pipeline](files/images/PCAAnomalyPipeline.PNG) 301 | 302 | // COMMAND ---------- 303 | 304 | // Pipeline 305 | val mainPipeline = new Pipeline() 306 | .setStages(indexers ++ 307 | Array(encoder, assembler, standardScalar, pcaAnom)) //pcaAnom 308 | 309 | // Fit pipeline 310 | val mainPipelineModel = mainPipeline.fit(training) 311 | 312 | // Save pipeline 313 | mainPipelineModel 314 | .write 315 | .overwrite 316 | .save(modelDir) 317 | 318 | // COMMAND ---------- 319 | 320 | // MAGIC %md 321 | // MAGIC ### Use Model to predict anomalies 322 | 323 | // COMMAND ---------- 324 | 325 | // MAGIC %md 326 | // MAGIC #### Using training data 327 | 328 | // COMMAND ---------- 329 | 330 | // Load saved model 331 | val model = PipelineModel.load(modelDir) 332 | 333 | // Use model 334 | val transformedTraining = model.transform(training) 335 | .select("is_anomaly", "label", "anomaly_score") 336 | .cache() 337 | 338 | display(transformedTraining 339 | .groupBy("is_anomaly") 340 | .agg(avg("anomaly_score"))) 341 | 342 | // COMMAND ---------- 343 | 344 | display(transformedTraining 345 | .groupBy("label") 346 | .agg(avg("anomaly_score").alias("anomaly_score")) 347 | .sort(desc("anomaly_score"))) 348 | 349 | // COMMAND ---------- 350 | 351 | // MAGIC %md 352 | // MAGIC #### Using test data 353 | 354 | // COMMAND ---------- 355 | 356 | val transformedTest = mainPipelineModel.transform(test) 357 | .select("is_anomaly", "label", "anomaly_score") 358 | .cache() 359 | 360 | display(transformedTest 361 | .groupBy("is_anomaly") 362 | .agg(avg("anomaly_score"))) 363 | 364 | // COMMAND ---------- 365 | 366 | display(transformedTest 367 | .groupBy("label") 368 | .agg(avg("anomaly_score").alias("anomaly_score")) 369 | .sort(desc("anomaly_score"))) 370 | 371 | // COMMAND ---------- 372 | 373 | // MAGIC %md 374 | // MAGIC ### Evaluate Model using Test data 375 | 376 | // COMMAND ---------- 377 | 378 | import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator 379 | 380 | val evaluator = new BinaryClassificationEvaluator() 381 | .setMetricName("areaUnderROC") 382 | .setLabelCol("is_anomaly") 383 | .setRawPredictionCol("anomaly_score") 384 | 385 | evaluator.evaluate(transformedTraining) -------------------------------------------------------------------------------- /notebooks/databricks_notebooks/04_trainmodel_pca_wo_custom.scala: -------------------------------------------------------------------------------- 1 | // Databricks notebook source 2 | import org.apache.spark.ml.feature.{StringIndexer, OneHotEncoderEstimator, VectorAssembler, PCA, StandardScaler, MinMaxScaler} 3 | import org.apache.spark.ml.Pipeline 4 | import org.apache.spark.ml.linalg.{Vector, Vectors} 5 | import org.apache.spark.sql.functions._ 6 | import breeze.linalg.{DenseVector, sum} 7 | import breeze.numerics.pow 8 | 9 | // COMMAND ---------- 10 | 11 | // MAGIC %md 12 | // MAGIC ## Read in data and perform data cleaning 13 | 14 | // COMMAND ---------- 15 | 16 | // Read data 17 | val df = spark.read.table("kdd") 18 | 19 | // Transform data 20 | val transformed_df = df.withColumnRenamed("label", "original_label") 21 | .withColumn("label_name", when(col("original_label") === "normal.", "normal").otherwise("anomaly")) 22 | 23 | // Drop nulls 24 | // Lace TODO 25 | 26 | // Clean up labels for anomaly 27 | display(transformed_df) 28 | 29 | // COMMAND ---------- 30 | 31 | // MAGIC %md 32 | // MAGIC ## Build data transformation ML pipeline 33 | 34 | // COMMAND ---------- 35 | 36 | val columns = df.columns.toSet 37 | val features = columns -- Set("id", "label", "original_label") 38 | val categoricalFeatures = Set("protocol_type", "service", "flag") 39 | val continuousFeatures = features -- categoricalFeatures 40 | | 41 | // Split data 42 | val Array(training, test) = transformed_df.randomSplit(Array(0.8, 0.2), seed = 123) 43 | 44 | // Indexers 45 | val indexers = categoricalFeatures.map({ colName => 46 | new StringIndexer().setInputCol(colName).setOutputCol(colName + "_index").setHandleInvalid("keep") 47 | }).toArray 48 | 49 | // Encoders 50 | val encoder = new OneHotEncoderEstimator() 51 | .setInputCols(categoricalFeatures.map(colName => colName + "_index").toArray) 52 | .setOutputCols(categoricalFeatures.map(colName => colName + "_encoded").toArray) 53 | 54 | // Label Indexer 55 | val labelIndexer = new StringIndexer() 56 | .setInputCol("label_name") 57 | .setOutputCol("label") 58 | 59 | // Vector Assembler 60 | var selectedFeatures = continuousFeatures ++ categoricalFeatures.map(colName => colName + "_encoded") 61 | val assembler = new VectorAssembler() 62 | .setInputCols(selectedFeatures.toArray) 63 | .setOutputCol("features") 64 | 65 | val standardScalar = new StandardScaler() 66 | .setInputCol("features") 67 | .setOutputCol("norm_features") 68 | .setWithMean(true) 69 | .setWithStd(true) 70 | 71 | // Pipeline 72 | val transformPipeline = new Pipeline() 73 | .setStages(indexers ++ Array(encoder, labelIndexer, assembler, standardScalar)) 74 | 75 | // Transform training 76 | val transformedTraining = transformPipeline 77 | .fit(training) 78 | .transform(training) 79 | .select("norm_features", "label") 80 | .cache() 81 | 82 | display(transformedTraining) 83 | 84 | // COMMAND ---------- 85 | 86 | // MAGIC %md 87 | // MAGIC ## Perform Principal Component Analysis 88 | 89 | // COMMAND ---------- 90 | 91 | // Fit PCA model 92 | val pca = new PCA() 93 | .setInputCol("norm_features") 94 | .setOutputCol("pca_features") 95 | .setK(3) 96 | .fit(transformedTraining) 97 | 98 | val pcaResult = pca 99 | .transform(transformedTraining) 100 | .select("label", "pca_features", "norm_features") 101 | .cache() 102 | 103 | display(pcaResult) 104 | 105 | // COMMAND ---------- 106 | 107 | // MAGIC %md 108 | // MAGIC ## Reconstruct features and calculate Anomaly Score 109 | // MAGIC Reconstruct the features using the Principal Components and the feature vectors. Then, calculate the normalized error, in this case the sum of squared differences from the original feature vector and the reconstructed features from the principal components. This becomes the Anomaly Score. 110 | 111 | // COMMAND ---------- 112 | 113 | val reconstructionUdf = udf((v: Vector) => { 114 | // Reconstruct vector using Principal components 115 | pca.pc.multiply(v) 116 | }) 117 | val anomalyScoreUdf = udf((v:Vector, x:Vector) => { 118 | // Calculate error (sum of squared differences) 119 | val vB = DenseVector(v.toArray) 120 | val xB = DenseVector(x.toArray) 121 | val diff = vB - xB 122 | val error = sum(pow(diff, 2)) 123 | error 124 | }) 125 | val anomalyScore = pcaResult 126 | .withColumn("reconstruction", reconstructionUdf(col("pca_features"))) 127 | .withColumn("anomaly_score", anomalyScoreUdf(col("norm_features"), col("reconstruction"))) 128 | 129 | // COMMAND ---------- 130 | 131 | // MAGIC %md 132 | // MAGIC ## Normalize Anomaly Score 133 | 134 | // COMMAND ---------- 135 | 136 | // Vectorize Anomaly Score 137 | val anomalyAssembler = new VectorAssembler() 138 | .setInputCols(Array("anomaly_score")) 139 | .setOutputCol("anomaly_score_vec") 140 | 141 | // Normalize anomaly score 142 | val anomalyScoreScalar = new MinMaxScaler() 143 | .setInputCol("anomaly_score_vec") 144 | .setOutputCol("norm_anomaly_score_vec") 145 | 146 | // Pipeline 147 | val postTransformPipeline = new Pipeline() 148 | .setStages(Array(anomalyAssembler, anomalyScoreScalar)) 149 | 150 | val postTransformPipelineModel = postTransformPipeline 151 | .fit(anomalyScore) 152 | 153 | val vecToDoubleUdf = udf((v: Vector) => { v.toArray(0) }) 154 | val predictions = postTransformPipelineModel 155 | .transform(anomalyScore) 156 | .withColumn("norm_anomaly_score", vecToDoubleUdf(col("norm_anomaly_score_vec"))) 157 | .select("label", "norm_anomaly_score") 158 | .cache() 159 | 160 | display(predictions) 161 | 162 | // COMMAND ---------- 163 | 164 | // MAGIC %md 165 | // MAGIC ## Evaluate Model 166 | 167 | // COMMAND ---------- 168 | 169 | import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator 170 | 171 | val evaluator = new BinaryClassificationEvaluator() 172 | .setMetricName("areaUnderROC") 173 | .setLabelCol("label") 174 | .setRawPredictionCol("norm_anomaly_score") 175 | 176 | var auc = evaluator.evaluate(predictions) 177 | 178 | // COMMAND ---------- 179 | 180 | 181 | 182 | // COMMAND ---------- 183 | 184 | // MAGIC %md 185 | // MAGIC # Custom Transformer and Estimator 186 | 187 | // COMMAND ---------- 188 | 189 | package org.apache.spark.ml.feature 190 | 191 | import org.apache.hadoop.fs.Path 192 | 193 | import org.apache.spark.ml._ 194 | import org.apache.spark.ml.linalg._ 195 | import org.apache.spark.ml.param._ 196 | import org.apache.spark.ml.param.shared._ 197 | import org.apache.spark.ml.util._ 198 | // import org.apache.spark.ml.feature.{PCA, PCAModel} 199 | import org.apache.spark.rdd.RDD 200 | import org.apache.spark.sql._ 201 | import org.apache.spark.sql.functions._ 202 | import org.apache.spark.sql.types.{StructField, StructType} 203 | import org.apache.spark.mllib.linalg.VectorUDT 204 | 205 | import breeze.linalg.{DenseVector, sum} 206 | import breeze.numerics.pow 207 | 208 | 209 | /** 210 | * Params for [[PCAAnomaly]] and [[PCAAnomalyModel]]. 211 | */ 212 | trait PCAAnomalyParams extends Params with HasInputCol with HasOutputCol { 213 | //final val inputCol= new Param[String](this, "inputCol", "The input column") 214 | //final val outputCol = new Param[String](this, "outputCol", "The output column") 215 | final val outputPCACol = new Param[String](this, "outputPCACol", "The output column with PCA features") 216 | final val k: IntParam = new IntParam(this, "k", "the number of principal components (> 0)", 217 | ParamValidators.gt(0)) 218 | 219 | /** Validates and transforms the input schema. */ 220 | protected def validateAndTransformSchema(schema: StructType): StructType = { 221 | //SchemaUtils.checkColumnType(schema, $(inputCol), new VectorUDT) 222 | require(!schema.fieldNames.contains($(outputCol)), s"Output column ${$(outputCol)} already exists.") 223 | val outputFields = schema.fields :+ StructField($(outputCol), new VectorUDT, false) 224 | StructType(outputFields) 225 | } 226 | } 227 | 228 | /** 229 | * PCA trains a model to project vectors to a lower dimensional space of the top `PCA!.k` 230 | * principal components. 231 | */ 232 | class PCAAnomaly (override val uid: String) 233 | extends Estimator[PCAAnomalyModel] with PCAAnomalyParams with DefaultParamsWritable { 234 | 235 | def this() = this(Identifiable.randomUID("pca_anomaly")) 236 | 237 | def setInputCol(value: String): this.type = set(inputCol, value) 238 | def setOutputCol(value: String): this.type = set(outputCol, value) 239 | def setOutputPCACol(value: String): this.type = set(outputPCACol, value) 240 | def setK(value: Int): this.type = set(k, value) 241 | 242 | /** 243 | * Computes a [[PCAAnomalyModel]] that contains the principal components of the input vectors. 244 | */ 245 | override def fit(dataset: Dataset[_]): PCAAnomalyModel = { 246 | transformSchema(dataset.schema, logging = true) 247 | 248 | // Fit regular PCA model 249 | val pcaModel = new PCA() 250 | .setInputCol($(inputCol)) 251 | .setOutputCol($(outputPCACol)) 252 | .setK($(k)) 253 | .fit(dataset) 254 | 255 | copyValues(new PCAAnomalyModel(uid, pcaModel).setParent(this)) 256 | } 257 | 258 | override def transformSchema(schema: StructType): StructType = { 259 | validateAndTransformSchema(schema) 260 | } 261 | 262 | override def copy(extra: ParamMap): PCAAnomaly = defaultCopy(extra) 263 | } 264 | 265 | object PCAAnomaly extends DefaultParamsReadable[PCAAnomaly] { 266 | override def load(path: String): PCAAnomaly = super.load(path) 267 | } 268 | 269 | /** 270 | * Model fitted by [[PCAAnomaly]]. Uses PCA to detect anomalies 271 | * 272 | * @param pcaModel A PCA model 273 | */ 274 | class PCAAnomalyModel ( 275 | override val uid: String, 276 | val pcaModel: PCAModel) 277 | extends Model[PCAAnomalyModel] with PCAAnomalyParams with MLWritable { 278 | 279 | import PCAAnomalyModel._ 280 | 281 | /** @group setParam */ 282 | def setInputCol(value: String): this.type = set(inputCol, value) 283 | 284 | /** @group setParam */ 285 | def setOutputCol(value: String): this.type = set(outputCol, value) 286 | 287 | /** 288 | * Transform a vector by computed Principal Components. 289 | * 290 | * @note Vectors to be transformed must be the same length as the source vectors given 291 | * to `PCAAnomaly.fit()`. 292 | */ 293 | override def transform(dataset: Dataset[_]): DataFrame = { 294 | transformSchema(dataset.schema, logging = true) 295 | 296 | val pcaResults = pcaModel.transform(dataset) 297 | 298 | val anomalyScoreUdf = udf((originalFeatures:Vector, pcaFeatures:Vector) => { 299 | // Reconstruct vector using Principal components 300 | val reconstructedFeatures = pcaModel.pc.multiply(pcaFeatures) 301 | 302 | // Calculate error (sum of squared differences) 303 | val originalFeaturesB = DenseVector(originalFeatures.toArray) 304 | val reconstructedFeaturesB = DenseVector(reconstructedFeatures.toArray) 305 | val diff = originalFeaturesB - reconstructedFeaturesB 306 | val error = sum(pow(diff, 2)) 307 | error 308 | }) 309 | pcaResults.withColumn($(outputCol), anomalyScoreUdf(col($(inputCol)), col($(outputPCACol)))) 310 | } 311 | 312 | override def transformSchema(schema: StructType): StructType = { 313 | validateAndTransformSchema(schema) 314 | } 315 | 316 | override def copy(extra: ParamMap): PCAAnomalyModel = { 317 | val copied = new PCAAnomalyModel(uid, pcaModel) 318 | copyValues(copied, extra).setParent(parent) 319 | } 320 | 321 | override def write: MLWriter = new PCAAnomalyModelWriter(this) 322 | } 323 | 324 | object PCAAnomalyModel extends MLReadable[PCAAnomalyModel] { 325 | 326 | private[PCAAnomalyModel] class PCAAnomalyModelWriter(instance: PCAAnomalyModel) extends MLWriter { 327 | override protected def saveImpl(path: String): Unit = { 328 | DefaultParamsWriter.saveMetadata(instance, path, sc) 329 | val pcaPath = new Path(path, "pca").toString 330 | instance.pcaModel.save(pcaPath) 331 | } 332 | } 333 | 334 | private class PCAAnomalyModelReader extends MLReader[PCAAnomalyModel] { 335 | 336 | private val className = classOf[PCAAnomalyModel].getName 337 | 338 | /** 339 | * Loads a [[PCAAnomalyModel]] from data located at the input path. 340 | * 341 | * @param path path to serialized model data 342 | * @return a [[PCAAnomalyModel]] 343 | */ 344 | override def load(path: String): PCAAnomalyModel = { 345 | val metadata = DefaultParamsReader.loadMetadata(path, sc, className) 346 | val pcaPath = new Path(path, "pca").toString 347 | val pcaModel = PCAModel.load(pcaPath) 348 | new PCAAnomalyModel(metadata.uid, pcaModel) 349 | } 350 | } 351 | 352 | override def read: MLReader[PCAAnomalyModel] = new PCAAnomalyModelReader 353 | 354 | override def load(path: String): PCAAnomalyModel = super.load(path) 355 | } 356 | 357 | 358 | 359 | // COMMAND ---------- 360 | 361 | import org.apache.spark.ml.feature.PCAAnomaly 362 | 363 | // Fit PCA model 364 | val pcaAnomaly = new PCAAnomaly() 365 | .setInputCol("norm_features") 366 | .setOutputPCACol("pca_features") 367 | .setOutputCol("anomaly_score") 368 | .setK(3) 369 | .fit(transformedTraining) 370 | 371 | val pcaResult = pcaAnomaly 372 | .transform(transformedTraining) 373 | .select("label", "anomaly_score", "pca_features", "norm_features") 374 | .cache() 375 | 376 | display(pcaResult) 377 | 378 | // COMMAND ---------- 379 | 380 | pcaAnomaly.save("/mnt/blob_storage/models/PCAAnomalyModel") 381 | 382 | // COMMAND ---------- 383 | 384 | // Pipeline 385 | 386 | val pcaAnom = new PCAAnomaly() 387 | .setInputCol("norm_features") 388 | .setOutputPCACol("pca_features") 389 | .setOutputCol("anomaly_score") 390 | .setK(3) 391 | 392 | val mainPipeline = new Pipeline() 393 | .setStages(indexers ++ Array(encoder, labelIndexer, assembler, standardScalar, pcaAnom, anomalyAssembler, anomalyScoreScalar)) 394 | 395 | val mainResult = mainPipeline.fit(training) 396 | -------------------------------------------------------------------------------- /notebooks/databricks_notebooks/05_batch_scoring.scala: -------------------------------------------------------------------------------- 1 | // Databricks notebook source 2 | import org.apache.spark.ml.{Pipeline, PipelineModel} 3 | import org.apache.spark.ml.classification._ 4 | import org.apache.spark.sql.functions._ 5 | import org.apache.spark.sql.types._ 6 | 7 | // COMMAND ---------- 8 | 9 | display(spark.catalog.listTables()) 10 | 11 | // COMMAND ---------- 12 | 13 | display(dbutils.fs.ls("/mnt/blob_storage/models")) 14 | 15 | // COMMAND ---------- 16 | 17 | // Load data 18 | // In production, you may need to filter since last run 19 | val df = spark.read.table("kdd_unlabeled") 20 | 21 | // Clean data 22 | val cleanDf = df.na.drop() // For production, may need to save this to another table, or impute null values 23 | 24 | // Load model 25 | val modelLoc = "/mnt/blob_storage/models/RandomForestPipeline" 26 | val model = PipelineModel.load(modelLoc) 27 | 28 | // Make predictions 29 | val predictions = model.transform(cleanDf) 30 | 31 | // Save data 32 | predictions.write.mode("append").saveAsTable("kdd_predictions") -------------------------------------------------------------------------------- /notebooks/databricks_notebooks/06a_streaming_datagen.scala: -------------------------------------------------------------------------------- 1 | // Databricks notebook source 2 | import org.apache.spark.eventhubs.{ ConnectionStringBuilder, EventHubsConf, EventPosition } 3 | import org.apache.spark.sql.functions.{ explode, split, to_json, struct } 4 | import org.apache.spark.sql.streaming.Trigger.ProcessingTime 5 | 6 | // Retrieve storage credentials 7 | val ehNamespace = dbutils.secrets.get(scope = "storage_scope", key = "eventhub_namespace") 8 | val ehData = dbutils.secrets.get(scope = "storage_scope", key = "eventhub_data_name") 9 | val ehDataSendKey = dbutils.secrets.get(scope = "storage_scope", key = "eventhub_data_send_key") 10 | 11 | // Set data path 12 | val data_path = "/mnt/blob_storage/data/for_streaming" 13 | 14 | val connectionString = ConnectionStringBuilder() 15 | .setNamespaceName(ehNamespace) 16 | .setEventHubName(ehData) 17 | .setSasKeyName("send") 18 | .setSasKey(ehDataSendKey) 19 | .build 20 | 21 | val eventHubsConf = EventHubsConf(connectionString) 22 | .setStartingPosition(EventPosition.fromEndOfStream) 23 | 24 | // COMMAND ---------- 25 | 26 | val kdd_schema = spark.read.table("kdd_unlabeled").schema 27 | val kdd_unlabeled_df = spark 28 | .readStream 29 | .schema(kdd_schema) 30 | .csv(s"$data_path/kddcup.testdata.unlabeled/") 31 | 32 | val kdd_unlabeled_df_json = kdd_unlabeled_df.select(to_json( 33 | struct( 34 | $"id", 35 | $"duration", 36 | $"protocol_type", 37 | $"service", 38 | $"src_bytes", 39 | $"dst_bytes", 40 | $"flag", 41 | $"land", 42 | $"wrong_fragment", 43 | $"urgent")).alias("body")) 44 | 45 | // COMMAND ---------- 46 | 47 | // // Output to console 48 | // var query = kdd_unlabeled_df_json 49 | // .writeStream 50 | // .outputMode("append") 51 | // .format("console") 52 | // .option("truncate", false) 53 | // .start() 54 | // query.awaitTermination() 55 | 56 | // COMMAND ---------- 57 | 58 | val query = 59 | kdd_unlabeled_df_json 60 | .writeStream 61 | .format("eventhubs") 62 | .outputMode("update") 63 | .options(eventHubsConf.toMap) 64 | .trigger(ProcessingTime("10 seconds")) 65 | .option("checkpointLocation", s"$data_path/checkpoints/kdd_unlabeled_gen/") 66 | .start() -------------------------------------------------------------------------------- /notebooks/databricks_notebooks/06b_streaming_scoring.scala: -------------------------------------------------------------------------------- 1 | // Databricks notebook source 2 | import org.apache.spark.eventhubs.{ ConnectionStringBuilder, EventHubsConf, EventPosition } 3 | import org.apache.spark.sql.functions.{ explode, split } 4 | import org.apache.spark.sql.streaming.Trigger.ProcessingTime 5 | import org.apache.spark.sql.types._ 6 | import org.apache.spark.sql.functions._ 7 | import org.apache.spark.ml.{Pipeline, PipelineModel} 8 | import org.apache.spark.ml.feature._ 9 | import org.apache.spark.ml.linalg.{Vector, Vectors} 10 | 11 | // COMMAND ---------- 12 | 13 | // MAGIC %md 14 | // MAGIC ## Setup 15 | // MAGIC Retrieve secrets, setup EventHub connection, load save Anomaly Model 16 | 17 | // COMMAND ---------- 18 | 19 | // Retrieve storage credentials 20 | val ehNamespace = dbutils.secrets.get(scope = "storage_scope", key = "eventhub_namespace") 21 | val ehData = dbutils.secrets.get(scope = "storage_scope", key = "eventhub_data_name") 22 | val ehDataListenKey = dbutils.secrets.get(scope = "storage_scope", key = "eventhub_data_listen_key") 23 | val ehAnom = dbutils.secrets.get(scope = "storage_scope", key = "eventhub_anom_name") 24 | val ehAnomSendKey = dbutils.secrets.get(scope = "storage_scope", key = "eventhub_anom_send_key") 25 | 26 | // Set storage mount path 27 | val storage_mount_path = "/mnt/blob_storage" 28 | 29 | // Set data path 30 | val data_path = "/mnt/blob_storage/data/for_streaming" 31 | 32 | // Load model 33 | val model = PipelineModel.load(s"$storage_mount_path/models/RandomForestPipeline") 34 | 35 | // Setup EH connection 36 | val dataEhConnectionString = ConnectionStringBuilder() 37 | .setNamespaceName(ehNamespace) 38 | .setEventHubName(ehData) 39 | .setSasKeyName("listen") 40 | .setSasKey(ehDataListenKey) 41 | .build 42 | val dataEhConf = EventHubsConf(dataEhConnectionString) 43 | .setStartingPosition(EventPosition.fromEndOfStream) 44 | 45 | val anomEhConnectionString = ConnectionStringBuilder() 46 | .setNamespaceName(ehNamespace) 47 | .setEventHubName(ehAnom) 48 | .setSasKeyName("send") 49 | .setSasKey(ehAnomSendKey) 50 | .build 51 | val anomEhConf = EventHubsConf(anomEhConnectionString) 52 | .setStartingPosition(EventPosition.fromEndOfStream) 53 | 54 | 55 | // COMMAND ---------- 56 | 57 | // MAGIC %md 58 | // MAGIC ## Read message from EventHubs 59 | 60 | // COMMAND ---------- 61 | 62 | // Read stream 63 | val incomingStream = spark 64 | .readStream 65 | .format("eventhubs") 66 | .options(dataEhConf.toMap) 67 | .load() 68 | 69 | // Event Hub message format is JSON and contains "body" field 70 | // Body is binary, so we cast it to string to see the actual content of the message 71 | val messages = 72 | incomingStream 73 | .withColumn("Offset", $"offset".cast(LongType)) 74 | .withColumn("Time (readable)", $"enqueuedTime".cast(TimestampType)) 75 | .withColumn("Timestamp", $"enqueuedTime".cast(LongType)) 76 | .withColumn("Body", $"body".cast(StringType)) 77 | .withWatermark("Time (readable)", "10 minutes") 78 | .select("Offset", "Time (readable)", "Timestamp", "Body") 79 | 80 | messages.printSchema 81 | 82 | // COMMAND ---------- 83 | 84 | // MAGIC %md 85 | // MAGIC ## Transform and enrich message through joining with static data 86 | 87 | // COMMAND ---------- 88 | 89 | var messageTransformed = 90 | messages 91 | .select( 92 | get_json_object($"Body", "$.id").cast(StringType).alias("id"), 93 | get_json_object($"Body", "$.duration").cast(FloatType).alias("duration"), 94 | get_json_object($"Body", "$.protocol_type").cast(StringType).alias("protocol_type"), 95 | get_json_object($"Body", "$.service").cast(StringType).alias("service"), 96 | get_json_object($"Body", "$.src_bytes").cast(FloatType).alias("src_bytes"), 97 | get_json_object($"Body", "$.dst_bytes").cast(FloatType).alias("dst_bytes"), 98 | get_json_object($"Body", "$.flag").cast(StringType).alias("flag"), 99 | get_json_object($"Body", "$.land").cast(ShortType).alias("land"), 100 | get_json_object($"Body", "$.wrong_fragment").cast(FloatType).alias("wrong_fragment"), 101 | get_json_object($"Body", "$.urgent").cast(FloatType).alias("urgent"), 102 | $"Timestamp") 103 | 104 | // Join with static table 105 | val kdd_unlabeled = spark.read.table("kdd_unlabeled") 106 | val messageAll = messageTransformed 107 | .join(kdd_unlabeled, messageTransformed("id") === kdd_unlabeled("id"), "left_outer") 108 | .drop(kdd_unlabeled("id")) 109 | .drop(kdd_unlabeled("duration")) 110 | .drop(kdd_unlabeled("protocol_type")) 111 | .drop(kdd_unlabeled("service")) 112 | .drop(kdd_unlabeled("src_bytes")) 113 | .drop(kdd_unlabeled("dst_bytes")) 114 | .drop(kdd_unlabeled("flag")) 115 | .drop(kdd_unlabeled("land")) 116 | .drop(kdd_unlabeled("wrong_fragment")) 117 | .drop(kdd_unlabeled("urgent")) 118 | 119 | messageAll.printSchema 120 | 121 | // COMMAND ---------- 122 | 123 | // MAGIC %md 124 | // MAGIC ## Use model to identify Anomalies in data stream 125 | 126 | // COMMAND ---------- 127 | 128 | // Make predictions 129 | val anomalies = model.transform(messageAll).filter("prediction == 1") 130 | 131 | // COMMAND ---------- 132 | 133 | // MAGIC %md 134 | // MAGIC ## Output anomalies 135 | 136 | // COMMAND ---------- 137 | 138 | // // Output to console 139 | // var query = anomalies 140 | // .select("id", "probability", "prediction") //filter for easy viewing 141 | // .writeStream 142 | // .outputMode("append") 143 | // .format("console") 144 | // .option("truncate", false) 145 | // .start() 146 | // query.awaitTermination() 147 | 148 | // COMMAND ---------- 149 | 150 | // Wrap in body tag 151 | val anomalies_wrapper = anomalies.select(to_json( 152 | struct( 153 | $"id", 154 | $"probability")).alias("body")) 155 | 156 | val query = 157 | anomalies_wrapper 158 | .writeStream 159 | .format("eventhubs") 160 | .outputMode("update") 161 | .options(anomEhConf.toMap) 162 | .trigger(ProcessingTime("10 seconds")) 163 | .option("checkpointLocation", s"$data_path/checkpoints/anomalies/") 164 | .start() 165 | 166 | // COMMAND ---------- 167 | 168 | println(query.lastProgress) -------------------------------------------------------------------------------- /references/Lace Lofranco - Building Advanced Analytics Pipelines with Azure Databricks.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devlace/azure-databricks-anomaly/081c56515ba8d8f614e85ae4b7207eb75c6a0900/references/Lace Lofranco - Building Advanced Analytics Pipelines with Azure Databricks.pdf -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | python-dotenv>=0.5.1 2 | databricks-cli==0.8.2 3 | msrestazure~=0.4.32 4 | azure-cli==2.0.67 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | setup( 4 | name='src', 5 | packages=find_packages(), 6 | version='0.1.0', 7 | description='An anomaly detection data pipeline on Azure Databricks', 8 | author='Lace Lofranco', 9 | license='MIT', 10 | ) 11 | -------------------------------------------------------------------------------- /test_environment.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | REQUIRED_PYTHON = "python3" 4 | 5 | 6 | def main(): 7 | system_major = sys.version_info.major 8 | if REQUIRED_PYTHON == "python": 9 | required_major = 2 10 | elif REQUIRED_PYTHON == "python3": 11 | required_major = 3 12 | else: 13 | raise ValueError("Unrecognized python interpreter: {}".format( 14 | REQUIRED_PYTHON)) 15 | 16 | if system_major != required_major: 17 | raise TypeError( 18 | "This project requires Python {}. Found: Python {}".format( 19 | required_major, sys.version)) 20 | else: 21 | print(">>> Development environment passes all tests!") 22 | 23 | 24 | if __name__ == '__main__': 25 | main() 26 | --------------------------------------------------------------------------------