├── .dockerignore
├── .gitignore
├── LICENSE
├── Makefile
├── README.md
├── azure-pipelines.yml
├── deploy
    ├── Dockerfile
    ├── azuredeploy.json
    ├── azuredeploy.parameters.json
    ├── databricks
    │   ├── config
    │   │   ├── cluster.config.json
    │   │   ├── job.batchscoring.config.json
    │   │   ├── job.streamdatagen.config.json
    │   │   ├── job.streamscoring.config.json
    │   │   ├── run.downloaddata.config.json
    │   │   ├── run.etl.config.json
    │   │   └── run.trainmodelall.config.json
    │   ├── configure_databricks.sh
    │   └── create_secrets.sh
    └── deploy.sh
├── images
    ├── FileStore
    │   ├── GBTModel.PNG
    │   ├── LogRegCVPipeline.PNG
    │   ├── MLPipeline.PNG
    │   ├── PCAAnomalyPipeline.PNG
    │   ├── RandomForestPipeline.PNG
    │   ├── TransformPipeline.PNG
    │   └── transformation_and_actions.PNG
    └── archi.PNG
├── notebooks
    ├── .gitkeep
    └── databricks_notebooks
    │   ├── 00_demo_hello_spark.py
    │   ├── 01_download_data.py
    │   ├── 02_ETL.py
    │   ├── 03_explore_data.py
    │   ├── 04_trainmodel_multiple.scala
    │   ├── 04_trainmodel_pca_w_custom.scala
    │   ├── 04_trainmodel_pca_wo_custom.scala
    │   ├── 05_batch_scoring.scala
    │   ├── 06a_streaming_datagen.scala
    │   └── 06b_streaming_scoring.scala
├── references
    └── Lace Lofranco - Building Advanced Analytics Pipelines with Azure Databricks.pdf
├── requirements.txt
├── setup.py
└── test_environment.py


/.dockerignore:
--------------------------------------------------------------------------------
1 | .env
2 | .git
3 | .cache
4 | *.md
5 | !README.md
6 | data


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 | 
26 | # PyInstaller
27 | #  Usually these files are written by a python script from a template
28 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 | 
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 | 
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | *,cover
45 | 
46 | # Translations
47 | *.mo
48 | *.pot
49 | 
50 | # Django stuff:
51 | *.log
52 | 
53 | # Sphinx documentation
54 | docs/_build/
55 | 
56 | # PyBuilder
57 | target/
58 | 
59 | # DotEnv configuration
60 | .env
61 | 
62 | # Database
63 | *.db
64 | *.rdb
65 | 
66 | # Pycharm
67 | .idea
68 | 
69 | # VS Code
70 | .vscode/
71 | 
72 | # Spyder
73 | .spyproject/
74 | 
75 | # Jupyter NB Checkpoints
76 | .ipynb_checkpoints/
77 | 
78 | # exclude data from source control by default
79 | /data/
80 | 
81 | # Mac OS-specific storage files
82 | .DS_Store
83 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 devlace
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | .PHONY: clean data lint requirements deploy_resources deploy deploy_w_docker download_notebooks
  2 | 
  3 | #################################################################################
  4 | # GLOBALS                                                                       #
  5 | #################################################################################
  6 | 
  7 | PROJECT_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
  8 | PROFILE = default
  9 | PROJECT_NAME = azure-databricks-anomaly
 10 | PYTHON_INTERPRETER = python3
 11 | DOCKER_DEPLOY_CONTAINER = devlace/azdatabricksanomaly
 12 | DATABRICKS_NOTEBOOKS_FOLDER = anomaly
 13 | 
 14 | ifeq (,$(shell which conda))
 15 | HAS_CONDA=False
 16 | else
 17 | HAS_CONDA=True
 18 | endif
 19 | 
 20 | #################################################################################
 21 | # COMMANDS                                                                      #
 22 | #################################################################################
 23 | 
 24 | ## Install Python Dependencies
 25 | requirements: test_environment
 26 | 	pip install -U pip setuptools wheel
 27 | 	pip install -r requirements.txt
 28 | 
 29 | ## Deploy infrastructure 
 30 | deploy_resources: 
 31 | 	deploy/deploy.sh
 32 | 
 33 | ## Deploys entire solution
 34 | deploy: deploy_resources data
 35 | 	deploy/databricks/create_secrets.sh
 36 | 	deploy/databricks/configure_databricks.sh
 37 | 
 38 | ## Deploys entire solutions using Docker
 39 | deploy_w_docker:
 40 | 	docker build -t $(DOCKER_DEPLOY_CONTAINER) -f deploy/Dockerfile .
 41 | 	docker run -it $(DOCKER_DEPLOY_CONTAINER)
 42 | 
 43 | ## Download notebooks in anomaly workspace folder locally
 44 | download_notebooks:
 45 | 	databricks workspace export_dir --overwrite /$(DATABRICKS_NOTEBOOKS_FOLDER) notebooks/databricks_notebooks
 46 | 
 47 | ## Downloads models
 48 | download_models:
 49 | 	databricks fs cp --recursive --overwrite dbfs:/mnt/blob_storage/models/ models/
 50 | 
 51 | ## Delete all compiled Python files
 52 | clean:
 53 | 	find . -type f -name "*.py[co]" -delete
 54 | 	find . -type d -name "__pycache__" -delete
 55 | 
 56 | ## Lint using flake8
 57 | lint:
 58 | 	flake8 src
 59 | 
 60 | 
 61 | ## Set up python interpreter environment
 62 | create_environment:
 63 | ifeq (True,$(HAS_CONDA))
 64 | 		@echo ">>> Detected conda, creating conda environment."
 65 | ifeq (3,$(findstring 3,$(PYTHON_INTERPRETER)))
 66 | 	conda create --name $(PROJECT_NAME) python=3
 67 | else
 68 | 	conda create --name $(PROJECT_NAME) python=2.7
 69 | endif
 70 | 		@echo ">>> New conda env created. Activate with:\nsource activate $(PROJECT_NAME)"
 71 | else
 72 | 	@pip install -q virtualenv virtualenvwrapper
 73 | 	@echo ">>> Installing virtualenvwrapper if not already intalled.\nMake sure the following lines are in shell startup file\n\
 74 | 	export WORKON_HOME=$$HOME/.virtualenvs\nexport PROJECT_HOME=$$HOME/Devel\nsource /usr/local/bin/virtualenvwrapper.sh\n"
 75 | 	@bash -c "source `which virtualenvwrapper.sh`;mkvirtualenv $(PROJECT_NAME) --python=$(PYTHON_INTERPRETER)"
 76 | 	@echo ">>> New virtualenv created. Activate with:\nworkon $(PROJECT_NAME)"
 77 | endif
 78 | 
 79 | ## Test python environment is setup correctly
 80 | test_environment:
 81 | 	$(PYTHON_INTERPRETER) test_environment.py
 82 | 
 83 | #################################################################################
 84 | # PROJECT RULES                                                                 #
 85 | #################################################################################
 86 | 
 87 | 
 88 | 
 89 | #################################################################################
 90 | # Self Documenting Commands                                                     #
 91 | #################################################################################
 92 | 
 93 | .DEFAULT_GOAL := help
 94 | 
 95 | # Inspired by <http://marmelab.com/blog/2016/02/29/auto-documented-makefile.html>
 96 | # sed script explained:
 97 | # /^##/:
 98 | # 	* save line in hold space
 99 | # 	* purge line
100 | # 	* Loop:
101 | # 		* append newline + line to hold space
102 | # 		* go to next line
103 | # 		* if line starts with doc comment, strip comment character off and loop
104 | # 	* remove target prerequisites
105 | # 	* append hold space (+ newline) to line
106 | # 	* replace newline plus comments by `---`
107 | # 	* print line
108 | # Separate expressions are necessary because labels cannot be delimited by
109 | # semicolon; see <http://stackoverflow.com/a/11799865/1968>
110 | .PHONY: help
111 | help:
112 | 	@echo "$$(tput bold)Available rules:$$(tput sgr0)"
113 | 	@echo
114 | 	@sed -n -e "/^## / { \
115 | 		h; \
116 | 		s/.*//; \
117 | 		:doc" \
118 | 		-e "H; \
119 | 		n; \
120 | 		s/^## //; \
121 | 		t doc" \
122 | 		-e "s/:.*//; \
123 | 		G; \
124 | 		s/\\n## /---/; \
125 | 		s/\\n/ /g; \
126 | 		p; \
127 | 	}" ${MAKEFILE_LIST} \
128 | 	| LC_ALL='C' sort --ignore-case \
129 | 	| awk -F '---' \
130 | 		-v ncol=$$(tput cols) \
131 | 		-v indent=19 \
132 | 		-v col_on="$$(tput setaf 6)" \
133 | 		-v col_off="$$(tput sgr0)" \
134 | 	'{ \
135 | 		printf "%s%*s%s ", col_on, -indent, $$1, col_off; \
136 | 		n = split($$2, words, " "); \
137 | 		line_length = ncol - indent; \
138 | 		for (i = 1; i <= n; i++) { \
139 | 			line_length -= length(words[i]) + 1; \
140 | 			if (line_length <= 0) { \
141 | 				line_length = ncol - indent - length(words[i]) - 1; \
142 | 				printf "\n%*s ", -indent, " "; \
143 | 			} \
144 | 			printf "%s ", words[i]; \
145 | 		} \
146 | 		printf "\n"; \
147 | 	}' \
148 | 	| more $(shell test $(shell uname) = Darwin && echo '--no-init --raw-control-chars')
149 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![Build Status](https://msdevlace.visualstudio.com/MLDevOps/_apis/build/status/devlace.azure-databricks-anomaly)](https://msdevlace.visualstudio.com/MLDevOps/_build/latest?definitionId=3)
 2 | 
 3 | Anomaly Detection Pipeline on Azure Databricks
 4 | ==============================
 5 | 
 6 | The following is an anomaly detection data pipeline on Azure Databricks. This solution was built to demonstrate how to build Advance Analytics Pipelines on Azure Databricks, with a particular focus on the Spark MLLib library. This solution includes:
 7 | 1. Initial ETL Data loading process into SparkSQL tables
 8 | 2. Model training and scoring
 9 |    - Explanation of Pipelines, Transformer and Estimators
10 |    - Sample Custom Estimator (PCAAnomaly)
11 | 3. Persisting trained models
12 | 4. Productionizing models through
13 |     -  Batch inference
14 |     -  Streaming
15 | 
16 | # Architecture
17 | ![Architecture](images/archi.PNG?raw=true "Architecture")
18 | 
19 | 
20 | # Data
21 | [KDD Cup 1999 Data](http://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html)
22 | 
23 | # Deployment
24 | 
25 | - Ensure you are in the root of the repository
26 | - To deploy the solution, use one of the following commands:
27 |     1. (*Easiest*) Using pre-built docker container: `docker run -it devlace/azdatabricksanomaly`
28 |     2. Build and run the container locally: `make deploy_w_docker`
29 |     3. Deploy using local environment (see requirements below): `make deploy`
30 | - Follow the prompts to login to Azure, name of resource group, deployment location, etc.
31 | - When prompted for a Databricks Host, enter the full name of your databricks workspace host, e.g. `https://southeastasia.azuredatabricks.net` 
32 | - When prompted for a token, you can [generate a new token](https://docs.databricks.com/api/latest/authentication.html) in the databricks workspace.
33 |   
34 | To view additional make commands run `make`
35 | 
36 | ## For local deployment
37 | 
38 | ### Requirements
39 | 
40 | - [Azure CLI 2.0+](https://azure.github.io/projects/clis/)
41 | - [Python virtualenv](http://docs.python-guide.org/en/latest/dev/virtualenvs/) or [Anaconda](https://anaconda.org/anaconda/python)
42 | - [jq tool](https://stedolan.github.io/jq/download/)
43 | - Check the requirements.txt for list of necessary Python packages. (will be installed by `make requirements`)
44 | 
45 | ### Development environment
46 | 
47 | - The following works with [Windows Subsystem for Linux](https://docs.microsoft.com/en-us/windows/wsl/install-win10)
48 | - Clone this repository
49 | - `cd azure-databricks-anomaly`
50 | - Create a python environment (Virtualenv or Conda). The following uses virtualenv.
51 |     - `virtualenv .`  This creates a python virtual environment to work in.
52 |     - `source bin/activate`  This activates the virtual environment.
53 | - `make requirements`. This installs python dependencies in the virtual environment.
54 | 
55 | # Project Organization
56 | ------------
57 | 
58 |     ├── LICENSE
59 |     ├── Makefile           <- Makefile with commands like `make data` or `make train`
60 |     ├── README.md          <- The top-level README for developers using this project.
61 |     ├── deploy             <- Deployment artifacts
62 |     │   │
63 |     │   └── databricks     <- Deployment artifacts in relation to the Databricks workspace
64 |     │   │
65 |     │   └── deploy.sh      <- Deployment script to deploy all Azure Resources
66 |     │   │
67 |     │   └── azuredeploy.json <- Azure ARM template w/ .parameters file
68 |     │   │
69 |     │   └── Dockerfile     <- Dockerfile for deployment
70 |     │
71 |     ├── models             <- Trained and serialized models, model predictions, or model summaries
72 |     │
73 |     ├── notebooks          <- Jupyter notebooks. Naming convention is a number (for ordering),
74 |     │                         the creator's initials, and a short `-` delimited description, e.g.
75 |     │                         `1.0-jqp-initial-data-exploration`.
76 |     │
77 |     ├── references         <- Contains the powerpoint presentation, and other reference materials.
78 |     │
79 |     ├── requirements.txt   <- The requirements file for reproducing the analysis environment, e.g.
80 |     │                         generated with `pip freeze > requirements.txt`
81 |     │
82 |     ├── setup.py           <- makes project pip installable (pip install -e .) so src can be imported
83 | 
84 | 
85 | --------
86 | 
87 | <p><small>Project based on the <a target="_blank" href="https://drivendata.github.io/cookiecutter-data-science/">cookiecutter data science project template</a>. #cookiecutterdatascience</small></p>
88 | 


--------------------------------------------------------------------------------
/azure-pipelines.yml:
--------------------------------------------------------------------------------
 1 | # Docker image
 2 | # Build a Docker image to run, deploy, or push to a container registry.
 3 | # Add steps that use Docker Compose, tag images, push to a registry, run an image, and more:
 4 | # https://docs.microsoft.com/vsts/pipelines/languages/docker
 5 | 
 6 | pool:
 7 |   vmImage: 'Ubuntu 16.04'
 8 | 
 9 | variables:
10 |   dockerId: 'devlace'
11 |   imageName: 'azdatabricksanomaly'
12 | 
13 | steps:
14 | - script: docker build -f deploy/Dockerfile -t $(dockerId)/$(imageName) .
15 |   displayName: 'docker build'
16 | - script: |
17 |     docker login -u $(dockerId) -p $(dockerPswd)
18 |     docker push $(dockerId)/$(imageName)
19 |   displayName: 'docker push'
20 | 


--------------------------------------------------------------------------------
/deploy/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM continuumio/miniconda3:4.6.14
 2 | 
 3 | # Install any needed packages specified in requirements.txt
 4 | RUN apt-get update
 5 | RUN apt-get install -y autoconf=2.69-10 automake=1:1.15-6 build-essential=12.3 libtool=2.4.6-2 python-dev=2.7.13-2 jq=1.5+dfsg-1.3
 6 | 
 7 | # Set the working directory to /
 8 | WORKDIR /
 9 | # Copy the directory contents into the container at /
10 | COPY . /
11 | 
12 | RUN make requirements
13 | 
14 | RUN chmod +x -R /deploy
15 | 
16 | CMD ["make", "deploy"]
17 | 
18 | 
19 | 


--------------------------------------------------------------------------------
/deploy/azuredeploy.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#",
  3 |     "contentVersion": "1.0.0.0",
  4 |     "parameters": {
  5 |         "deployNs": {
  6 |             "type": "string"
  7 |         },
  8 |         "dbricksWorkspaceName": {
  9 |             "defaultValue": "[concat(parameters('deployNs'), 'dbricks', uniqueString(resourceGroup().id))]",
 10 |             "type": "string"
 11 |         },
 12 |         "dbricksLocation": {
 13 |             "defaultValue": "[resourceGroup().location]",
 14 |             "type": "string",
 15 |             "metadata": {
 16 |                 "description": "Location of Databricks workspace"
 17 |             }
 18 |         },
 19 |         "dbricksTier": {
 20 |             "defaultValue": "premium",
 21 |             "type": "string",
 22 |             "allowedValues": [
 23 |                 "premium",
 24 |                 "standard"
 25 |             ]
 26 |         },
 27 |         "eventhubsNsName": {
 28 |             "defaultValue": "[concat(parameters('deployNs'), 'ehns', uniqueString(resourceGroup().id))]",
 29 |             "type": "string",
 30 |             "metadata": {
 31 |                 "description": "Event Hubs namespace name"
 32 |             }
 33 |         },
 34 |         "eventhubsLocation": {
 35 |             "defaultValue": "[resourceGroup().location]",
 36 |             "type": "string",
 37 |             "metadata": {
 38 |                 "description": "Event Hubs location"
 39 |             }
 40 |         },
 41 |         "eventhubDataName": {
 42 |             "defaultValue": "[concat(parameters('deployNs'), 'ehData', uniqueString(resourceGroup().id))]",
 43 |             "type": "string",
 44 |             "metadata": {
 45 |                 "description": "Event Hub name"
 46 |             }
 47 |         },
 48 |         "eventhubAnomName": {
 49 |             "defaultValue": "[concat(parameters('deployNs'), 'ehAnom', uniqueString(resourceGroup().id))]",
 50 |             "type": "string",
 51 |             "metadata": {
 52 |                 "description": "Event Hub name - anomalies"
 53 |             }
 54 |         },
 55 |         "storAccountName": {
 56 |             "defaultValue": "[concat(parameters('deployNs'), 'stor', uniqueString(resourceGroup().id))]",
 57 |             "type": "string",
 58 |             "metadata": {
 59 |                 "description": "Storage account name"
 60 |             }
 61 |         },
 62 |         "storLocation": {
 63 |             "defaultValue": "[resourceGroup().location]",
 64 |             "type": "string",
 65 |             "metadata": {
 66 |                 "description": "Storage account location"
 67 |             }
 68 |         }
 69 |     },
 70 |     "variables": {
 71 |         "managedResourceGroupId": "[concat(subscription().id, '/resourceGroups/', variables('managedResourceGroupName'))]",
 72 |         "managedResourceGroupName": "[concat('databricks-rg-', parameters('dbricksWorkspaceName'), '-', uniqueString(parameters('dbricksWorkspaceName'), resourceGroup().id))]",
 73 |         "eventhubsDataFullName": "[concat(parameters('eventhubsNsName'), '/', parameters('eventhubDataName'))]",
 74 |         "eventhubsAnomFullName": "[concat(parameters('eventhubsNsName'), '/', parameters('eventhubAnomName'))]"
 75 |     },
 76 |     "resources": [
 77 |         {
 78 |             "apiVersion": "2018-04-01",
 79 |             "location": "[parameters('dbricksLocation')]",
 80 |             "name": "[parameters('dbricksWorkspaceName')]",
 81 |             "tags": {
 82 |                 "displayName": "Databricks Workspace"
 83 |             },
 84 |             "sku": {
 85 |                 "name": "[parameters('dbricksTier')]"
 86 |             },
 87 |             "properties": {
 88 |                 "ManagedResourceGroupId": "[variables('managedResourceGroupId')]"
 89 |             },
 90 |             "type": "Microsoft.Databricks/workspaces"
 91 |         },
 92 |         {
 93 |             "type": "Microsoft.EventHub/namespaces",
 94 |             "sku": {
 95 |                 "name": "Standard",
 96 |                 "tier": "Standard",
 97 |                 "capacity": 1
 98 |             },
 99 |             "name": "[parameters('eventhubsNsName')]",
100 |             "apiVersion": "2017-04-01",
101 |             "location": "[parameters('eventhubsLocation')]",
102 |             "tags": {},
103 |             "scale": null,
104 |             "properties": {
105 |                 "isAutoInflateEnabled": true,
106 |                 "maximumThroughputUnits": 20
107 |             },
108 |             "dependsOn": []
109 |         },
110 |         {
111 |             "type": "Microsoft.EventHub/namespaces/eventhubs",
112 |             "name": "[variables('eventhubsDataFullName')]",
113 |             "apiVersion": "2017-04-01",
114 |             "location": "[parameters('eventhubsLocation')]",
115 |             "scale": null,
116 |             "properties": {
117 |                 "messageRetentionInDays": 1,
118 |                 "partitionCount": 2,
119 |                 "status": "Active"
120 |             },
121 |             "dependsOn": [
122 |                 "[resourceId('Microsoft.EventHub/namespaces', parameters('eventhubsNsName'))]"
123 |             ]
124 |         },
125 |         {
126 |             "type": "Microsoft.EventHub/namespaces/eventhubs/authorizationRules",
127 |             "name": "[concat(variables('eventhubsDataFullName'), '/send')]",
128 |             "apiVersion": "2017-04-01",
129 |             "location": "[parameters('eventhubsLocation')]",
130 |             "scale": null,
131 |             "properties": {
132 |                 "rights": [
133 |                     "Send"
134 |                 ]
135 |             },
136 |             "dependsOn": [
137 |                 "[resourceId('Microsoft.EventHub/namespaces', parameters('eventhubsNsName'))]",
138 |                 "[resourceId('Microsoft.EventHub/namespaces/eventhubs', parameters('eventhubsNsName'), parameters('eventhubDataName'))]"
139 |             ]
140 |         },
141 |         {
142 |             "type": "Microsoft.EventHub/namespaces/eventhubs/authorizationRules",
143 |             "name": "[concat(variables('eventhubsDataFullName'), '/listen')]",
144 |             "apiVersion": "2017-04-01",
145 |             "location": "[parameters('eventhubsLocation')]",
146 |             "scale": null,
147 |             "properties": {
148 |                 "rights": [
149 |                     "Listen"
150 |                 ]
151 |             },
152 |             "dependsOn": [
153 |                 "[resourceId('Microsoft.EventHub/namespaces', parameters('eventhubsNsName'))]",
154 |                 "[resourceId('Microsoft.EventHub/namespaces/eventhubs', parameters('eventhubsNsName'), parameters('eventhubDataName'))]"
155 |             ]
156 |         },
157 |         {
158 |             "type": "Microsoft.EventHub/namespaces/eventhubs",
159 |             "name": "[variables('eventhubsAnomFullName')]",
160 |             "apiVersion": "2017-04-01",
161 |             "location": "[parameters('eventhubsLocation')]",
162 |             "scale": null,
163 |             "properties": {
164 |                 "messageRetentionInDays": 1,
165 |                 "partitionCount": 2,
166 |                 "status": "Active"
167 |             },
168 |             "dependsOn": [
169 |                 "[resourceId('Microsoft.EventHub/namespaces', parameters('eventhubsNsName'))]"
170 |             ]
171 |         },
172 |         {
173 |             "type": "Microsoft.EventHub/namespaces/eventhubs/authorizationRules",
174 |             "name": "[concat(variables('eventhubsAnomFullName'), '/send')]",
175 |             "apiVersion": "2017-04-01",
176 |             "location": "[parameters('eventhubsLocation')]",
177 |             "scale": null,
178 |             "properties": {
179 |                 "rights": [
180 |                     "Send"
181 |                 ]
182 |             },
183 |             "dependsOn": [
184 |                 "[resourceId('Microsoft.EventHub/namespaces', parameters('eventhubsNsName'))]",
185 |                 "[resourceId('Microsoft.EventHub/namespaces/eventhubs', parameters('eventhubsNsName'), parameters('eventhubAnomName'))]"
186 |             ]
187 |         },
188 |         {
189 |             "type": "Microsoft.EventHub/namespaces/eventhubs/authorizationRules",
190 |             "name": "[concat(variables('eventhubsAnomFullName'), '/listen')]",
191 |             "apiVersion": "2017-04-01",
192 |             "location": "[parameters('eventhubsLocation')]",
193 |             "scale": null,
194 |             "properties": {
195 |                 "rights": [
196 |                     "Listen"
197 |                 ]
198 |             },
199 |             "dependsOn": [
200 |                 "[resourceId('Microsoft.EventHub/namespaces', parameters('eventhubsNsName'))]",
201 |                 "[resourceId('Microsoft.EventHub/namespaces/eventhubs', parameters('eventhubsNsName'), parameters('eventhubAnomName'))]"
202 |             ]
203 |         },
204 |         {
205 |             "type": "Microsoft.Storage/storageAccounts",
206 |             "sku": {
207 |                 "name": "Standard_LRS",
208 |                 "tier": "Standard"
209 |             },
210 |             "kind": "StorageV2",
211 |             "name": "[parameters('storAccountName')]",
212 |             "apiVersion": "2017-10-01",
213 |             "location": "[parameters('storLocation')]",
214 |             "tags": {
215 |                 "displayname": "Databricks storage"
216 |             },
217 |             "properties": {
218 |                 "networkAcls": {
219 |                     "bypass": "AzureServices",
220 |                     "virtualNetworkRules": [],
221 |                     "ipRules": [],
222 |                     "defaultAction": "Allow"
223 |                 },
224 |                 "supportsHttpsTrafficOnly": true,
225 |                 "encryption": {
226 |                     "services": {
227 |                         "file": {
228 |                             "enabled": true
229 |                         },
230 |                         "blob": {
231 |                             "enabled": true
232 |                         }
233 |                     },
234 |                     "keySource": "Microsoft.Storage"
235 |                 },
236 |                 "accessTier": "Hot"
237 |             }
238 |         }
239 |     ],
240 |     "outputs": {
241 |         "dbricksWorkspaceName": {
242 |             "value": "[parameters('dbricksWorkspaceName')]",
243 |             "type": "string"
244 |         },
245 |         "dbricksLocation": {
246 |             "value": "[parameters('dbricksLocation')]",
247 |             "type": "string"
248 |         },
249 |         "storAccountName": {
250 |             "value": "[parameters('storAccountName')]",
251 |             "type": "string"
252 |         },
253 |         "eventhubsNsName": {
254 |             "value": "[parameters('eventhubsNsName')]",
255 |             "type": "string"
256 |         },
257 |         "eventhubDataName": {
258 |             "value": "[parameters('eventhubDataName')]",
259 |             "type": "string"
260 |         },
261 |         "eventhubAnomName": {
262 |             "value": "[parameters('eventhubAnomName')]",
263 |             "type": "string"
264 |         }
265 |     }
266 | }


--------------------------------------------------------------------------------
/deploy/azuredeploy.parameters.json:
--------------------------------------------------------------------------------
1 | {
2 |     "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentParameters.json#",
3 |     "contentVersion": "1.0.0.0",
4 |     "parameters": {
5 |       "deployNs": {
6 |         "value": "lace"
7 |       }
8 |     }
9 |   }


--------------------------------------------------------------------------------
/deploy/databricks/config/cluster.config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "cluster_name": "anomalycluster",
 3 |     "autoscale": { "min_workers": 1, "max_workers": 4 },
 4 |     "spark_version": "5.5.x-scala2.11",
 5 |     "spark_env_vars": {
 6 |       "PYSPARK_PYTHON": "/databricks/python3/bin/python3"
 7 |     },
 8 |     "autotermination_minutes": 120,
 9 |     "node_type_id": "Standard_DS12_v2",
10 |     "driver_node_type_id": "Standard_DS12_v2"
11 | }
12 | 


--------------------------------------------------------------------------------
/deploy/databricks/config/job.batchscoring.config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "Batch - Score using anomaly model",
 3 |     "new_cluster": {
 4 |         "spark_version": "5.5.x-scala2.11",
 5 |         "node_type_id": "Standard_DS12_v2",
 6 |         "num_workers": 3,
 7 |         "spark_env_vars": {
 8 |             "PYSPARK_PYTHON": "/databricks/python3/bin/python3"
 9 |         }
10 |     },
11 |     "libraries": [],
12 |     "timeout_seconds": 3600,
13 |     "email_notifications": {
14 |         "on_start": [],
15 |         "on_success": [],
16 |         "on_failure": []
17 |     },
18 |     "max_retries": 3,
19 |     "schedule": {
20 |         "quartz_cron_expression": "0 0 22 ? * *",
21 |         "timezone_id": "Australia/Victoria"
22 |     },
23 |     "notebook_task": {
24 |         "notebook_path": "/anomaly/05_batch_scoring"
25 |     }
26 | }


--------------------------------------------------------------------------------
/deploy/databricks/config/job.streamdatagen.config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "Streaming - Send data to evenhubs",
 3 |     "new_cluster": {
 4 |       "spark_version": "5.5.x-scala2.11",
 5 |       "node_type_id": "Standard_DS12_v2",
 6 |       "num_workers": 3,
 7 |       "spark_env_vars": {
 8 |         "PYSPARK_PYTHON": "/databricks/python3/bin/python3"
 9 |       }
10 |     },
11 |     "libraries": [
12 |       {
13 |         "maven": {
14 |           "coordinates": "com.microsoft.azure:azure-eventhubs-spark_2.11:2.3.2"
15 |         }
16 |       }
17 |     ],
18 |     "email_notifications": {
19 |       "on_start": [],
20 |       "on_success": [],
21 |       "on_failure": []
22 |     },
23 |     "max_retries": -1,
24 |     "notebook_task": {
25 |       "notebook_path": "/anomaly/06a_streaming_datagen"
26 |     }
27 |   }


--------------------------------------------------------------------------------
/deploy/databricks/config/job.streamscoring.config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "Streaming - Ingest data from eventhub and identify anomalies",
 3 |     "new_cluster": {
 4 |       "spark_version": "5.5.x-scala2.11",
 5 |       "node_type_id": "Standard_DS12_v2",
 6 |       "num_workers": 3,
 7 |       "spark_env_vars": {
 8 |         "PYSPARK_PYTHON": "/databricks/python3/bin/python3"
 9 |       }
10 |     },
11 |     "libraries": [
12 |       {
13 |         "maven": {
14 |           "coordinates": "com.microsoft.azure:azure-eventhubs-spark_2.11:2.3.2"
15 |         }
16 |       }
17 |     ],
18 |     "email_notifications": {
19 |       "on_start": [],
20 |       "on_success": [],
21 |       "on_failure": []
22 |     },
23 |     "max_retries": -1,
24 |     "notebook_task": {
25 |       "notebook_path": "/anomaly/06b_streaming_scoring"
26 |     }
27 |   }


--------------------------------------------------------------------------------
/deploy/databricks/config/run.downloaddata.config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "run_name": "Mount storage and download data",
 3 |     "new_cluster": {
 4 |         "spark_version": "5.5.x-scala2.11",
 5 |         "node_type_id": "Standard_DS12_v2",
 6 |         "num_workers": 1,
 7 |         "spark_env_vars": {
 8 |             "PYSPARK_PYTHON": "/databricks/python3/bin/python3"
 9 |         }
10 |     },
11 |     "libraries": [],
12 |     "timeout_seconds": 3600,
13 |     "notebook_task": {
14 |         "notebook_path": "/anomaly/01_download_data"
15 |     }
16 | }


--------------------------------------------------------------------------------
/deploy/databricks/config/run.etl.config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "run_name": "Perform ETL (SparkSQL tables)",
 3 |     "new_cluster": {
 4 |         "spark_version": "5.5.x-scala2.11",
 5 |         "node_type_id": "Standard_DS12_v2",
 6 |         "num_workers": 1,
 7 |         "spark_env_vars": {
 8 |             "PYSPARK_PYTHON": "/databricks/python3/bin/python3"
 9 |         }
10 |     },
11 |     "libraries": [],
12 |     "timeout_seconds": 3600,
13 |     "notebook_task": {
14 |         "notebook_path": "/anomaly/02_ETL"
15 |     }
16 | }


--------------------------------------------------------------------------------
/deploy/databricks/config/run.trainmodelall.config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "run_name": "Train PCAAnomaly model",
 3 |     "new_cluster": {
 4 |         "spark_version": "5.5.x-scala2.11",
 5 |         "node_type_id": "Standard_DS12_v2",
 6 |         "num_workers": 4,
 7 |         "spark_env_vars": {
 8 |             "PYSPARK_PYTHON": "/databricks/python3/bin/python3"
 9 |         }
10 |     },
11 |     "libraries": [],
12 |     "timeout_seconds": 3600,
13 |     "notebook_task": {
14 |         "notebook_path": "/anomaly/04_trainmodel_multiple"
15 |     }
16 | }


--------------------------------------------------------------------------------
/deploy/databricks/configure_databricks.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Access granted under MIT Open Source License: https://en.wikipedia.org/wiki/MIT_License
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated 
  6 | # documentation files (the "Software"), to deal in the Software without restriction, including without limitation 
  7 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, # and/or sell copies of the Software, 
  8 | # and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
  9 | #
 10 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions 
 11 | # of the Software.
 12 | #
 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED 
 14 | # TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
 15 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF 
 16 | # CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
 17 | # DEALINGS IN THE SOFTWARE.
 18 | #
 19 | #
 20 | # Description: Deploy Databricks cluster
 21 | #
 22 | # Usage: 
 23 | #
 24 | # Requirments:  
 25 | #
 26 | 
 27 | set -o errexit
 28 | set -o pipefail
 29 | set -o nounset
 30 | # set -o xtrace
 31 | 
 32 | # Set path
 33 | parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )
 34 | cd "$parent_path"
 35 | 
 36 | # Constants
 37 | RED='\033[0;31m'
 38 | ORANGE='\033[0;33m'
 39 | NC='\033[0m'
 40 | 
 41 | cluster_config="./config/cluster.config.json"
 42 | 
 43 | wait_for_run () {
 44 |     # See here: https://docs.azuredatabricks.net/api/latest/jobs.html#jobsrunresultstate
 45 |     declare mount_run_id=$1
 46 |     while : ; do
 47 |         life_cycle_status=$(databricks runs get --run-id $mount_run_id | jq -r ".state.life_cycle_state") 
 48 |         result_state=$(databricks runs get --run-id $mount_run_id | jq -r ".state.result_state")
 49 |         if [[ $result_state == "SUCCESS" || $result_state == "SKIPPED" ]]; then
 50 |             break;
 51 |         elif [[ $life_cycle_status == "INTERNAL_ERROR" || $result_state == "FAILED" ]]; then
 52 |             state_message=$(databricks runs get --run-id $mount_run_id | jq -r ".state.state_message")
 53 |             echo -e "${RED}Error while running ${mount_run_id}: ${state_message} ${NC}"
 54 |             exit 1
 55 |         else 
 56 |             echo "Waiting for run ${mount_run_id} to finish..."
 57 |             sleep 2m
 58 |         fi
 59 |     done
 60 | }
 61 | 
 62 | cluster_exists () {
 63 |     declare cluster_name="$1"
 64 |     declare cluster=$(databricks clusters list | tr -s " " | cut -d" " -f2 | grep ^${cluster_name}$)
 65 |     if [[ -n $cluster ]]; then
 66 |         return 0; # cluster exists
 67 |     else
 68 |         return 1; # cluster does not exists
 69 |     fi
 70 | }
 71 | 
 72 | yes_or_no () {
 73 |     while true; do
 74 |         read -p "$(echo -e ${ORANGE}"$* [y/n]: "${NC})" yn
 75 |         case $yn in
 76 |             [Yy]*) return 0  ;;  
 77 |             [Nn]*) echo -e "${RED}Aborted${NC}" ; return  1 ;;
 78 |         esac
 79 |     done
 80 | }
 81 | 
 82 | 
 83 | _main() {
 84 |     echo -e "${ORANGE}"
 85 |     echo -e "!! -- WARNING --!!"
 86 |     echo -e "If this is the second time you are running this, this will re-upload and overwrite existing notebooks with the same names in the 'notebooks' folder. "
 87 |     echo -e "This will also drop and reload data in Tables."
 88 |     echo -e "${NC}"
 89 |     yes_or_no "Are you sure you want to continue (Y/N)?" || { exit 1; }
 90 | 
 91 |     # Upload notebooks
 92 |     echo "Uploading notebooks..."
 93 |     databricks workspace import_dir "../../notebooks/databricks_notebooks" "/anomaly" --overwrite
 94 | 
 95 |     # Upload notebook images to FileStore
 96 |     # https://docs.databricks.com/user-guide/advanced/filestore.html
 97 |     databricks fs cp "../../images/FileStore" "dbfs:/FileStore/images" --recursive
 98 |     
 99 |     # Upload models
100 |     # echo "Uploading pre-trained models..."
101 |     # databricks fs cp --recursive --overwrite models/ dbfs:/mnt/blob_storage/models/
102 | 
103 |     # Setup workspace
104 |     echo "Downloading data. This may take a while as cluster spins up..."
105 |     wait_for_run $(databricks runs submit --json-file "./config/run.downloaddata.config.json" | jq -r ".run_id" )
106 |     echo "Performing initial ETL of data. This may take a while as cluster spins up..."
107 |     wait_for_run $(databricks runs submit --json-file "./config/run.etl.config.json" | jq -r ".run_id" )
108 |     echo "Training anomaly model. This may take a while as cluster spins up..."
109 |     wait_for_run $(databricks runs submit --json-file "./config/run.trainmodelall.config.json" | jq -r ".run_id" )
110 | 
111 |     # Schedule and run jobs
112 |     echo "Scheduling and running jobs..."
113 |     databricks jobs run-now --job-id $(databricks jobs create --json-file "./config/job.streamscoring.config.json" | jq ".job_id")
114 |     databricks jobs run-now --job-id $(databricks jobs create --json-file "./config/job.batchscoring.config.json" | jq ".job_id")
115 | 
116 |     # Create initial cluster, if not yet exists
117 |     echo "Creating an interactive cluster..."
118 |     cluster_name=$(cat $cluster_config | jq -r ".cluster_name")
119 |     if cluster_exists $cluster_name; then 
120 |         echo "Cluster ${cluster_name} already exists!"
121 |     else
122 |         echo "Creating cluster ${cluster_name}..."
123 |         databricks clusters create --json-file $cluster_config
124 |     fi
125 | 
126 |     # Install Library
127 |     echo "Installing libraries..."
128 |     cluster_id=$(databricks clusters list | awk '/'$cluster_name'/ {print $1}')
129 |     databricks libraries install --maven-coordinates com.microsoft.azure:azure-eventhubs-spark_2.11:2.3.2 --cluster-id $cluster_id
130 | 
131 | }
132 | 
133 | _main
134 | 


--------------------------------------------------------------------------------
/deploy/databricks/create_secrets.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -o allexport
 4 | source .env
 5 | set +o allexport
 6 | 
 7 | scope_name="storage_scope"
 8 | 
 9 | # Create scope, if not exists
10 | if [[ -z $(databricks secrets list-scopes | grep "$scope_name") ]]; then
11 |     echo "Creating secrets scope: $scope_name"
12 |     databricks secrets create-scope --scope "$scope_name"
13 | fi
14 | 
15 | # Create secrets
16 | echo "Creating secrets within scope $scope_name..."
17 | databricks secrets write --scope "$scope_name" --key "storage_account" --string-value  "$BLOB_STORAGE_ACCOUNT"
18 | databricks secrets write --scope "$scope_name" --key "storage_key" --string-value  "$BLOB_STORAGE_KEY"
19 | databricks secrets write --scope "$scope_name" --key "eventhub_namespace" --string-value  "$EVENTHUB_NAMESPACE"
20 | databricks secrets write --scope "$scope_name" --key "eventhub_data_name" --string-value  "$EVENTHUB_DATA_NAME"
21 | databricks secrets write --scope "$scope_name" --key "eventhub_data_send_key" --string-value  "$EVENTHUB_DATA_SEND_KEY"
22 | databricks secrets write --scope "$scope_name" --key "eventhub_data_listen_key" --string-value  "$EVENTHUB_DATA_LISTEN_KEY"
23 | databricks secrets write --scope "$scope_name" --key "eventhub_anom_name" --string-value  "$EVENTHUB_ANOM_NAME"
24 | databricks secrets write --scope "$scope_name" --key "eventhub_anom_send_key" --string-value  "$EVENTHUB_ANOM_SEND_KEY"
25 | databricks secrets write --scope "$scope_name" --key "eventhub_anom_listen_key" --string-value  "$EVENTHUB_ANOM_LISTEN_KEY"
26 | 


--------------------------------------------------------------------------------
/deploy/deploy.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Access granted under MIT Open Source License: https://en.wikipedia.org/wiki/MIT_License
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated 
  6 | # documentation files (the "Software"), to deal in the Software without restriction, including without limitation 
  7 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, # and/or sell copies of the Software, 
  8 | # and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
  9 | #
 10 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions 
 11 | # of the Software.
 12 | #
 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED 
 14 | # TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
 15 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF 
 16 | # CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
 17 | # DEALINGS IN THE SOFTWARE.
 18 | #
 19 | #
 20 | # Description: Deploy ARM template which creates a Databricks account
 21 | #
 22 | # Usage: ./deploy.sh myResourceGroup "East US 2"
 23 | #
 24 | # Requirments:  
 25 | # - User must be logged in to the az cli with the appropriate account set.
 26 | # - User must have appropraite permission to deploy to a resource group
 27 | # - User must have appropriate permission to create a service principal
 28 | 
 29 | set -o errexit
 30 | set -o pipefail
 31 | set -o nounset
 32 | #set -o xtrace # For debugging
 33 | 
 34 | ###################
 35 | # SETUP
 36 | 
 37 | # Check if required utilities are installed
 38 | command -v jq >/dev/null 2>&1 || { echo >&2 "I require jq but it's not installed. See https://stedolan.github.io/jq/.  Aborting."; exit 1; }
 39 | command -v az >/dev/null 2>&1 || { echo >&2 "I require azure cli but it's not installed. See https://bit.ly/2Gc8IsS. Aborting."; exit 1; }
 40 | 
 41 | # Globals
 42 | timestamp=$(date +%s)
 43 | deploy_name="deployment${timestamp}"
 44 | env_file="../.env"
 45 | 
 46 | # Constants
 47 | RED='\033[0;31m'
 48 | ORANGE='\033[0;33m'
 49 | NC='\033[0m'
 50 | 
 51 | # Set path
 52 | parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )
 53 | cd "$parent_path"
 54 | 
 55 | # Check if user is logged in
 56 | [[ -n $(az account show 2> /dev/null) ]] || { echo "Please login via the Azure CLI: "; az login; }
 57 | 
 58 | 
 59 | ###################
 60 | # USER PARAMETERS
 61 | 
 62 | rg_name="${1-}"
 63 | rg_location="${2-}"
 64 | sub_id="${3-}"
 65 | 
 66 | storage_container=databricks #fixed
 67 | 
 68 | while [[ -z $rg_name ]]; do
 69 |     read -rp "$(echo -e ${ORANGE}"Enter Resource Group name: "${NC})" rg_name
 70 | done
 71 | 
 72 | while [[ -z $rg_location ]]; do
 73 |     read -rp "$(echo -e ${ORANGE}"Enter Azure Location (ei. EAST US 2): "${NC})" rg_location
 74 | done
 75 | 
 76 | while [[ -z $sub_id ]]; do
 77 |     # Check if user only has one sub
 78 |     sub_count=$(az account list --output json | jq '. | length')
 79 |     if (( $sub_count != 1 )); then
 80 |         az account list --output table
 81 |         read -rp "$(echo -e ${ORANGE}"Enter Azure Subscription Id you wish to deploy to (enter to use Default): "${NC})" sub_id
 82 |     fi
 83 |     # If still empty then user selected IsDefault
 84 |     if [[ -z $sub_id ]]; then
 85 |         sub_id=$(az account show --output json | jq -r '.id')
 86 |     fi
 87 | done
 88 | 
 89 | # Set account
 90 | echo "Deploying to Subscription: $sub_id"
 91 | az account set --subscription $sub_id
 92 | 
 93 | #####################
 94 | # Deploy ARM template
 95 | 
 96 | echo "Creating resource group: $rg_name"
 97 | az group create --name "$rg_name" --location "$rg_location"
 98 | 
 99 | echo "Deploying resources into $rg_name"
100 | arm_output=$(az group deployment create \
101 |     --name "$deploy_name" \
102 |     --resource-group "$rg_name" \
103 |     --template-file "./azuredeploy.json" \
104 |     --parameters @"./azuredeploy.parameters.json" \
105 |     --output json)
106 | 
107 | if [[ -z $arm_output ]]; then
108 |     echo >&2 "ARM deployment failed." 
109 |     exit 1
110 | fi
111 | 
112 | 
113 | #####################
114 | # Ask user to configure databricks cli
115 | dbi_workspace=$(echo $arm_output | jq -r '.properties.outputs.dbricksWorkspaceName.value')
116 | echo -e "${ORANGE}"
117 | echo "Configure your databricks cli to connect to the newly created Databricks workspace: ${dbi_workspace}. See here for more info: https://bit.ly/2GUwHcw."
118 | databricks configure --token
119 | echo -e "${NC}"
120 | 
121 | 
122 | #####################
123 | # Append to .env file
124 | 
125 | echo "Retrieving configuration information from newly deployed resources."
126 | 
127 | # Databricks details
128 | dbricks_location=$(echo $arm_output | jq -r '.properties.outputs.dbricksLocation.value')
129 | dbi_token=$(awk '/token/ && NR==3 {print $0;exit;}' ~/.databrickscfg | cut -d' ' -f3)
130 | [[ -n $dbi_token ]] || { echo >&2 "Databricks cli not configured correctly. Please run databricks configure --token. Aborting."; exit 1; }
131 | 
132 | # Retrieve storage account details
133 | storage_account=$(echo $arm_output | jq -r '.properties.outputs.storAccountName.value')
134 | storage_account_key=$(az storage account keys list \
135 |     --account-name $storage_account \
136 |     --resource-group $rg_name \
137 |     --output json |
138 |     jq -r '.[0].value')
139 | 
140 | # Retrieve eventhub details
141 | ehns_name=$(echo $arm_output | jq -r '.properties.outputs.eventhubsNsName.value')
142 | ## EH - data
143 | eh_data_name=$(echo $arm_output | jq -r '.properties.outputs.eventhubDataName.value')
144 | eh_data_send_key=$(az eventhubs eventhub authorization-rule keys list \
145 |     --namespace-name $ehns_name \
146 |     --eventhub-name $eh_data_name \
147 |     --name send \
148 |     --resource-group $rg_name \
149 |     --output json |
150 |     jq -r '.primaryKey')
151 | eh_data_listen_key=$(az eventhubs eventhub authorization-rule keys list \
152 |     --namespace-name $ehns_name \
153 |     --eventhub-name $eh_data_name \
154 |     --name listen \
155 |     --resource-group $rg_name \
156 |     --output json |
157 |     jq -r '.primaryKey')
158 | ## EH - anom
159 | eh_anom_name=$(echo $arm_output | jq -r '.properties.outputs.eventhubAnomName.value')
160 | eh_anom_send_key=$(az eventhubs eventhub authorization-rule keys list \
161 |     --namespace-name $ehns_name \
162 |     --eventhub-name $eh_anom_name \
163 |     --name send \
164 |     --resource-group $rg_name \
165 |     --output json |
166 |     jq -r '.primaryKey')
167 | eh_anom_listen_key=$(az eventhubs eventhub authorization-rule keys list \
168 |     --namespace-name $ehns_name \
169 |     --eventhub-name $eh_anom_name \
170 |     --name listen \
171 |     --resource-group $rg_name \
172 |     --output json |
173 |     jq -r '.primaryKey')
174 | 
175 | # Create storage container
176 | # LACE TODO Idempotent?
177 | az storage container create \
178 |     --name "$storage_container" \
179 |     --account-name "$storage_account" \
180 |     --account-key "$storage_account_key"
181 | 
182 | 
183 | # Build .env file
184 | echo "Appending configuration to .env file."
185 | cat << EOF >> $env_file
186 | 
187 | # ------ Configuration from deployment ${deploy_name} -----------
188 | BLOB_STORAGE_ACCOUNT=${storage_account}
189 | BLOB_STORAGE_KEY=${storage_account_key}
190 | EVENTHUB_NAMESPACE=${ehns_name}
191 | EVENTHUB_DATA_NAME=${eh_data_name}
192 | EVENTHUB_DATA_SEND_KEY=${eh_data_send_key}
193 | EVENTHUB_DATA_LISTEN_KEY=${eh_data_listen_key}
194 | EVENTHUB_ANOM_NAME=${eh_anom_name}
195 | EVENTHUB_ANOM_SEND_KEY=${eh_anom_send_key}
196 | EVENTHUB_ANOM_LISTEN_KEY=${eh_anom_listen_key}
197 | DBRICKS_DOMAIN=${dbricks_location}.azuredatabricks.net
198 | DBRICKS_TOKEN=${dbi_token}
199 | 
200 | EOF
201 | 
202 | echo "Completed deploying Azure resources."
203 | 


--------------------------------------------------------------------------------
/images/FileStore/GBTModel.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devlace/azure-databricks-anomaly/081c56515ba8d8f614e85ae4b7207eb75c6a0900/images/FileStore/GBTModel.PNG


--------------------------------------------------------------------------------
/images/FileStore/LogRegCVPipeline.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devlace/azure-databricks-anomaly/081c56515ba8d8f614e85ae4b7207eb75c6a0900/images/FileStore/LogRegCVPipeline.PNG


--------------------------------------------------------------------------------
/images/FileStore/MLPipeline.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devlace/azure-databricks-anomaly/081c56515ba8d8f614e85ae4b7207eb75c6a0900/images/FileStore/MLPipeline.PNG


--------------------------------------------------------------------------------
/images/FileStore/PCAAnomalyPipeline.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devlace/azure-databricks-anomaly/081c56515ba8d8f614e85ae4b7207eb75c6a0900/images/FileStore/PCAAnomalyPipeline.PNG


--------------------------------------------------------------------------------
/images/FileStore/RandomForestPipeline.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devlace/azure-databricks-anomaly/081c56515ba8d8f614e85ae4b7207eb75c6a0900/images/FileStore/RandomForestPipeline.PNG


--------------------------------------------------------------------------------
/images/FileStore/TransformPipeline.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devlace/azure-databricks-anomaly/081c56515ba8d8f614e85ae4b7207eb75c6a0900/images/FileStore/TransformPipeline.PNG


--------------------------------------------------------------------------------
/images/FileStore/transformation_and_actions.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devlace/azure-databricks-anomaly/081c56515ba8d8f614e85ae4b7207eb75c6a0900/images/FileStore/transformation_and_actions.PNG


--------------------------------------------------------------------------------
/images/archi.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devlace/azure-databricks-anomaly/081c56515ba8d8f614e85ae4b7207eb75c6a0900/images/archi.PNG


--------------------------------------------------------------------------------
/notebooks/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devlace/azure-databricks-anomaly/081c56515ba8d8f614e85ae4b7207eb75c6a0900/notebooks/.gitkeep


--------------------------------------------------------------------------------
/notebooks/databricks_notebooks/00_demo_hello_spark.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md
 3 | # MAGIC ## Spark session
 4 | 
 5 | # COMMAND ----------
 6 | 
 7 | spark
 8 | 
 9 | # COMMAND ----------
10 | 
11 | spark.version
12 | 
13 | # COMMAND ----------
14 | 
15 | # MAGIC %md
16 | # MAGIC ## Spark Dataframe
17 | 
18 | # COMMAND ----------
19 | 
20 | df = spark.createDataFrame([('Fiji Apple', 'Red', 3.5), 
21 |                            ('Banana', 'Yellow', 1.0),
22 |                            ('Green Grape', 'Green', 2.0),
23 |                            ('Red Grape', 'Red', 2.0),
24 |                            ('Peach', 'Yellow', 3.0),
25 |                            ('Orange', 'Orange', 2.0),
26 |                            ('Green Apple', 'Green', 2.5)], 
27 |                            ['Fruit', 'Color', 'Price'])
28 | display(df)
29 | 
30 | # COMMAND ----------
31 | 
32 | df.printSchema()
33 | 
34 | # COMMAND ----------
35 | 
36 | # MAGIC %md
37 | # MAGIC ### Let's mix in some Spark SQL
38 | 
39 | # COMMAND ----------
40 | 
41 | df.createOrReplaceTempView("temp_df")
42 | 
43 | # COMMAND ----------
44 | 
45 | # MAGIC %sql
46 | # MAGIC SELECT * FROM temp_df
47 | 
48 | # COMMAND ----------
49 | 
50 | # MAGIC %md
51 | # MAGIC ## Transformation and Actions
52 | # MAGIC ![Transformation and Actions](files/images/transformation_and_actions.PNG)
53 | 
54 | # COMMAND ----------
55 | 
56 | # MAGIC %md
57 | # MAGIC #### Transformation
58 | 
59 | # COMMAND ----------
60 | 
61 | df_agg = df\
62 |   .select("Fruit", "Color", "Price")\
63 |   .groupBy("Color")\
64 |   .avg("Price")
65 | 
66 | # COMMAND ----------
67 | 
68 | # MAGIC %md
69 | # MAGIC #### Action
70 | 
71 | # COMMAND ----------
72 | 
73 | df_agg.collect()


--------------------------------------------------------------------------------
/notebooks/databricks_notebooks/01_download_data.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md
 3 | # MAGIC # Mount blob storage
 4 | 
 5 | # COMMAND ----------
 6 | 
 7 | # Set mount path
 8 | storage_mount_path = "/mnt/blob_storage"
 9 | 
10 | # Unmount if existing
11 | for mp in dbutils.fs.mounts():
12 |   if mp.mountPoint == storage_mount_path:
13 |     dbutils.fs.unmount(storage_mount_path)
14 | 
15 | # Refresh mounts
16 | dbutils.fs.refreshMounts()
17 | 
18 | # COMMAND ----------
19 | 
20 | # Retrieve storage credentials
21 | storage_account = dbutils.secrets.get(scope = "storage_scope", key = "storage_account")
22 | storage_key = dbutils.secrets.get(scope = "storage_scope", key = "storage_key")
23 | 
24 | # Try to print out:
25 | storage_key
26 | 
27 | # COMMAND ----------
28 | 
29 | # Mount
30 | dbutils.fs.mount(
31 |   source = "wasbs://databricks@" + storage_account + ".blob.core.windows.net",
32 |   mount_point = storage_mount_path, 
33 |   extra_configs = {"fs.azure.account.key." + storage_account + ".blob.core.windows.net": storage_key})
34 | 
35 | # Refresh mounts
36 | dbutils.fs.refreshMounts()
37 | 
38 | # COMMAND ----------
39 | 
40 | # MAGIC %md
41 | # MAGIC # Download Data
42 | 
43 | # COMMAND ----------
44 | 
45 | import os
46 | import gzip
47 | import shutil
48 | from urllib.request import urlretrieve
49 | 
50 | def download_and_uncompress_gz(data_url, out_file):
51 |   tmp_loc = '/tmp/data.gz'
52 |   
53 |   # Download
54 |   urlretrieve(data_url, tmp_loc)
55 |   
56 |   # Create dir if not exist
57 |   dir_path = os.path.dirname(out_file)
58 |   if not os.path.exists(dir_path):
59 |     os.makedirs(dir_path)
60 |     
61 |   # Uncompress
62 |   with gzip.open(tmp_loc, 'rb') as f_in:
63 |     with open(out_file, 'wb') as f_out:
64 |         shutil.copyfileobj(f_in, f_out)
65 |         
66 |   # Cleanup
67 |   os.remove(tmp_loc)
68 |   
69 | 
70 | # Note that Azure Databricks configures each cluster node with a FUSE mount that allows processes running on cluster nodes to read and write to the underlying
71 | # distributed storage layer with local file APIs
72 | # See here: https://docs.azuredatabricks.net/user-guide/dbfs-databricks-file-system.html#access-dbfs-using-local-file-apis
73 | # 'https://archive.ics.uci.edu/ml/machine-learning-databases/kddcup99-mld/kddcup.data.gz'
74 | download_and_uncompress_gz(data_url='https://lacedemodata.blob.core.windows.net/data/kddcup.data.gz',
75 |                            out_file='/dbfs' + storage_mount_path + '/data/raw/kddcup.data.csv')
76 | 
77 | # 'http://kdd.ics.uci.edu/databases/kddcup99/kddcup.testdata.unlabeled.gz'
78 | download_and_uncompress_gz(data_url='https://lacedemodata.blob.core.windows.net/data/kddcup.testdata.unlabeled.gz',
79 |                            out_file='/dbfs' + storage_mount_path + '/data/raw/kddcup.testdata.unlabeled.csv')


--------------------------------------------------------------------------------
/notebooks/databricks_notebooks/02_ETL.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # Prepare data
  3 | 
  4 | from pyspark.sql.functions import monotonically_increasing_id, lit, concat
  5 | 
  6 | # Set mount path
  7 | storage_mount_path = "/mnt/blob_storage"
  8 | 
  9 | raw_df = spark.read.csv(storage_mount_path + '/data/raw/kddcup.data.csv')
 10 | raw_unlabeled_df = spark.read.csv(storage_mount_path + '/data/raw/kddcup.testdata.unlabeled.csv')
 11 | 
 12 | # Add id
 13 | df = raw_df.withColumn('id', concat(lit('A'), monotonically_increasing_id()))\
 14 |   .select(['id'] + raw_df.columns)\
 15 |   .repartition(20)
 16 | unlabeled_df = raw_unlabeled_df.withColumn('id', concat(lit('B'), monotonically_increasing_id()))\
 17 |   .select(['id'] + raw_unlabeled_df.columns)\
 18 |   .repartition(20)
 19 | 
 20 | # Write out to csv
 21 | df.write.csv(storage_mount_path + '/data/for_streaming/kddcup.data/', mode='overwrite')
 22 | unlabeled_df.write.csv(storage_mount_path + '/data/for_streaming/kddcup.testdata.unlabeled/', mode='overwrite')
 23 | 
 24 | # COMMAND ----------
 25 | 
 26 | # MAGIC %md
 27 | # MAGIC # Create and load SparkSQL tables
 28 | 
 29 | # COMMAND ----------
 30 | 
 31 | # MAGIC %sql
 32 | # MAGIC ------------------
 33 | # MAGIC -- Create KDD Table
 34 | # MAGIC 
 35 | # MAGIC DROP TABLE IF EXISTS kdd_temp;
 36 | # MAGIC CREATE TABLE kdd_temp
 37 | # MAGIC (
 38 | # MAGIC   id STRING,
 39 | # MAGIC   duration FLOAT,
 40 | # MAGIC   protocol_type STRING,
 41 | # MAGIC   service STRING,
 42 | # MAGIC   flag STRING,
 43 | # MAGIC   src_bytes FLOAT,
 44 | # MAGIC   dst_bytes FLOAT,
 45 | # MAGIC   land SHORT,
 46 | # MAGIC   wrong_fragment FLOAT,
 47 | # MAGIC   urgent FLOAT,
 48 | # MAGIC   hot FLOAT,
 49 | # MAGIC   num_failed_logins FLOAT,
 50 | # MAGIC   logged_in SHORT,
 51 | # MAGIC   num_compromised FLOAT,
 52 | # MAGIC   root_shell FLOAT,
 53 | # MAGIC   su_attempted FLOAT,
 54 | # MAGIC   num_root FLOAT,
 55 | # MAGIC   num_file_creations FLOAT,
 56 | # MAGIC   num_shells FLOAT,
 57 | # MAGIC   num_access_files FLOAT,
 58 | # MAGIC   num_outbound_cmds FLOAT,
 59 | # MAGIC   is_host_login SHORT,
 60 | # MAGIC   is_guest_login SHORT,
 61 | # MAGIC   count FLOAT,
 62 | # MAGIC   srv_count FLOAT,
 63 | # MAGIC   serror_rate FLOAT,
 64 | # MAGIC   srv_serror_rate FLOAT,
 65 | # MAGIC   rerror_rate FLOAT,
 66 | # MAGIC   srv_rerror_rate FLOAT,
 67 | # MAGIC   same_srv_rate FLOAT,
 68 | # MAGIC   diff_srv_rate FLOAT,
 69 | # MAGIC   srv_diff_host_rate FLOAT,
 70 | # MAGIC   dst_host_count FLOAT,
 71 | # MAGIC   dst_host_srv_count FLOAT,
 72 | # MAGIC   dst_host_same_srv_rate FLOAT,
 73 | # MAGIC   dst_host_diff_srv_rate FLOAT,
 74 | # MAGIC   dst_host_same_src_port_rate FLOAT,
 75 | # MAGIC   dst_host_srv_diff_host_rate FLOAT,
 76 | # MAGIC   dst_host_serror_rate FLOAT,
 77 | # MAGIC   dst_host_srv_serror_rate FLOAT,
 78 | # MAGIC   dst_host_rerror_rate FLOAT,
 79 | # MAGIC   dst_host_srv_rerror_rate FLOAT,
 80 | # MAGIC   label STRING
 81 | # MAGIC )
 82 | # MAGIC USING CSV
 83 | # MAGIC LOCATION '/mnt/blob_storage/data/for_streaming/kddcup.data/'
 84 | # MAGIC OPTIONS ("header"="false");
 85 | # MAGIC 
 86 | # MAGIC DROP TABLE IF EXISTS kdd;
 87 | # MAGIC CREATE TABLE kdd 
 88 | # MAGIC USING org.apache.spark.sql.parquet
 89 | # MAGIC AS SELECT * FROM kdd_temp;
 90 | # MAGIC 
 91 | # MAGIC -- Drop temporary table
 92 | # MAGIC DROP TABLE kdd_temp;
 93 | # MAGIC 
 94 | # MAGIC --Refresh
 95 | # MAGIC REFRESH TABLE kdd;
 96 | # MAGIC 
 97 | # MAGIC --select
 98 | # MAGIC SELECT * FROM kdd LIMIT 100;
 99 | 
100 | # COMMAND ----------
101 | 
102 | # MAGIC %sql
103 | # MAGIC ------------------
104 | # MAGIC -- Create KDD_unlabelled Table
105 | # MAGIC 
106 | # MAGIC DROP TABLE IF EXISTS kdd_unlabeled_temp;
107 | # MAGIC CREATE TABLE kdd_unlabeled_temp
108 | # MAGIC (
109 | # MAGIC   id STRING,
110 | # MAGIC   duration FLOAT,
111 | # MAGIC   protocol_type STRING,
112 | # MAGIC   service STRING,
113 | # MAGIC   flag STRING,
114 | # MAGIC   src_bytes FLOAT,
115 | # MAGIC   dst_bytes FLOAT,
116 | # MAGIC   land SHORT,
117 | # MAGIC   wrong_fragment FLOAT,
118 | # MAGIC   urgent FLOAT,
119 | # MAGIC   hot FLOAT,
120 | # MAGIC   num_failed_logins FLOAT,
121 | # MAGIC   logged_in SHORT,
122 | # MAGIC   num_compromised FLOAT,
123 | # MAGIC   root_shell FLOAT,
124 | # MAGIC   su_attempted FLOAT,
125 | # MAGIC   num_root FLOAT,
126 | # MAGIC   num_file_creations FLOAT,
127 | # MAGIC   num_shells FLOAT,
128 | # MAGIC   num_access_files FLOAT,
129 | # MAGIC   num_outbound_cmds FLOAT,
130 | # MAGIC   is_host_login SHORT,
131 | # MAGIC   is_guest_login SHORT,
132 | # MAGIC   count FLOAT,
133 | # MAGIC   srv_count FLOAT,
134 | # MAGIC   serror_rate FLOAT,
135 | # MAGIC   srv_serror_rate FLOAT,
136 | # MAGIC   rerror_rate FLOAT,
137 | # MAGIC   srv_rerror_rate FLOAT,
138 | # MAGIC   same_srv_rate FLOAT,
139 | # MAGIC   diff_srv_rate FLOAT,
140 | # MAGIC   srv_diff_host_rate FLOAT,
141 | # MAGIC   dst_host_count FLOAT,
142 | # MAGIC   dst_host_srv_count FLOAT,
143 | # MAGIC   dst_host_same_srv_rate FLOAT,
144 | # MAGIC   dst_host_diff_srv_rate FLOAT,
145 | # MAGIC   dst_host_same_src_port_rate FLOAT,
146 | # MAGIC   dst_host_srv_diff_host_rate FLOAT,
147 | # MAGIC   dst_host_serror_rate FLOAT,
148 | # MAGIC   dst_host_srv_serror_rate FLOAT,
149 | # MAGIC   dst_host_rerror_rate FLOAT,
150 | # MAGIC   dst_host_srv_rerror_rate FLOAT
151 | # MAGIC )
152 | # MAGIC USING CSV
153 | # MAGIC LOCATION '/mnt/blob_storage/data/for_streaming/kddcup.testdata.unlabeled/'
154 | # MAGIC OPTIONS ("header"="false");
155 | # MAGIC 
156 | # MAGIC DROP TABLE IF EXISTS kdd_unlabeled;
157 | # MAGIC CREATE TABLE kdd_unlabeled 
158 | # MAGIC USING org.apache.spark.sql.parquet
159 | # MAGIC AS SELECT * FROM kdd_unlabeled_temp;
160 | # MAGIC 
161 | # MAGIC -- Drop temporary table
162 | # MAGIC DROP TABLE kdd_unlabeled_temp;
163 | # MAGIC 
164 | # MAGIC --Refresh
165 | # MAGIC REFRESH TABLE kdd_unlabeled;
166 | # MAGIC 
167 | # MAGIC --Select
168 | # MAGIC SELECT * FROM kdd_unlabeled LIMIT 100;


--------------------------------------------------------------------------------
/notebooks/databricks_notebooks/03_explore_data.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | from pyspark.sql import functions as F
 3 | 
 4 | # COMMAND ----------
 5 | 
 6 | df = spark.read.table("kdd")
 7 | 
 8 | # COMMAND ----------
 9 | 
10 | display(df)
11 | 
12 | # COMMAND ----------
13 | 
14 | df.printSchema()
15 | 
16 | # COMMAND ----------
17 | 
18 | df.count()
19 | 
20 | # COMMAND ----------
21 | 
22 | # Summary on continuous features
23 | cols = df.columns
24 | noncont_features = ['id', 'protocol_type', 'service', 'flag', 'label']
25 | cont_features = [x for x in cols if x not in noncont_features]
26 | 
27 | summary_df = df.select(cont_features).summary().cache()
28 | display(summary_df)
29 | 
30 | # COMMAND ----------
31 | 
32 | # Normal vs Anomalies
33 | transformed_df = (df\
34 |   .withColumn("label", F.when(df.label == "normal.", 0).otherwise(1))\
35 |   .groupBy("label")
36 |   .agg(F.count("id")))
37 | 
38 | display(transformed_df)
39 | 
40 | # COMMAND ----------
41 | 
42 | # Count by label
43 | transformed_df = (df\
44 |   .groupBy("label")\
45 |   .agg(F.count("label").alias("num_requests"))\
46 |   .orderBy("num_requests", ascending=False))
47 | 
48 | display(transformed_df)
49 | 
50 | # COMMAND ----------
51 | 
52 | 


--------------------------------------------------------------------------------
/notebooks/databricks_notebooks/04_trainmodel_multiple.scala:
--------------------------------------------------------------------------------
  1 | // Databricks notebook source
  2 | // MAGIC %md
  3 | // MAGIC ## Setup
  4 | 
  5 | // COMMAND ----------
  6 | 
  7 | import org.apache.spark.ml.Pipeline
  8 | import org.apache.spark.ml.feature._
  9 | import org.apache.spark.sql.functions._
 10 | import org.apache.spark.ml.classification._
 11 | import org.apache.spark.ml.evaluation.{MulticlassClassificationEvaluator, BinaryClassificationEvaluator}
 12 | import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}
 13 | 
 14 | // Model Directory
 15 | val modelDir = "/mnt/blob_storage/models"
 16 | val randomSeed = 123
 17 | 
 18 | // COMMAND ----------
 19 | 
 20 | // MAGIC %md
 21 | // MAGIC ## Load and transform data
 22 | 
 23 | // COMMAND ----------
 24 | 
 25 | // Read data
 26 | spark.catalog.refreshTable("kdd") // need to refresh to invalidate cache
 27 | val df = spark.read.table("kdd")
 28 | 
 29 | // Clean data
 30 | val cleanDf = df
 31 |   .withColumn("is_anomaly", when(col("label") === "normal.", 0).otherwise(1))
 32 |   .na.drop()
 33 | 
 34 | // Clean up labels for anomaly
 35 | display(cleanDf)
 36 | 
 37 | val columns = cleanDf.columns.toSet
 38 | val features = columns -- Set("id", "label", "is_anomaly")
 39 | val categoricalFeatures = Set("protocol_type", "service", "flag")
 40 | val continuousFeatures = features -- categoricalFeatures
 41 | 
 42 | 
 43 | // COMMAND ----------
 44 | 
 45 | // MAGIC %md
 46 | // MAGIC ## Define Feature Estimators and Transformers
 47 | 
 48 | // COMMAND ----------
 49 | 
 50 | // Label indexer
 51 | val labelIndexer = new StringIndexer()
 52 |   .setInputCol("label")
 53 |   .setOutputCol("label_index")
 54 | val labelIndexerModel = labelIndexer.fit(cleanDf)
 55 | 
 56 | // Categorical Feature Indexers
 57 | val indexers = categoricalFeatures.map({ colName =>
 58 |   new StringIndexer().setInputCol(colName).setOutputCol(colName + "_index").setHandleInvalid("keep")
 59 | }).toArray
 60 | 
 61 | // Encoders
 62 | val encoder = new OneHotEncoderEstimator()
 63 |   .setInputCols(categoricalFeatures.map(colName => colName + "_index").toArray)
 64 |   .setOutputCols(categoricalFeatures.map(colName => colName + "_encoded").toArray)
 65 | 
 66 | // Vector Assembler
 67 | var selectedFeatures = continuousFeatures ++ categoricalFeatures.map(colName => colName + "_encoded") 
 68 | val assembler = new VectorAssembler()
 69 |   .setInputCols(selectedFeatures.toArray)
 70 |   .setOutputCol("features")
 71 | 
 72 | // Standard Scalar
 73 | val standardScalar = new StandardScaler()
 74 |   .setInputCol("features")
 75 |   .setOutputCol("norm_features")
 76 |   .setWithMean(true)
 77 |   .setWithStd(true)
 78 | 
 79 | // Convert indexed labels back to original labels.
 80 | val labelConverter = new IndexToString()
 81 |   .setInputCol("prediction")
 82 |   .setOutputCol("predicted_label")
 83 |   .setLabels(labelIndexerModel.labels)                                                         
 84 | 
 85 | // COMMAND ----------
 86 | 
 87 | // MAGIC %md
 88 | // MAGIC ## Build Data Transformation pipeline
 89 | // MAGIC ![Data Transform Pipeline](files/images/TransformPipeline.PNG)
 90 | 
 91 | // COMMAND ----------
 92 | 
 93 | // Transform pipeline
 94 | val transformPipeline = new Pipeline().setStages(indexers ++ Array(labelIndexer, encoder, assembler, standardScalar))
 95 | val transformedDf = transformPipeline
 96 |   .fit(cleanDf)
 97 |   .transform(cleanDf)
 98 | 
 99 | // Split data
100 | val Array(transformedTraining, transformedTest) = transformedDf.randomSplit(Array(0.8, 0.2), seed = randomSeed)
101 | 
102 | display(transformedDf.select("label_index", "norm_features"))
103 | 
104 | // COMMAND ----------
105 | 
106 | // MAGIC %md
107 | // MAGIC ## GBT Binary classification
108 | // MAGIC ![GBT Model](files/images/GBTModel.PNG)
109 | 
110 | // COMMAND ----------
111 | 
112 | // Train a GBT model.
113 | val gbt = new GBTClassifier()
114 |   .setLabelCol("is_anomaly")
115 |   .setFeaturesCol("norm_features")
116 |   .setMaxIter(10)
117 |   .setFeatureSubsetStrategy("auto")
118 | 
119 | // Fit pipeline
120 | val gbtModel = gbt.fit(transformedTraining)
121 | 
122 | // Make predictions.
123 | val gbtPredictions = gbtModel.transform(transformedTest)
124 | gbtPredictions.select("prediction", "label", "features").show(10)
125 | 
126 | val gbtEvaluator = new BinaryClassificationEvaluator()
127 |   .setMetricName("areaUnderROC")
128 |   .setLabelCol("is_anomaly")
129 |   .setRawPredictionCol("rawPrediction")
130 | val gbtAccuracy = gbtEvaluator.evaluate(gbtPredictions)
131 | println(s"Test Error = ${(1.0 - gbtAccuracy)}")
132 | 
133 | // COMMAND ----------
134 | 
135 | // MAGIC %md
136 | // MAGIC ## Random Forest Multiclassification - End to end pipeline
137 | // MAGIC ![RandomForest Model](files/images/RandomForestPipeline.PNG)
138 | 
139 | // COMMAND ----------
140 | 
141 | // Using non-transformed data (cleanDf)
142 | val Array(training, test) = cleanDf.randomSplit(Array(0.8, 0.2), seed = 123)
143 | 
144 | // Train a RandomForest model.
145 | val rf = new RandomForestClassifier()
146 |   .setLabelCol("label_index")
147 |   .setFeaturesCol("norm_features")
148 |   .setNumTrees(10)
149 | 
150 | // Chain indexers and Random Forest in a Pipeline.
151 | val rfPipeline = new Pipeline().setStages(indexers ++ Array(labelIndexer, encoder, assembler, standardScalar, rf, labelConverter))
152 | 
153 | // Fit pipeline
154 | val rfPipelineModel = rfPipeline.fit(training)
155 | 
156 | // Make predictions.
157 | val rfPredictions = rfPipelineModel.transform(test)
158 | rfPredictions.select("predicted_label", "label", "features").show(10)
159 | 
160 | // Evaluate
161 | val rfEvaluator = new MulticlassClassificationEvaluator()
162 |   .setMetricName("accuracy")
163 |   .setLabelCol("label_index")
164 |   .setPredictionCol("prediction")
165 | val rfAccuracy = rfEvaluator.evaluate(rfPredictions)
166 | println(s"Test Error = ${(1.0 - rfAccuracy)}")
167 | 
168 | 
169 | // COMMAND ----------
170 | 
171 | // MAGIC %md
172 | // MAGIC ## Logistic Regression with CrossValidation
173 | // MAGIC ![Logistic Regression w/ CrossValidation](files/images/LogRegCVPipeline.PNG)
174 | 
175 | // COMMAND ----------
176 | 
177 | // Train a Logistic Regres model.
178 | val lr = new LogisticRegression()
179 |   .setMaxIter(10)
180 |   .setLabelCol("label_index")
181 |   .setFeaturesCol("norm_features")
182 | 
183 | // Define ParamGrid
184 | val lrParamGrid = new ParamGridBuilder()
185 |   .addGrid(lr.regParam, Array(0.1, 0.01))
186 |   .addGrid(lr.elasticNetParam, Array(0.1, 0.5, 0.8))
187 |   .build()
188 | 
189 | // Define evaluator
190 | val lrEvaluator = new MulticlassClassificationEvaluator()
191 |   .setMetricName("accuracy")
192 |   .setLabelCol("label_index")
193 |   .setPredictionCol("prediction")
194 | 
195 | // CrossValidation model
196 | val lrCv = new CrossValidator()
197 |   .setEstimator(lr)
198 |   .setEvaluator(lrEvaluator)
199 |   .setEstimatorParamMaps(lrParamGrid)
200 |   .setNumFolds(3)
201 | 
202 | // Chain indexers and Random Forest in a Pipeline.
203 | val lrCvPipeline = new Pipeline().setStages(Array(lrCv, labelConverter))
204 | 
205 | // Fit model
206 | val lrCvPipelineModel = lrCvPipeline.fit(transformedTraining)
207 | 
208 | // Make predictions with test
209 | val lrCvPredictions = lrCvPipelineModel.transform(transformedTest)
210 | lrCvPredictions.select("predicted_label", "label", "features").show(10)
211 | 
212 | // Evaluate
213 | val lrCvAccuracy = lrEvaluator.evaluate(lrCvPredictions)
214 | println(s"Test Error = ${(1.0 - lrCvAccuracy)}")
215 | 
216 | 
217 | // COMMAND ----------
218 | 
219 | // MAGIC %md
220 | // MAGIC ## Save models
221 | // MAGIC - Saving Data Scientist work
222 | // MAGIC - Compose models and train in a different cluster
223 | // MAGIC - Productionizing ML models
224 | 
225 | // COMMAND ----------
226 | 
227 | gbtModel.write.overwrite().save(s"$modelDir/GBT")
228 | rfPipelineModel.write.overwrite().save(s"$modelDir/RandomForestPipeline")
229 | lrCvPipelineModel.write.overwrite().save(s"$modelDir/LogRegPipeline")


--------------------------------------------------------------------------------
/notebooks/databricks_notebooks/04_trainmodel_pca_w_custom.scala:
--------------------------------------------------------------------------------
  1 | // Databricks notebook source
  2 | // MAGIC %md
  3 | // MAGIC ## Writing your own Model (Custom Spark Estimators and Transformers)
  4 | 
  5 | // COMMAND ----------
  6 | 
  7 | // MAGIC %md
  8 | // MAGIC ### PCA for Anomaly detection
  9 | // MAGIC 1. Filter out nomalous points and perform PCA to extract Principal Components
 10 | // MAGIC 2. Reconstruct the features using the Principal Components and the feature vectors.
 11 | // MAGIC 3. To calculate the Anomaly Score, calculate the normalized error between the reconstructed features and the original feature vector
 12 | // MAGIC   - In this case, we use the sum of squared differences from the two vectors
 13 | // MAGIC   
 14 | // MAGIC For more information:
 15 | // MAGIC - [PCA-based Anomaly Detection](https://docs.microsoft.com/en-us/azure/machine-learning/studio-module-reference/pca-based-anomaly-detection)
 16 | // MAGIC - [A randomized algorithm for principal component analysis](https://arxiv.org/abs/0809.2274). Rokhlin, Szlan and Tygert
 17 | // MAGIC - [Finding Structure with Randomness: Probabilistic Algorithms for Constructing Approximate Matrix Decompositions](http://users.cms.caltech.edu/~jtropp/papers/HMT11-Finding-Structure-SIREV.pdf). Halko, Martinsson and Tropp.
 18 | 
 19 | // COMMAND ----------
 20 | 
 21 | package org.apache.spark.ml.feature
 22 | 
 23 | import org.apache.hadoop.fs.Path
 24 | 
 25 | import org.apache.spark.ml._
 26 | import org.apache.spark.ml.linalg._
 27 | import org.apache.spark.ml.param._
 28 | import org.apache.spark.ml.param.shared._
 29 | import org.apache.spark.ml.util._
 30 | import org.apache.spark.sql._
 31 | import org.apache.spark.sql.functions._
 32 | import org.apache.spark.sql.types.{StructField, StructType, DoubleType}
 33 | 
 34 | import breeze.linalg.{DenseVector, sum}
 35 | import breeze.numerics.pow
 36 | 
 37 | /**
 38 |  * Params for [[PCAAnomaly]] and [[PCAAnomalyModel]].
 39 |  */
 40 | trait PCAAnomalyParams extends Params with HasInputCol with HasOutputCol {
 41 |   final val outputPCACol = new Param[String](this, "outputPCACol", "The output column with PCA features")
 42 |   final val outputAbsScoreCol = new Param[String](this, "outputAbsScoreCol", "The output column with non-normalized Anomaly Scores")
 43 |   final val labelCol = new Param[String](this, "labelCol", "Label column")
 44 |   setDefault(outputPCACol, "pca_features")
 45 |   setDefault(outputAbsScoreCol, "nonnorm_anomaly_score")
 46 |   setDefault(labelCol, "label")
 47 |   
 48 |   final val k: IntParam = new IntParam(this, "k", "the number of principal components (> 0)",
 49 |     ParamValidators.gt(0))
 50 |   
 51 |   /** Validates and transforms the input schema. */
 52 |   protected def validateAndTransformSchema(schema: StructType): StructType = {
 53 |     //SchemaUtils.checkColumnType(schema, $(inputCol), new VectorUDT)
 54 |     require(!schema.fieldNames.contains($(outputCol)), s"Output column ${$(outputCol)} already exists.")
 55 |     val outputFields = schema.fields :+ 
 56 |       StructField($(outputPCACol), new VectorUDT, false) :+ 
 57 |       StructField($(outputCol), DoubleType, false)
 58 |     StructType(outputFields)
 59 |   }
 60 | }
 61 | 
 62 | /**
 63 |  * PCA trains a model to project vectors to a lower dimensional space of the top `PCA!.k`
 64 |  * principal components.
 65 |  */
 66 | class PCAAnomaly (override val uid: String)
 67 |   extends Estimator[PCAAnomalyModel] with PCAAnomalyParams with DefaultParamsWritable {
 68 | 
 69 |   def this() = this(Identifiable.randomUID("pca_anomaly"))
 70 | 
 71 |   def setInputCol(value: String): this.type = set(inputCol, value)
 72 |   def setOutputCol(value: String): this.type = set(outputCol, value)
 73 |   def setLabelCol(value: String): this.type = set(labelCol, value)
 74 |   def setOutputPCACol(value: String): this.type = set(outputPCACol, value)
 75 |   def setOutputAbsScoreCol(value: String): this.type = set(outputAbsScoreCol, value)
 76 |   def setK(value: Int): this.type = set(k, value)
 77 | 
 78 |   /**
 79 |    * Computes a [[PCAAnomalyModel]] that contains the principal components of the input vectors.
 80 |    */
 81 |   override def fit(dataset: Dataset[_]): PCAAnomalyModel = {
 82 |     transformSchema(dataset.schema, logging = true)
 83 |     
 84 |     // remove anomalies
 85 |     val cleanDataset = dataset.filter(col($(labelCol)) === 0)
 86 |     
 87 |     // Fit regular PCA model
 88 |     val pcaModel = new PCA()
 89 |       .setInputCol($(inputCol))
 90 |       .setOutputCol($(outputPCACol))
 91 |       .setK($(k))
 92 |       .fit(cleanDataset)
 93 |     
 94 |     copyValues(new PCAAnomalyModel(uid, pcaModel).setParent(this))
 95 |   }
 96 | 
 97 |   override def transformSchema(schema: StructType): StructType = {
 98 |     validateAndTransformSchema(schema)
 99 |   }
100 | 
101 |   override def copy(extra: ParamMap): PCAAnomaly = defaultCopy(extra)
102 | }
103 | 
104 | object PCAAnomaly extends DefaultParamsReadable[PCAAnomaly] {
105 |   override def load(path: String): PCAAnomaly = super.load(path)
106 | }
107 | 
108 | /**
109 |  * Model fitted by [[PCAAnomaly]]. Uses PCA to detect anomalies
110 |  *
111 |  * @param pcaModel A PCA model
112 |  */
113 | class PCAAnomalyModel (
114 |   override val uid: String, 
115 |   val pcaModel: PCAModel)
116 |   extends Model[PCAAnomalyModel] with PCAAnomalyParams with MLWritable {
117 | 
118 |   import PCAAnomalyModel._
119 | 
120 |   def setInputCol(value: String): this.type = set(inputCol, value)
121 |   def setOutputCol(value: String): this.type = set(outputCol, value)
122 |   def setLabelCol(value: String): this.type = set(labelCol, value)
123 |   def setOutputPCACol(value: String): this.type = set(outputPCACol, value)
124 |   def setOutputAbsScoreCol(value: String): this.type = set(outputAbsScoreCol, value)
125 |   def setK(value: Int): this.type = set(k, value)
126 |     
127 |   /**
128 |    * Transform a vector by computed Principal Components.
129 |    *
130 |    * @note Vectors to be transformed must be the same length as the source vectors given
131 |    * to `PCAAnomaly.fit()`.
132 |    */
133 |   override def transform(dataset: Dataset[_]): DataFrame = {
134 |     transformSchema(dataset.schema, logging = true)
135 |     
136 |     val pcaResults = pcaModel.transform(dataset)
137 |     
138 |     val anomalyScoreUdf = udf((originalFeatures:Vector, pcaFeatures:Vector) => {
139 |       // Reconstruct vector using Principal components
140 |       val reconstructedFeatures = pcaModel.pc.multiply(pcaFeatures) 
141 |       
142 |       // Calculate error (sum of squared differences)
143 |       val originalFeaturesB = DenseVector(originalFeatures.toArray)
144 |       val reconstructedFeaturesB = DenseVector(reconstructedFeatures.toArray)
145 |       val diff = originalFeaturesB - reconstructedFeaturesB
146 |       val error = sum(pow(diff, 2))
147 |       error
148 |     })
149 |     val anomalyScore = pcaResults.withColumn($(outputAbsScoreCol), anomalyScoreUdf(col($(inputCol)), col($(outputPCACol))))
150 |     
151 |     // Normalize
152 |     val Row(maxVal: Double) = anomalyScore.select(max($(outputAbsScoreCol))).head
153 |     val Row(minVal: Double) = anomalyScore.select(min($(outputAbsScoreCol))).head
154 |     val nomarlizeAnomalyScore = anomalyScore
155 |       .withColumn($(outputCol), (col($(outputAbsScoreCol)) - minVal) / (maxVal - minVal))
156 |     
157 |     nomarlizeAnomalyScore
158 |   }
159 | 
160 |   override def transformSchema(schema: StructType): StructType = {
161 |     validateAndTransformSchema(schema)
162 |   }
163 | 
164 |   override def copy(extra: ParamMap): PCAAnomalyModel = {
165 |     val copied = new PCAAnomalyModel(uid, pcaModel)
166 |     copyValues(copied, extra).setParent(parent)
167 |   }
168 | 
169 |   override def write: MLWriter = new PCAAnomalyModelWriter(this)
170 | }
171 | 
172 | object PCAAnomalyModel extends MLReadable[PCAAnomalyModel] {
173 | 
174 |   private[PCAAnomalyModel] class PCAAnomalyModelWriter(instance: PCAAnomalyModel) extends MLWriter {
175 |     override protected def saveImpl(path: String): Unit = {
176 |       DefaultParamsWriter.saveMetadata(instance, path, sc)
177 |       val pcaPath = new Path(path, "pca").toString
178 |       instance.pcaModel.save(pcaPath)
179 |     }
180 |   }
181 | 
182 |   private class PCAAnomalyModelReader extends MLReader[PCAAnomalyModel] {
183 | 
184 |     private val className = classOf[PCAAnomalyModel].getName
185 | 
186 |     /**
187 |      * Loads a [[PCAAnomalyModel]] from data located at the input path.
188 |      *
189 |      * @param path path to serialized model data
190 |      * @return a [[PCAAnomalyModel]]
191 |      */
192 |     override def load(path: String): PCAAnomalyModel = {
193 |       val metadata = DefaultParamsReader.loadMetadata(path, sc, className)
194 |       val pcaPath = new Path(path, "pca").toString
195 |       val pcaModel = PCAModel.load(pcaPath)
196 |       val model = new PCAAnomalyModel(metadata.uid, pcaModel)
197 |       DefaultParamsReader.getAndSetParams(model, metadata)
198 |       model
199 |     }
200 |   }
201 | 
202 |   override def read: MLReader[PCAAnomalyModel] = new PCAAnomalyModelReader
203 | 
204 |   override def load(path: String): PCAAnomalyModel = super.load(path)
205 | }
206 | 
207 | 
208 | 
209 | // COMMAND ----------
210 | 
211 | // MAGIC %md
212 | // MAGIC ## Use Custom Model in a Pipeline
213 | 
214 | // COMMAND ----------
215 | 
216 | // MAGIC %md
217 | // MAGIC ### Setup
218 | 
219 | // COMMAND ----------
220 | 
221 | import org.apache.spark.ml.feature.{StringIndexer, OneHotEncoderEstimator, VectorAssembler, PCA, StandardScaler, MinMaxScaler, PCAAnomaly}
222 | import org.apache.spark.ml.{Pipeline, PipelineModel}
223 | import org.apache.spark.ml.linalg.{Vector, Vectors}
224 | import org.apache.spark.sql.functions._
225 | import breeze.linalg.{DenseVector, sum}
226 | import breeze.numerics.pow
227 | 
228 | val modelDir = "mnt/blob_storage/models/PCAAnomalyModel"
229 | 
230 | // COMMAND ----------
231 | 
232 | // MAGIC %md
233 | // MAGIC ### Load and transform data
234 | 
235 | // COMMAND ----------
236 | 
237 | // Read data
238 | spark.catalog.refreshTable("kdd") // need to refresh to invalidate cache
239 | val df = spark.read.table("kdd")
240 | 
241 | // Clean data
242 | val cleanDf = df
243 |   .withColumn("is_anomaly", when(col("label") === "normal.", 0).otherwise(1))
244 |   .na.drop()
245 | 
246 | // Clean up labels for anomaly
247 | display(cleanDf)
248 | 
249 | val columns = cleanDf.columns.toSet
250 | val features = columns -- Set("id", "label", "is_anomaly")
251 | val categoricalFeatures = Set("protocol_type", "service", "flag")
252 | val continuousFeatures = features -- categoricalFeatures
253 | 
254 | // Split
255 | val Array(training, test) = cleanDf.randomSplit(Array(0.8, 0.2), seed = 123)
256 | 
257 | 
258 | // COMMAND ----------
259 | 
260 | // MAGIC %md
261 | // MAGIC ### Define Feature Estimators and Transformers
262 | 
263 | // COMMAND ----------
264 | 
265 | // Indexers
266 | val indexers = categoricalFeatures.map({ colName =>
267 |   new StringIndexer().setInputCol(colName).setOutputCol(colName + "_index").setHandleInvalid("keep")
268 | }).toArray
269 | 
270 | // Encoders
271 | val encoder = new OneHotEncoderEstimator()
272 |   .setInputCols(categoricalFeatures.map(colName => colName + "_index").toArray)
273 |   .setOutputCols(categoricalFeatures.map(colName => colName + "_encoded").toArray)
274 | 
275 | // Vector Assembler
276 | var selectedFeatures = continuousFeatures ++ categoricalFeatures.map(colName => colName + "_encoded") 
277 | val assembler = new VectorAssembler()
278 |   .setInputCols(selectedFeatures.toArray)
279 |   .setOutputCol("features")
280 | 
281 | // Standard Scalar
282 | val standardScalar = new StandardScaler()
283 |   .setInputCol("features")
284 |   .setOutputCol("norm_features")
285 |   .setWithMean(true)
286 |   .setWithStd(true)
287 | 
288 | // PCA Anomaly model
289 | val pcaAnom = new PCAAnomaly()
290 |   .setInputCol("norm_features")
291 |   .setOutputPCACol("pca_features")  
292 |   .setOutputCol("anomaly_score")
293 |   .setLabelCol("is_anomaly")
294 |   .setK(2)
295 | 
296 | // COMMAND ----------
297 | 
298 | // MAGIC %md
299 | // MAGIC ### Build and Fit Pipeline using PCAAnomaly (custom model)
300 | // MAGIC ![PCAAnomaly Pipeline](files/images/PCAAnomalyPipeline.PNG)
301 | 
302 | // COMMAND ----------
303 | 
304 | // Pipeline
305 | val mainPipeline = new Pipeline()
306 |   .setStages(indexers ++ 
307 |      Array(encoder, assembler, standardScalar, pcaAnom)) //pcaAnom
308 | 
309 | // Fit pipeline
310 | val mainPipelineModel = mainPipeline.fit(training)
311 | 
312 | // Save pipeline
313 | mainPipelineModel
314 |   .write
315 |   .overwrite
316 |   .save(modelDir)
317 | 
318 | // COMMAND ----------
319 | 
320 | // MAGIC %md
321 | // MAGIC ### Use Model to predict anomalies
322 | 
323 | // COMMAND ----------
324 | 
325 | // MAGIC %md
326 | // MAGIC #### Using training data
327 | 
328 | // COMMAND ----------
329 | 
330 | // Load saved model
331 | val model = PipelineModel.load(modelDir)
332 | 
333 | // Use model 
334 | val transformedTraining = model.transform(training)
335 |   .select("is_anomaly", "label", "anomaly_score")
336 |   .cache()
337 | 
338 | display(transformedTraining
339 |         .groupBy("is_anomaly")
340 |         .agg(avg("anomaly_score")))
341 | 
342 | // COMMAND ----------
343 | 
344 | display(transformedTraining
345 |   .groupBy("label")
346 |   .agg(avg("anomaly_score").alias("anomaly_score"))
347 |   .sort(desc("anomaly_score")))
348 | 
349 | // COMMAND ----------
350 | 
351 | // MAGIC %md
352 | // MAGIC #### Using test data
353 | 
354 | // COMMAND ----------
355 | 
356 | val transformedTest = mainPipelineModel.transform(test)
357 |   .select("is_anomaly", "label", "anomaly_score")
358 |   .cache()
359 | 
360 | display(transformedTest
361 |         .groupBy("is_anomaly")
362 |         .agg(avg("anomaly_score")))
363 | 
364 | // COMMAND ----------
365 | 
366 | display(transformedTest
367 |   .groupBy("label")
368 |   .agg(avg("anomaly_score").alias("anomaly_score"))
369 |   .sort(desc("anomaly_score")))
370 | 
371 | // COMMAND ----------
372 | 
373 | // MAGIC %md
374 | // MAGIC ### Evaluate Model using Test data
375 | 
376 | // COMMAND ----------
377 | 
378 | import  org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
379 | 
380 | val evaluator = new BinaryClassificationEvaluator()
381 |   .setMetricName("areaUnderROC")
382 |   .setLabelCol("is_anomaly")
383 |   .setRawPredictionCol("anomaly_score")
384 | 
385 | evaluator.evaluate(transformedTraining)


--------------------------------------------------------------------------------
/notebooks/databricks_notebooks/04_trainmodel_pca_wo_custom.scala:
--------------------------------------------------------------------------------
  1 | // Databricks notebook source
  2 | import org.apache.spark.ml.feature.{StringIndexer, OneHotEncoderEstimator, VectorAssembler, PCA, StandardScaler, MinMaxScaler}
  3 | import org.apache.spark.ml.Pipeline
  4 | import org.apache.spark.ml.linalg.{Vector, Vectors}
  5 | import org.apache.spark.sql.functions._
  6 | import breeze.linalg.{DenseVector, sum}
  7 | import breeze.numerics.pow
  8 | 
  9 | // COMMAND ----------
 10 | 
 11 | // MAGIC %md
 12 | // MAGIC ## Read in data and perform data cleaning
 13 | 
 14 | // COMMAND ----------
 15 | 
 16 | // Read data
 17 | val df = spark.read.table("kdd")
 18 | 
 19 | // Transform data
 20 | val transformed_df = df.withColumnRenamed("label", "original_label")
 21 |   .withColumn("label_name", when(col("original_label") === "normal.", "normal").otherwise("anomaly"))
 22 | 
 23 | // Drop nulls
 24 | // Lace TODO
 25 | 
 26 | // Clean up labels for anomaly
 27 | display(transformed_df)
 28 | 
 29 | // COMMAND ----------
 30 | 
 31 | // MAGIC %md
 32 | // MAGIC ## Build data transformation ML pipeline
 33 | 
 34 | // COMMAND ----------
 35 | 
 36 | val columns = df.columns.toSet
 37 | val features = columns -- Set("id", "label", "original_label")
 38 | val categoricalFeatures = Set("protocol_type", "service", "flag")
 39 | val continuousFeatures = features -- categoricalFeatures
 40 | |
 41 | // Split data
 42 | val Array(training, test) = transformed_df.randomSplit(Array(0.8, 0.2), seed = 123)
 43 | 
 44 | // Indexers
 45 | val indexers = categoricalFeatures.map({ colName =>
 46 |   new StringIndexer().setInputCol(colName).setOutputCol(colName + "_index").setHandleInvalid("keep")
 47 | }).toArray
 48 | 
 49 | // Encoders
 50 | val encoder = new OneHotEncoderEstimator()
 51 |   .setInputCols(categoricalFeatures.map(colName => colName + "_index").toArray)
 52 |   .setOutputCols(categoricalFeatures.map(colName => colName + "_encoded").toArray)
 53 | 
 54 | // Label Indexer
 55 | val labelIndexer = new StringIndexer()
 56 |   .setInputCol("label_name")
 57 |   .setOutputCol("label")
 58 | 
 59 | // Vector Assembler
 60 | var selectedFeatures = continuousFeatures ++ categoricalFeatures.map(colName => colName + "_encoded") 
 61 | val assembler = new VectorAssembler()
 62 |   .setInputCols(selectedFeatures.toArray)
 63 |   .setOutputCol("features")
 64 | 
 65 | val standardScalar = new StandardScaler()
 66 |   .setInputCol("features")
 67 |   .setOutputCol("norm_features")
 68 |   .setWithMean(true)
 69 |   .setWithStd(true)
 70 | 
 71 | // Pipeline
 72 | val transformPipeline = new Pipeline()
 73 |   .setStages(indexers ++ Array(encoder, labelIndexer, assembler, standardScalar))
 74 | 
 75 | // Transform training
 76 | val transformedTraining = transformPipeline
 77 |   .fit(training)
 78 |   .transform(training)
 79 |   .select("norm_features", "label")
 80 |   .cache()
 81 | 
 82 | display(transformedTraining)
 83 | 
 84 | // COMMAND ----------
 85 | 
 86 | // MAGIC %md
 87 | // MAGIC ## Perform Principal Component Analysis
 88 | 
 89 | // COMMAND ----------
 90 | 
 91 | // Fit PCA model
 92 | val pca = new PCA()
 93 |   .setInputCol("norm_features")
 94 |   .setOutputCol("pca_features")
 95 |   .setK(3)
 96 |   .fit(transformedTraining)
 97 | 
 98 | val pcaResult = pca
 99 |   .transform(transformedTraining)
100 |   .select("label", "pca_features", "norm_features")
101 |   .cache()
102 | 
103 | display(pcaResult)
104 | 
105 | // COMMAND ----------
106 | 
107 | // MAGIC %md
108 | // MAGIC ## Reconstruct features and calculate Anomaly Score
109 | // MAGIC Reconstruct the features using the Principal Components and the feature vectors. Then, calculate the normalized error, in this case the sum of squared differences from the original feature vector and the reconstructed features from the principal components. This becomes the Anomaly Score.
110 | 
111 | // COMMAND ----------
112 | 
113 | val reconstructionUdf = udf((v: Vector) => { 
114 |   // Reconstruct vector using Principal components
115 |   pca.pc.multiply(v) 
116 | })
117 | val anomalyScoreUdf = udf((v:Vector, x:Vector) => {
118 |   // Calculate error (sum of squared differences)
119 |   val vB = DenseVector(v.toArray)
120 |   val xB = DenseVector(x.toArray)
121 |   val diff = vB - xB
122 |   val error = sum(pow(diff, 2))
123 |   error
124 | })
125 | val anomalyScore = pcaResult
126 |   .withColumn("reconstruction", reconstructionUdf(col("pca_features")))
127 |   .withColumn("anomaly_score", anomalyScoreUdf(col("norm_features"), col("reconstruction")))
128 | 
129 | // COMMAND ----------
130 | 
131 | // MAGIC %md
132 | // MAGIC ## Normalize Anomaly Score
133 | 
134 | // COMMAND ----------
135 | 
136 | // Vectorize Anomaly Score
137 | val anomalyAssembler = new VectorAssembler()
138 |   .setInputCols(Array("anomaly_score"))
139 |   .setOutputCol("anomaly_score_vec")
140 | 
141 | // Normalize anomaly score
142 | val anomalyScoreScalar = new MinMaxScaler()
143 |   .setInputCol("anomaly_score_vec")
144 |   .setOutputCol("norm_anomaly_score_vec")
145 | 
146 | // Pipeline
147 | val postTransformPipeline = new Pipeline()
148 |   .setStages(Array(anomalyAssembler, anomalyScoreScalar))
149 | 
150 | val postTransformPipelineModel = postTransformPipeline
151 |   .fit(anomalyScore)
152 | 
153 | val vecToDoubleUdf = udf((v: Vector) => { v.toArray(0) })
154 | val predictions = postTransformPipelineModel
155 |   .transform(anomalyScore)
156 |   .withColumn("norm_anomaly_score", vecToDoubleUdf(col("norm_anomaly_score_vec")))
157 |   .select("label", "norm_anomaly_score")
158 |   .cache()
159 | 
160 | display(predictions)
161 | 
162 | // COMMAND ----------
163 | 
164 | // MAGIC %md
165 | // MAGIC ## Evaluate Model
166 | 
167 | // COMMAND ----------
168 | 
169 | import  org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
170 | 
171 | val evaluator = new BinaryClassificationEvaluator()
172 |   .setMetricName("areaUnderROC")
173 |   .setLabelCol("label")
174 |   .setRawPredictionCol("norm_anomaly_score")
175 | 
176 | var auc = evaluator.evaluate(predictions)
177 | 
178 | // COMMAND ----------
179 | 
180 | 
181 | 
182 | // COMMAND ----------
183 | 
184 | // MAGIC %md
185 | // MAGIC # Custom Transformer and Estimator
186 | 
187 | // COMMAND ----------
188 | 
189 | package org.apache.spark.ml.feature
190 | 
191 | import org.apache.hadoop.fs.Path
192 | 
193 | import org.apache.spark.ml._
194 | import org.apache.spark.ml.linalg._
195 | import org.apache.spark.ml.param._
196 | import org.apache.spark.ml.param.shared._
197 | import org.apache.spark.ml.util._
198 | // import org.apache.spark.ml.feature.{PCA, PCAModel}
199 | import org.apache.spark.rdd.RDD
200 | import org.apache.spark.sql._
201 | import org.apache.spark.sql.functions._
202 | import org.apache.spark.sql.types.{StructField, StructType}
203 | import org.apache.spark.mllib.linalg.VectorUDT
204 | 
205 | import breeze.linalg.{DenseVector, sum}
206 | import breeze.numerics.pow
207 | 
208 | 
209 | /**
210 |  * Params for [[PCAAnomaly]] and [[PCAAnomalyModel]].
211 |  */
212 | trait PCAAnomalyParams extends Params with HasInputCol with HasOutputCol {
213 |   //final val inputCol= new Param[String](this, "inputCol", "The input column")
214 |   //final val outputCol = new Param[String](this, "outputCol", "The output column")
215 |   final val outputPCACol = new Param[String](this, "outputPCACol", "The output column with PCA features")
216 |   final val k: IntParam = new IntParam(this, "k", "the number of principal components (> 0)",
217 |     ParamValidators.gt(0))
218 |   
219 |   /** Validates and transforms the input schema. */
220 |   protected def validateAndTransformSchema(schema: StructType): StructType = {
221 |     //SchemaUtils.checkColumnType(schema, $(inputCol), new VectorUDT)
222 |     require(!schema.fieldNames.contains($(outputCol)), s"Output column ${$(outputCol)} already exists.")
223 |     val outputFields = schema.fields :+ StructField($(outputCol), new VectorUDT, false)
224 |     StructType(outputFields)
225 |   }
226 | }
227 | 
228 | /**
229 |  * PCA trains a model to project vectors to a lower dimensional space of the top `PCA!.k`
230 |  * principal components.
231 |  */
232 | class PCAAnomaly (override val uid: String)
233 |   extends Estimator[PCAAnomalyModel] with PCAAnomalyParams with DefaultParamsWritable {
234 | 
235 |   def this() = this(Identifiable.randomUID("pca_anomaly"))
236 | 
237 |   def setInputCol(value: String): this.type = set(inputCol, value)
238 |   def setOutputCol(value: String): this.type = set(outputCol, value)
239 |   def setOutputPCACol(value: String): this.type = set(outputPCACol, value)
240 |   def setK(value: Int): this.type = set(k, value)
241 | 
242 |   /**
243 |    * Computes a [[PCAAnomalyModel]] that contains the principal components of the input vectors.
244 |    */
245 |   override def fit(dataset: Dataset[_]): PCAAnomalyModel = {
246 |     transformSchema(dataset.schema, logging = true)
247 |     
248 |     // Fit regular PCA model
249 |     val pcaModel = new PCA()
250 |       .setInputCol($(inputCol))
251 |       .setOutputCol($(outputPCACol))
252 |       .setK($(k))
253 |       .fit(dataset)
254 |     
255 |     copyValues(new PCAAnomalyModel(uid, pcaModel).setParent(this))
256 |   }
257 | 
258 |   override def transformSchema(schema: StructType): StructType = {
259 |     validateAndTransformSchema(schema)
260 |   }
261 | 
262 |   override def copy(extra: ParamMap): PCAAnomaly = defaultCopy(extra)
263 | }
264 | 
265 | object PCAAnomaly extends DefaultParamsReadable[PCAAnomaly] {
266 |   override def load(path: String): PCAAnomaly = super.load(path)
267 | }
268 | 
269 | /**
270 |  * Model fitted by [[PCAAnomaly]]. Uses PCA to detect anomalies
271 |  *
272 |  * @param pcaModel A PCA model
273 |  */
274 | class PCAAnomalyModel (
275 |   override val uid: String, 
276 |   val pcaModel: PCAModel)
277 |   extends Model[PCAAnomalyModel] with PCAAnomalyParams with MLWritable {
278 | 
279 |   import PCAAnomalyModel._
280 | 
281 |   /** @group setParam */
282 |   def setInputCol(value: String): this.type = set(inputCol, value)
283 | 
284 |   /** @group setParam */
285 |   def setOutputCol(value: String): this.type = set(outputCol, value)
286 | 
287 |   /**
288 |    * Transform a vector by computed Principal Components.
289 |    *
290 |    * @note Vectors to be transformed must be the same length as the source vectors given
291 |    * to `PCAAnomaly.fit()`.
292 |    */
293 |   override def transform(dataset: Dataset[_]): DataFrame = {
294 |     transformSchema(dataset.schema, logging = true)
295 |     
296 |     val pcaResults = pcaModel.transform(dataset)
297 |     
298 |     val anomalyScoreUdf = udf((originalFeatures:Vector, pcaFeatures:Vector) => {
299 |       // Reconstruct vector using Principal components
300 |       val reconstructedFeatures = pcaModel.pc.multiply(pcaFeatures) 
301 |       
302 |       // Calculate error (sum of squared differences)
303 |       val originalFeaturesB = DenseVector(originalFeatures.toArray)
304 |       val reconstructedFeaturesB = DenseVector(reconstructedFeatures.toArray)
305 |       val diff = originalFeaturesB - reconstructedFeaturesB
306 |       val error = sum(pow(diff, 2))
307 |       error
308 |     })
309 |     pcaResults.withColumn($(outputCol), anomalyScoreUdf(col($(inputCol)), col($(outputPCACol))))
310 |   }
311 | 
312 |   override def transformSchema(schema: StructType): StructType = {
313 |     validateAndTransformSchema(schema)
314 |   }
315 | 
316 |   override def copy(extra: ParamMap): PCAAnomalyModel = {
317 |     val copied = new PCAAnomalyModel(uid, pcaModel)
318 |     copyValues(copied, extra).setParent(parent)
319 |   }
320 | 
321 |   override def write: MLWriter = new PCAAnomalyModelWriter(this)
322 | }
323 | 
324 | object PCAAnomalyModel extends MLReadable[PCAAnomalyModel] {
325 | 
326 |   private[PCAAnomalyModel] class PCAAnomalyModelWriter(instance: PCAAnomalyModel) extends MLWriter {
327 |     override protected def saveImpl(path: String): Unit = {
328 |       DefaultParamsWriter.saveMetadata(instance, path, sc)
329 |       val pcaPath = new Path(path, "pca").toString
330 |       instance.pcaModel.save(pcaPath)
331 |     }
332 |   }
333 | 
334 |   private class PCAAnomalyModelReader extends MLReader[PCAAnomalyModel] {
335 | 
336 |     private val className = classOf[PCAAnomalyModel].getName
337 | 
338 |     /**
339 |      * Loads a [[PCAAnomalyModel]] from data located at the input path.
340 |      *
341 |      * @param path path to serialized model data
342 |      * @return a [[PCAAnomalyModel]]
343 |      */
344 |     override def load(path: String): PCAAnomalyModel = {
345 |       val metadata = DefaultParamsReader.loadMetadata(path, sc, className)
346 |       val pcaPath = new Path(path, "pca").toString
347 |       val pcaModel = PCAModel.load(pcaPath)
348 |       new PCAAnomalyModel(metadata.uid, pcaModel)
349 |     }
350 |   }
351 | 
352 |   override def read: MLReader[PCAAnomalyModel] = new PCAAnomalyModelReader
353 | 
354 |   override def load(path: String): PCAAnomalyModel = super.load(path)
355 | }
356 | 
357 | 
358 | 
359 | // COMMAND ----------
360 | 
361 | import org.apache.spark.ml.feature.PCAAnomaly
362 | 
363 | // Fit PCA model
364 | val pcaAnomaly = new PCAAnomaly()
365 |   .setInputCol("norm_features")
366 |   .setOutputPCACol("pca_features")  
367 |   .setOutputCol("anomaly_score")
368 |   .setK(3)
369 |   .fit(transformedTraining)
370 | 
371 | val pcaResult = pcaAnomaly
372 |   .transform(transformedTraining)
373 |   .select("label", "anomaly_score", "pca_features", "norm_features")
374 |   .cache()
375 | 
376 | display(pcaResult)
377 | 
378 | // COMMAND ----------
379 | 
380 | pcaAnomaly.save("/mnt/blob_storage/models/PCAAnomalyModel")
381 | 
382 | // COMMAND ----------
383 | 
384 | // Pipeline
385 | 
386 | val pcaAnom = new PCAAnomaly()
387 |   .setInputCol("norm_features")
388 |   .setOutputPCACol("pca_features")  
389 |   .setOutputCol("anomaly_score")
390 |   .setK(3)
391 | 
392 | val mainPipeline = new Pipeline()
393 |   .setStages(indexers ++ Array(encoder, labelIndexer, assembler, standardScalar, pcaAnom, anomalyAssembler, anomalyScoreScalar))
394 | 
395 | val mainResult = mainPipeline.fit(training)
396 |   


--------------------------------------------------------------------------------
/notebooks/databricks_notebooks/05_batch_scoring.scala:
--------------------------------------------------------------------------------
 1 | // Databricks notebook source
 2 | import org.apache.spark.ml.{Pipeline, PipelineModel}
 3 | import org.apache.spark.ml.classification._
 4 | import org.apache.spark.sql.functions._
 5 | import org.apache.spark.sql.types._
 6 | 
 7 | // COMMAND ----------
 8 | 
 9 | display(spark.catalog.listTables())
10 | 
11 | // COMMAND ----------
12 | 
13 | display(dbutils.fs.ls("/mnt/blob_storage/models"))
14 | 
15 | // COMMAND ----------
16 | 
17 | // Load data
18 | // In production, you may need to filter since last run
19 | val df = spark.read.table("kdd_unlabeled")
20 | 
21 | // Clean data
22 | val cleanDf = df.na.drop() // For production, may need to save this to another table, or impute null values
23 | 
24 | // Load model
25 | val modelLoc = "/mnt/blob_storage/models/RandomForestPipeline"
26 | val model = PipelineModel.load(modelLoc)
27 | 
28 | // Make predictions
29 | val predictions = model.transform(cleanDf)
30 | 
31 | // Save data
32 | predictions.write.mode("append").saveAsTable("kdd_predictions")


--------------------------------------------------------------------------------
/notebooks/databricks_notebooks/06a_streaming_datagen.scala:
--------------------------------------------------------------------------------
 1 | // Databricks notebook source
 2 | import org.apache.spark.eventhubs.{ ConnectionStringBuilder, EventHubsConf, EventPosition }
 3 | import org.apache.spark.sql.functions.{ explode, split, to_json, struct }
 4 | import org.apache.spark.sql.streaming.Trigger.ProcessingTime
 5 | 
 6 | // Retrieve storage credentials
 7 | val ehNamespace = dbutils.secrets.get(scope = "storage_scope", key = "eventhub_namespace")
 8 | val ehData = dbutils.secrets.get(scope = "storage_scope", key = "eventhub_data_name")
 9 | val ehDataSendKey = dbutils.secrets.get(scope = "storage_scope", key = "eventhub_data_send_key")
10 | 
11 | // Set data path
12 | val data_path = "/mnt/blob_storage/data/for_streaming"
13 | 
14 | val connectionString = ConnectionStringBuilder()
15 |   .setNamespaceName(ehNamespace)
16 |   .setEventHubName(ehData)
17 |   .setSasKeyName("send")
18 |   .setSasKey(ehDataSendKey)
19 |   .build
20 | 
21 | val eventHubsConf = EventHubsConf(connectionString)
22 |   .setStartingPosition(EventPosition.fromEndOfStream)
23 | 
24 | // COMMAND ----------
25 | 
26 | val kdd_schema = spark.read.table("kdd_unlabeled").schema
27 | val kdd_unlabeled_df = spark
28 |   .readStream
29 |   .schema(kdd_schema)
30 |   .csv(s"$data_path/kddcup.testdata.unlabeled/")
31 | 
32 | val kdd_unlabeled_df_json = kdd_unlabeled_df.select(to_json(
33 |   struct(
34 |     $"id",
35 |     $"duration", 
36 |     $"protocol_type", 
37 |     $"service", 
38 |     $"src_bytes", 
39 |     $"dst_bytes", 
40 |     $"flag",
41 |     $"land",
42 |     $"wrong_fragment",
43 |     $"urgent")).alias("body"))
44 | 
45 | // COMMAND ----------
46 | 
47 | // // Output to console
48 | // var query = kdd_unlabeled_df_json
49 | //   .writeStream
50 | //   .outputMode("append")
51 | //   .format("console")
52 | //   .option("truncate", false)
53 | //   .start()
54 | // query.awaitTermination()
55 | 
56 | // COMMAND ----------
57 | 
58 | val query =
59 |   kdd_unlabeled_df_json
60 |     .writeStream
61 |     .format("eventhubs")
62 |     .outputMode("update")
63 |     .options(eventHubsConf.toMap)
64 |     .trigger(ProcessingTime("10 seconds"))
65 |     .option("checkpointLocation", s"$data_path/checkpoints/kdd_unlabeled_gen/")
66 |     .start()


--------------------------------------------------------------------------------
/notebooks/databricks_notebooks/06b_streaming_scoring.scala:
--------------------------------------------------------------------------------
  1 | // Databricks notebook source
  2 | import org.apache.spark.eventhubs.{ ConnectionStringBuilder, EventHubsConf, EventPosition }
  3 | import org.apache.spark.sql.functions.{ explode, split }
  4 | import org.apache.spark.sql.streaming.Trigger.ProcessingTime
  5 | import org.apache.spark.sql.types._
  6 | import org.apache.spark.sql.functions._
  7 | import org.apache.spark.ml.{Pipeline, PipelineModel}
  8 | import org.apache.spark.ml.feature._
  9 | import org.apache.spark.ml.linalg.{Vector, Vectors}
 10 | 
 11 | // COMMAND ----------
 12 | 
 13 | // MAGIC %md
 14 | // MAGIC ## Setup
 15 | // MAGIC Retrieve secrets, setup EventHub connection, load save Anomaly Model
 16 | 
 17 | // COMMAND ----------
 18 | 
 19 | // Retrieve storage credentials
 20 | val ehNamespace = dbutils.secrets.get(scope = "storage_scope", key = "eventhub_namespace")
 21 | val ehData = dbutils.secrets.get(scope = "storage_scope", key = "eventhub_data_name")
 22 | val ehDataListenKey = dbutils.secrets.get(scope = "storage_scope", key = "eventhub_data_listen_key")
 23 | val ehAnom = dbutils.secrets.get(scope = "storage_scope", key = "eventhub_anom_name")
 24 | val ehAnomSendKey = dbutils.secrets.get(scope = "storage_scope", key = "eventhub_anom_send_key")
 25 | 
 26 | // Set storage mount path
 27 | val storage_mount_path = "/mnt/blob_storage"
 28 | 
 29 | // Set data path
 30 | val data_path = "/mnt/blob_storage/data/for_streaming"
 31 | 
 32 | // Load model
 33 | val model = PipelineModel.load(s"$storage_mount_path/models/RandomForestPipeline")
 34 | 
 35 | // Setup EH connection
 36 | val dataEhConnectionString = ConnectionStringBuilder()
 37 |   .setNamespaceName(ehNamespace)
 38 |   .setEventHubName(ehData)
 39 |   .setSasKeyName("listen")
 40 |   .setSasKey(ehDataListenKey)
 41 |   .build
 42 | val dataEhConf = EventHubsConf(dataEhConnectionString)
 43 |   .setStartingPosition(EventPosition.fromEndOfStream)
 44 | 
 45 | val anomEhConnectionString = ConnectionStringBuilder()
 46 |   .setNamespaceName(ehNamespace)
 47 |   .setEventHubName(ehAnom)
 48 |   .setSasKeyName("send")
 49 |   .setSasKey(ehAnomSendKey)
 50 |   .build
 51 | val anomEhConf = EventHubsConf(anomEhConnectionString)
 52 |   .setStartingPosition(EventPosition.fromEndOfStream)
 53 | 
 54 | 
 55 | // COMMAND ----------
 56 | 
 57 | // MAGIC %md
 58 | // MAGIC ## Read message from EventHubs
 59 | 
 60 | // COMMAND ----------
 61 | 
 62 | // Read stream
 63 | val incomingStream = spark
 64 |   .readStream
 65 |   .format("eventhubs")
 66 |   .options(dataEhConf.toMap)
 67 |   .load()
 68 | 
 69 | // Event Hub message format is JSON and contains "body" field
 70 | // Body is binary, so we cast it to string to see the actual content of the message
 71 | val messages =
 72 |   incomingStream
 73 |   .withColumn("Offset", $"offset".cast(LongType))
 74 |   .withColumn("Time (readable)", $"enqueuedTime".cast(TimestampType))
 75 |   .withColumn("Timestamp", $"enqueuedTime".cast(LongType))
 76 |   .withColumn("Body", $"body".cast(StringType))
 77 |   .withWatermark("Time (readable)", "10 minutes")
 78 |   .select("Offset", "Time (readable)", "Timestamp", "Body")
 79 | 
 80 | messages.printSchema
 81 | 
 82 | // COMMAND ----------
 83 | 
 84 | // MAGIC %md
 85 | // MAGIC ## Transform and enrich message through joining with static data
 86 | 
 87 | // COMMAND ----------
 88 | 
 89 | var messageTransformed = 
 90 |   messages
 91 |   .select(
 92 |     get_json_object($"Body", "$.id").cast(StringType).alias("id"),
 93 |     get_json_object($"Body", "$.duration").cast(FloatType).alias("duration"),
 94 |     get_json_object($"Body", "$.protocol_type").cast(StringType).alias("protocol_type"), 
 95 |     get_json_object($"Body", "$.service").cast(StringType).alias("service"), 
 96 |     get_json_object($"Body", "$.src_bytes").cast(FloatType).alias("src_bytes"), 
 97 |     get_json_object($"Body", "$.dst_bytes").cast(FloatType).alias("dst_bytes"), 
 98 |     get_json_object($"Body", "$.flag").cast(StringType).alias("flag"),
 99 |     get_json_object($"Body", "$.land").cast(ShortType).alias("land"),
100 |     get_json_object($"Body", "$.wrong_fragment").cast(FloatType).alias("wrong_fragment"),
101 |     get_json_object($"Body", "$.urgent").cast(FloatType).alias("urgent"),
102 |     $"Timestamp")
103 | 
104 | // Join with static table
105 | val kdd_unlabeled = spark.read.table("kdd_unlabeled")
106 | val messageAll = messageTransformed
107 |   .join(kdd_unlabeled, messageTransformed("id") === kdd_unlabeled("id"), "left_outer")
108 |   .drop(kdd_unlabeled("id"))
109 |   .drop(kdd_unlabeled("duration"))
110 |   .drop(kdd_unlabeled("protocol_type"))
111 |   .drop(kdd_unlabeled("service"))
112 |   .drop(kdd_unlabeled("src_bytes"))
113 |   .drop(kdd_unlabeled("dst_bytes"))
114 |   .drop(kdd_unlabeled("flag"))
115 |   .drop(kdd_unlabeled("land"))
116 |   .drop(kdd_unlabeled("wrong_fragment"))
117 |   .drop(kdd_unlabeled("urgent"))
118 | 
119 | messageAll.printSchema
120 | 
121 | // COMMAND ----------
122 | 
123 | // MAGIC %md
124 | // MAGIC ## Use model to identify Anomalies in data stream
125 | 
126 | // COMMAND ----------
127 | 
128 | // Make predictions
129 | val anomalies = model.transform(messageAll).filter("prediction == 1")
130 | 
131 | // COMMAND ----------
132 | 
133 | // MAGIC %md
134 | // MAGIC ## Output anomalies
135 | 
136 | // COMMAND ----------
137 | 
138 | // // Output to console
139 | // var query = anomalies
140 | //   .select("id", "probability", "prediction") //filter for easy viewing
141 | //   .writeStream
142 | //   .outputMode("append")
143 | //   .format("console")
144 | //   .option("truncate", false)
145 | //   .start()
146 | // query.awaitTermination()
147 | 
148 | // COMMAND ----------
149 | 
150 | // Wrap in body tag
151 | val anomalies_wrapper = anomalies.select(to_json(
152 |   struct(
153 |     $"id",
154 |     $"probability")).alias("body"))
155 | 
156 | val query =
157 |   anomalies_wrapper
158 |     .writeStream
159 |     .format("eventhubs")
160 |     .outputMode("update")
161 |     .options(anomEhConf.toMap)
162 |     .trigger(ProcessingTime("10 seconds"))
163 |     .option("checkpointLocation", s"$data_path/checkpoints/anomalies/")
164 |     .start()
165 | 
166 | // COMMAND ----------
167 | 
168 | println(query.lastProgress)


--------------------------------------------------------------------------------
/references/Lace Lofranco - Building Advanced Analytics Pipelines with Azure Databricks.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devlace/azure-databricks-anomaly/081c56515ba8d8f614e85ae4b7207eb75c6a0900/references/Lace Lofranco - Building Advanced Analytics Pipelines with Azure Databricks.pdf


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | python-dotenv>=0.5.1
2 | databricks-cli==0.8.2
3 | msrestazure~=0.4.32
4 | azure-cli==2.0.67


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages, setup
 2 | 
 3 | setup(
 4 |     name='src',
 5 |     packages=find_packages(),
 6 |     version='0.1.0',
 7 |     description='An anomaly detection data pipeline on Azure Databricks',
 8 |     author='Lace Lofranco',
 9 |     license='MIT',
10 | )
11 | 


--------------------------------------------------------------------------------
/test_environment.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | REQUIRED_PYTHON = "python3"
 4 | 
 5 | 
 6 | def main():
 7 |     system_major = sys.version_info.major
 8 |     if REQUIRED_PYTHON == "python":
 9 |         required_major = 2
10 |     elif REQUIRED_PYTHON == "python3":
11 |         required_major = 3
12 |     else:
13 |         raise ValueError("Unrecognized python interpreter: {}".format(
14 |             REQUIRED_PYTHON))
15 | 
16 |     if system_major != required_major:
17 |         raise TypeError(
18 |             "This project requires Python {}. Found: Python {}".format(
19 |                 required_major, sys.version))
20 |     else:
21 |         print(">>> Development environment passes all tests!")
22 | 
23 | 
24 | if __name__ == '__main__':
25 |     main()
26 | 


--------------------------------------------------------------------------------