├── .devcontainer
    ├── Dockerfile
    └── devcontainer.json
├── .github
    ├── CODE_OF_CONDUCT.md
    ├── ISSUE_TEMPLATE.md
    └── PULL_REQUEST_TEMPLATE.md
├── .gitignore
├── .vscode
    └── settings.json
├── CHANGELOG.md
├── CONTRIBUTING.md
├── LICENSE.md
├── Makefile
├── README.md
├── docs
    ├── advance
    │   ├── cicd.md
    │   └── registered_model_stages.md
    └── images
    │   ├── batch_scoring.png
    │   ├── model_training.png
    │   ├── result_azure_resources.png
    │   ├── result_batch_scoring.png
    │   ├── result_databricks_job.png
    │   ├── result_mlflow_experiment.png
    │   └── result_mlflow_model_registry.png
├── ml_experiments
    └── Feature Store Taxi example notebook.ipynb
├── ml_ops
    ├── README.md
    ├── deployment
    │   ├── README.md
    │   ├── arm_templates
    │   │   └── databricks_and_storage.json
    │   └── databricks
    │   │   ├── cluster_template.json
    │   │   ├── job_template_taxi_fares_batch_scoring.json
    │   │   └── job_template_taxi_fares_training.json
    ├── orchestrator
    │   ├── README.md
    │   ├── taxi_fares_orchestrator_batch_score.py
    │   └── taxi_fares_orchestrator_train.py
    ├── src
    │   ├── README.md
    │   ├── setup.py
    │   └── taxi_fares_mlops
    │   │   ├── __init__.py
    │   │   ├── feature_engineering.py
    │   │   ├── publish_model.py
    │   │   ├── scoring_batch.py
    │   │   ├── training.py
    │   │   └── utils.py
    └── tests
    │   ├── README.md
    │   ├── __init__.py
    │   └── taxi_fares
    │       ├── __init__.py
    │       ├── data
    │           └── taxi_fares_unit_test_training.csv
    │       ├── test_publish_model.py
    │       ├── test_training.py
    │       └── test_utils.py
├── ml_source
    ├── README.md
    ├── src
    │   ├── README.md
    │   ├── monitoring
    │   │   ├── __init__.py
    │   │   └── app_logger.py
    │   ├── setup.py
    │   └── taxi_fares
    │   │   ├── __init__.py
    │   │   ├── feature_eng
    │   │       ├── __init__.py
    │   │       └── features.py
    │   │   ├── training
    │   │       ├── __init__.py
    │   │       ├── evaluate.py
    │   │       └── train.py
    │   │   └── utils
    │   │       ├── __init__.py
    │   │       └── pyspark_utils.py
    └── tests
    │   ├── README.md
    │   ├── __init__.py
    │   ├── monitoring
    │       ├── __init__.py
    │       └── test_app_logger.py
    │   └── taxi_fares
    │       ├── __init__.py
    │       ├── feature_eng
    │           ├── __init__.py
    │           └── test_features.py
    │       └── utils
    │           ├── __init__.py
    │           └── test_pyspark_utils.py
└── requirements.txt


/.devcontainer/Dockerfile:
--------------------------------------------------------------------------------
 1 | # See here for image contents: https://github.com/microsoft/vscode-dev-containers/blob/master/containers/python-3-anaconda/.devcontainer/base.Dockerfile
 2 | ARG VARIANT="3"
 3 | FROM mcr.microsoft.com/vscode/devcontainers/anaconda:0-${VARIANT}
 4 | 
 5 | # Additional packages
 6 | RUN sudo apt-get update
 7 | RUN sudo apt-get install --reinstall build-essential -y
 8 | RUN sudo apt-get install default-jdk -y
 9 | 
10 | # Get local user
11 | ARG USERNAME=vscode
12 | 
13 | # Change conda to be owned by the local user
14 | RUN chown -R $USERNAME:$USERNAME /opt/conda
15 | 
16 | # Activate local user
17 | USER $USERNAME
18 | 
19 | # Conda init
20 | RUN conda init bash
21 | 
22 | # [Optional] If your pip requirements rarely change, uncomment this section to add them to the image.
23 | COPY requirements.txt /tmp/pip-tmp/
24 | RUN pip --disable-pip-version-check --no-cache-dir install -r /tmp/pip-tmp/requirements.txt \
25 |     && sudo rm -rf /tmp/pip-tmp
26 | RUN pip --disable-pip-version-check --no-cache-dir install databricks-feature-store


--------------------------------------------------------------------------------
/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
 1 | // For format details, see https://aka.ms/devcontainer.json. For config options, see the README at:
 2 | // https://github.com/microsoft/vscode-dev-containers/tree/v0.155.1/containers/python-3
 3 | {
 4 | 	"name": "Anaconda (Python 3)",
 5 | 	"build": {
 6 | 		"dockerfile": "Dockerfile",
 7 | 		"context": "..",
 8 | 		"args": {
 9 | 			// Update 'VARIANT'
10 | 			"VARIANT": "3",
11 | 			// Options
12 | 			"INSTALL_NODE": "false",
13 | 		}
14 | 	},
15 | 	"mounts": [
16 | 		"source=${localEnv:HOME}/.ssh,target=/home/vscode/.ssh,type=bind",
17 | 		"source=${localEnv:HOME}/.gitconfig,target=/home/vscode/.gitconfig,type=bind",
18 | 	],
19 | 	// Set *default* container specific settings.json values on container create.
20 | 	"settings": {
21 | 		"terminal.integrated.shell.linux": "/bin/bash",
22 | 		"python.pythonPath": "/opt/conda/bin/python",
23 | 		"python.linting.enabled": true,
24 | 		"python.linting.pylintEnabled": false,
25 | 		"python.linting.flake8Enabled": true,
26 | 		"python.linting.flake8Path": "/opt/conda/bin/flake8",
27 | 		"python.linting.flake8Args": [
28 | 			"--max-line-length=88"
29 | 		],
30 | 		"python.formatting.provider": "black",
31 | 		"python.formatting.blackPath": "/opt/conda/bin/black",
32 | 		"python.testing.promptToConfigure": false,
33 | 		"[python]": {
34 | 			"editor.formatOnSave": true,
35 | 			"editor.codeActionsOnSave": {
36 | 				"source.organizeImports": true
37 | 			},
38 | 			"files.trimTrailingWhitespace": true
39 | 		},
40 | 	},
41 | 	// Add the IDs of extensions you want installed when the container is created.
42 | 	"extensions": [
43 | 		"ms-python.python",
44 | 		"yzhang.markdown-all-in-one",
45 | 		"streetsidesoftware.code-spell-checker",
46 | 		"njpwerner.autodocstring",
47 | 		"GitHub.copilot"
48 | 	],
49 | 	// Use 'forwardPorts' to make a list of ports inside the container available locally.
50 | 	"forwardPorts": [
51 | 		5000
52 | 	],
53 | 	// Use 'postCreateCommand' to run commands after the container is created.
54 | 	// "postCreateCommand": "python --version",
55 | 	// Comment out connect as root instead. More info: https://aka.ms/vscode-remote/containers/non-root.
56 | 	"remoteUser": "vscode"
57 | }


--------------------------------------------------------------------------------
/.github/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Microsoft Open Source Code of Conduct
 2 | 
 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 4 | 
 5 | Resources:
 6 | 
 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | <!--
 2 | IF SUFFICIENT INFORMATION IS NOT PROVIDED VIA THE FOLLOWING TEMPLATE THE ISSUE MIGHT BE CLOSED WITHOUT FURTHER CONSIDERATION OR INVESTIGATION
 3 | -->
 4 | > Please provide us with the following information:
 5 | > ---------------------------------------------------------------
 6 | 
 7 | ### This issue is for a: (mark with an `x`)
 8 | ```
 9 | - [ ] bug report -> please search issues before submitting
10 | - [ ] feature request
11 | - [ ] documentation issue or request
12 | - [ ] regression (a behavior that used to work and stopped in a new release)
13 | ```
14 | 
15 | ### Minimal steps to reproduce
16 | >
17 | 
18 | ### Any log messages given by the failure
19 | >
20 | 
21 | ### Expected/desired behavior
22 | >
23 | 
24 | ### OS and Version?
25 | > Windows 7, 8 or 10. Linux (which distribution). macOS (Yosemite? El Capitan? Sierra?)
26 | 
27 | ### Versions
28 | >
29 | 
30 | ### Mention any other details that might be useful
31 | 
32 | > ---------------------------------------------------------------
33 | > Thanks! We'll be in touch soon.
34 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | ## Purpose
 2 | <!-- Describe the intention of the changes being proposed. What problem does it solve or functionality does it add? -->
 3 | * ...
 4 | 
 5 | ## Does this introduce a breaking change?
 6 | <!-- Mark one with an "x". -->
 7 | ```
 8 | [ ] Yes
 9 | [ ] No
10 | ```
11 | 
12 | ## Pull Request Type
13 | What kind of change does this Pull Request introduce?
14 | 
15 | <!-- Please check the one that applies to this PR using "x". -->
16 | ```
17 | [ ] Bugfix
18 | [ ] Feature
19 | [ ] Code style update (formatting, local variables)
20 | [ ] Refactoring (no functional changes, no api changes)
21 | [ ] Documentation content changes
22 | [ ] Other... Please describe:
23 | ```
24 | 
25 | ## How to Test
26 | *  Get the code
27 | 
28 | ```
29 | git clone [repo-address]
30 | cd [repo-name]
31 | git checkout [branch-name]
32 | npm install
33 | ```
34 | 
35 | * Test the code
36 | <!-- Add steps to run the tests suite and/or manually test -->
37 | ```
38 | ```
39 | 
40 | ## What to Check
41 | Verify that the following are valid
42 | * ...
43 | 
44 | ## Other Information
45 | <!-- Add any other helpful information that may be needed here. -->


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 98 | __pypackages__/
 99 | 
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 | 
104 | # SageMath parsed files
105 | *.sage.py
106 | 
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 | 
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 | 
120 | # Rope project settings
121 | .ropeproject
122 | 
123 | # mkdocs documentation
124 | /site
125 | 
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 | 
131 | # Pyre type checker
132 | .pyre/
133 | 
134 | # pytype static type analyzer
135 | .pytype/
136 | 
137 | # Cython debug symbols
138 | cython_debug/
139 | 
140 | # Local files
141 | **/.DS_Store
142 | .vscode/settings.json
143 | mlruns
144 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "python.pythonPath": "/opt/conda/bin/python",
 3 |     "python.testing.unittestArgs": [
 4 |         "-v",
 5 |         "-s",
 6 |         "./ml_source",
 7 |         "-p",
 8 |         "test*.py"
 9 |     ],
10 |     "python.testing.pytestEnabled": false,
11 |     "python.testing.unittestEnabled": true,
12 |     "python.sortImports.path": "/opt/conda/bin/isort"
13 | }


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | ## [project-title] Changelog
 2 | 
 3 | <a name="x.y.z"></a>
 4 | # x.y.z (yyyy-mm-dd)
 5 | 
 6 | *Features*
 7 | * ...
 8 | 
 9 | *Bug Fixes*
10 | * ...
11 | 
12 | *Breaking Changes*
13 | * ...
14 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to [project-title]
 2 | 
 3 | This project welcomes contributions and suggestions.  Most contributions require you to agree to a
 4 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
 5 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
 6 | 
 7 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide
 8 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
 9 | provided by the bot. You will only need to do this once across all repos using our CLA.
10 | 
11 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
12 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
13 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
14 | 
15 |  - [Code of Conduct](#coc)
16 |  - [Issues and Bugs](#issue)
17 |  - [Feature Requests](#feature)
18 |  - [Submission Guidelines](#submit)
19 | 
20 | ## <a name="coc"></a> Code of Conduct
21 | Help us keep this project open and inclusive. Please read and follow our [Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
22 | 
23 | ## <a name="issue"></a> Found an Issue?
24 | If you find a bug in the source code or a mistake in the documentation, you can help us by
25 | [submitting an issue](#submit-issue) to the GitHub Repository. Even better, you can
26 | [submit a Pull Request](#submit-pr) with a fix.
27 | 
28 | ## <a name="feature"></a> Want a Feature?
29 | You can *request* a new feature by [submitting an issue](#submit-issue) to the GitHub
30 | Repository. If you would like to *implement* a new feature, please submit an issue with
31 | a proposal for your work first, to be sure that we can use it.
32 | 
33 | * **Small Features** can be crafted and directly [submitted as a Pull Request](#submit-pr).
34 | 
35 | ## <a name="submit"></a> Submission Guidelines
36 | 
37 | ### <a name="submit-issue"></a> Submitting an Issue
38 | Before you submit an issue, search the archive, maybe your question was already answered.
39 | 
40 | If your issue appears to be a bug, and hasn't been reported, open a new issue.
41 | Help us to maximize the effort we can spend fixing issues and adding new
42 | features, by not reporting duplicate issues.  Providing the following information will increase the
43 | chances of your issue being dealt with quickly:
44 | 
45 | * **Overview of the Issue** - if an error is being thrown a non-minified stack trace helps
46 | * **Version** - what version is affected (e.g. 0.1.2)
47 | * **Motivation for or Use Case** - explain what are you trying to do and why the current behavior is a bug for you
48 | * **Browsers and Operating System** - is this a problem with all browsers?
49 | * **Reproduce the Error** - provide a live example or a unambiguous set of steps
50 | * **Related Issues** - has a similar issue been reported before?
51 | * **Suggest a Fix** - if you can't fix the bug yourself, perhaps you can point to what might be
52 |   causing the problem (line of code or commit)
53 | 
54 | You can file new issues by providing the above information at the corresponding repository's issues link: https://github.com/[organization-name]/[repository-name]/issues/new].
55 | 
56 | ### <a name="submit-pr"></a> Submitting a Pull Request (PR)
57 | Before you submit your Pull Request (PR) consider the following guidelines:
58 | 
59 | * Search the repository (https://github.com/[organization-name]/[repository-name]/pulls) for an open or closed PR
60 |   that relates to your submission. You don't want to duplicate effort.
61 | 
62 | * Make your changes in a new git fork:
63 | 
64 | * Commit your changes using a descriptive commit message
65 | * Push your fork to GitHub:
66 | * In GitHub, create a pull request
67 | * If we suggest changes then:
68 |   * Make the required updates.
69 |   * Rebase your fork and force push to your GitHub repository (this will update your Pull Request):
70 | 
71 |     ```shell
72 |     git rebase master -i
73 |     git push -f
74 |     ```
75 | 
76 | That's it! Thank you for your contribution!
77 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | .PHONY: clean clean-test clean-pyc clean-build
  2 | SHELL=/bin/bash
  3 | 
  4 | ## remove Python file artifacts
  5 | clean-pyc:
  6 | 	find . -name '*.pyc' -exec rm -f {} +
  7 | 	find . -name '*.pyo' -exec rm -f {} +
  8 | 	find . -name '*~' -exec rm -f {} +
  9 | 	find . -name '__pycache__' -exec rm -fr {} +
 10 | 
 11 | ## remove test and coverage artifacts
 12 | clean-test:
 13 | 	rm -f .coverage
 14 | 	rm -fr htmlcov/
 15 | 	rm -fr .pytest_cache
 16 | 
 17 | ## remove build artifacts
 18 | clean-build:
 19 | 	rm -fr build/
 20 | 	rm -fr dist/
 21 | 	rm -fr .eggs/
 22 | 	find . -name '*.egg-info' -exec rm -fr {} +
 23 | 	find . -name '*.egg' -exec rm -f {} +
 24 | 
 25 | ## remove all build, test, coverage and Python artifacts
 26 | clean: clean-build clean-pyc clean-test
 27 | 
 28 | ## pcakage ml
 29 | dist-ml: clean
 30 | 	python ml_source/src/setup.py bdist_wheel
 31 | 	rm -fr build/
 32 | 
 33 | ## pcakage mlops
 34 | dist-mlops: clean
 35 | 	python ml_ops/src/setup.py bdist_wheel
 36 | 	rm -fr build/
 37 | 
 38 | ## pcakage all
 39 | dist: dist-ml dist-mlops
 40 | 
 41 | ## install ml locally
 42 | install-ml: clean
 43 | 	python ml_source/src/setup.py install
 44 | 	rm -fr build/
 45 | 
 46 | ## install mlops locally
 47 | install-mlops: clean
 48 | 	python ml_ops/src/setup.py install
 49 | 	rm -fr build/
 50 | 
 51 | ## install all locally
 52 | install: install-ml install-mlops
 53 | 
 54 | ## unit test ml locally
 55 | test-ml: install-ml
 56 | 	cd ml_source && coverage run --source=taxi_fares,monitoring -m unittest discover
 57 | 	cd ml_source && coverage report -m
 58 | 
 59 | ## unit test mlops locally
 60 | test-mlops: install-mlops
 61 | 	cd ml_ops && coverage run --source=taxi_fares_mlops -m unittest discover
 62 | 	cd ml_ops && coverage report -m
 63 | 
 64 | ## unit test all locally
 65 | test: test-ml test-mlops
 66 | 	coverage combine ml_source/.coverage ml_ops/.coverage
 67 | 	coverage report
 68 | 
 69 | ## lint all python src and tests
 70 | lint:
 71 | 	flake8 --max-line-length=88 ml_ops/src ml_ops/tests ml_source/src ml_source/tests
 72 | 
 73 | ## databricks authenticate
 74 | databricks-authenticate:
 75 | 	$(info Authenticate Databricks CLI)
 76 | 	$(info Follow https://docs.microsoft.com/en-us/azure/databricks/dev-tools/cli/ for getting Host and token value)
 77 | 	databricks configure --token
 78 | 	$(info Taking Backup of .databrickscfg file in .env/databrickscfg)
 79 | 	mkdir -p .env
 80 | 	cp ~/.databrickscfg .env/.databrickscfg
 81 | 	$(info Creating env script file for mlflow)
 82 | 	DATABRICKS_HOST="$$(cat ~/.databrickscfg | grep '^host' | cut -d' ' -f 3)"; \
 83 | 	DATABRICKS_TOKEN="$$(cat ~/.databrickscfg | grep '^token' | cut -d' ' -f 3)"; \
 84 | 	echo "export MLFLOW_TRACKING_URI=databricks"> .env/.databricks_env.sh; \
 85 | 	echo "export DATABRICKS_HOST=$$DATABRICKS_HOST" >> .env/.databricks_env.sh; \
 86 | 	echo "export DATABRICKS_TOKEN=$$DATABRICKS_TOKEN" >> .env/.databricks_env.sh
 87 | 
 88 | ## databricks init (create cluster, base workspace, mlflow experiment, secret scope)
 89 | databricks-init:
 90 | 	echo "Creating databricks workspace root directory"; \
 91 | 	databricks workspace mkdirs /azure-databricks-mlops-mlflow; \
 92 | 	echo "Creating databricks dbfs root directory"; \
 93 | 	databricks fs mkdirs dbfs:/FileStore/libraries/azure-databricks-mlops-mlflow; \
 94 | 	CLUSTER_ID="$$(databricks clusters list --output json | \
 95 | 				   jq ".clusters[] | select(.cluster_name == \"azure-databricks-mlops-mlflow\") | .cluster_id")"; \
 96 | 	echo "Got existing cluster azure-databricks-mlops-mlflow with id: $$CLUSTER_ID"; \
 97 | 	if [[ $$CLUSTER_ID == "" ]]; then \
 98 | 		echo "Creating databricks cluster azure-databricks-mlops-mlflow"; \
 99 | 		databricks clusters create --json-file ml_ops/deployment/databricks/cluster_template.json; \
100 | 	fi; \
101 | 	SECRET_SCOPE_NAME="$$(databricks secrets list-scopes --output json | \
102 | 				   jq ".scopes[] | select(.name == \"azure-databricks-mlops-mlflow\") | .name")"; \
103 | 	echo "Got existing secret scope $$SECRET_SCOPE_NAME"; \
104 | 	if [[ $$SECRET_SCOPE_NAME == "" ]]; then \
105 | 		echo "Creating databricks secret scope azure-databricks-mlops-mlflow"; \
106 | 		databricks secrets create-scope --scope azure-databricks-mlops-mlflow --initial-manage-principal users; \
107 | 	fi; \
108 | 	MLFLOW_EXPERIMENT_ID="$$(source .env/.databricks_env.sh && mlflow experiments list | \
109 | 							 grep '/azure-databricks-mlops-mlflow/Experiment' | \
110 | 							 cut -d' ' -f 1)"; \
111 | 	echo "Got existing mlflow experiment id: $$MLFLOW_EXPERIMENT_ID"; \
112 | 	if [[ "$$MLFLOW_EXPERIMENT_ID" == "" ]]; then \
113 | 		echo "Creating mlflow experiment in databricks workspace /azure-databricks-mlops-mlflow/Experiment directory"; \
114 | 		source .env/.databricks_env.sh && mlflow experiments create --experiment-name /azure-databricks-mlops-mlflow/Experiment; \
115 | 	fi; \
116 | 
117 | ## databricks secrets put
118 | databricks-secrets-put:
119 | 	$(info Put databricks secret azure-blob-storage-account-name)
120 | 	@read -p "Enter Azure Blob storage Account Name: " stg_account_name; \
121 | 	databricks secrets put --scope azure-databricks-mlops-mlflow --key azure-blob-storage-account-name --string-value $$stg_account_name
122 | 	$(info Put databricks secret azure-blob-storage-container-name)
123 | 	@read -p "Enter Azure Blob storage Container Name: " stg_container_name; \
124 | 	databricks secrets put --scope azure-databricks-mlops-mlflow --key azure-blob-storage-container-name --string-value $$stg_container_name
125 | 	$(info Put databricks secret azure-shared-access-key)
126 | 	$(info Mount Blob Storage https://docs.microsoft.com/en-gb/azure/databricks/data/data-sources/azure/azure-storage)
127 | 	@read -p "Enter Azure Blob storage Shared Access Key: " shared_access_key; \
128 | 	databricks secrets put --scope azure-databricks-mlops-mlflow --key azure-blob-storage-shared-access-key --string-value $$shared_access_key
129 | 	
130 | ## databricks secrets put application insights key
131 | databricks-add-app-insights-key:
132 | 	$(info Put app insights key)
133 | 	@read -p "Enter App insights key: " app_insights_key; \
134 | 	if [[ "$$app_insights_key" != '' ]]; then \
135 | 		echo "Setting app insights key : $$app_insights_key "; \
136 | 		databricks secrets put --scope azure-databricks-mlops-mlflow --key app_insights_key --string-value "$$app_insights_key"; \
137 | 	fi; \
138 | 
139 | ## databricks deploy (upload wheel pacakges to databricks DBFS workspace)
140 | databricks-deploy-code: dist
141 | 	$(info Upload wheel packages into databricks dbfs root directory)
142 | 	databricks fs cp --overwrite --recursive dist/ dbfs:/FileStore/libraries/azure-databricks-mlops-mlflow/
143 | 	$(info Importing orchestrator notebooks into databricks workspace root directory)
144 | 	databricks workspace import_dir --overwrite ml_ops/orchestrator/ /azure-databricks-mlops-mlflow/
145 | 	$(info Create or update databricks jobs)
146 | 
147 | ## databricks deploy jobs (create databricks jobs)
148 | databricks-deploy-jobs: databricks-deploy-code
149 | 	$(info Getting required values from databricks)
150 | 	CLUSTER_ID="$$(databricks clusters list --output json | \
151 | 				   jq ".clusters[] | select(.cluster_name == \"azure-databricks-mlops-mlflow\") | .cluster_id")"; \
152 | 	echo "Got existing cluster id: $$CLUSTER_ID"; \
153 | 	TRAINING_JOB_ID="$$(databricks jobs list --output json | \
154 | 						jq ".jobs[] | select(.settings.name == \"taxi_fares_model_training\") | .job_id")"; \
155 | 	echo "Got existing taxi_fares_model_training job id: $$TRAINING_JOB_ID"; \
156 | 	if [[ "$$TRAINING_JOB_ID" == "" ]]; then \
157 | 		databricks jobs create --json "{\"name\": \"taxi_fares_model_training\", \"existing_cluster_id\": $$CLUSTER_ID}"; \
158 | 		TRAINING_JOB_ID="$$(databricks jobs list --output json | \
159 | 							jq ".jobs[] | select(.settings.name == \"taxi_fares_model_training\") | .job_id")"; \
160 | 		echo "Created taxi_fares_model_training with job id: $$TRAINING_JOB_ID"; \
161 | 	fi; \
162 | 	BATCH_SCORING_JOB_ID="$$(databricks jobs list --output json | \
163 | 							 jq ".jobs[] | select(.settings.name == \"taxi_fares_batch_scoring\") | .job_id")"; \
164 | 	echo "Got existing taxi_fares_batch_scoring job id: $$BATCH_SCORING_JOB_ID"; \
165 | 	if [[ "$$BATCH_SCORING_JOB_ID" == "" ]]; then \
166 | 		databricks jobs create --json "{\"name\": \"taxi_fares_batch_scoring\", \"existing_cluster_id\": $$CLUSTER_ID}"; \
167 | 		BATCH_SCORING_JOB_ID="$$(databricks jobs list --output json | \
168 | 								 jq ".jobs[] | select(.settings.name == \"taxi_fares_batch_scoring\") | .job_id")"; \
169 | 		echo "Created taxi_fares_batch_scoring with job id: $$BATCH_SCORING_JOB_ID"; \
170 | 	fi; \
171 | 	MLFLOW_EXPERIMENT_ID="$$(source .env/.databricks_env.sh && mlflow experiments list | \
172 | 							 grep '/azure-databricks-mlops-mlflow/Experiment' | \
173 | 							 cut -d' ' -f 1)"; \
174 | 	echo "Got existing mlflow experiment id: $$MLFLOW_EXPERIMENT_ID"; \
175 | 	echo "Updating taxi_fares_model_training by using template ml_ops/deployment/databricks/job_template_taxi_fares_training.json"; \
176 | 	TRAINING_JOB_UPDATE_JSON="$$(cat ml_ops/deployment/databricks/job_template_taxi_fares_training.json | \
177 | 								 sed "s/\"FILL_JOB_ID\"/$$TRAINING_JOB_ID/" | \
178 | 								 sed "s/FILL_MLFLOW_EXPERIMENT_ID/$$MLFLOW_EXPERIMENT_ID/" | \
179 | 								 sed "s/\"FILL_CLUSTER_ID\"/$$CLUSTER_ID/")"; \
180 | 	databricks jobs reset --job-id $$TRAINING_JOB_ID --json "$$TRAINING_JOB_UPDATE_JSON"; \
181 | 	echo "Updating taxi_fares_batch_scoring by using template ml_ops/deployment/databricks/job_template_taxi_fares_batch_scoring.json"; \
182 | 	BATCH_SCORING_JOB_UPDATE_JSON="$$(cat ml_ops/deployment/databricks/job_template_taxi_fares_batch_scoring.json | \
183 | 								 sed "s/\"FILL_JOB_ID\"/$$BATCH_SCORING_JOB_ID/" | \
184 | 								 sed "s/FILL_MLFLOW_EXPERIMENT_ID/$$MLFLOW_EXPERIMENT_ID/" | \
185 | 								 sed "s/\"FILL_CLUSTER_ID\"/$$CLUSTER_ID/")"; \
186 | 	databricks jobs reset --job-id $$BATCH_SCORING_JOB_ID --json "$$BATCH_SCORING_JOB_UPDATE_JSON"; \
187 | 
188 | ## deploy databricks all
189 | deploy: databricks-deploy-jobs
190 | 
191 | ## run databricks taxi_fares_model_training job
192 | run-taxifares-model-training:
193 | 	$(info Triggering model training job)
194 | 	TRAINING_JOB_ID="$$(databricks jobs list --output json | \
195 | 						jq ".jobs[] | select(.settings.name == \"taxi_fares_model_training\") | .job_id")"; \
196 | 	RUN_ID="$$(databricks jobs run-now --job-id $$TRAINING_JOB_ID | \
197 | 			   jq ".number_in_job")"; \
198 | 	DATABRICKS_HOST="$$(cat ~/.databrickscfg | grep '^host' | cut -d' ' -f 3)"; \
199 | 	DATABRICKS_ORG_ID="$$(echo $$DATABRICKS_HOST | cut -d'-' -f 2 | cut -d'.' -f 1)"; \
200 | 	echo "Open the following link in browser to check result -"; \
201 | 	echo "$$DATABRICKS_HOST/?o=$$DATABRICKS_ORG_ID/#job/$$TRAINING_JOB_ID/run/$$RUN_ID"; \
202 | 
203 | 	
204 | ## run databricks taxi_fares_batch_scoring job
205 | run-taxifares-batch-scoring:
206 | 	$(info Triggering batch scoring job)
207 | 	BATCH_SCORING_JOB_ID="$$(databricks jobs list --output json | \
208 | 							 jq ".jobs[] | select(.settings.name == \"taxi_fares_batch_scoring\") | .job_id")"; \
209 | 	RUN_ID="$$(databricks jobs run-now --job-id $$BATCH_SCORING_JOB_ID | \
210 | 			   jq ".number_in_job")"; \
211 | 	DATABRICKS_HOST="$$(cat ~/.databrickscfg | grep '^host' | cut -d' ' -f 3)"; \
212 | 	DATABRICKS_ORG_ID="$$(echo $$DATABRICKS_HOST | cut -d'-' -f 2 | cut -d'.' -f 1)"; \
213 | 	echo "Open the following link in browser to check result -"; \
214 | 	echo "$$DATABRICKS_HOST/?o=$$DATABRICKS_ORG_ID/#job/$$BATCH_SCORING_JOB_ID/run/$$RUN_ID"; \
215 | 
216 | # continuous integration (CI)
217 | ci: lint test dist
218 | 
219 | # continuous deployment (CD)
220 | cd: deploy
221 | 
222 | # train model
223 | train: run-taxifares-model-training
224 | 
225 | # batch scoring
226 | score: run-taxifares-batch-scoring


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | page_type: sample
  3 | ms.custom:
  4 | - team=cse
  5 | ms.contributors:
  6 | - prdeb-12/21/2021
  7 | - anchugh-12/21/2021
  8 | languages:
  9 | - python
 10 | products:
 11 | - azure-databricks
 12 | - azure-blob-storage
 13 | - azure-monitor
 14 | ---
 15 | 
 16 | # Azure Databricks MLOps using MLflow
 17 | 
 18 | This is a template or sample for [MLOps](https://github.com/microsoft/mlops) for [Python](https://www.python.org) based source code in [Azure Databricks](https://docs.microsoft.com/en-us/azure/databricks/) using [MLflow](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/) without using [MLflow Project](https://mlflow.org/docs/latest/projects.html#).
 19 | 
 20 | This template provides the following features:
 21 | 
 22 | - A way to run Python based MLOps without using [MLflow Project](https://mlflow.org/docs/latest/projects.html#), but still using [MLflow](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/) for managing the end-to-end machine learning lifecycle.
 23 | - Sample of machine learning source code structure along with Unit Test cases
 24 | - Sample of MLOps code structure along with Unit Test cases
 25 | - Demo setup to try on users subscription
 26 | 
 27 | ## Problem Summary
 28 | 
 29 | - This demonstrates deployment scenario of [Orchestrate MLOps on Azure Databricks using Databricks Notebook](https://docs.microsoft.com/en-us/azure/architecture/reference-architectures/ai/orchestrate-mlops-azure-databricks)
 30 | 
 31 | ## Products/Technologies/Languages Used
 32 | 
 33 | - Products & Technologies:
 34 |   - Azure Databricks
 35 |   - Azure Blob Storage
 36 |   - Azure Monitor
 37 | - Languages:
 38 |   - Python
 39 | 
 40 | ## Architecture
 41 | 
 42 | ### Model Training
 43 | 
 44 | ![Model Training](docs/images/model_training.png)
 45 | 
 46 | ### Batch Scoring
 47 | 
 48 | ![Batch Scoring](docs/images/batch_scoring.png)
 49 | 
 50 | ## Individual Components
 51 | 
 52 | - [ml_experiment](./ml_experiments/experiment_notebook.ipynb) - sample ML experiment notebook.
 53 | - [ml_data](./ml_data/) - dummy data for sample model
 54 | - [ml_ops](./ml_ops/) - sample MLOps code along with Unit Test cases, orchestrator, deployment setup.
 55 | - [ml_source](./ml_source/) - sample ML code along with Unit Test cases
 56 | - [Makefile](./Makefile) - for build, test in local environment
 57 | - [requirements.txt](./requirements.txt) - python dependencies
 58 | 
 59 | ## Getting Started
 60 | 
 61 | ### Prerequisites
 62 | 
 63 | - [Azure Databricks](https://docs.microsoft.com/en-us/azure/databricks/) workspace
 64 | - [Azure Data Lake Storage Gen2](https://docs.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-introduction) account
 65 | - [Visual Studio Code](https://code.visualstudio.com/) in local environment for development
 66 | - [Docker](https://www.docker.com/) in local environment for development
 67 | 
 68 | ### Development
 69 | 
 70 | 1. `git clone https://github.com/Azure-Samples/azure-databricks-mlops-mlflow.git`
 71 | 2. `cd azure-databricks-mlops-mlflow`
 72 | 3. Open cloned repository in Visual Studio Code [Remote Container](https://code.visualstudio.com/docs/remote/containers)
 73 | 4. Open a [terminal](https://code.visualstudio.com/docs/remote/containers#_opening-a-terminal) in Remote Container from Visual Studio Code
 74 | 5. `make install` to install sample packages (`taxi_fares` and `taxi_fares_mlops`) locally
 75 | 6. `make test` to Unit Test the code locally
 76 | 
 77 | ### Package
 78 | 
 79 | 1. `make dist` to build wheel Ml and MLOps packages (`taxi_fares` and `taxi_fares_mlops`) locally
 80 | 
 81 | ### Deployment
 82 | 
 83 | 1. `make databricks-deploy-code` to deploy Databricks Orchestrator Notebooks, ML and MLOps Python wheel packages. If any code changes.
 84 | 2. `make databricks-deploy-jobs` to deploy Databricks Jobs. If any changes in job specs.
 85 | 
 86 | ### Run training and batch scoring
 87 | 
 88 | 1. To trigger training, execute `make run-taxi-fares-model-training`
 89 | 2. To trigger batch scoring, execute `make run-taxi-fares-batch-scoring`
 90 | 
 91 | **NOTE:** for [deployment](#deployment) and [running](#run-training-and-batch-scoring) the Databricks environment should be created first, for creating a demo environment the [Demo](#demo) chapter can be followed.
 92 | 
 93 | ### Observability
 94 | 
 95 | Check Logs, create alerts. etc. in [Application Insights](https://docs.microsoft.com/en-us/azure/azure-monitor/app/app-insights-overview). Following are the few sample [Kusto Query](https://docs.microsoft.com/en-us/azure/data-explorer/kusto/query/) to check logs, traces, exception, etc.
 96 | 
 97 | - Check for Error, Info, Debug Logs
 98 | 
 99 |   Kusto Query for checking general logs for a specific MLflow experiment, filtered by `mlflow_experiment_id`
100 | 
101 |   ```kusto
102 |     traces
103 |   | extend mlflow_experiment_id = customDimensions.mlflow_experiment_id
104 |   | where timestamp > ago(30m) 
105 |   | where mlflow_experiment_id == <mlflow experiment id>
106 |   | limit 1000
107 |   ```
108 | 
109 |   Kusto Query for checking general logs for a specific Databricks job execution filtered by `mlflow_experiment_id` and `mlflow_run_id`
110 | 
111 |   ```kusto
112 |   traces
113 |   | extend mlflow_run_id = customDimensions.mlflow_run_id
114 |   | extend mlflow_experiment_id = customDimensions.mlflow_experiment_id
115 |   | where timestamp > ago(30m) 
116 |   | where mlflow_experiment_id == <mlflow experiment id>
117 |   | where mlflow_run_id == "<mlflow run id>"
118 |   | limit 1000
119 |   ```
120 | 
121 | - Check for Exceptions
122 | 
123 |   Kusto Query for checking exception log if any
124 | 
125 |   ```kusto
126 |   exceptions 
127 |   | where timestamp > ago(30m)
128 |   | limit 1000
129 |   ```
130 | 
131 | - Check for duration of different stages in MLOps
132 | 
133 |   Sample Kusto Query for checking duration of different stages in MLOps
134 | 
135 |   ```kusto
136 |   dependencies 
137 |   | where timestamp > ago(30m) 
138 |   | where cloud_RoleName == 'TaxiFares_Training'
139 |   | limit 1000
140 |   ```
141 | 
142 | To correlate dependencies, exceptions and traces, `operation_Id` can be used a filter to above Kusto Queries.
143 | 
144 | ## Demo
145 | 
146 | 1. Create Databricks workspace, a storage account (Azure Data Lake Storage Gen2) and Application Insights
147 |    1. Create an [Azure Account](https://azure.microsoft.com/en-in/free/)
148 |    2. [Deploy resources](https://docs.microsoft.com/en-us/azure/azure-resource-manager/templates/deploy-portal#deploy-resources-from-custom-template) from [custom ARM template](ml_ops/deployment/arm_templates/databricks_and_storage.json)
149 | 2. Initialize Databricks (create cluster, base workspace, mlflow experiment, secret scope)
150 |    1. Get [Databricks CLI](https://docs.microsoft.com/en-us/azure/databricks/dev-tools/cli/) Host and Token
151 |    2. Authenticate Databricks CLI `make databricks-authenticate`
152 |    3. Execute `make databricks-init`
153 | 3. Create Azure Data Lake Storage Gen2 Container and upload data
154 |    1. [Create](https://docs.microsoft.com/en-us/azure/storage/blobs/storage-quickstart-blobs-portal#create-a-container) Azure Data Lake Storage Gen2 Container named - `taxifares`
155 |    2. [Upload](https://docs.microsoft.com/en-us/azure/storage/blobs/storage-quickstart-blobs-portal#upload-a-block-blob) as blob [taxi-fares data files](./ml_data/) into Azure Data Lake Storage Gen2 container named - `taxifares`
156 | 4. Put secrets to [Mount ADLS Gen2 Storage using Shared Access Key](https://docs.microsoft.com/en-gb/azure/databricks/data/data-sources/azure/azure-storage)
157 |    1. Get Azure Data Lake Storage Gen2 account name created in step 1
158 |    2. Get [Shared Key](https://docs.microsoft.com/en-us/rest/api/storageservices/authorize-with-shared-key) for Azure Data Lake Storage Gen2 account
159 |    3. Execute `make databricks-secrets-put` to put secret in Databricks secret scope
160 | 5. Put Application Insights Key as a secret in Databricks secret scope (optional)
161 |    1. Get [Application Insights Key](https://docs.microsoft.com/en-us/azure/azure-monitor/app/create-new-resource#copy-the-instrumentation-key) created in step 1
162 |    2. Execute `make databricks-add-app-insights-key` to put secret in Databricks secret scope
163 | 6. Package and deploy into Databricks (Databricks Jobs, Orchestrator Notebooks, ML and MLOps Python wheel packages)
164 |    1. Execute `make deploy`
165 | 7. Run Databricks Jobs
166 |    1. To trigger training, execute `make run-taxifares-model-training`
167 |    2. To trigger batch scoring, execute `make run-taxifares-batch-scoring`
168 | 8. Expected results
169 |    1. Azure resources
170 |       ![Azure resources](docs/images/result_azure_resources.png)
171 |    2. Databricks jobs
172 |       ![Databricks jobs](docs/images/result_databricks_job.png)
173 |    3. Databricks mlflow experiment
174 |       ![Databricks mlflow experiment](docs/images/result_mlflow_experiment.png)
175 |    4. Databricks mlflow model registry
176 |       ![Databricks mlflow model registry](docs/images/result_mlflow_model_registry.png)
177 |    5. Output of batch scoring
178 |       ![Output of batch scoring](docs/images/result_batch_scoring.png)
179 | 
180 | ## Additional Details
181 | 
182 | 1. [Continuous Integration (CI) & Continuous Deployment (CD)](docs/advance/cicd.md)
183 | 2. [Registered Models Stages and Transitioning](docs/advance/registered_model_stages.md)
184 | 
185 | ## Related resources
186 | 
187 | 1. [Azure Databricks](https://docs.microsoft.com/en-us/azure/databricks/)
188 | 2. [MLflow](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/)
189 | 3. [MLflow Project](https://mlflow.org/docs/latest/projects.html#)
190 | 4. [Run MLflow Projects on Azure Databricks](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/projects)
191 | 5. [Databricks Widgets](https://docs.microsoft.com/en-us/azure/databricks/dev-tools/databricks-utils#--widget-utilities)
192 | 6. [Databricks Notebook-scoped Python libraries](https://docs.microsoft.com/en-us/azure/databricks/libraries/notebooks-python-libraries)
193 | 7. [Databricks CLI](https://docs.microsoft.com/en-us/azure/databricks/dev-tools/cli/)
194 | 8. [Azure Data Lake Storage Gen2](https://docs.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-introduction)
195 | 9. [Application Insights](https://docs.microsoft.com/en-us/azure/azure-monitor/app/app-insights-overview)
196 | 10. [Kusto Query Language](https://docs.microsoft.com/en-us/azure/data-explorer/kusto/query/)
197 | 
198 | ## Glossaries
199 | 
200 | 1. [Application developer](https://docs.microsoft.com/en-us/azure/machine-learning/team-data-science-process/overview) : It is a role that work mainly towards operationalize of machine learning.
201 | 2. [Data scientist](https://docs.microsoft.com/en-us/azure/machine-learning/team-data-science-process/roles-tasks#structure-of-data-science-groups-and-teams) : It is a role to perform the data science parts of the project
202 | 
203 | ## Contributors
204 | 
205 | - [Julien Chomarat](https://github.com/jchomarat)  
206 | - [Benjamin Guinebertière](https://github.com/benjguin)  
207 | - [Ankit Sinha](https://github.com/ankitbko)  
208 | - [Prabal Deb](https://github.com/prabdeb)  
209 | - [Megha Patil](https://github.com/meghapatilcode)  
210 | - [Srikantan Sankaran](https://github.com/ssrikantan)   
211 | - [Frédéric Le Coquil](https://github.com/flecoqui)  
212 | - [Anand Chugh](https://github.com/anandchugh)
213 | 


--------------------------------------------------------------------------------
/docs/advance/cicd.md:
--------------------------------------------------------------------------------
 1 | # Continuous Integration (CI) & Continuous Deployment (CD)
 2 | 
 3 | CI and CD can be performed using any platform like `Azure DevOps Pipeline` or `GitHub Actions`, etc. where the following `make` commands in [Makefile](../../Makefile) might be useful.
 4 | 
 5 | - CI: execute `make ci` from the Pipeline/Action stage.
 6 | - CD: execute `make cd` from the Pipeline/Action stage.
 7 | 
 8 | **NOTE:** Set env variables - `DATABRICKS_HOST`, `DATABRICKS_TOKEN` in the environment prior executing CD stage.
 9 | 
10 | ## Reference
11 | 
12 | - [Design a CI/CD pipeline using Azure DevOps](https://docs.microsoft.com/en-us/azure/architecture/example-scenario/apps/devops-dotnet-webapp)
13 | - [GitHub Actions](https://docs.github.com/en/actions)


--------------------------------------------------------------------------------
/docs/advance/registered_model_stages.md:
--------------------------------------------------------------------------------
 1 | # Registered Models Stages and Transitioning
 2 | 
 3 | This document describes a possible way to transitioning a model from different stages available in [Mlflow Model Registry](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/model-registry#model-registry-concepts).
 4 | 
 5 | 1. In this demo setup, currently [Continuous Integration (CI)](cicd.md) step does [register](../ml_ops/src/taxi_fares_mlops/../../../../ml_ops/src/taxi_fares_mlops/publish_model.py) the model in MLflow model registry in `None` stage.
 6 | 2. Now the registered model can be [transitioned](https://www.mlflow.org/docs/latest/model-registry.html#transitioning-an-mlflow-models-stage) to next stage `Staging` post Integration test step.
 7 | 3. Finally the model can be transitioned to stage `Production` during Continuous Deployment (CD) step.
 8 | 
 9 | ## References
10 | 
11 | - [MLflow Model Registry on Azure Databricks](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/model-registry)
12 | - [MLflow Model Registry](https://www.mlflow.org/docs/latest/model-registry.html)
13 | 


--------------------------------------------------------------------------------
/docs/images/batch_scoring.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/docs/images/batch_scoring.png


--------------------------------------------------------------------------------
/docs/images/model_training.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/docs/images/model_training.png


--------------------------------------------------------------------------------
/docs/images/result_azure_resources.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/docs/images/result_azure_resources.png


--------------------------------------------------------------------------------
/docs/images/result_batch_scoring.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/docs/images/result_batch_scoring.png


--------------------------------------------------------------------------------
/docs/images/result_databricks_job.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/docs/images/result_databricks_job.png


--------------------------------------------------------------------------------
/docs/images/result_mlflow_experiment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/docs/images/result_mlflow_experiment.png


--------------------------------------------------------------------------------
/docs/images/result_mlflow_model_registry.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/docs/images/result_mlflow_model_registry.png


--------------------------------------------------------------------------------
/ml_ops/README.md:
--------------------------------------------------------------------------------
 1 | # MLOps
 2 | 
 3 | ## Overview
 4 | 
 5 | This contains MLOps code. That will be developed, unit tested, packaged and delivered independently and typically maintained by Application developer in an organization.
 6 | 
 7 | ## Contents
 8 | 
 9 | 1. [src](src/) : MLOps source code, that will be packaged as Python `wheel`.
10 | 2. [tests](tests/) : unit test cases for `src`.
11 | 3. [orchestrator](orchestrator/) : Databricks Python Notebooks for MLOps orchestrator.
12 | 4. [deployment](deployment/) : deployment templates ([ARM](https://docs.microsoft.com/en-us/azure/azure-resource-manager/templates/overview) and Databricks Jobs, Cluster).
13 | 


--------------------------------------------------------------------------------
/ml_ops/deployment/README.md:
--------------------------------------------------------------------------------
 1 | # Deployment
 2 | 
 3 | ## Overview
 4 | 
 5 | This document covers the deployment guide for MLOps.
 6 | 
 7 | ## Databricks Cluster
 8 | 
 9 | For Orchestrator job, either an existing cluster can be used or a new cluster can be created. However, we need to be sure to set following [properties](https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/latest/clusters#--request-structure-of-the-cluster-definition) in the cluster.
10 | 
11 | - Cluster Mode: High Concurrency
12 | - DataBricks Runtime Version : 8.1 LTS ML (includes Apache Spark 3.0.1, Scala 2.12)
13 | - Enable Autoscaling: True
14 | - Worker Type: Standard_F4s
15 | - Driver Type: Standard_F4s
16 | - Spark Settings under “Spark Config” (Edit > Advanced Options > Spark)
17 |   
18 |   ```configuration
19 |   spark.databricks.cluster.profile serverless
20 |   spark.databricks.repl.allowedLanguages sql,python,r
21 |   spark.databricks.conda.condaMagic.enabled true
22 |   ```
23 | 
24 | ## Databricks Job
25 | 
26 | Orchestrator DataBricks Job from a [Databricks Job create template](https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/latest/jobs#--create) can be created using following example CLI command -
27 | 
28 | ```sh
29 | databricks jobs create --json-file <job-template-file>.json
30 | ```
31 | 
32 | Orchestrator DataBricks Job from a [Databricks Job reset template](https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/latest/jobs#--reset) can be updated using following example CLI command -
33 | 
34 | ```sh
35 | databricks jobs reset --job-id <job-id of existing job> --json-file <job-template-file>.json
36 | ```
37 | 
38 | ## Databricks MLflow Experiment
39 | 
40 | MLflow Experiment can be created using [Databricks Workspace Portal](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/tracking#workspace-experiments) or using following CLI commands -
41 | 
42 | ```sh
43 | export MLFLOW_TRACKING_URI=databricks
44 | export DATABRICKS_HOST=<databricks host>
45 | export DATABRICKS_TOKEN=<databricks token>
46 | mlflow experiments create --experiment-name /<path in databricks workspace>/<experiment name>
47 | ```
48 | 
49 | Get `DATABRICKS_HOST` and `DATABRICKS_TOKEN` from [Databricks CLI Reference](https://docs.microsoft.com/en-us/azure/databricks/dev-tools/cli/)
50 | 
51 | ## Databricks DBFS Upload
52 | 
53 | The following CLI command can be used to upload Wheel package into DataBricks DBFS.
54 | 
55 | ```sh
56 | databricks fs cp --overwrite python-package.whl <dbfs-path>
57 | ```
58 | 
59 | ## Databricks Notebook Import
60 | 
61 | The following CLI command can be used to import orchestrator python file as a DataBricks notebook into DataBricks workspace.
62 | 
63 | ```sh
64 | databricks workspace import -l PYTHON -f SOURCE -o <orchestrator-notebook-python-file>.py <databricks-workspace-path>
65 | ```
66 | 
67 | ## Orchestrator DataBricks Job trigger
68 | 
69 | Orchestrator databricks job can be triggered using following ways -
70 | 
71 | - Scheduled :  
72 |   - Cron based scheduling.
73 | - Manual :  
74 |   - Databricks workspace portal but clicking on `Run Now With Different Parameters`.
75 |   - Via [Databricks-CLI](https://docs.microsoft.com/en-us/azure/databricks/dev-tools/cli/jobs-cli).
76 |   - Via [Databricks-API](https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/latest/jobs#--run-now).
77 | 


--------------------------------------------------------------------------------
/ml_ops/deployment/arm_templates/databricks_and_storage.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#",
  3 |     "contentVersion": "1.0.0.1",
  4 |     "parameters": {
  5 |         "location": {
  6 |             "type": "string",
  7 |             "defaultValue": "[resourceGroup().location]",
  8 |             "metadata": {
  9 |                 "description": "Resources location."
 10 |             }
 11 |         },
 12 |         "dbWorkspaceName": {
 13 |             "type": "string",
 14 |             "defaultValue": "databricks-mlops-mlflow",
 15 |             "metadata": {
 16 |                 "description": "The name of the Azure Databricks workspace to create."
 17 |             }
 18 |         },
 19 |         "dbTier": {
 20 |             "defaultValue": "standard",
 21 |             "type": "string",
 22 |             "allowedValues": [
 23 |                 "standard",
 24 |                 "premium"
 25 |             ],
 26 |             "metadata": {
 27 |                 "description": "The pricing tier of Databricks workspace."
 28 |             }
 29 |         },
 30 |         "stgAccountName": {
 31 |             "type": "string",
 32 |             "defaultValue": "[concat('storage', uniqueString(parameters('location'), resourceGroup().id))]",
 33 |             "metadata": {
 34 |                 "description": "Storage account name."
 35 |             }
 36 |         },
 37 |         "stgAccountType": {
 38 |             "type": "string",
 39 |             "defaultValue": "Standard_RAGRS",
 40 |             "metadata": {
 41 |                 "description": "Storage account type."
 42 |             }
 43 |         },
 44 |         "stgKind": {
 45 |             "type": "string",
 46 |             "defaultValue": "StorageV2",
 47 |             "metadata": {
 48 |                 "description": "Storage account kind."
 49 |             }
 50 |         },
 51 |         "stgAccessTier": {
 52 |             "type": "string",
 53 |             "defaultValue": "Cool",
 54 |             "metadata": {
 55 |                 "description": "Storage account tier."
 56 |             }
 57 |         },
 58 |         "stgIsHnsEnabled": {
 59 |             "type": "bool",
 60 |             "defaultValue": true,
 61 |             "metadata": {
 62 |                 "description": "Enable ADLS Gen2."
 63 |             }
 64 |         },
 65 |         "aiName": {
 66 |             "type": "string",
 67 |             "defaultValue": "[concat('ai', uniqueString(parameters('location'), resourceGroup().id))]",
 68 |             "metadata": {
 69 |                 "description": "Application Insights name."
 70 |             }
 71 |         }
 72 |     },
 73 |     "variables": {
 74 |         "managedResourceGroupName": "[concat('databricks-rg-', parameters('dbWorkspaceName'), '-', uniqueString(parameters('dbWorkspaceName'), resourceGroup().id))]",
 75 |         "managedResourceGroupId": "[concat(subscription().id, '/resourceGroups/', variables('managedResourceGroupName'))]"
 76 |     },
 77 |     "resources": [
 78 |         {
 79 |             "type": "Microsoft.Databricks/workspaces",
 80 |             "apiVersion": "2018-04-01",
 81 |             "name": "[parameters('dbWorkspaceName')]",
 82 |             "location": "[parameters('location')]",
 83 |             "sku": {
 84 |                 "name": "[parameters('dbTier')]"
 85 |             },
 86 |             "comments": "Please do not use an existing resource group for ManagedResourceGroupId.",
 87 |             "properties": {
 88 |                 "ManagedResourceGroupId": "[variables('managedResourceGroupId')]",
 89 |                 "parameters": {}
 90 |             },
 91 |             "dependsOn": [],
 92 |             "tags": {
 93 |                 "Purpose": "Demo",
 94 |                 "Project": "azure-databricks-mlops-mlflow"
 95 |             }
 96 |         },
 97 |         {
 98 |             "type": "Microsoft.Storage/storageAccounts",
 99 |             "apiVersion": "2019-06-01",
100 |             "name": "[parameters('stgAccountName')]",
101 |             "location": "[parameters('location')]",
102 |             "properties": {
103 |                 "accessTier": "[parameters('stgAccessTier')]",
104 |                 "isHnsEnabled": "[parameters('stgIsHnsEnabled')]"
105 |             },
106 |             "dependsOn": [],
107 |             "sku": {
108 |                 "name": "[parameters('stgAccountType')]"
109 |             },
110 |             "kind": "[parameters('stgKind')]",
111 |             "tags": {
112 |                 "Purpose": "Demo",
113 |                 "Project": "azure-databricks-mlops-mlflow"
114 |             }
115 |         },
116 |         {
117 |             "type": "Microsoft.Insights/components",
118 |             "apiVersion": "2020-02-02",
119 |             "name": "[parameters('aiName')]",
120 |             "location": "[parameters('location')]",
121 |             "kind": "other",
122 |             "tags": {
123 |                 "Purpose": "Demo",
124 |                 "Project": "azure-databricks-mlops-mlflow"
125 |             },
126 |             "properties": {
127 |                 "Application_Type": "web",
128 |                 "Flow_Type": "Bluefield",
129 |                 "Request_Source": "CustomDeployment"
130 |             }
131 |         }
132 |     ],
133 |     "outputs": {}
134 | }


--------------------------------------------------------------------------------
/ml_ops/deployment/databricks/cluster_template.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "cluster_name": "azure-databricks-mlops-mlflow",
 3 |     "spark_version": "10.4.x-cpu-ml-scala2.12",
 4 |     "num_workers": 0,
 5 |     "spark_conf": {
 6 |         "spark.databricks.cluster.profile": "singleNode",
 7 |         "spark.databricks.conda.condaMagic.enabled": "true",
 8 |         "spark.master": "local[*]"
 9 |     },
10 |     "node_type_id": "Standard_F4",
11 |     "driver_node_type_id": "Standard_F4",
12 |     "custom_tags": {
13 |         "ResourceClass": "SingleNode"
14 |     },
15 |     "autotermination_minutes": 30,
16 |     "enable_elastic_disk": true
17 | }


--------------------------------------------------------------------------------
/ml_ops/deployment/databricks/job_template_taxi_fares_batch_scoring.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "job_id": "FILL_JOB_ID",
 3 |     "name": "taxi_fares_batch_scoring",
 4 |     "existing_cluster_id": "FILL_CLUSTER_ID",
 5 |     "notebook_task": {
 6 |         "notebook_path": "/azure-databricks-mlops-mlflow/taxi_fares_orchestrator_batch_score",
 7 |         "base_parameters": {
 8 |             "taxi_fares_raw_data": "/databricks-datasets/nyctaxi-with-zipcodes/subsampled",
 9 |             "taxi_fares_mount_point": "/mnt/data_batch",
10 |             "mlflow_experiment_id": "FILL_MLFLOW_EXPERIMENT_ID",
11 |             "execute_feature_engineering": "true",
12 |             "scoring_data_start_date": "2016-02-01",
13 |             "scoring_data_end_date": "2016-02-29",
14 |             "trained_model_version": "",
15 |             "wheel_package_dbfs_base_path": "/dbfs/FileStore/libraries/azure-databricks-mlops-mlflow",
16 |             "wheel_package_taxi_fares_version": "0.0.1",
17 |             "wheel_package_taxi_fares_mlops_version": "0.0.1"
18 |         }
19 |     },
20 |     "timeout_seconds": 86400,
21 |     "email_notifications": {
22 |         "on_start": [],
23 |         "on_success": [],
24 |         "on_failure": []
25 |     }
26 | }


--------------------------------------------------------------------------------
/ml_ops/deployment/databricks/job_template_taxi_fares_training.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "job_id": "FILL_JOB_ID",
 3 |     "name": "taxi_fares_model_training",
 4 |     "existing_cluster_id": "FILL_CLUSTER_ID",
 5 |     "notebook_task": {
 6 |         "notebook_path": "/azure-databricks-mlops-mlflow/taxi_fares_orchestrator_train",
 7 |         "base_parameters": {
 8 |             "taxi_fares_raw_data": "/databricks-datasets/nyctaxi-with-zipcodes/subsampled",
 9 |             "mlflow_experiment_id": "FILL_MLFLOW_EXPERIMENT_ID",
10 |             "wheel_package_dbfs_base_path": "/dbfs/FileStore/libraries/azure-databricks-mlops-mlflow",
11 |             "wheel_package_taxi_fares_version": "0.0.1",
12 |             "wheel_package_taxi_fares_mlops_version": "0.0.1",
13 |             "execute_feature_engineering": "true",
14 |             "training_data_end_date": "2016-01-31",
15 |             "training_data_start_date": "2016-01-01",
16 |             "training_num_leaves": "32",
17 |             "training_objective": "regression",
18 |             "training_metric": "rmse",
19 |             "training_num_rounds": "100"
20 |         }
21 |     },
22 |     "timeout_seconds": 86400,
23 |     "email_notifications": {
24 |         "on_start": [],
25 |         "on_success": [],
26 |         "on_failure": []
27 |     }
28 | }


--------------------------------------------------------------------------------
/ml_ops/orchestrator/README.md:
--------------------------------------------------------------------------------
 1 | # Orchestrator
 2 | 
 3 | ## Overview
 4 | 
 5 | This document covers the design guide of the following orchestrators -
 6 | 
 7 | 1. [taxi_fares_orchestrator_train.py](taxi_fares_orchestrator_train.py)
 8 | 2. [taxi_fares_orchestrator_batch_score.py](taxi_fares_orchestrator_batch_score.py)
 9 | 
10 | ## Considerations
11 | 
12 | - It will be a Databricks notebook in Databricks workspace.
13 | - It will be stored in GIT as a python file.
14 | - It will use `dbutils` widgets for parametrization
15 | - It will use `pip magic commands` for managing libraries.
16 | - It will be executed from a Databricks Job.
17 | - It will perform logging in Application Insights
18 | - It will log artifacts, metrics, parameters, trained model into MLflow.
19 | 
20 | ## Parameters
21 | 
22 | ### Define Parameters
23 | 
24 | Parameters are defined using `dbutils.widgets.text`, example
25 | 
26 | ```py
27 | dbutils.widgets.text("<param_name>", "<default_value>")
28 | ```
29 | 
30 | ### Read Parameters
31 | 
32 | Parameters are read using `dbutils.widgets.get`, example
33 | 
34 | ```py
35 | param_value = dbutils.widgets.get("<param_name>")
36 | ```
37 | 
38 | ## Installation of libraries
39 | 
40 | ### How to enable %pip magic commands
41 | 
42 | Starting with Databricks Runtime ML version 6.4 this feature can be enabled when creating a cluster.
43 | To perform this set `spark.databricks.conda.condaMagic.enabled` to `true` under “Spark Config” (Edit > Advanced Options > Spark).
44 | 
45 | ### How to install libraries using pip
46 | 
47 | Libraries are installed as [Notebook-scoped Python libraries](https://docs.microsoft.com/en-us/azure/databricks/libraries/notebooks-python-libraries), example
48 | 
49 | ```sh
50 | %pip install dbfs/<path>/<package_name>.whl
51 | ```
52 | 
53 | ## Calling MLOps Python Functions
54 | 
55 | MLOps Python Functions are packaged as a wheel package and orchestrator notebook calls the python functions from wheel package.
56 | 
57 | ## Execution of Orchestrator
58 | 
59 | Orchestrator are executed from DataBricks Job.
60 | 
61 | ## Error handling
62 | 
63 | For error handling `try..catch` block is used to handle exceptions -
64 | 
65 | ```py
66 | try:
67 |   model = run_training()
68 | except(Exception ex):
69 |   logger.error(f"Encountered error: {ex.Message}") # To log exception in Application Insights
70 |   raise Exception(f"Encountered error - {ex}") from ex # To fail the Databricks Job Run
71 | ```
72 | 
73 | ## Observability
74 | 
75 | [OpenCensus](https://docs.microsoft.com/en-us/azure/azure-monitor/app/opencensus-python) library is used to capture logs and metrics and send it to Application Insights.
76 | 
77 | ## Secret Management
78 | 
79 | The following secrets need to be stored in [Databricks Secret Scope](https://docs.microsoft.com/en-us/azure/databricks/security/secrets/):
80 | 
81 | - Application Insights Instrumentation Key
82 | - Azure ADLS Gen2 Storage Details (account name, container name, shared access key)
83 | 
84 | Secrets are read using `dbutils.secrets.get`, example
85 | 
86 | ```py
87 | secret_value = dbutils.secrets.get(scope = "<scope-name>", key = "<secret-name>")
88 | ```
89 | 
90 | ## References
91 | 
92 | 1. [Enable pip magic commands](https://databricks.com/blog/2020/06/17/simplify-python-environment-management-on-databricks-runtime-for-machine-learning-using-pip-and-conda.html)
93 | 2. [OpenCensus](https://docs.microsoft.com/en-us/azure/azure-monitor/app/opencensus-python)
94 | 3. [DataBricks Job API](https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/latest/jobs)
95 | 4. [DataBricks Cluster API](https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/latest/clusters)
96 | 5. [DataBricks CLI](https://docs.microsoft.com/en-us/azure/databricks/dev-tools/cli/)
97 | 


--------------------------------------------------------------------------------
/ml_ops/orchestrator/taxi_fares_orchestrator_batch_score.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | """Orchestrator notebook for taxifares training."""
  3 | # Initialization of dbutils to avoid linting errors during developing in vscode
  4 | from pyspark.sql import SparkSession
  5 | 
  6 | 
  7 | def get_dbutils(spark):
  8 |     """Return dbutils for databricks."""
  9 |     if spark.conf.get("spark.databricks.service.client.enabled") == "true":
 10 |         from pyspark.dbutils import DBUtils
 11 | 
 12 |         return DBUtils(spark)
 13 |     else:
 14 |         import IPython
 15 | 
 16 |         return IPython.get_ipython().user_ns["dbutils"]
 17 | 
 18 | 
 19 | spark = SparkSession.builder.appName("Pipeline").getOrCreate()
 20 | dbutils = get_dbutils(spark)
 21 | 
 22 | # COMMAND ----------
 23 | 
 24 | # Define parameters
 25 | dbutils.widgets.text(
 26 |     "taxi_fares_raw_data", "/databricks-datasets/nyctaxi-with-zipcodes/subsampled"
 27 | )
 28 | dbutils.widgets.text("taxi_fares_mount_point", "/mnt/data")
 29 | dbutils.widgets.text("mlflow_experiment_id", "")
 30 | dbutils.widgets.text("wheel_package_dbfs_base_path", "")
 31 | dbutils.widgets.text("wheel_package_taxi_fares_version", "")
 32 | dbutils.widgets.text("wheel_package_taxi_fares_mlops_version", "")
 33 | dbutils.widgets.text("execute_feature_engineering", "true")
 34 | dbutils.widgets.text("trained_model_version", "")
 35 | dbutils.widgets.text("scoring_data_start_date", "2016-02-01")
 36 | dbutils.widgets.text("training_data_end_date", "2016-02-29")
 37 | 
 38 | # COMMAND ----------
 39 | 
 40 | # Get wheel package parameters
 41 | wheel_package_dbfs_base_path = dbutils.widgets.get(
 42 |     "wheel_package_dbfs_base_path")
 43 | wheel_package_taxi_fares_version = dbutils.widgets.get(
 44 |     "wheel_package_taxi_fares_version"
 45 | )
 46 | wheel_package_taxi_fares_mlops_version = dbutils.widgets.get(
 47 |     "wheel_package_taxi_fares_mlops_version"
 48 | )
 49 | 
 50 | # COMMAND ----------
 51 | 
 52 | # MAGIC %pip install $wheel_package_dbfs_base_path/taxi_fares-$wheel_package_taxi_fares_version-py3-none-any.whl # noqa: E501
 53 | # MAGIC %pip install $wheel_package_dbfs_base_path/taxi_fares_mlops-$wheel_package_taxi_fares_mlops_version-py3-none-any.whl # noqa: E501
 54 | 
 55 | # COMMAND ----------
 56 | 
 57 | # Imports
 58 | import os  # noqa: E402
 59 | import shutil  # noqa: E402
 60 | from datetime import datetime  # noqa: E402
 61 | from pathlib import Path  # noqa: E402
 62 | 
 63 | import mlflow  # noqa: E402
 64 | from databricks import feature_store  # noqa: E402
 65 | from monitoring.app_logger import AppLogger, get_disabled_logger  # noqa: E402
 66 | from taxi_fares.utils.pyspark_utils import rounded_taxi_data  # noqa: E402
 67 | from taxi_fares_mlops.feature_engineering import run as run_feature_engineering  # noqa
 68 | from taxi_fares_mlops.scoring_batch import run as run_scoring_batch  # noqa: E402
 69 | 
 70 | # COMMAND ----------
 71 | 
 72 | # Get other parameters
 73 | mlflow_experiment_id = dbutils.widgets.get("mlflow_experiment_id")
 74 | execute_feature_engineering = dbutils.widgets.get(
 75 |     "execute_feature_engineering")
 76 | taxi_fares_raw_data = dbutils.widgets.get("taxi_fares_raw_data")
 77 | taxi_fares_mount_point = dbutils.widgets.get("taxi_fares_mount_point")
 78 | trained_model_version = dbutils.widgets.get("trained_model_version")
 79 | scoring_data_start_date = dbutils.widgets.get("scoring_data_start_date")
 80 | scoring_data_end_date = dbutils.widgets.get("scoring_data_end_date")
 81 | 
 82 | # COMMAND ----------
 83 | 
 84 | # Initiate mlflow experiment
 85 | mlflow.start_run(experiment_id=int(mlflow_experiment_id),
 86 |                  run_name="batch_scoring")
 87 | mlflow_run = mlflow.active_run()
 88 | mlflow_run_id = mlflow_run.info.run_id
 89 | mlflow_log_tmp_dir = "/tmp/" + str(mlflow_run_id)  # nosec: B108
 90 | Path(mlflow_log_tmp_dir).mkdir(parents=True, exist_ok=True)
 91 | 
 92 | # initiate app logger
 93 | if any(
 94 |     [
 95 |         True
 96 |         for secret in dbutils.secrets.list(scope="azure-databricks-mlops-mlflow")
 97 |         if "app_insights_key" in secret.key
 98 |     ]
 99 | ):
100 |     app_insights_key = dbutils.secrets.get(
101 |         scope="azure-databricks-mlops-mlflow", key="app_insights_key"
102 |     )
103 |     config = {"app_insights_key": app_insights_key}
104 |     app_logger = AppLogger(config=config)
105 | else:
106 |     app_logger = get_disabled_logger()
107 | try:
108 |     logger = app_logger.get_logger(
109 |         component_name="Batch_Score_Orchestrator",
110 |         custom_dimensions={
111 |             "mlflow_run_id": mlflow_run_id,
112 |             "mlflow_experiment_id": int(mlflow_experiment_id),
113 |         },
114 |     )
115 |     tracer = app_logger.get_tracer(
116 |         component_name="Batch_Score_Orchestrator",
117 |     )
118 | except Exception as ex:
119 |     print(ex)
120 |     mlflow.end_run()
121 |     shutil.rmtree(mlflow_log_tmp_dir, ignore_errors=True)
122 |     raise Exception(f"ERROR - in initializing app logger - {ex}") from ex
123 | 
124 | logger.info(f"Stating batch scoring with mlflow run id {mlflow_run_id}")
125 | 
126 | # COMMAND ----------
127 | 
128 | # Mount ADLS Gen2 storage container
129 | try:
130 |     logger.info(f"Mounting {taxi_fares_mount_point}")
131 |     if any(mount.mountPoint == taxi_fares_mount_point for mount in dbutils.fs.mounts()):
132 |         logger.info(f"Mount point exists {taxi_fares_mount_point}")
133 |     else:
134 |         storage_account_name = dbutils.secrets.get(
135 |             scope="azure-databricks-mlops-mlflow", key="azure-blob-storage-account-name"
136 |         )
137 |         storage_container_name = dbutils.secrets.get(
138 |             scope="azure-databricks-mlops-mlflow",
139 |             key="azure-blob-storage-container-name",
140 |         )
141 |         storage_shared_key_name = dbutils.secrets.get(
142 |             scope="azure-databricks-mlops-mlflow",
143 |             key="azure-blob-storage-shared-access-key",
144 |         )
145 |         dbutils.fs.mount(
146 |             source=f"wasbs://{storage_container_name}@{storage_account_name}.blob.core.windows.net",  # noqa: E501
147 |             mount_point=taxi_fares_mount_point,
148 |             extra_configs={
149 |                 f"fs.azure.account.key.{storage_account_name}.blob.core.windows.net": storage_shared_key_name  # noqa: E501
150 |             },
151 |         )
152 | except Exception as ex:
153 |     print(ex)
154 |     mlflow.end_run()
155 |     shutil.rmtree(mlflow_log_tmp_dir, ignore_errors=True)
156 |     logger.exception(f"ERROR - in mounting adls - {ex}")
157 |     raise Exception(f"ERROR - in mounting adls - {ex}") from ex
158 | 
159 | # COMMAND ----------
160 | 
161 | # Clean up function
162 | 
163 | 
164 | def clean():
165 |     dbutils.fs.unmount(taxi_fares_mount_point)
166 |     mlflow.log_artifacts(mlflow_log_tmp_dir)
167 |     shutil.rmtree(mlflow_log_tmp_dir)
168 |     mlflow.end_run()
169 | 
170 | 
171 | # COMMAND ----------
172 | 
173 | # Get batch scoring raw data
174 | try:
175 |     logger.info("Reading training raw data")
176 |     raw_data_file = taxi_fares_raw_data
177 |     raw_data = spark.read.format("delta").load(raw_data_file)
178 |     mlflow.log_param("data_raw_rows", raw_data.count())
179 |     mlflow.log_param("data_raw_cols", len(raw_data.columns))
180 | except Exception as ex:
181 |     clean()
182 |     logger.exception(f"ERROR - in reading raw data - {ex}")
183 |     raise Exception(f"ERROR - in reading raw data - {ex}") from ex
184 | 
185 | # COMMAND ----------
186 | 
187 | 
188 | # Run feature engineering on batch scoring raw data
189 | if execute_feature_engineering == "true":
190 |     try:
191 |         logger.info("Starting feature engineering")
192 |         with tracer.span("run_feature_engineering"):
193 |             feature_engineered_data = run_feature_engineering(
194 |                 df_input=raw_data,
195 |                 start_date=datetime.strptime(
196 |                     scoring_data_start_date, "%Y-%m-%d"),
197 |                 end_date=datetime.strptime(scoring_data_end_date, "%Y-%m-%d"),
198 |                 mlflow=mlflow,
199 |                 mlflow_log_tmp_dir=mlflow_log_tmp_dir,
200 |                 explain_features=True,
201 |                 app_logger=app_logger,
202 |                 parent_tracer=tracer,
203 |             )
204 |     except Exception as ex:
205 |         clean()
206 |         logger.exception(f"ERROR - in feature engineering - {ex}")
207 |         raise Exception(f"ERROR - in feature engineering - {ex}") from ex
208 | else:
209 |     logger.info("Skipping feature engineering")
210 | 
211 | # COMMAND ----------
212 | 
213 | # MAGIC %sql
214 | # MAGIC CREATE DATABASE IF NOT EXISTS feature_store_taxi_example;
215 | 
216 | # COMMAND ----------
217 | 
218 | # Save features to feature store
219 | fs = feature_store.FeatureStoreClient()
220 | if execute_feature_engineering == "true":
221 |     try:
222 |         spark.conf.set("spark.sql.shuffle.partitions", "5")
223 | 
224 |         fs.create_table(
225 |             name="feature_store_taxi_example.trip_pickup_features",
226 |             primary_keys=["zip", "ts"],
227 |             df=feature_engineered_data[0],
228 |             partition_columns="yyyy_mm",
229 |             description="Taxi Fares. Pickup Features",
230 |         )
231 |         fs.create_table(
232 |             name="feature_store_taxi_example.trip_dropoff_features",
233 |             primary_keys=["zip", "ts"],
234 |             df=feature_engineered_data[1],
235 |             partition_columns="yyyy_mm",
236 |             description="Taxi Fares. Dropoff Features",
237 |         )
238 | 
239 |         # Write the pickup features DataFrame to the feature store table
240 |         fs.write_table(
241 |             name="feature_store_taxi_example.trip_pickup_features",
242 |             df=feature_engineered_data[0],
243 |             mode="merge",
244 |         )
245 |         # Write the dropoff features DataFrame to the feature store table
246 |         fs.write_table(
247 |             name="feature_store_taxi_example.trip_dropoff_features",
248 |             df=feature_engineered_data[1],
249 |             mode="merge",
250 |         )
251 |     except Exception as ex:
252 |         clean()
253 |         logger.exception(
254 |             f"ERROR - in feature saving into feature store - {ex}")
255 |         raise Exception(
256 |             f"ERROR - in feature saving into feature store - {ex}") from ex
257 | else:
258 |     logger.info("Skipping feature saving into feature store")
259 | 
260 | # COMMAND ----------
261 | 
262 | # Batch scoring
263 | try:
264 |     logger.info("Starting batch scoring")
265 |     with tracer.span("run_scoring_batch"):
266 |         run_scoring_batch(
267 |             trained_model_name="taxi_fares",
268 |             score_df=rounded_taxi_data(raw_data),
269 |             mlflow=mlflow,
270 |             mlflow_log_tmp_dir=mlflow_log_tmp_dir,
271 |             trained_model_version=trained_model_version,
272 |             app_logger=app_logger,
273 |             parent_tracer=tracer,
274 |         )
275 | except Exception as ex:
276 |     clean()
277 |     logger.exception(f"ERROR - in batch scoring - {ex}")
278 |     raise Exception(f"ERROR - in batch scoring - {ex}") from ex
279 | 
280 | 
281 | # COMMAND ----------
282 | 
283 | # Batch scoring result publish
284 | try:
285 |     logger.info("Starting batch scoring result publish to adls")
286 |     with tracer.span("publish_result"):
287 |         result_path = "/".join(
288 |             [
289 |                 "/dbfs",
290 |                 taxi_fares_mount_point,
291 |                 "batch_scoring_result",
292 |                 str(mlflow_run_id),
293 |             ]
294 |         )
295 |         Path(result_path).mkdir(parents=True, exist_ok=True)
296 |         shutil.copyfile(
297 |             os.path.join(mlflow_log_tmp_dir, "batch_scoring_result.html"),
298 |             os.path.join(
299 |                 result_path,
300 |                 "batch_scoring_result.html",
301 |             ),
302 |         )
303 |         shutil.copyfile(
304 |             os.path.join(mlflow_log_tmp_dir, "batch_scoring_result.csv"),
305 |             os.path.join(
306 |                 result_path,
307 |                 "batch_scoring_result.csv",
308 |             ),
309 |         )
310 |         logger.info(f"Published score result in {result_path}")
311 | except Exception as ex:
312 |     clean()
313 |     logger.exception(f"ERROR - in batch scoring result publish to adls - {ex}")
314 |     raise Exception(
315 |         f"ERROR - in batch scoring result publish to adls - {ex}") from ex
316 | 
317 | 
318 | # COMMAND ----------
319 | 
320 | # End
321 | logger.info(f"Completed batch scoring with mlflow run id {mlflow_run_id}")
322 | clean()
323 | 


--------------------------------------------------------------------------------
/ml_ops/orchestrator/taxi_fares_orchestrator_train.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | """Orchestrator notebook for taxifares training."""
  3 | # Initialization of dbutils to avoid linting errors during developing in vscode
  4 | from pyspark.sql import SparkSession
  5 | 
  6 | 
  7 | def get_dbutils(spark):
  8 |     """Return dbutils for databricks."""
  9 |     if spark.conf.get("spark.databricks.service.client.enabled") == "true":
 10 |         from pyspark.dbutils import DBUtils
 11 | 
 12 |         return DBUtils(spark)
 13 |     else:
 14 |         import IPython
 15 | 
 16 |         return IPython.get_ipython().user_ns["dbutils"]
 17 | 
 18 | 
 19 | spark = SparkSession.builder.appName("Pipeline").getOrCreate()
 20 | dbutils = get_dbutils(spark)
 21 | 
 22 | # COMMAND ----------
 23 | 
 24 | # Define parameters
 25 | dbutils.widgets.text(
 26 |     "taxi_fares_raw_data", "/databricks-datasets/nyctaxi-with-zipcodes/subsampled"
 27 | )
 28 | dbutils.widgets.text("mlflow_experiment_id", "")
 29 | dbutils.widgets.text("wheel_package_dbfs_base_path", "")
 30 | dbutils.widgets.text("wheel_package_taxi_fares_version", "")
 31 | dbutils.widgets.text("wheel_package_taxi_fares_mlops_version", "")
 32 | dbutils.widgets.text("execute_feature_engineering", "true")
 33 | dbutils.widgets.text("training_data_start_date", "2016-01-01")
 34 | dbutils.widgets.text("training_data_end_date", "2016-01-31")
 35 | dbutils.widgets.text("training_num_leaves", "32")
 36 | dbutils.widgets.text("training_objective", "regression")
 37 | dbutils.widgets.text("training_metric", "rmse")
 38 | dbutils.widgets.text("training_num_rounds", "100")
 39 | 
 40 | # COMMAND ----------
 41 | 
 42 | # Get wheel package parameters
 43 | wheel_package_dbfs_base_path = dbutils.widgets.get(
 44 |     "wheel_package_dbfs_base_path")
 45 | wheel_package_taxi_fares_version = dbutils.widgets.get(
 46 |     "wheel_package_taxi_fares_version"
 47 | )
 48 | wheel_package_taxi_fares_mlops_version = dbutils.widgets.get(
 49 |     "wheel_package_taxi_fares_mlops_version"
 50 | )
 51 | 
 52 | # COMMAND ----------
 53 | 
 54 | # MAGIC %pip install $wheel_package_dbfs_base_path/taxi_fares-$wheel_package_taxi_fares_version-py3-none-any.whl # noqa: E501
 55 | # MAGIC %pip install $wheel_package_dbfs_base_path/taxi_fares_mlops-$wheel_package_taxi_fares_mlops_version-py3-none-any.whl # noqa: E501
 56 | 
 57 | # COMMAND ----------
 58 | 
 59 | # Imports
 60 | import shutil  # noqa: E402
 61 | from datetime import datetime  # noqa: E402
 62 | from pathlib import Path  # noqa: E402
 63 | 
 64 | import mlflow  # noqa: E402
 65 | from databricks import feature_store  # noqa: E402
 66 | from databricks.feature_store import FeatureLookup  # noqa: E402
 67 | from monitoring.app_logger import AppLogger, get_disabled_logger  # noqa: E402
 68 | from taxi_fares.utils.pyspark_utils import rounded_taxi_data  # noqa: E402
 69 | from taxi_fares_mlops.feature_engineering import run as run_feature_engineering  # noqa
 70 | from taxi_fares_mlops.publish_model import run as run_publish_model  # noqa: E402
 71 | from taxi_fares_mlops.training import run as run_training  # noqa: E402
 72 | 
 73 | # COMMAND ----------
 74 | 
 75 | # Get other parameters
 76 | mlflow_experiment_id = dbutils.widgets.get("mlflow_experiment_id")
 77 | execute_feature_engineering = dbutils.widgets.get(
 78 |     "execute_feature_engineering")
 79 | training_data_start_date = dbutils.widgets.get("training_data_start_date")
 80 | training_data_end_date = dbutils.widgets.get("training_data_end_date")
 81 | taxi_fares_raw_data = dbutils.widgets.get("taxi_fares_raw_data")
 82 | training_num_leaves = int(dbutils.widgets.get("training_num_leaves"))
 83 | training_objective = dbutils.widgets.get("training_objective")
 84 | training_metric = dbutils.widgets.get("training_metric")
 85 | training_num_rounds = int(dbutils.widgets.get("training_num_rounds"))
 86 | 
 87 | # COMMAND ----------
 88 | 
 89 | # Initiate mlflow experiment
 90 | mlflow.start_run(experiment_id=int(mlflow_experiment_id), run_name="training")
 91 | mlflow_run = mlflow.active_run()
 92 | mlflow_run_id = mlflow_run.info.run_id
 93 | mlflow_log_tmp_dir = "/tmp/" + str(mlflow_run_id)  # nosec: B108
 94 | Path(mlflow_log_tmp_dir).mkdir(parents=True, exist_ok=True)
 95 | 
 96 | # initiate app logger
 97 | if any(
 98 |     [
 99 |         True
100 |         for secret in dbutils.secrets.list(scope="azure-databricks-mlops-mlflow")
101 |         if "app_insights_key" in secret.key
102 |     ]
103 | ):
104 |     app_insights_key = dbutils.secrets.get(
105 |         scope="azure-databricks-mlops-mlflow", key="app_insights_key"
106 |     )
107 |     config = {"app_insights_key": app_insights_key}
108 |     app_logger = AppLogger(config=config)
109 | else:
110 |     app_logger = get_disabled_logger()
111 | try:
112 |     logger = app_logger.get_logger(
113 |         component_name="Train_Orchestrator",
114 |         custom_dimensions={
115 |             "mlflow_run_id": mlflow_run_id,
116 |             "mlflow_experiment_id": int(mlflow_experiment_id),
117 |         },
118 |     )
119 |     tracer = app_logger.get_tracer(
120 |         component_name="Train_Orchestrator",
121 |     )
122 | except Exception as ex:
123 |     print(ex)
124 |     mlflow.end_run()
125 |     shutil.rmtree(mlflow_log_tmp_dir, ignore_errors=True)
126 |     raise Exception(f"ERROR - in initializing app logger - {ex}") from ex
127 | 
128 | logger.info(f"Stating training with mlflow run id {mlflow_run_id}")
129 | 
130 | # COMMAND ----------
131 | 
132 | # Clean up function
133 | 
134 | 
135 | def clean():
136 |     mlflow.log_artifacts(mlflow_log_tmp_dir)
137 |     shutil.rmtree(mlflow_log_tmp_dir)
138 |     mlflow.end_run()
139 | 
140 | 
141 | # COMMAND ----------
142 | 
143 | # Get training raw data
144 | try:
145 |     logger.info("Reading training raw data")
146 |     raw_data_file = taxi_fares_raw_data
147 |     raw_data = spark.read.format("delta").load(raw_data_file)
148 |     mlflow.log_param("data_raw_rows", raw_data.count())
149 |     mlflow.log_param("data_raw_cols", len(raw_data.columns))
150 | except Exception as ex:
151 |     clean()
152 |     logger.exception(f"ERROR - in reading raw data - {ex}")
153 |     raise Exception(f"ERROR - in reading raw data - {ex}") from ex
154 | 
155 | # COMMAND ----------
156 | 
157 | # Run feature engineering
158 | if execute_feature_engineering == "true":
159 |     try:
160 |         logger.info("Starting feature engineering")
161 |         with tracer.span("run_feature_engineering"):
162 |             feature_engineered_data = run_feature_engineering(
163 |                 df_input=raw_data,
164 |                 start_date=datetime.strptime(
165 |                     training_data_start_date, "%Y-%m-%d"),
166 |                 end_date=datetime.strptime(training_data_end_date, "%Y-%m-%d"),
167 |                 mlflow=mlflow,
168 |                 mlflow_log_tmp_dir=mlflow_log_tmp_dir,
169 |                 explain_features=True,
170 |                 app_logger=app_logger,
171 |                 parent_tracer=tracer,
172 |             )
173 |     except Exception as ex:
174 |         clean()
175 |         logger.exception(f"ERROR - in feature engineering - {ex}")
176 |         raise Exception(f"ERROR - in feature engineering - {ex}") from ex
177 | else:
178 |     logger.info("Skipping feature engineering")
179 | 
180 | # COMMAND ----------
181 | 
182 | # MAGIC %sql
183 | # MAGIC CREATE DATABASE IF NOT EXISTS feature_store_taxi_example;
184 | 
185 | # COMMAND ----------
186 | 
187 | # Save features to feature store
188 | fs = feature_store.FeatureStoreClient()
189 | if execute_feature_engineering == "true":
190 |     try:
191 |         spark.conf.set("spark.sql.shuffle.partitions", "5")
192 | 
193 |         fs.create_table(
194 |             name="feature_store_taxi_example.trip_pickup_features",
195 |             primary_keys=["zip", "ts"],
196 |             df=feature_engineered_data[0],
197 |             partition_columns="yyyy_mm",
198 |             description="Taxi Fares. Pickup Features",
199 |         )
200 |         fs.create_table(
201 |             name="feature_store_taxi_example.trip_dropoff_features",
202 |             primary_keys=["zip", "ts"],
203 |             df=feature_engineered_data[1],
204 |             partition_columns="yyyy_mm",
205 |             description="Taxi Fares. Dropoff Features",
206 |         )
207 | 
208 |         # Write the pickup features DataFrame to the feature store table
209 |         fs.write_table(
210 |             name="feature_store_taxi_example.trip_pickup_features",
211 |             df=feature_engineered_data[0],
212 |             mode="merge",
213 |         )
214 |         # Write the dropoff features DataFrame to the feature store table
215 |         fs.write_table(
216 |             name="feature_store_taxi_example.trip_dropoff_features",
217 |             df=feature_engineered_data[1],
218 |             mode="merge",
219 |         )
220 |     except Exception as ex:
221 |         clean()
222 |         logger.exception(
223 |             f"ERROR - in feature saving into feature store - {ex}")
224 |         raise Exception(
225 |             f"ERROR - in feature saving into feature store - {ex}") from ex
226 | else:
227 |     logger.info("Skipping feature saving into feature store")
228 | 
229 | # COMMAND ----------
230 | 
231 | # Load features from feature store
232 | try:
233 |     pickup_features_table = "feature_store_taxi_example.trip_pickup_features"
234 |     dropoff_features_table = "feature_store_taxi_example.trip_dropoff_features"
235 | 
236 |     pickup_feature_lookups = [
237 |         FeatureLookup(
238 |             table_name=pickup_features_table,
239 |             feature_names=[
240 |                 "mean_fare_window_1h_pickup_zip",
241 |                 "count_trips_window_1h_pickup_zip",
242 |             ],
243 |             lookup_key=["pickup_zip", "rounded_pickup_datetime"],
244 |         ),
245 |     ]
246 | 
247 |     dropoff_feature_lookups = [
248 |         FeatureLookup(
249 |             table_name=dropoff_features_table,
250 |             feature_names=["count_trips_window_30m_dropoff_zip",
251 |                            "dropoff_is_weekend"],
252 |             lookup_key=["dropoff_zip", "rounded_dropoff_datetime"],
253 |         ),
254 |     ]
255 | 
256 |     # unless additional feature engineering was performed,
257 |     # exclude them to avoid training on them.
258 |     exclude_columns = ["rounded_pickup_datetime", "rounded_dropoff_datetime"]
259 | 
260 |     # Create the training set that includes the raw input data merged with
261 |     # corresponding features from both feature tables
262 |     with tracer.span("create_training_set"):
263 |         training_set = fs.create_training_set(
264 |             rounded_taxi_data(raw_data),
265 |             feature_lookups=pickup_feature_lookups + dropoff_feature_lookups,
266 |             label="fare_amount",
267 |             exclude_columns=exclude_columns,
268 |         )
269 | 
270 |     # Load the TrainingSet into a dataframe which can be passed into
271 |     # sklearn for training a model
272 |     training_df = training_set.load_df()
273 | 
274 |     logger.info(
275 |         f"Shape of training dataframe, rows: {training_df.count()}, cols: {len(training_df.columns)}"  # noqa: E501
276 |     )
277 |     mlflow.log_param("training_data_rows", training_df.count())
278 |     mlflow.log_param("training_data_columns", len(training_df.columns))
279 | except Exception as ex:
280 |     clean()
281 |     logger.exception(f"ERROR - in feature loading from feature store - {ex}")
282 |     raise Exception(
283 |         f"ERROR - in feature loading from feature store - {ex}") from ex
284 | 
285 | # COMMAND ----------
286 | 
287 | # Run training
288 | try:
289 |     logger.info("Starting model training")
290 |     params = {
291 |         "num_leaves": training_num_leaves,
292 |         "objective": training_objective,
293 |         "metric": training_metric,
294 |     }
295 |     num_rounds = training_num_rounds
296 |     with tracer.span("run_training"):
297 |         trained_model = run_training(
298 |             training_df,
299 |             mlflow,
300 |             params=params,
301 |             num_rounds=num_rounds,
302 |             app_logger=app_logger,
303 |             parent_tracer=tracer,
304 |         )
305 | except Exception as ex:
306 |     clean()
307 |     logger.exception(f"ERROR - in model training - {ex}")
308 |     raise Exception(f"ERROR - in model training - {ex}") from ex
309 | 
310 | # COMMAND ----------
311 | 
312 | # Publish trained model
313 | try:
314 |     logger.info("Starting publish model")
315 |     with tracer.span("run_publish_model"):
316 |         run_publish_model(
317 |             trained_model=trained_model,
318 |             training_set=training_set,
319 |             mlflow=mlflow,
320 |             model_name="taxi_fares",
321 |             app_logger=app_logger,
322 |             parent_tracer=tracer,
323 |         )
324 | except Exception as ex:
325 |     clean()
326 |     logger.exception(f"ERROR - in publish trained model - {ex}")
327 |     raise Exception(f"ERROR - in publish trained model - {ex}") from ex
328 | 
329 | # COMMAND ----------
330 | 
331 | # End
332 | logger.info(f"Completed training with mlflow run id {mlflow_run_id}")
333 | clean()
334 | 


--------------------------------------------------------------------------------
/ml_ops/src/README.md:
--------------------------------------------------------------------------------
 1 | # SRC
 2 | 
 3 | ## Overview
 4 | 
 5 | Source code for MLOps, based on  -
 6 | 
 7 | 1. `taxifares_mlops` contains MLOps source code for `taxifares` machine learning code.
 8 | 2. The MLOps Python functions will be called from orchestrator Databricks Notebook.
 9 | 3. Ops related integrations (MLflow and Application Insights Metrics, Tracing, etc.) may happen in MLOps source code.
10 | 4. Mostly no Machine Learning (data science) related logics will be written in MLOps.
11 | 5. DataFrame I/O will happen in orchestrator Databricks Notebook, not in MLOps source code.
12 | 


--------------------------------------------------------------------------------
/ml_ops/src/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from setuptools import find_packages, setup
 4 | 
 5 | 
 6 | def read(fname):
 7 |     return open(os.path.join(os.path.dirname(__file__), fname)).read()
 8 | 
 9 | 
10 | requirements_file_name = "requirements.txt"
11 | with open(requirements_file_name) as f:
12 |     required_packages = f.read().splitlines()
13 | required_packages = [
14 |     package.strip(" ")
15 |     for package in required_packages
16 |     if package.strip(" ") and "#" not in package
17 | ]
18 | 
19 | setup(
20 |     name="taxi_fares_mlops",
21 |     version="0.0.1",
22 |     author="",
23 |     author_email="",
24 |     description=(""),
25 |     license="",
26 |     keywords="",
27 |     url="",
28 |     package_dir={"": "ml_ops/src"},
29 |     packages=find_packages(where="ml_ops/src"),
30 |     classifiers=[],
31 |     install_requires=required_packages,
32 | )
33 | 


--------------------------------------------------------------------------------
/ml_ops/src/taxi_fares_mlops/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/ml_ops/src/taxi_fares_mlops/__init__.py


--------------------------------------------------------------------------------
/ml_ops/src/taxi_fares_mlops/feature_engineering.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import math
  3 | from datetime import datetime
  4 | from pathlib import Path
  5 | from typing import Tuple
  6 | 
  7 | import matplotlib.pyplot as plt
  8 | import mlflow
  9 | import seaborn as sns
 10 | from monitoring.app_logger import AppLogger, get_disabled_logger
 11 | from opencensus.trace.tracer import Tracer
 12 | from pyspark.sql.dataframe import DataFrame
 13 | from taxi_fares.feature_eng.features import dropoff_features_fn, pickup_features_fn
 14 | 
 15 | 
 16 | def run(
 17 |     df_input: DataFrame,
 18 |     start_date: datetime,
 19 |     end_date: datetime,
 20 |     mlflow: mlflow,
 21 |     mlflow_log_tmp_dir: str,
 22 |     explain_features: bool = True,
 23 |     app_logger: AppLogger = get_disabled_logger(),
 24 |     parent_tracer: Tracer = None,
 25 | ) -> Tuple[DataFrame, DataFrame]:
 26 |     """MLOps feature engineering entry point.
 27 | 
 28 |     Args:
 29 |         df_input (pd.DataFrame): input data - raw
 30 |         mlflow (mlflow): mlflow object that is having an active run
 31 |                          initiated by mlflow.start_run
 32 |         mlflow_log_tmp_dir (str): directory for putting files to be logged
 33 |                                   in mlflow artifacts
 34 |         explain_features (bool, optional): explain features, possible only with
 35 |                                            training data. Defaults to True.
 36 |         app_logger (monitoring.app_logger.AppLogger): AppLogger object default
 37 |                                         to monitoring.app_logger.get_disabled_logger
 38 |         parent_tracer (Tracer): OpenCensus parent tracer for correlation
 39 |     Returns:
 40 |         pd.DataFrame: clean and feature engineered data
 41 |     """
 42 |     logger = logging.getLogger(__name__)
 43 |     try:
 44 |         component_name = "Taxi_Fares_Feature_Eng"
 45 |         # mlflow tracking
 46 |         mlflow_run = mlflow.active_run()
 47 |         mlflow_run_id = mlflow_run.info.run_id
 48 |         mlflow_experiment_id = mlflow_run.info.experiment_id
 49 | 
 50 |         logger = app_logger.get_logger(
 51 |             component_name=component_name,
 52 |             custom_dimensions={
 53 |                 "mlflow_run_id": mlflow_run_id,
 54 |                 "mlflow_experiment_id": mlflow_experiment_id,
 55 |             },
 56 |         )
 57 |         tracer = app_logger.get_tracer(
 58 |             component_name=component_name, parent_tracer=parent_tracer
 59 |         )
 60 | 
 61 |         logger.info("Running MLOps feature engineering")
 62 |         logger.info(
 63 |             f"Shape of input dataframe, rows: {df_input.count()}, cols: {len(df_input.columns)}"  # noqa: E501
 64 |         )
 65 | 
 66 |         logger.info("Getting pickup features")
 67 |         with tracer.span("pickup_features"):
 68 |             pickup_features = pickup_features_fn(
 69 |                 df_input,
 70 |                 ts_column="tpep_pickup_datetime",
 71 |                 start_date=start_date,
 72 |                 end_date=end_date,
 73 |             )
 74 |         logger.info(
 75 |             f"Shape of pickup features dataframe, rows: {pickup_features.count()}, cols: {len(pickup_features.columns)}"  # noqa: E501
 76 |         )
 77 |         mlflow.log_param(
 78 |             "feature_engineering_pickup_features",
 79 |             (pickup_features.count(), len(pickup_features.columns)),
 80 |         )
 81 | 
 82 |         logger.info("Getting drop off features")
 83 |         with tracer.span("dropoff_features"):
 84 |             dropoff_features = dropoff_features_fn(
 85 |                 df_input,
 86 |                 ts_column="tpep_dropoff_datetime",
 87 |                 start_date=start_date,
 88 |                 end_date=end_date,
 89 |             )
 90 |         logger.info(
 91 |             f"Shape of dropoff features dataframe, rows: {dropoff_features.count()}, cols: {len(dropoff_features.columns)}"  # noqa: E501
 92 |         )
 93 |         mlflow.log_param(
 94 |             "feature_engineering_dropoff_features",
 95 |             (dropoff_features.count(), len(dropoff_features.columns)),
 96 |         )
 97 | 
 98 |         with tracer.span("explain_features"):
 99 |             if explain_features:
100 |                 logger.info("Getting feature explanations - statistics")
101 |                 feature_statistic_pickup_features = (
102 |                     pickup_features.describe().toPandas()
103 |                 )
104 |                 feature_statistic_pickup_features.to_html(
105 |                     Path(
106 |                         mlflow_log_tmp_dir,
107 |                         "feature_statistic_pickup_features.html",
108 |                     ),
109 |                     justify="center",
110 |                     na_rep="",
111 |                 )
112 |                 feature_statistic_dropoff_features = (
113 |                     dropoff_features.describe().toPandas()
114 |                 )
115 |                 feature_statistic_dropoff_features.to_html(
116 |                     Path(
117 |                         mlflow_log_tmp_dir,
118 |                         "feature_statistic_dropoff_features.html",
119 |                     ),
120 |                     justify="center",
121 |                     na_rep="",
122 |                 )
123 |                 logger.info("Getting feature explanations - box plot")
124 |                 pickup_features_pandas = pickup_features.toPandas()[
125 |                     [
126 |                         "mean_fare_window_1h_pickup_zip",
127 |                         "count_trips_window_1h_pickup_zip",
128 |                     ]
129 |                 ]
130 |                 numeric_cols = pickup_features_pandas.columns
131 |                 plot_data = pickup_features_pandas.copy()
132 |                 select_top_k = len(numeric_cols)
133 |                 n_col = 2
134 |                 n_row = math.ceil(select_top_k / n_col)
135 |                 s_col = 5
136 |                 s_row = 3
137 |                 fig, axs = plt.subplots(
138 |                     n_row, n_col, figsize=(s_col * n_col, s_row * n_row), sharey=False
139 |                 )
140 |                 axs = axs.flatten()
141 |                 for index, col in enumerate(numeric_cols[:select_top_k]):
142 |                     ax = sns.boxplot(
143 |                         x="count_trips_window_1h_pickup_zip",
144 |                         y=col,
145 |                         data=plot_data,
146 |                         ax=axs[index],
147 |                     )
148 |                     ax.set(title=col, ylabel="")
149 |                 fig.tight_layout()
150 |                 fig.savefig(
151 |                     Path(mlflow_log_tmp_dir, "feature_pickup_features_boxplot.png")
152 |                 )
153 |                 dropoff_features_pandas = dropoff_features.toPandas()[
154 |                     ["count_trips_window_30m_dropoff_zip", "dropoff_is_weekend"]
155 |                 ]
156 |                 numeric_cols = dropoff_features_pandas.columns
157 |                 plot_data = dropoff_features_pandas.copy()
158 |                 select_top_k = len(numeric_cols)
159 |                 n_col = 2
160 |                 n_row = math.ceil(select_top_k / n_col)
161 |                 s_col = 5
162 |                 s_row = 3
163 |                 fig, axs = plt.subplots(
164 |                     n_row, n_col, figsize=(s_col * n_col, s_row * n_row), sharey=False
165 |                 )
166 |                 axs = axs.flatten()
167 |                 for index, col in enumerate(numeric_cols[:select_top_k]):
168 |                     ax = sns.boxplot(
169 |                         x="dropoff_is_weekend", y=col, data=plot_data, ax=axs[index]
170 |                     )
171 |                     ax.set(title=col, ylabel="")
172 |                 fig.tight_layout()
173 |                 fig.savefig(
174 |                     Path(mlflow_log_tmp_dir, "feature_dropoff_features_boxplot.png")
175 |                 )
176 | 
177 |         logger.info("Completed MLOps feature engineering")
178 |         return (pickup_features, dropoff_features)
179 |     except Exception as exp:
180 |         logger.error("an exception occurred in Feature Eng")
181 |         raise Exception("an exception occurred in Feature Eng") from exp
182 | 


--------------------------------------------------------------------------------
/ml_ops/src/taxi_fares_mlops/publish_model.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import lightgbm as lgb
 4 | import mlflow
 5 | from databricks import feature_store
 6 | from databricks.feature_store.training_set import TrainingSet
 7 | from mlflow.entities.model_registry import ModelVersion
 8 | from monitoring.app_logger import AppLogger, get_disabled_logger
 9 | from opencensus.trace.tracer import Tracer
10 | 
11 | from taxi_fares_mlops.utils import get_latest_model_version
12 | 
13 | 
14 | def run(
15 |     trained_model: lgb.Booster,
16 |     training_set: TrainingSet,
17 |     mlflow: mlflow,
18 |     model_name: str = "taxi_fares",
19 |     app_logger: AppLogger = get_disabled_logger(),
20 |     parent_tracer: Tracer = None,
21 | ) -> ModelVersion:
22 |     """MLOps publish model in mlflow model registry - entry point.
23 | 
24 |     Args:
25 |         trained_model (Ridge): trained Ridge model
26 |         mlflow (mlflow): mlflow object that is having an active run
27 |                          initiated by mlflow.start_run
28 |         model_name (str, optional): model name in mlflow model registry.
29 |                                     Defaults to "taxi_fares".
30 |         app_logger (monitoring.app_logger.AppLogger): AppLogger object deafult
31 |                                         to monitoring.app_logger.get_disabled_logger
32 |         parent_tracer (Tracer): OpenCensus parent tracer for correlation
33 |     Returns:
34 |         mlflow.entities.model_registry.ModelVersion: registered model details
35 |     """
36 |     logger = logging.getLogger(__name__)
37 |     try:
38 |         component_name = "Taxi_Fares_Publish_Model"
39 | 
40 |         # mlflow tracking
41 |         mlflow_run = mlflow.active_run()
42 |         mlflow_run_id = mlflow_run.info.run_id
43 |         mlflow_experiment_id = mlflow_run.info.experiment_id
44 | 
45 |         logger = app_logger.get_logger(
46 |             component_name=component_name,
47 |             custom_dimensions={
48 |                 "mlflow_run_id": mlflow_run_id,
49 |                 "mlflow_experiment_id": mlflow_experiment_id,
50 |             },
51 |         )
52 |         tracer = app_logger.get_tracer(
53 |             component_name=component_name, parent_tracer=parent_tracer
54 |         )
55 | 
56 |         logger.info("Publishing trained model into mlflow model registry")
57 |         with tracer.span("register_model"):
58 |             fs = feature_store.FeatureStoreClient()
59 |             fs.log_model(
60 |                 trained_model,
61 |                 artifact_path="model_packaged",
62 |                 flavor=mlflow.lightgbm,
63 |                 training_set=training_set,
64 |                 registered_model_name=model_name,
65 |             )
66 |         model_version = get_latest_model_version(model_name)
67 |         mlflow.log_param("model_version", model_version)
68 |         mlflow.log_param("model_name", model_name)
69 | 
70 |         logger.info(f"published model name: {model_name}, version: {model_version}")
71 |         logger.info("Completed MLOps publish model")
72 |     except Exception as exp:
73 |         logger.error("an exception occurred in publish model")
74 |         raise Exception("an exception occurred in publish model") from exp
75 | 


--------------------------------------------------------------------------------
/ml_ops/src/taxi_fares_mlops/scoring_batch.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from pathlib import Path
  3 | 
  4 | import mlflow
  5 | import pandas as pd
  6 | import pyspark.sql.functions as func
  7 | from databricks import feature_store
  8 | from monitoring.app_logger import AppLogger, get_disabled_logger
  9 | from opencensus.trace.tracer import Tracer
 10 | 
 11 | from taxi_fares_mlops.utils import get_latest_model_version
 12 | 
 13 | 
 14 | def run(
 15 |     trained_model_name: str,
 16 |     score_df: pd.DataFrame,
 17 |     mlflow: mlflow,
 18 |     mlflow_log_tmp_dir: str,
 19 |     trained_model_version: str = None,
 20 |     app_logger: AppLogger = get_disabled_logger(),
 21 |     parent_tracer: Tracer = None,
 22 | ) -> None:
 23 |     """[summary]
 24 | 
 25 |     Args:
 26 |         trained_model (Ridge): trained Ridge model
 27 |         df_input (pd.DataFrame): input dataframe for batch scoring,
 28 |                                  feature engineeringered.
 29 |         mlflow (mlflow): mlflow object that is having an active run
 30 |                          initiated by mlflow.start_run
 31 |         mlflow_log_tmp_dir (str): directory for puting files to be logged
 32 |                                   in mlflow artifacts
 33 |         app_logger (monitoring.app_logger.AppLogger): AppLogger object deafult
 34 |                                         to monitoring.app_logger.get_disabled_logger
 35 |         parent_tracer (Tracer): OpenCensus parent tracer for correlation
 36 |     """
 37 |     logger = logging.getLogger(__name__)
 38 |     try:
 39 |         component_name = "Taxi_Fares_Scoring_Batch"
 40 |         # mlflow tracking
 41 |         mlflow_run = mlflow.active_run()
 42 |         mlflow_run_id = mlflow_run.info.run_id
 43 |         mlflow_experiment_id = mlflow_run.info.experiment_id
 44 | 
 45 |         logger = app_logger.get_logger(
 46 |             component_name=component_name,
 47 |             custom_dimensions={
 48 |                 "mlflow_run_id": mlflow_run_id,
 49 |                 "mlflow_experiment_id": mlflow_experiment_id,
 50 |             },
 51 |         )
 52 |         tracer = app_logger.get_tracer(
 53 |             component_name=component_name, parent_tracer=parent_tracer
 54 |         )
 55 | 
 56 |         logger.info("Running MLOps batch scoring")
 57 |         with tracer.span("batch_scoring"):
 58 |             cols = [
 59 |                 "fare_amount",
 60 |                 "trip_distance",
 61 |                 "pickup_zip",
 62 |                 "dropoff_zip",
 63 |                 "rounded_pickup_datetime",
 64 |                 "rounded_dropoff_datetime",
 65 |             ]
 66 |             score_df_reordered = score_df.select(cols)
 67 |             if trained_model_version is None or trained_model_version == "":
 68 |                 trained_model_version = get_latest_model_version(
 69 |                     trained_model_name)
 70 |             else:
 71 |                 trained_model_version = int(trained_model_version)
 72 |             model_uri = f"models:/{trained_model_name}/{trained_model_version}"
 73 |             mlflow.log_param("trained_model_version", trained_model_version)
 74 |             logger.info(f"trained model version {trained_model_version}")
 75 |             fs = feature_store.FeatureStoreClient()
 76 |             predictions = fs.score_batch(model_uri, score_df_reordered)
 77 |             cols = [
 78 |                 "prediction",
 79 |                 "fare_amount",
 80 |                 "trip_distance",
 81 |                 "pickup_zip",
 82 |                 "dropoff_zip",
 83 |                 "rounded_pickup_datetime",
 84 |                 "rounded_dropoff_datetime",
 85 |                 "mean_fare_window_1h_pickup_zip",
 86 |                 "count_trips_window_1h_pickup_zip",
 87 |                 "count_trips_window_30m_dropoff_zip",
 88 |                 "dropoff_is_weekend",
 89 |             ]
 90 | 
 91 |             with_predictions_reordered = (
 92 |                 predictions.select(
 93 |                     cols,
 94 |                 )
 95 |                 .withColumnRenamed(
 96 |                     "prediction",
 97 |                     "predicted_fare_amount",
 98 |                 )
 99 |                 .withColumn(
100 |                     "predicted_fare_amount",
101 |                     func.round("predicted_fare_amount", 2),
102 |                 )
103 |             )
104 |         with_predictions_reordered.toPandas().to_html(
105 |             Path(
106 |                 mlflow_log_tmp_dir,
107 |                 "batch_scoring_result.html",
108 |             ),
109 |             justify="center",
110 |             na_rep="",
111 |         )
112 |         with_predictions_reordered.toPandas().to_csv(
113 |             Path(
114 |                 mlflow_log_tmp_dir,
115 |                 "batch_scoring_result.csv",
116 |             ),
117 |             index=False,
118 |         )
119 |         logger.info("Completed MLOps batch scoring")
120 |     except Exception as exp:
121 |         logger.error("an exception occurred in scoring batch")
122 |         raise Exception("an exception occurred in scoring batch") from exp
123 | 


--------------------------------------------------------------------------------
/ml_ops/src/taxi_fares_mlops/training.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from typing import Dict
 3 | 
 4 | import lightgbm as lgb
 5 | import mlflow
 6 | import pandas as pd
 7 | from monitoring.app_logger import AppLogger, get_disabled_logger
 8 | from opencensus.trace.tracer import Tracer
 9 | from taxi_fares.training.evaluate import get_model_metrics, split_data
10 | from taxi_fares.training.train import train
11 | 
12 | 
13 | def run(
14 |     train_df: pd.DataFrame,
15 |     mlflow: mlflow,
16 |     params: Dict = {"num_leaves": 32,
17 |                     "objective": "regression", "metric": "rmse"},
18 |     num_rounds: int = 100,
19 |     app_logger: AppLogger = get_disabled_logger(),
20 |     parent_tracer: Tracer = None,
21 | ) -> lgb.Booster:
22 |     """MLOps training entry point.
23 | 
24 |     Args:
25 |         train_df (pd.DataFrame): data for training, output of feature engineering
26 |         mlflow (mlflow): mlflow object that is having an active run
27 |                          initiated by mlflow.start_run
28 |         app_logger (monitoring.app_logger.AppLogger): AppLogger object deafult
29 |                                         to monitoring.app_logger.get_disabled_logger
30 |         parent_tracer (Tracer): OpenCensus parent tracer for correlation
31 |     Returns:
32 |         Ridge: trained model
33 |     """
34 |     logger = logging.getLogger(__name__)
35 |     try:
36 |         component_name = "Taxi_Fares_Training"
37 | 
38 |         # mlflow tracking
39 |         mlflow_run = mlflow.active_run()
40 |         mlflow_run_id = mlflow_run.info.run_id
41 |         mlflow_experiment_id = mlflow_run.info.experiment_id
42 | 
43 |         logger = app_logger.get_logger(
44 |             component_name=component_name,
45 |             custom_dimensions={
46 |                 "mlflow_run_id": mlflow_run_id,
47 |                 "mlflow_experiment_id": mlflow_experiment_id,
48 |             },
49 |         )
50 |         tracer = app_logger.get_tracer(
51 |             component_name=component_name, parent_tracer=parent_tracer
52 |         )
53 | 
54 |         logger.info("Running MLOps training")
55 | 
56 |         params = {"num_leaves": 32,
57 |                   "objective": "regression", "metric": "rmse"}
58 |         num_rounds = 100
59 |         for k, v in params.items():
60 |             logger.info(f"Training parameter {k}: {v}")
61 |         logger.info(f"Training parameter num_rounds: {num_rounds}")
62 | 
63 |         logger.info("Splitting data for train and test")
64 |         data = split_data(train_df)
65 | 
66 |         logger.info("Train the model")
67 |         with tracer.span("train_model"):
68 |             mlflow.lightgbm.autolog()
69 |             model = train(data["train"], params, num_rounds)
70 | 
71 |         logger.info("Log the metrics for the model")
72 |         metrics = get_model_metrics(model, data["test"])
73 |         for (k, v) in metrics.items():
74 |             logger.info(f"Metric {k}: {v}")
75 |             mlflow.log_metric(k, v)
76 | 
77 |         logger.info("Completed MLOps training")
78 |         return model
79 |     except Exception as exp:
80 |         logger.error("an exception occurred in training")
81 |         raise Exception("an exception occurred in training") from exp
82 | 


--------------------------------------------------------------------------------
/ml_ops/src/taxi_fares_mlops/utils.py:
--------------------------------------------------------------------------------
 1 | from mlflow.tracking import MlflowClient
 2 | 
 3 | 
 4 | def get_latest_model_version(model_name: str) -> int:
 5 |     latest_version = 1
 6 |     mlflow_client = MlflowClient()
 7 |     for mv in mlflow_client.search_model_versions(f"name='{model_name}'"):
 8 |         version_int = int(mv.version)
 9 |         if version_int > latest_version:
10 |             latest_version = version_int
11 |     return latest_version
12 | 


--------------------------------------------------------------------------------
/ml_ops/tests/README.md:
--------------------------------------------------------------------------------
1 | # TESTS
2 | 
3 | Unit test cases for `taxifares_mlops` MLOps source code.
4 | 


--------------------------------------------------------------------------------
/ml_ops/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/ml_ops/tests/__init__.py


--------------------------------------------------------------------------------
/ml_ops/tests/taxi_fares/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/ml_ops/tests/taxi_fares/__init__.py


--------------------------------------------------------------------------------
/ml_ops/tests/taxi_fares/data/taxi_fares_unit_test_training.csv:
--------------------------------------------------------------------------------
 1 | trip_distance,pickup_zip,dropoff_zip,mean_fare_window_1h_pickup_zip,count_trips_window_1h_pickup_zip,count_trips_window_30m_dropoff_zip,dropoff_is_weekend,fare_amount
 2 | 4.94,10282,10171,13,2,1,1,19
 3 | 0.28,10110,10110,3.5,1,2,0,3.5
 4 | 0.7,10103,10023,7.5,2,1,0,5
 5 | 0.8,10022,10017,6,1,1,0,6
 6 | 4.51,10110,10282,17,1,1,0,17
 7 | 1.8,10009,10065,8,2,1,0,7
 8 | 2.58,10153,10199,7.75,2,2,0,12
 9 | 1.4,10112,10069,11,1,1,0,11
10 | 1.21,10023,10153,7.75,2,1,1,7.5
11 | 0.6,10012,10003,7.5,2,2,1,6


--------------------------------------------------------------------------------
/ml_ops/tests/taxi_fares/test_publish_model.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import unittest
 3 | from unittest.mock import MagicMock, patch
 4 | 
 5 | from taxi_fares_mlops.publish_model import run
 6 | 
 7 | 
 8 | class TestEvaluateMethods(unittest.TestCase):
 9 |     logger = logging.getLogger(__name__)
10 |     logging.basicConfig(
11 |         format="%(asctime)s %(module)s %(levelname)s: %(message)s",
12 |         datefmt="%m/%d/%Y %I:%M:%S %p",
13 |         level=logging.INFO,
14 |     )
15 | 
16 |     @patch("taxi_fares_mlops.publish_model.feature_store")
17 |     @patch("taxi_fares_mlops.publish_model.get_latest_model_version")
18 |     def test_publish_model(self, mock_feature_store, mock_get_latest_model_version):
19 |         self.logger.info("unittest test_publish_model")
20 |         run(MagicMock(), MagicMock(), MagicMock())
21 |         assert True
22 | 
23 |     def test_publish_model_exception(self):
24 |         self.logger.info("unittest test_publish_model exception")
25 |         with self.assertRaises(Exception):
26 |             run(None, None, None)
27 |             assert True
28 | 
29 | 
30 | if __name__ == "__main__":
31 |     unittest.main()
32 | 


--------------------------------------------------------------------------------
/ml_ops/tests/taxi_fares/test_training.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import unittest
 4 | from unittest.mock import MagicMock
 5 | 
 6 | import lightgbm as lgb
 7 | import pandas as pd
 8 | from pyspark.sql import SparkSession
 9 | from taxi_fares_mlops.training import run
10 | 
11 | 
12 | class TestEvaluateMethods(unittest.TestCase):
13 |     @classmethod
14 |     def setUpClass(cls):
15 |         cls.spark = (
16 |             SparkSession.builder.master("local[*]").appName("Unit-tests").getOrCreate()
17 |         )
18 | 
19 |     logger = logging.getLogger(__name__)
20 |     logging.basicConfig(
21 |         format="%(asctime)s %(module)s %(levelname)s: %(message)s",
22 |         datefmt="%m/%d/%Y %I:%M:%S %p",
23 |         level=logging.INFO,
24 |     )
25 | 
26 |     def test_training(self):
27 |         self.logger.info("unittest test_training")
28 |         data_file = os.path.join(
29 |             "tests/taxi_fares/data", "taxi_fares_unit_test_training.csv"
30 |         )
31 |         train_df_pandas = pd.read_csv(data_file)
32 |         train_df = self.spark.createDataFrame(train_df_pandas)
33 |         model = run(train_df, MagicMock())
34 | 
35 |         assert isinstance(model, lgb.Booster)
36 | 
37 |     def test_training_exception(self):
38 |         self.logger.info("unittest test_training exception")
39 |         with self.assertRaises(Exception):
40 |             model = run(MagicMock(), MagicMock())
41 |             assert model is not None
42 | 
43 | 
44 | if __name__ == "__main__":
45 |     unittest.main()
46 | 


--------------------------------------------------------------------------------
/ml_ops/tests/taxi_fares/test_utils.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from unittest.mock import patch
 3 | 
 4 | from taxi_fares_mlops.utils import get_latest_model_version
 5 | 
 6 | 
 7 | class TestUtils(unittest.TestCase):
 8 |     @patch("taxi_fares_mlops.utils.MlflowClient")
 9 |     def test_get_latest_model_version(self, mock_mlflow_client):
10 |         assert get_latest_model_version("taxi_fares") == 1
11 | 


--------------------------------------------------------------------------------
/ml_source/README.md:
--------------------------------------------------------------------------------
 1 | # ML Source
 2 | 
 3 | ## Overview
 4 | 
 5 | This contains machine learning code. That will be developed, unit tested, packaged and delivered independently and typically maintained by Data scientist in an organization.
 6 | 
 7 | ## Contents
 8 | 
 9 | 1. [src](src/) : machine learning source code, that will be packaged as Python `wheel`.
10 | 2. [tests](tests/) : unit test cases for `src`.
11 | 


--------------------------------------------------------------------------------
/ml_source/src/README.md:
--------------------------------------------------------------------------------
 1 | # SRC
 2 | 
 3 | ## Overview
 4 | 
 5 | Source code for machine learning, based on  -
 6 | 
 7 | 1. [taxifares](taxi_fares/) contains machine learning source code.
 8 | 2. `monitoring` contains logging class for logging into Application Insights.
 9 | 3. The machine python functions will be called from MLOps python functions.
10 | 


--------------------------------------------------------------------------------
/ml_source/src/monitoring/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/ml_source/src/monitoring/__init__.py


--------------------------------------------------------------------------------
/ml_source/src/monitoring/app_logger.py:
--------------------------------------------------------------------------------
  1 | """This module is used to log traces into Azure Application Insights."""
  2 | import logging
  3 | import uuid
  4 | from os import getenv
  5 | 
  6 | from opencensus.ext.azure.common import utils
  7 | from opencensus.ext.azure.log_exporter import AzureLogHandler
  8 | from opencensus.ext.azure.trace_exporter import AzureExporter
  9 | from opencensus.trace import config_integration
 10 | from opencensus.trace.samplers import AlwaysOffSampler, AlwaysOnSampler
 11 | from opencensus.trace.tracer import Tracer
 12 | 
 13 | 
 14 | class CustomDimensionsFilter(logging.Filter):
 15 |     """Add custom-dimensions like run_id in each log by using filters."""
 16 | 
 17 |     def __init__(self, custom_dimensions=None):
 18 |         """Initialize CustomDimensionsFilter."""
 19 |         self.custom_dimensions = custom_dimensions or {}
 20 | 
 21 |     def filter(self, record):
 22 |         """Add the default custom_dimensions into the current log record."""
 23 |         dim = {**self.custom_dimensions, **
 24 |                getattr(record, "custom_dimensions", {})}
 25 |         record.custom_dimensions = dim
 26 |         return True
 27 | 
 28 | 
 29 | class AppLogger:
 30 |     """Logger wrapper that attach the handler to Application Insights."""
 31 | 
 32 |     HANDLER_NAME = "Azure Application Insights Handler"
 33 | 
 34 |     def __init__(self, config=None):
 35 |         """Create an instance of the Logger class.
 36 | 
 37 |         Args:
 38 |             config:([dict], optional):
 39 |                 Contains the setting for logger {"log_level": logging.debug,"env":"dev",
 40 |                                     "app_insights_key":"<app insights key>"}
 41 |             parent:tracer([opencensus.trace.tracer], optional):
 42 |                 Contains parent tracer required for setting coorelation.
 43 |         """
 44 |         self.config = {"log_level": logging.INFO, "logging_enabled": "true"}
 45 |         self.APPINSIGHTS_INSTRUMENTATION_KEY = "APPINSIGHTS_INSTRUMENTATION_KEY"
 46 |         self.update_config(config)
 47 |         pass
 48 | 
 49 |     def _initialize_azure_log_handler(self, component_name, custom_dimensions):
 50 |         """Initialize azure log handler."""
 51 |         # Adding logging to trace_integrations
 52 |         # This will help in adding trace and span ids to logs
 53 |         # https://github.com/census-instrumentation/opencensus-python/tree/master/contrib/opencensus-ext-logging
 54 | 
 55 |         config_integration.trace_integrations(["logging"])
 56 |         logging.basicConfig(
 57 |             format="%(asctime)s name=%(name)s level=%(levelname)s "
 58 |             "traceId=%(traceId)s spanId=%(spanId)s %(message)s"
 59 |         )
 60 |         app_insights_cs = "InstrumentationKey=" + self._get_app_insights_key()
 61 |         log_handler = AzureLogHandler(
 62 |             connection_string=app_insights_cs, export_interval=0.0
 63 |         )
 64 |         log_handler.add_telemetry_processor(self._get_callback(component_name))
 65 |         log_handler.name = self.HANDLER_NAME
 66 |         log_handler.addFilter(CustomDimensionsFilter(custom_dimensions))
 67 |         return log_handler
 68 | 
 69 |     def _initialize_azure_log_exporter(self, component_name):
 70 |         """Initialize azure log exporter."""
 71 |         app_insights_cs = "InstrumentationKey=" + self._get_app_insights_key()
 72 |         log_exporter = AzureExporter(
 73 |             connection_string=app_insights_cs, export_interval=0.0
 74 |         )
 75 |         log_exporter.add_telemetry_processor(
 76 |             self._get_callback(component_name))
 77 |         return log_exporter
 78 | 
 79 |     def _initialize_logger(self, log_handler, component_name):
 80 |         """Initialize Logger."""
 81 |         logger = logging.getLogger(component_name)
 82 |         logger.setLevel(self.log_level)
 83 |         if self.config.get("logging_enabled") == "true":
 84 |             if not any(x for x in logger.handlers if x.name == self.HANDLER_NAME):
 85 |                 logger.addHandler(log_handler)
 86 |         return logger
 87 | 
 88 |     def get_logger(self, component_name="TaxiFaresMlOps", custom_dimensions={}):
 89 |         """Get Logger Object.
 90 | 
 91 |         Args:
 92 |             component_name (str, optional): Name of logger. Defaults to "TaxiFaresMlOps".
 93 |             custom_dimensions (dict, optional): {"key":"value"}
 94 |                                                 to capture with every log.
 95 |                                                 Defaults to {}.
 96 | 
 97 |         Returns:
 98 |             Logger: A logger.
 99 |         """
100 |         self.update_config(self.config)
101 |         handler = self._initialize_azure_log_handler(
102 |             component_name, custom_dimensions)
103 |         return self._initialize_logger(handler, component_name)
104 | 
105 |     def get_tracer(self, component_name="TaxiFaresMlOps", parent_tracer=None):
106 |         """Get Tracer Object.
107 | 
108 |         Args:
109 |             component_name (str, optional): Name of logger. Defaults to "TaxiFaresMlOps".
110 |             parent_tracer([opencensus.trace.tracer], optional):
111 |                 Contains parent tracer required for setting coorelation.
112 | 
113 |         Returns:
114 |             opencensus.trace.tracer: A Tracer.
115 |         """
116 |         self.update_config(self.config)
117 |         sampler = AlwaysOnSampler()
118 |         exporter = self._initialize_azure_log_exporter(component_name)
119 |         if self.config.get("logging_enabled") != "true":
120 |             sampler = AlwaysOffSampler()
121 |         if parent_tracer is None:
122 |             tracer = Tracer(exporter=exporter, sampler=sampler)
123 |         else:
124 |             tracer = Tracer(
125 |                 span_context=parent_tracer.span_context,
126 |                 exporter=exporter,
127 |                 sampler=sampler,
128 |             )
129 |         return tracer
130 | 
131 |     def _get_app_insights_key(self):
132 |         """Get Application Insights Key."""
133 |         try:
134 |             if self.app_insights_key is None:
135 |                 self.app_insights_key = getenv(
136 |                     self.APPINSIGHTS_INSTRUMENTATION_KEY, None
137 |                 )
138 |             if self.app_insights_key is not None:
139 |                 utils.validate_instrumentation_key(self.app_insights_key)
140 |                 return self.app_insights_key
141 |             else:
142 |                 raise Exception("ApplicationInsights Key is not set")
143 |         except Exception as exp:
144 |             raise Exception(f"Exception is getting app insights key-> {exp}")
145 | 
146 |     def _get_callback(self, component_name):
147 |         def _callback_add_role_name(envelope):
148 |             """Add role name for logger."""
149 |             envelope.tags["ai.cloud.role"] = component_name
150 |             envelope.tags["ai.cloud.roleInstance"] = component_name
151 | 
152 |         return _callback_add_role_name
153 | 
154 |     def update_config(self, config=None):
155 |         """Update logger configuration."""
156 |         if config is not None:
157 |             self.config.update(config)
158 |         self.app_insights_key = self.config.get("app_insights_key")
159 |         self.log_level = self.config.get("log_level")
160 | 
161 | 
162 | def get_disabled_logger():
163 |     """Get a disabled AppLogger.
164 | 
165 |     Returns:
166 |         AppLogger: A disabled AppLogger
167 |     """
168 |     return AppLogger(
169 |         config={"logging_enabled": "false",
170 |                 "app_insights_key": str(uuid.uuid1())}
171 |     )
172 | 


--------------------------------------------------------------------------------
/ml_source/src/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from setuptools import find_packages, setup
 4 | 
 5 | 
 6 | # Utility function to read the README file.
 7 | # Used for the long_description.  It's nice, because now 1) we have a top level
 8 | # README file and 2) it's easier to type in the README file than to put a raw
 9 | # string in below ...
10 | def read(fname):
11 |     return open(os.path.join(os.path.dirname(__file__), fname)).read()
12 | 
13 | 
14 | requirements_file_name = "requirements.txt"
15 | with open(requirements_file_name) as f:
16 |     required_packages = f.read().splitlines()
17 | required_packages = [
18 |     package.strip(" ")
19 |     for package in required_packages
20 |     if package.strip(" ") and "#" not in package
21 | ]
22 | setup(
23 |     name="taxi_fares",
24 |     version="0.0.1",
25 |     author="",
26 |     author_email="",
27 |     description=(""),
28 |     license="",
29 |     keywords="",
30 |     url="",
31 |     package_dir={"": "ml_source/src"},
32 |     packages=find_packages(where="ml_source/src"),
33 |     classifiers=[],
34 |     install_requires=required_packages,
35 | )
36 | 


--------------------------------------------------------------------------------
/ml_source/src/taxi_fares/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/ml_source/src/taxi_fares/__init__.py


--------------------------------------------------------------------------------
/ml_source/src/taxi_fares/feature_eng/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/ml_source/src/taxi_fares/feature_eng/__init__.py


--------------------------------------------------------------------------------
/ml_source/src/taxi_fares/feature_eng/features.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | 
 3 | from pyspark.sql.dataframe import DataFrame
 4 | from pyspark.sql.functions import col, count, mean, to_timestamp, unix_timestamp, window
 5 | from pyspark.sql.types import FloatType, IntegerType
 6 | from taxi_fares.utils.pyspark_utils import filter_df_by_ts, is_weekend, partition_id
 7 | 
 8 | 
 9 | def pickup_features_fn(
10 |     df: DataFrame, ts_column: str, start_date: datetime, end_date: datetime
11 | ) -> DataFrame:
12 |     """
13 |     Computes the pickup_features feature group.
14 |     To restrict features to a time range, pass in ts_column, start_date,
15 |     and/or end_date as kwargs.
16 |     """
17 |     df = filter_df_by_ts(df, ts_column, start_date, end_date)
18 |     pickupzip_features = (
19 |         df.groupBy(
20 |             "pickup_zip", window("tpep_pickup_datetime", "1 hour", "15 minutes")
21 |         )  # 1 hour window, sliding every 15 minutes
22 |         .agg(
23 |             mean("fare_amount").alias("mean_fare_window_1h_pickup_zip"),
24 |             count("*").alias("count_trips_window_1h_pickup_zip"),
25 |         )
26 |         .select(
27 |             col("pickup_zip").alias("zip"),
28 |             unix_timestamp(col("window.end")).alias("ts").cast(IntegerType()),
29 |             partition_id(to_timestamp(col("window.end"))).alias("yyyy_mm"),
30 |             col("mean_fare_window_1h_pickup_zip").cast(FloatType()),
31 |             col("count_trips_window_1h_pickup_zip").cast(IntegerType()),
32 |         )
33 |     )
34 |     return pickupzip_features
35 | 
36 | 
37 | def dropoff_features_fn(
38 |     df: DataFrame, ts_column: str, start_date: datetime, end_date: datetime
39 | ) -> DataFrame:
40 |     """
41 |     Computes the dropoff_features feature group.
42 |     To restrict features to a time range, pass in ts_column, start_date,
43 |     and/or end_date as kwargs.
44 |     """
45 |     df = filter_df_by_ts(df, ts_column, start_date, end_date)
46 |     dropoffzip_features = (
47 |         df.groupBy("dropoff_zip", window("tpep_dropoff_datetime", "30 minute"))
48 |         .agg(count("*").alias("count_trips_window_30m_dropoff_zip"))
49 |         .select(
50 |             col("dropoff_zip").alias("zip"),
51 |             unix_timestamp(col("window.end")).alias("ts").cast(IntegerType()),
52 |             partition_id(to_timestamp(col("window.end"))).alias("yyyy_mm"),
53 |             col("count_trips_window_30m_dropoff_zip").cast(IntegerType()),
54 |             is_weekend(col("window.end")).alias("dropoff_is_weekend"),
55 |         )
56 |     )
57 |     return dropoffzip_features
58 | 


--------------------------------------------------------------------------------
/ml_source/src/taxi_fares/training/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/ml_source/src/taxi_fares/training/__init__.py


--------------------------------------------------------------------------------
/ml_source/src/taxi_fares/training/evaluate.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict
 2 | 
 3 | import lightgbm as lgb
 4 | import numpy as np
 5 | from pyspark.sql.dataframe import DataFrame
 6 | from sklearn.metrics import mean_squared_error
 7 | from sklearn.model_selection import train_test_split
 8 | 
 9 | 
10 | def split_data(df: DataFrame) -> dict:
11 |     """Split the dataframe into test and train data.
12 | 
13 |     Args:
14 |         df (pd.DataFrame): processed dataframe for train and evaluate
15 | 
16 |     Returns:
17 |         dict: splitted data for train and test -
18 |                 {
19 |                     "train":
20 |                         "X": np.array,
21 |                         "y": np.array,
22 |                     },
23 |                     "test":
24 |                         "X": np.array,
25 |                         "y": np.array,
26 |                     }
27 |                 }
28 |     """
29 |     features_and_label = df.columns
30 | 
31 |     # Collect data into a Pandas array for training
32 |     data = df.toPandas()[features_and_label]
33 | 
34 |     train, test = train_test_split(data, random_state=123)
35 |     X_train = train.drop(["fare_amount"], axis=1)
36 |     y_train = train.fare_amount
37 |     X_test = test.drop(["fare_amount"], axis=1)
38 |     y_test = test.fare_amount
39 | 
40 |     data = {"train": {"X": X_train, "y": y_train}, "test": {"X": X_test, "y": y_test}}
41 |     return data
42 | 
43 | 
44 | def get_model_metrics(model: lgb.Booster, test_data: Dict[str, np.ndarray]) -> dict:
45 |     """Evaluate the metrics for the model.
46 | 
47 |     Args:
48 |         model (Ridge): trained ridge model
49 |         test_data (Dict[np.array]): test data with X key for features and y key labels
50 | 
51 |     Returns:
52 |         dict: mse metrics
53 |     """
54 |     preds = model.predict(test_data["X"])
55 |     mse = mean_squared_error(preds, test_data["y"])
56 |     metrics = {"mse": mse}
57 |     return metrics
58 | 


--------------------------------------------------------------------------------
/ml_source/src/taxi_fares/training/train.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict
 2 | 
 3 | import lightgbm as lgb
 4 | import numpy as np
 5 | 
 6 | 
 7 | def train(
 8 |     train_data: Dict[str, np.ndarray], params: dict, num_rounds: int
 9 | ) -> lgb.Booster:
10 |     train_lgb_dataset = lgb.Dataset(train_data["X"], label=train_data["y"].values)
11 | 
12 |     # Train a lightGBM model
13 |     model = lgb.train(params, train_lgb_dataset, num_rounds)
14 |     return model
15 | 


--------------------------------------------------------------------------------
/ml_source/src/taxi_fares/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/ml_source/src/taxi_fares/utils/__init__.py


--------------------------------------------------------------------------------
/ml_source/src/taxi_fares/utils/pyspark_utils.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | from datetime import datetime, timedelta
 3 | 
 4 | from pyspark.sql.column import Column
 5 | from pyspark.sql.dataframe import DataFrame
 6 | from pyspark.sql.functions import col, lit, udf
 7 | from pyspark.sql.types import IntegerType, StringType
 8 | from pytz import timezone
 9 | 
10 | 
11 | @udf(returnType=IntegerType())
12 | def is_weekend(dt: Column) -> Column:
13 |     tz = "America/New_York"
14 |     return int(dt.astimezone(timezone(tz)).weekday() >= 5)  # 5 = Saturday, 6 = Sunday
15 | 
16 | 
17 | @udf(returnType=StringType())
18 | def partition_id(dt: Column) -> Column:
19 |     # datetime -> "YYYY-MM"
20 |     return f"{dt.year:04d}-{dt.month:02d}"
21 | 
22 | 
23 | def filter_df_by_ts(
24 |     df: DataFrame, ts_column: str, start_date: datetime, end_date: datetime
25 | ) -> DataFrame:
26 |     if ts_column and start_date:
27 |         df = df.filter(col(ts_column) >= start_date)
28 |     if ts_column and end_date:
29 |         df = df.filter(col(ts_column) < end_date)
30 |     return df
31 | 
32 | 
33 | def rounded_unix_timestamp(dt, num_minutes=15):
34 |     """
35 |     Ceilings datetime dt to interval num_minutes, then returns the unix timestamp.
36 |     """
37 |     nsecs = dt.minute * 60 + dt.second + dt.microsecond * 1e-6
38 |     delta = math.ceil(nsecs / (60 * num_minutes)) * (60 * num_minutes) - nsecs
39 |     return int((dt + timedelta(seconds=delta)).timestamp())
40 | 
41 | 
42 | rounded_unix_timestamp_udf = udf(rounded_unix_timestamp, IntegerType())
43 | 
44 | 
45 | def rounded_taxi_data(taxi_data_df):
46 |     # Round the taxi data timestamp to 15 and 30 minute intervals so we can join with
47 |     # the pickup and dropoff features
48 |     # respectively.
49 |     taxi_data_df = (
50 |         taxi_data_df.withColumn(
51 |             "rounded_pickup_datetime",
52 |             rounded_unix_timestamp_udf(taxi_data_df["tpep_pickup_datetime"], lit(15)),
53 |         )
54 |         .withColumn(
55 |             "rounded_dropoff_datetime",
56 |             rounded_unix_timestamp_udf(taxi_data_df["tpep_dropoff_datetime"], lit(30)),
57 |         )
58 |         .drop("tpep_pickup_datetime")
59 |         .drop("tpep_dropoff_datetime")
60 |     )
61 |     taxi_data_df.createOrReplaceTempView("taxi_data")
62 |     return taxi_data_df
63 | 


--------------------------------------------------------------------------------
/ml_source/tests/README.md:
--------------------------------------------------------------------------------
1 | # TESTS
2 | 
3 | Unit test cases for `taxifares` machine learning source code.
4 | 


--------------------------------------------------------------------------------
/ml_source/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/ml_source/tests/__init__.py


--------------------------------------------------------------------------------
/ml_source/tests/monitoring/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/ml_source/tests/monitoring/__init__.py


--------------------------------------------------------------------------------
/ml_source/tests/monitoring/test_app_logger.py:
--------------------------------------------------------------------------------
  1 | """Test src/monitoring/app_logger.py."""
  2 | 
  3 | import logging
  4 | import unittest
  5 | import uuid
  6 | 
  7 | from monitoring.app_logger import AppLogger, get_disabled_logger
  8 | 
  9 | test_instrumentation_key = str(uuid.uuid1())
 10 | test_invalid_instrumentation_key = "invalid_instrumentation_key"
 11 | 
 12 | 
 13 | class TestAppLogger(unittest.TestCase):
 14 |     @classmethod
 15 |     def setUpClass(cls):
 16 |         cls.valid_config = {
 17 |             "log_level": "DEBUG",
 18 |             "logging_enabled": "true",
 19 |             "app_insights_key": test_instrumentation_key,
 20 |         }
 21 |         cls.invalid_config = {
 22 |             "log_level": "DEBUG",
 23 |             "logging_enabled": "false",
 24 |             "app_insights_key": test_invalid_instrumentation_key,
 25 |         }
 26 | 
 27 |     def test_logger_creation_valid_instrumentation_key(self):
 28 |         """Test with valid formatted instrumentation key."""
 29 |         global test_instrumentation_key
 30 |         try:
 31 |             app_logger = AppLogger(
 32 |                 config=self.valid_config,
 33 |             )
 34 |             assert app_logger is not None
 35 |         except Exception:
 36 |             assert False
 37 | 
 38 |     def test_logger_creation_invalid_instrumentation_key(self):
 39 |         """Test with invalid instrumentation key."""
 40 |         global test_invalid_instrumentation_key
 41 |         with self.assertRaises(Exception):
 42 |             logging.disable(logging.CRITICAL)
 43 |             app_logger = AppLogger(
 44 |                 config=self.invalid_config,
 45 |             )
 46 |             app_logger.get_logger()
 47 |             assert app_logger is not None
 48 | 
 49 |     def test_logger_creation_no_instrumentation_key(self):
 50 |         """Test with no instrumentation key."""
 51 |         with self.assertRaises(Exception):
 52 |             logging.disable(logging.CRITICAL)
 53 |             config = {"log_level": logging.DEBUG, "logging_enabled": "false"}
 54 |             app_logger = AppLogger(config=config)
 55 |             app_logger.get_logger()
 56 |             assert app_logger is not None
 57 | 
 58 |     def test_logging(self):
 59 |         """Test to use logging functions."""
 60 |         global test_instrumentation_key
 61 |         try:
 62 |             component_name = "TestComponent"
 63 |             app_logger = AppLogger(config=self.valid_config)
 64 |             assert app_logger is not None
 65 |             test_logger = app_logger.get_logger(
 66 |                 component_name=component_name,
 67 |             )
 68 | 
 69 |             assert test_logger is not None
 70 |             test_logger.info("Test Logging")
 71 |         except Exception:
 72 |             assert False
 73 | 
 74 |     def test_tracing(self):
 75 |         """Test for Tracer."""
 76 |         global test_instrumentation_key
 77 |         try:
 78 |             component_name = "TestComponent"
 79 |             app_logger = AppLogger(config=self.valid_config)
 80 |             assert app_logger is not None
 81 | 
 82 |             tracer = app_logger.get_tracer(
 83 |                 component_name=component_name,
 84 |             )
 85 |             tracer_with_parent = app_logger.get_tracer(
 86 |                 component_name=component_name, parent_tracer=tracer
 87 |             )
 88 |             test_logger = app_logger.get_logger(
 89 |                 component_name=component_name,
 90 |             )
 91 | 
 92 |             assert test_logger is not None
 93 |             assert tracer is not None
 94 |             assert tracer_with_parent is not None
 95 | 
 96 |             with tracer.span(name="testspan"):
 97 |                 test_logger.info("in test span")
 98 |         except Exception:
 99 |             assert False
100 | 
101 |     def test_tracing_with_disabled_logger(self):
102 |         """Test with no instrumentation key."""
103 |         app_logger = get_disabled_logger()
104 |         tracer = app_logger.get_tracer()
105 |         assert tracer is not None
106 | 
107 |     def test_exception(self):
108 |         """Test for calling logger.exception method."""
109 |         global test_instrumentation_key
110 |         try:
111 |             component_name = "TestComponent"
112 |             app_logger = AppLogger(
113 |                 config=self.valid_config,
114 |             )
115 |             assert app_logger is not None
116 | 
117 |             test_logger = app_logger.get_logger(
118 |                 component_name=component_name,
119 |             )
120 |             assert test_logger is not None
121 |             try:
122 |                 raise Exception("Testing exception logging")
123 |             except Exception as exp:
124 |                 test_logger.exception(exp)
125 |         except Exception:
126 |             assert False
127 | 
128 |     def test_logging_level(self):
129 |         """Test for changing logger level in config."""
130 |         try:
131 |             global test_instrumentation_key
132 |             component_name = "TestComponent"
133 |             valid_config = self.valid_config.copy()
134 |             valid_config["log_level"] = logging.ERROR
135 |             app_logger = AppLogger(
136 |                 config=valid_config,
137 |             )
138 |             assert app_logger.config["log_level"] == logging.ERROR
139 |             test_logger = app_logger.get_logger(
140 |                 component_name=component_name,
141 |             )
142 | 
143 |             test_logger.error("Testing logging level")
144 |         except Exception:
145 |             assert False
146 | 
147 |     def test_logging_extra_params(self):
148 |         """Test logging extra params."""
149 |         try:
150 |             global test_instrumentation_key
151 |             component_name = "TestComponent"
152 |             app_logger = AppLogger(
153 |                 config=self.valid_config,
154 |             )
155 |             test_logger = app_logger.get_logger(
156 |                 component_name=component_name,
157 |             )
158 |             extra_params = {"custom_dimensions": {"key1": "value1"}}
159 |             test_logger.info("Logging extra params", extra=extra_params)
160 |         except Exception:
161 |             assert False
162 | 
163 |     def test_disabled_logger(self):
164 |         """Test disabled logger."""
165 |         try:
166 | 
167 |             def do_work(app_logger=get_disabled_logger()):
168 |                 component_name = "TestComponent"
169 |                 test_logger = app_logger.get_logger(
170 |                     component_name=component_name,
171 |                 )
172 |                 extra_params = {"custom_dimensions": {"key1": "value1"}}
173 |                 test_logger.info("Logging extra params", extra=extra_params)
174 | 
175 |             do_work()
176 |         except Exception:
177 |             assert False
178 | 
179 | 
180 | if __name__ == "__main__":
181 |     unittest.main()
182 | 


--------------------------------------------------------------------------------
/ml_source/tests/taxi_fares/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/ml_source/tests/taxi_fares/__init__.py


--------------------------------------------------------------------------------
/ml_source/tests/taxi_fares/feature_eng/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/ml_source/tests/taxi_fares/feature_eng/__init__.py


--------------------------------------------------------------------------------
/ml_source/tests/taxi_fares/feature_eng/test_features.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from pyspark.sql import SparkSession
 4 | from src.taxi_fares.feature_eng.features import pickup_features_fn
 5 | 
 6 | 
 7 | class TestFeatures(unittest.TestCase):
 8 |     @classmethod
 9 |     def setUpClass(cls):
10 |         cls.spark = (
11 |             SparkSession.builder.master("local[*]").appName("Unit-tests").getOrCreate()
12 |         )
13 | 
14 |     @classmethod
15 |     def tearDownClass(cls):
16 |         cls.spark.stop()
17 | 
18 |     def test_if_pickup_features_are_computed(self):
19 |         df = self.spark.createDataFrame(
20 |             [
21 |                 ("2019-01-01 00:00:00", "2019-01-01 01:00:00", 1.0, 1, 10000, 10001),
22 |                 ("2019-01-01 00:15:00", "2019-01-01 01:15:00", 2.0, 2, 10002, 10003),
23 |                 ("2019-01-01 00:30:00", "2019-01-01 01:30:00", 3.0, 3, 10004, 10005),
24 |                 ("2019-01-01 00:45:00", "2019-01-01 01:45:00", 4.0, 4, 10006, 10007),
25 |                 ("2019-01-01 01:00:00", "2019-01-01 02:00:00", 5.0, 5, 10008, 10009),
26 |                 ("2019-01-01 01:15:00", "2019-01-01 02:15:00", 6.0, 6, 10010, 10011),
27 |                 ("2019-01-01 01:30:00", "2019-01-01 02:30:00", 7.0, 7, 10012, 10013),
28 |                 ("2019-01-01 01:45:00", "2019-01-01 02:45:00", 8.0, 8, 10014, 10015),
29 |                 ("2019-01-01 02:00:00", "2019-01-01 03:00:00", 9.0, 9, 10016, 10017),
30 |                 ("2019-01-01 02:15:00", "2019-01-01 03:15:00", 10.0, 10, 10018, 10019),
31 |                 ("2019-01-01 02:30:00", "2019-01-01 03:30:00", 11.0, 11, 10020, 10021),
32 |                 ("2019-01-01 02:45:00", "2019-01-01 03:45:00", 12.0, 12, 10022, 10023),
33 |             ],
34 |             [
35 |                 "tpep_pickup_datetime",
36 |                 "tpep_dropoff_datetime",
37 |                 "trip_distance",
38 |                 "fare_amount",
39 |                 "pickup_zip",
40 |                 "dropoff_zip",
41 |             ],
42 |         )
43 |         df = pickup_features_fn(
44 |             df, "tpep_pickup_datetime", "2019-01-01 00:00:00", "2019-01-01 01:45:00"
45 |         )
46 |         self.assertEqual(df.count(), 28)
47 | 


--------------------------------------------------------------------------------
/ml_source/tests/taxi_fares/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/ml_source/tests/taxi_fares/utils/__init__.py


--------------------------------------------------------------------------------
/ml_source/tests/taxi_fares/utils/test_pyspark_utils.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from pyspark.sql import SparkSession
 4 | from src.taxi_fares.utils.pyspark_utils import filter_df_by_ts
 5 | 
 6 | 
 7 | class TestPysparkUtils(unittest.TestCase):
 8 |     @classmethod
 9 |     def setUpClass(cls):
10 |         cls.spark = (
11 |             SparkSession.builder.master("local[*]").appName("Unit-tests").getOrCreate()
12 |         )
13 | 
14 |     @classmethod
15 |     def tearDownClass(cls):
16 |         cls.spark.stop()
17 | 
18 |     def test_if_df_is_getting_filtered_by_ts(self):
19 |         df = self.spark.createDataFrame(
20 |             [
21 |                 ("2019-01-01 00:00:00", 1),
22 |                 ("2019-01-01 00:15:00", 2),
23 |                 ("2019-01-01 00:30:00", 3),
24 |                 ("2019-01-01 00:45:00", 4),
25 |                 ("2019-01-01 01:00:00", 5),
26 |                 ("2019-01-01 01:15:00", 6),
27 |                 ("2019-01-01 01:30:00", 7),
28 |                 ("2019-01-01 01:45:00", 8),
29 |                 ("2019-01-01 02:00:00", 9),
30 |                 ("2019-01-01 02:15:00", 10),
31 |                 ("2019-01-01 02:30:00", 11),
32 |                 ("2019-01-01 02:45:00", 12),
33 |             ],
34 |             ["tpep_pickup_datetime", "fare_amount"],
35 |         )
36 |         df = filter_df_by_ts(
37 |             df, "tpep_pickup_datetime", "2019-01-01 00:00:00", "2019-01-01 01:45:00"
38 |         )
39 |         self.assertEqual(df.count(), 7)
40 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | scikit-learn==1.5.0
 2 | pandas==1.2.4
 3 | black==24.3.0
 4 | coverage==5.5
 5 | databricks-cli==0.14.3
 6 | mlflow==2.21.0
 7 | opencensus-ext-azure==1.0.7
 8 | opencensus-ext-logging==0.1.0
 9 | protobuf==3.18.3
10 | lightgbm==4.6.0
11 | isort==5.10.1
12 | 


--------------------------------------------------------------------------------