├── .devcontainer
├── Dockerfile
└── devcontainer.json
├── .github
├── CODE_OF_CONDUCT.md
├── ISSUE_TEMPLATE.md
└── PULL_REQUEST_TEMPLATE.md
├── .gitignore
├── .vscode
└── settings.json
├── CHANGELOG.md
├── CONTRIBUTING.md
├── LICENSE.md
├── Makefile
├── README.md
├── docs
├── advance
│ ├── cicd.md
│ └── registered_model_stages.md
└── images
│ ├── batch_scoring.png
│ ├── model_training.png
│ ├── result_azure_resources.png
│ ├── result_batch_scoring.png
│ ├── result_databricks_job.png
│ ├── result_mlflow_experiment.png
│ └── result_mlflow_model_registry.png
├── ml_experiments
└── Feature Store Taxi example notebook.ipynb
├── ml_ops
├── README.md
├── deployment
│ ├── README.md
│ ├── arm_templates
│ │ └── databricks_and_storage.json
│ └── databricks
│ │ ├── cluster_template.json
│ │ ├── job_template_taxi_fares_batch_scoring.json
│ │ └── job_template_taxi_fares_training.json
├── orchestrator
│ ├── README.md
│ ├── taxi_fares_orchestrator_batch_score.py
│ └── taxi_fares_orchestrator_train.py
├── src
│ ├── README.md
│ ├── setup.py
│ └── taxi_fares_mlops
│ │ ├── __init__.py
│ │ ├── feature_engineering.py
│ │ ├── publish_model.py
│ │ ├── scoring_batch.py
│ │ ├── training.py
│ │ └── utils.py
└── tests
│ ├── README.md
│ ├── __init__.py
│ └── taxi_fares
│ ├── __init__.py
│ ├── data
│ └── taxi_fares_unit_test_training.csv
│ ├── test_publish_model.py
│ ├── test_training.py
│ └── test_utils.py
├── ml_source
├── README.md
├── src
│ ├── README.md
│ ├── monitoring
│ │ ├── __init__.py
│ │ └── app_logger.py
│ ├── setup.py
│ └── taxi_fares
│ │ ├── __init__.py
│ │ ├── feature_eng
│ │ ├── __init__.py
│ │ └── features.py
│ │ ├── training
│ │ ├── __init__.py
│ │ ├── evaluate.py
│ │ └── train.py
│ │ └── utils
│ │ ├── __init__.py
│ │ └── pyspark_utils.py
└── tests
│ ├── README.md
│ ├── __init__.py
│ ├── monitoring
│ ├── __init__.py
│ └── test_app_logger.py
│ └── taxi_fares
│ ├── __init__.py
│ ├── feature_eng
│ ├── __init__.py
│ └── test_features.py
│ └── utils
│ ├── __init__.py
│ └── test_pyspark_utils.py
└── requirements.txt
/.devcontainer/Dockerfile:
--------------------------------------------------------------------------------
1 | # See here for image contents: https://github.com/microsoft/vscode-dev-containers/blob/master/containers/python-3-anaconda/.devcontainer/base.Dockerfile
2 | ARG VARIANT="3"
3 | FROM mcr.microsoft.com/vscode/devcontainers/anaconda:0-${VARIANT}
4 |
5 | # Additional packages
6 | RUN sudo apt-get update
7 | RUN sudo apt-get install --reinstall build-essential -y
8 | RUN sudo apt-get install default-jdk -y
9 |
10 | # Get local user
11 | ARG USERNAME=vscode
12 |
13 | # Change conda to be owned by the local user
14 | RUN chown -R $USERNAME:$USERNAME /opt/conda
15 |
16 | # Activate local user
17 | USER $USERNAME
18 |
19 | # Conda init
20 | RUN conda init bash
21 |
22 | # [Optional] If your pip requirements rarely change, uncomment this section to add them to the image.
23 | COPY requirements.txt /tmp/pip-tmp/
24 | RUN pip --disable-pip-version-check --no-cache-dir install -r /tmp/pip-tmp/requirements.txt \
25 | && sudo rm -rf /tmp/pip-tmp
26 | RUN pip --disable-pip-version-check --no-cache-dir install databricks-feature-store
--------------------------------------------------------------------------------
/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
1 | // For format details, see https://aka.ms/devcontainer.json. For config options, see the README at:
2 | // https://github.com/microsoft/vscode-dev-containers/tree/v0.155.1/containers/python-3
3 | {
4 | "name": "Anaconda (Python 3)",
5 | "build": {
6 | "dockerfile": "Dockerfile",
7 | "context": "..",
8 | "args": {
9 | // Update 'VARIANT'
10 | "VARIANT": "3",
11 | // Options
12 | "INSTALL_NODE": "false",
13 | }
14 | },
15 | "mounts": [
16 | "source=${localEnv:HOME}/.ssh,target=/home/vscode/.ssh,type=bind",
17 | "source=${localEnv:HOME}/.gitconfig,target=/home/vscode/.gitconfig,type=bind",
18 | ],
19 | // Set *default* container specific settings.json values on container create.
20 | "settings": {
21 | "terminal.integrated.shell.linux": "/bin/bash",
22 | "python.pythonPath": "/opt/conda/bin/python",
23 | "python.linting.enabled": true,
24 | "python.linting.pylintEnabled": false,
25 | "python.linting.flake8Enabled": true,
26 | "python.linting.flake8Path": "/opt/conda/bin/flake8",
27 | "python.linting.flake8Args": [
28 | "--max-line-length=88"
29 | ],
30 | "python.formatting.provider": "black",
31 | "python.formatting.blackPath": "/opt/conda/bin/black",
32 | "python.testing.promptToConfigure": false,
33 | "[python]": {
34 | "editor.formatOnSave": true,
35 | "editor.codeActionsOnSave": {
36 | "source.organizeImports": true
37 | },
38 | "files.trimTrailingWhitespace": true
39 | },
40 | },
41 | // Add the IDs of extensions you want installed when the container is created.
42 | "extensions": [
43 | "ms-python.python",
44 | "yzhang.markdown-all-in-one",
45 | "streetsidesoftware.code-spell-checker",
46 | "njpwerner.autodocstring",
47 | "GitHub.copilot"
48 | ],
49 | // Use 'forwardPorts' to make a list of ports inside the container available locally.
50 | "forwardPorts": [
51 | 5000
52 | ],
53 | // Use 'postCreateCommand' to run commands after the container is created.
54 | // "postCreateCommand": "python --version",
55 | // Comment out connect as root instead. More info: https://aka.ms/vscode-remote/containers/non-root.
56 | "remoteUser": "vscode"
57 | }
--------------------------------------------------------------------------------
/.github/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Microsoft Open Source Code of Conduct
2 |
3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
4 |
5 | Resources:
6 |
7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
1 |
4 | > Please provide us with the following information:
5 | > ---------------------------------------------------------------
6 |
7 | ### This issue is for a: (mark with an `x`)
8 | ```
9 | - [ ] bug report -> please search issues before submitting
10 | - [ ] feature request
11 | - [ ] documentation issue or request
12 | - [ ] regression (a behavior that used to work and stopped in a new release)
13 | ```
14 |
15 | ### Minimal steps to reproduce
16 | >
17 |
18 | ### Any log messages given by the failure
19 | >
20 |
21 | ### Expected/desired behavior
22 | >
23 |
24 | ### OS and Version?
25 | > Windows 7, 8 or 10. Linux (which distribution). macOS (Yosemite? El Capitan? Sierra?)
26 |
27 | ### Versions
28 | >
29 |
30 | ### Mention any other details that might be useful
31 |
32 | > ---------------------------------------------------------------
33 | > Thanks! We'll be in touch soon.
34 |
--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | ## Purpose
2 |
3 | * ...
4 |
5 | ## Does this introduce a breaking change?
6 |
7 | ```
8 | [ ] Yes
9 | [ ] No
10 | ```
11 |
12 | ## Pull Request Type
13 | What kind of change does this Pull Request introduce?
14 |
15 |
16 | ```
17 | [ ] Bugfix
18 | [ ] Feature
19 | [ ] Code style update (formatting, local variables)
20 | [ ] Refactoring (no functional changes, no api changes)
21 | [ ] Documentation content changes
22 | [ ] Other... Please describe:
23 | ```
24 |
25 | ## How to Test
26 | * Get the code
27 |
28 | ```
29 | git clone [repo-address]
30 | cd [repo-name]
31 | git checkout [branch-name]
32 | npm install
33 | ```
34 |
35 | * Test the code
36 |
37 | ```
38 | ```
39 |
40 | ## What to Check
41 | Verify that the following are valid
42 | * ...
43 |
44 | ## Other Information
45 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
98 | __pypackages__/
99 |
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 |
104 | # SageMath parsed files
105 | *.sage.py
106 |
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 |
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 |
120 | # Rope project settings
121 | .ropeproject
122 |
123 | # mkdocs documentation
124 | /site
125 |
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 |
131 | # Pyre type checker
132 | .pyre/
133 |
134 | # pytype static type analyzer
135 | .pytype/
136 |
137 | # Cython debug symbols
138 | cython_debug/
139 |
140 | # Local files
141 | **/.DS_Store
142 | .vscode/settings.json
143 | mlruns
144 |
--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 | "python.pythonPath": "/opt/conda/bin/python",
3 | "python.testing.unittestArgs": [
4 | "-v",
5 | "-s",
6 | "./ml_source",
7 | "-p",
8 | "test*.py"
9 | ],
10 | "python.testing.pytestEnabled": false,
11 | "python.testing.unittestEnabled": true,
12 | "python.sortImports.path": "/opt/conda/bin/isort"
13 | }
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | ## [project-title] Changelog
2 |
3 |
4 | # x.y.z (yyyy-mm-dd)
5 |
6 | *Features*
7 | * ...
8 |
9 | *Bug Fixes*
10 | * ...
11 |
12 | *Breaking Changes*
13 | * ...
14 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to [project-title]
2 |
3 | This project welcomes contributions and suggestions. Most contributions require you to agree to a
4 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
5 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
6 |
7 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide
8 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
9 | provided by the bot. You will only need to do this once across all repos using our CLA.
10 |
11 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
12 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
13 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
14 |
15 | - [Code of Conduct](#coc)
16 | - [Issues and Bugs](#issue)
17 | - [Feature Requests](#feature)
18 | - [Submission Guidelines](#submit)
19 |
20 | ## Code of Conduct
21 | Help us keep this project open and inclusive. Please read and follow our [Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
22 |
23 | ## Found an Issue?
24 | If you find a bug in the source code or a mistake in the documentation, you can help us by
25 | [submitting an issue](#submit-issue) to the GitHub Repository. Even better, you can
26 | [submit a Pull Request](#submit-pr) with a fix.
27 |
28 | ## Want a Feature?
29 | You can *request* a new feature by [submitting an issue](#submit-issue) to the GitHub
30 | Repository. If you would like to *implement* a new feature, please submit an issue with
31 | a proposal for your work first, to be sure that we can use it.
32 |
33 | * **Small Features** can be crafted and directly [submitted as a Pull Request](#submit-pr).
34 |
35 | ## Submission Guidelines
36 |
37 | ### Submitting an Issue
38 | Before you submit an issue, search the archive, maybe your question was already answered.
39 |
40 | If your issue appears to be a bug, and hasn't been reported, open a new issue.
41 | Help us to maximize the effort we can spend fixing issues and adding new
42 | features, by not reporting duplicate issues. Providing the following information will increase the
43 | chances of your issue being dealt with quickly:
44 |
45 | * **Overview of the Issue** - if an error is being thrown a non-minified stack trace helps
46 | * **Version** - what version is affected (e.g. 0.1.2)
47 | * **Motivation for or Use Case** - explain what are you trying to do and why the current behavior is a bug for you
48 | * **Browsers and Operating System** - is this a problem with all browsers?
49 | * **Reproduce the Error** - provide a live example or a unambiguous set of steps
50 | * **Related Issues** - has a similar issue been reported before?
51 | * **Suggest a Fix** - if you can't fix the bug yourself, perhaps you can point to what might be
52 | causing the problem (line of code or commit)
53 |
54 | You can file new issues by providing the above information at the corresponding repository's issues link: https://github.com/[organization-name]/[repository-name]/issues/new].
55 |
56 | ### Submitting a Pull Request (PR)
57 | Before you submit your Pull Request (PR) consider the following guidelines:
58 |
59 | * Search the repository (https://github.com/[organization-name]/[repository-name]/pulls) for an open or closed PR
60 | that relates to your submission. You don't want to duplicate effort.
61 |
62 | * Make your changes in a new git fork:
63 |
64 | * Commit your changes using a descriptive commit message
65 | * Push your fork to GitHub:
66 | * In GitHub, create a pull request
67 | * If we suggest changes then:
68 | * Make the required updates.
69 | * Rebase your fork and force push to your GitHub repository (this will update your Pull Request):
70 |
71 | ```shell
72 | git rebase master -i
73 | git push -f
74 | ```
75 |
76 | That's it! Thank you for your contribution!
77 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) Microsoft Corporation.
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: clean clean-test clean-pyc clean-build
2 | SHELL=/bin/bash
3 |
4 | ## remove Python file artifacts
5 | clean-pyc:
6 | find . -name '*.pyc' -exec rm -f {} +
7 | find . -name '*.pyo' -exec rm -f {} +
8 | find . -name '*~' -exec rm -f {} +
9 | find . -name '__pycache__' -exec rm -fr {} +
10 |
11 | ## remove test and coverage artifacts
12 | clean-test:
13 | rm -f .coverage
14 | rm -fr htmlcov/
15 | rm -fr .pytest_cache
16 |
17 | ## remove build artifacts
18 | clean-build:
19 | rm -fr build/
20 | rm -fr dist/
21 | rm -fr .eggs/
22 | find . -name '*.egg-info' -exec rm -fr {} +
23 | find . -name '*.egg' -exec rm -f {} +
24 |
25 | ## remove all build, test, coverage and Python artifacts
26 | clean: clean-build clean-pyc clean-test
27 |
28 | ## pcakage ml
29 | dist-ml: clean
30 | python ml_source/src/setup.py bdist_wheel
31 | rm -fr build/
32 |
33 | ## pcakage mlops
34 | dist-mlops: clean
35 | python ml_ops/src/setup.py bdist_wheel
36 | rm -fr build/
37 |
38 | ## pcakage all
39 | dist: dist-ml dist-mlops
40 |
41 | ## install ml locally
42 | install-ml: clean
43 | python ml_source/src/setup.py install
44 | rm -fr build/
45 |
46 | ## install mlops locally
47 | install-mlops: clean
48 | python ml_ops/src/setup.py install
49 | rm -fr build/
50 |
51 | ## install all locally
52 | install: install-ml install-mlops
53 |
54 | ## unit test ml locally
55 | test-ml: install-ml
56 | cd ml_source && coverage run --source=taxi_fares,monitoring -m unittest discover
57 | cd ml_source && coverage report -m
58 |
59 | ## unit test mlops locally
60 | test-mlops: install-mlops
61 | cd ml_ops && coverage run --source=taxi_fares_mlops -m unittest discover
62 | cd ml_ops && coverage report -m
63 |
64 | ## unit test all locally
65 | test: test-ml test-mlops
66 | coverage combine ml_source/.coverage ml_ops/.coverage
67 | coverage report
68 |
69 | ## lint all python src and tests
70 | lint:
71 | flake8 --max-line-length=88 ml_ops/src ml_ops/tests ml_source/src ml_source/tests
72 |
73 | ## databricks authenticate
74 | databricks-authenticate:
75 | $(info Authenticate Databricks CLI)
76 | $(info Follow https://docs.microsoft.com/en-us/azure/databricks/dev-tools/cli/ for getting Host and token value)
77 | databricks configure --token
78 | $(info Taking Backup of .databrickscfg file in .env/databrickscfg)
79 | mkdir -p .env
80 | cp ~/.databrickscfg .env/.databrickscfg
81 | $(info Creating env script file for mlflow)
82 | DATABRICKS_HOST="$$(cat ~/.databrickscfg | grep '^host' | cut -d' ' -f 3)"; \
83 | DATABRICKS_TOKEN="$$(cat ~/.databrickscfg | grep '^token' | cut -d' ' -f 3)"; \
84 | echo "export MLFLOW_TRACKING_URI=databricks"> .env/.databricks_env.sh; \
85 | echo "export DATABRICKS_HOST=$$DATABRICKS_HOST" >> .env/.databricks_env.sh; \
86 | echo "export DATABRICKS_TOKEN=$$DATABRICKS_TOKEN" >> .env/.databricks_env.sh
87 |
88 | ## databricks init (create cluster, base workspace, mlflow experiment, secret scope)
89 | databricks-init:
90 | echo "Creating databricks workspace root directory"; \
91 | databricks workspace mkdirs /azure-databricks-mlops-mlflow; \
92 | echo "Creating databricks dbfs root directory"; \
93 | databricks fs mkdirs dbfs:/FileStore/libraries/azure-databricks-mlops-mlflow; \
94 | CLUSTER_ID="$$(databricks clusters list --output json | \
95 | jq ".clusters[] | select(.cluster_name == \"azure-databricks-mlops-mlflow\") | .cluster_id")"; \
96 | echo "Got existing cluster azure-databricks-mlops-mlflow with id: $$CLUSTER_ID"; \
97 | if [[ $$CLUSTER_ID == "" ]]; then \
98 | echo "Creating databricks cluster azure-databricks-mlops-mlflow"; \
99 | databricks clusters create --json-file ml_ops/deployment/databricks/cluster_template.json; \
100 | fi; \
101 | SECRET_SCOPE_NAME="$$(databricks secrets list-scopes --output json | \
102 | jq ".scopes[] | select(.name == \"azure-databricks-mlops-mlflow\") | .name")"; \
103 | echo "Got existing secret scope $$SECRET_SCOPE_NAME"; \
104 | if [[ $$SECRET_SCOPE_NAME == "" ]]; then \
105 | echo "Creating databricks secret scope azure-databricks-mlops-mlflow"; \
106 | databricks secrets create-scope --scope azure-databricks-mlops-mlflow --initial-manage-principal users; \
107 | fi; \
108 | MLFLOW_EXPERIMENT_ID="$$(source .env/.databricks_env.sh && mlflow experiments list | \
109 | grep '/azure-databricks-mlops-mlflow/Experiment' | \
110 | cut -d' ' -f 1)"; \
111 | echo "Got existing mlflow experiment id: $$MLFLOW_EXPERIMENT_ID"; \
112 | if [[ "$$MLFLOW_EXPERIMENT_ID" == "" ]]; then \
113 | echo "Creating mlflow experiment in databricks workspace /azure-databricks-mlops-mlflow/Experiment directory"; \
114 | source .env/.databricks_env.sh && mlflow experiments create --experiment-name /azure-databricks-mlops-mlflow/Experiment; \
115 | fi; \
116 |
117 | ## databricks secrets put
118 | databricks-secrets-put:
119 | $(info Put databricks secret azure-blob-storage-account-name)
120 | @read -p "Enter Azure Blob storage Account Name: " stg_account_name; \
121 | databricks secrets put --scope azure-databricks-mlops-mlflow --key azure-blob-storage-account-name --string-value $$stg_account_name
122 | $(info Put databricks secret azure-blob-storage-container-name)
123 | @read -p "Enter Azure Blob storage Container Name: " stg_container_name; \
124 | databricks secrets put --scope azure-databricks-mlops-mlflow --key azure-blob-storage-container-name --string-value $$stg_container_name
125 | $(info Put databricks secret azure-shared-access-key)
126 | $(info Mount Blob Storage https://docs.microsoft.com/en-gb/azure/databricks/data/data-sources/azure/azure-storage)
127 | @read -p "Enter Azure Blob storage Shared Access Key: " shared_access_key; \
128 | databricks secrets put --scope azure-databricks-mlops-mlflow --key azure-blob-storage-shared-access-key --string-value $$shared_access_key
129 |
130 | ## databricks secrets put application insights key
131 | databricks-add-app-insights-key:
132 | $(info Put app insights key)
133 | @read -p "Enter App insights key: " app_insights_key; \
134 | if [[ "$$app_insights_key" != '' ]]; then \
135 | echo "Setting app insights key : $$app_insights_key "; \
136 | databricks secrets put --scope azure-databricks-mlops-mlflow --key app_insights_key --string-value "$$app_insights_key"; \
137 | fi; \
138 |
139 | ## databricks deploy (upload wheel pacakges to databricks DBFS workspace)
140 | databricks-deploy-code: dist
141 | $(info Upload wheel packages into databricks dbfs root directory)
142 | databricks fs cp --overwrite --recursive dist/ dbfs:/FileStore/libraries/azure-databricks-mlops-mlflow/
143 | $(info Importing orchestrator notebooks into databricks workspace root directory)
144 | databricks workspace import_dir --overwrite ml_ops/orchestrator/ /azure-databricks-mlops-mlflow/
145 | $(info Create or update databricks jobs)
146 |
147 | ## databricks deploy jobs (create databricks jobs)
148 | databricks-deploy-jobs: databricks-deploy-code
149 | $(info Getting required values from databricks)
150 | CLUSTER_ID="$$(databricks clusters list --output json | \
151 | jq ".clusters[] | select(.cluster_name == \"azure-databricks-mlops-mlflow\") | .cluster_id")"; \
152 | echo "Got existing cluster id: $$CLUSTER_ID"; \
153 | TRAINING_JOB_ID="$$(databricks jobs list --output json | \
154 | jq ".jobs[] | select(.settings.name == \"taxi_fares_model_training\") | .job_id")"; \
155 | echo "Got existing taxi_fares_model_training job id: $$TRAINING_JOB_ID"; \
156 | if [[ "$$TRAINING_JOB_ID" == "" ]]; then \
157 | databricks jobs create --json "{\"name\": \"taxi_fares_model_training\", \"existing_cluster_id\": $$CLUSTER_ID}"; \
158 | TRAINING_JOB_ID="$$(databricks jobs list --output json | \
159 | jq ".jobs[] | select(.settings.name == \"taxi_fares_model_training\") | .job_id")"; \
160 | echo "Created taxi_fares_model_training with job id: $$TRAINING_JOB_ID"; \
161 | fi; \
162 | BATCH_SCORING_JOB_ID="$$(databricks jobs list --output json | \
163 | jq ".jobs[] | select(.settings.name == \"taxi_fares_batch_scoring\") | .job_id")"; \
164 | echo "Got existing taxi_fares_batch_scoring job id: $$BATCH_SCORING_JOB_ID"; \
165 | if [[ "$$BATCH_SCORING_JOB_ID" == "" ]]; then \
166 | databricks jobs create --json "{\"name\": \"taxi_fares_batch_scoring\", \"existing_cluster_id\": $$CLUSTER_ID}"; \
167 | BATCH_SCORING_JOB_ID="$$(databricks jobs list --output json | \
168 | jq ".jobs[] | select(.settings.name == \"taxi_fares_batch_scoring\") | .job_id")"; \
169 | echo "Created taxi_fares_batch_scoring with job id: $$BATCH_SCORING_JOB_ID"; \
170 | fi; \
171 | MLFLOW_EXPERIMENT_ID="$$(source .env/.databricks_env.sh && mlflow experiments list | \
172 | grep '/azure-databricks-mlops-mlflow/Experiment' | \
173 | cut -d' ' -f 1)"; \
174 | echo "Got existing mlflow experiment id: $$MLFLOW_EXPERIMENT_ID"; \
175 | echo "Updating taxi_fares_model_training by using template ml_ops/deployment/databricks/job_template_taxi_fares_training.json"; \
176 | TRAINING_JOB_UPDATE_JSON="$$(cat ml_ops/deployment/databricks/job_template_taxi_fares_training.json | \
177 | sed "s/\"FILL_JOB_ID\"/$$TRAINING_JOB_ID/" | \
178 | sed "s/FILL_MLFLOW_EXPERIMENT_ID/$$MLFLOW_EXPERIMENT_ID/" | \
179 | sed "s/\"FILL_CLUSTER_ID\"/$$CLUSTER_ID/")"; \
180 | databricks jobs reset --job-id $$TRAINING_JOB_ID --json "$$TRAINING_JOB_UPDATE_JSON"; \
181 | echo "Updating taxi_fares_batch_scoring by using template ml_ops/deployment/databricks/job_template_taxi_fares_batch_scoring.json"; \
182 | BATCH_SCORING_JOB_UPDATE_JSON="$$(cat ml_ops/deployment/databricks/job_template_taxi_fares_batch_scoring.json | \
183 | sed "s/\"FILL_JOB_ID\"/$$BATCH_SCORING_JOB_ID/" | \
184 | sed "s/FILL_MLFLOW_EXPERIMENT_ID/$$MLFLOW_EXPERIMENT_ID/" | \
185 | sed "s/\"FILL_CLUSTER_ID\"/$$CLUSTER_ID/")"; \
186 | databricks jobs reset --job-id $$BATCH_SCORING_JOB_ID --json "$$BATCH_SCORING_JOB_UPDATE_JSON"; \
187 |
188 | ## deploy databricks all
189 | deploy: databricks-deploy-jobs
190 |
191 | ## run databricks taxi_fares_model_training job
192 | run-taxifares-model-training:
193 | $(info Triggering model training job)
194 | TRAINING_JOB_ID="$$(databricks jobs list --output json | \
195 | jq ".jobs[] | select(.settings.name == \"taxi_fares_model_training\") | .job_id")"; \
196 | RUN_ID="$$(databricks jobs run-now --job-id $$TRAINING_JOB_ID | \
197 | jq ".number_in_job")"; \
198 | DATABRICKS_HOST="$$(cat ~/.databrickscfg | grep '^host' | cut -d' ' -f 3)"; \
199 | DATABRICKS_ORG_ID="$$(echo $$DATABRICKS_HOST | cut -d'-' -f 2 | cut -d'.' -f 1)"; \
200 | echo "Open the following link in browser to check result -"; \
201 | echo "$$DATABRICKS_HOST/?o=$$DATABRICKS_ORG_ID/#job/$$TRAINING_JOB_ID/run/$$RUN_ID"; \
202 |
203 |
204 | ## run databricks taxi_fares_batch_scoring job
205 | run-taxifares-batch-scoring:
206 | $(info Triggering batch scoring job)
207 | BATCH_SCORING_JOB_ID="$$(databricks jobs list --output json | \
208 | jq ".jobs[] | select(.settings.name == \"taxi_fares_batch_scoring\") | .job_id")"; \
209 | RUN_ID="$$(databricks jobs run-now --job-id $$BATCH_SCORING_JOB_ID | \
210 | jq ".number_in_job")"; \
211 | DATABRICKS_HOST="$$(cat ~/.databrickscfg | grep '^host' | cut -d' ' -f 3)"; \
212 | DATABRICKS_ORG_ID="$$(echo $$DATABRICKS_HOST | cut -d'-' -f 2 | cut -d'.' -f 1)"; \
213 | echo "Open the following link in browser to check result -"; \
214 | echo "$$DATABRICKS_HOST/?o=$$DATABRICKS_ORG_ID/#job/$$BATCH_SCORING_JOB_ID/run/$$RUN_ID"; \
215 |
216 | # continuous integration (CI)
217 | ci: lint test dist
218 |
219 | # continuous deployment (CD)
220 | cd: deploy
221 |
222 | # train model
223 | train: run-taxifares-model-training
224 |
225 | # batch scoring
226 | score: run-taxifares-batch-scoring
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | page_type: sample
3 | ms.custom:
4 | - team=cse
5 | ms.contributors:
6 | - prdeb-12/21/2021
7 | - anchugh-12/21/2021
8 | languages:
9 | - python
10 | products:
11 | - azure-databricks
12 | - azure-blob-storage
13 | - azure-monitor
14 | ---
15 |
16 | # Azure Databricks MLOps using MLflow
17 |
18 | This is a template or sample for [MLOps](https://github.com/microsoft/mlops) for [Python](https://www.python.org) based source code in [Azure Databricks](https://docs.microsoft.com/en-us/azure/databricks/) using [MLflow](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/) without using [MLflow Project](https://mlflow.org/docs/latest/projects.html#).
19 |
20 | This template provides the following features:
21 |
22 | - A way to run Python based MLOps without using [MLflow Project](https://mlflow.org/docs/latest/projects.html#), but still using [MLflow](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/) for managing the end-to-end machine learning lifecycle.
23 | - Sample of machine learning source code structure along with Unit Test cases
24 | - Sample of MLOps code structure along with Unit Test cases
25 | - Demo setup to try on users subscription
26 |
27 | ## Problem Summary
28 |
29 | - This demonstrates deployment scenario of [Orchestrate MLOps on Azure Databricks using Databricks Notebook](https://docs.microsoft.com/en-us/azure/architecture/reference-architectures/ai/orchestrate-mlops-azure-databricks)
30 |
31 | ## Products/Technologies/Languages Used
32 |
33 | - Products & Technologies:
34 | - Azure Databricks
35 | - Azure Blob Storage
36 | - Azure Monitor
37 | - Languages:
38 | - Python
39 |
40 | ## Architecture
41 |
42 | ### Model Training
43 |
44 | 
45 |
46 | ### Batch Scoring
47 |
48 | 
49 |
50 | ## Individual Components
51 |
52 | - [ml_experiment](./ml_experiments/experiment_notebook.ipynb) - sample ML experiment notebook.
53 | - [ml_data](./ml_data/) - dummy data for sample model
54 | - [ml_ops](./ml_ops/) - sample MLOps code along with Unit Test cases, orchestrator, deployment setup.
55 | - [ml_source](./ml_source/) - sample ML code along with Unit Test cases
56 | - [Makefile](./Makefile) - for build, test in local environment
57 | - [requirements.txt](./requirements.txt) - python dependencies
58 |
59 | ## Getting Started
60 |
61 | ### Prerequisites
62 |
63 | - [Azure Databricks](https://docs.microsoft.com/en-us/azure/databricks/) workspace
64 | - [Azure Data Lake Storage Gen2](https://docs.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-introduction) account
65 | - [Visual Studio Code](https://code.visualstudio.com/) in local environment for development
66 | - [Docker](https://www.docker.com/) in local environment for development
67 |
68 | ### Development
69 |
70 | 1. `git clone https://github.com/Azure-Samples/azure-databricks-mlops-mlflow.git`
71 | 2. `cd azure-databricks-mlops-mlflow`
72 | 3. Open cloned repository in Visual Studio Code [Remote Container](https://code.visualstudio.com/docs/remote/containers)
73 | 4. Open a [terminal](https://code.visualstudio.com/docs/remote/containers#_opening-a-terminal) in Remote Container from Visual Studio Code
74 | 5. `make install` to install sample packages (`taxi_fares` and `taxi_fares_mlops`) locally
75 | 6. `make test` to Unit Test the code locally
76 |
77 | ### Package
78 |
79 | 1. `make dist` to build wheel Ml and MLOps packages (`taxi_fares` and `taxi_fares_mlops`) locally
80 |
81 | ### Deployment
82 |
83 | 1. `make databricks-deploy-code` to deploy Databricks Orchestrator Notebooks, ML and MLOps Python wheel packages. If any code changes.
84 | 2. `make databricks-deploy-jobs` to deploy Databricks Jobs. If any changes in job specs.
85 |
86 | ### Run training and batch scoring
87 |
88 | 1. To trigger training, execute `make run-taxi-fares-model-training`
89 | 2. To trigger batch scoring, execute `make run-taxi-fares-batch-scoring`
90 |
91 | **NOTE:** for [deployment](#deployment) and [running](#run-training-and-batch-scoring) the Databricks environment should be created first, for creating a demo environment the [Demo](#demo) chapter can be followed.
92 |
93 | ### Observability
94 |
95 | Check Logs, create alerts. etc. in [Application Insights](https://docs.microsoft.com/en-us/azure/azure-monitor/app/app-insights-overview). Following are the few sample [Kusto Query](https://docs.microsoft.com/en-us/azure/data-explorer/kusto/query/) to check logs, traces, exception, etc.
96 |
97 | - Check for Error, Info, Debug Logs
98 |
99 | Kusto Query for checking general logs for a specific MLflow experiment, filtered by `mlflow_experiment_id`
100 |
101 | ```kusto
102 | traces
103 | | extend mlflow_experiment_id = customDimensions.mlflow_experiment_id
104 | | where timestamp > ago(30m)
105 | | where mlflow_experiment_id ==
106 | | limit 1000
107 | ```
108 |
109 | Kusto Query for checking general logs for a specific Databricks job execution filtered by `mlflow_experiment_id` and `mlflow_run_id`
110 |
111 | ```kusto
112 | traces
113 | | extend mlflow_run_id = customDimensions.mlflow_run_id
114 | | extend mlflow_experiment_id = customDimensions.mlflow_experiment_id
115 | | where timestamp > ago(30m)
116 | | where mlflow_experiment_id ==
117 | | where mlflow_run_id == ""
118 | | limit 1000
119 | ```
120 |
121 | - Check for Exceptions
122 |
123 | Kusto Query for checking exception log if any
124 |
125 | ```kusto
126 | exceptions
127 | | where timestamp > ago(30m)
128 | | limit 1000
129 | ```
130 |
131 | - Check for duration of different stages in MLOps
132 |
133 | Sample Kusto Query for checking duration of different stages in MLOps
134 |
135 | ```kusto
136 | dependencies
137 | | where timestamp > ago(30m)
138 | | where cloud_RoleName == 'TaxiFares_Training'
139 | | limit 1000
140 | ```
141 |
142 | To correlate dependencies, exceptions and traces, `operation_Id` can be used a filter to above Kusto Queries.
143 |
144 | ## Demo
145 |
146 | 1. Create Databricks workspace, a storage account (Azure Data Lake Storage Gen2) and Application Insights
147 | 1. Create an [Azure Account](https://azure.microsoft.com/en-in/free/)
148 | 2. [Deploy resources](https://docs.microsoft.com/en-us/azure/azure-resource-manager/templates/deploy-portal#deploy-resources-from-custom-template) from [custom ARM template](ml_ops/deployment/arm_templates/databricks_and_storage.json)
149 | 2. Initialize Databricks (create cluster, base workspace, mlflow experiment, secret scope)
150 | 1. Get [Databricks CLI](https://docs.microsoft.com/en-us/azure/databricks/dev-tools/cli/) Host and Token
151 | 2. Authenticate Databricks CLI `make databricks-authenticate`
152 | 3. Execute `make databricks-init`
153 | 3. Create Azure Data Lake Storage Gen2 Container and upload data
154 | 1. [Create](https://docs.microsoft.com/en-us/azure/storage/blobs/storage-quickstart-blobs-portal#create-a-container) Azure Data Lake Storage Gen2 Container named - `taxifares`
155 | 2. [Upload](https://docs.microsoft.com/en-us/azure/storage/blobs/storage-quickstart-blobs-portal#upload-a-block-blob) as blob [taxi-fares data files](./ml_data/) into Azure Data Lake Storage Gen2 container named - `taxifares`
156 | 4. Put secrets to [Mount ADLS Gen2 Storage using Shared Access Key](https://docs.microsoft.com/en-gb/azure/databricks/data/data-sources/azure/azure-storage)
157 | 1. Get Azure Data Lake Storage Gen2 account name created in step 1
158 | 2. Get [Shared Key](https://docs.microsoft.com/en-us/rest/api/storageservices/authorize-with-shared-key) for Azure Data Lake Storage Gen2 account
159 | 3. Execute `make databricks-secrets-put` to put secret in Databricks secret scope
160 | 5. Put Application Insights Key as a secret in Databricks secret scope (optional)
161 | 1. Get [Application Insights Key](https://docs.microsoft.com/en-us/azure/azure-monitor/app/create-new-resource#copy-the-instrumentation-key) created in step 1
162 | 2. Execute `make databricks-add-app-insights-key` to put secret in Databricks secret scope
163 | 6. Package and deploy into Databricks (Databricks Jobs, Orchestrator Notebooks, ML and MLOps Python wheel packages)
164 | 1. Execute `make deploy`
165 | 7. Run Databricks Jobs
166 | 1. To trigger training, execute `make run-taxifares-model-training`
167 | 2. To trigger batch scoring, execute `make run-taxifares-batch-scoring`
168 | 8. Expected results
169 | 1. Azure resources
170 | 
171 | 2. Databricks jobs
172 | 
173 | 3. Databricks mlflow experiment
174 | 
175 | 4. Databricks mlflow model registry
176 | 
177 | 5. Output of batch scoring
178 | 
179 |
180 | ## Additional Details
181 |
182 | 1. [Continuous Integration (CI) & Continuous Deployment (CD)](docs/advance/cicd.md)
183 | 2. [Registered Models Stages and Transitioning](docs/advance/registered_model_stages.md)
184 |
185 | ## Related resources
186 |
187 | 1. [Azure Databricks](https://docs.microsoft.com/en-us/azure/databricks/)
188 | 2. [MLflow](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/)
189 | 3. [MLflow Project](https://mlflow.org/docs/latest/projects.html#)
190 | 4. [Run MLflow Projects on Azure Databricks](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/projects)
191 | 5. [Databricks Widgets](https://docs.microsoft.com/en-us/azure/databricks/dev-tools/databricks-utils#--widget-utilities)
192 | 6. [Databricks Notebook-scoped Python libraries](https://docs.microsoft.com/en-us/azure/databricks/libraries/notebooks-python-libraries)
193 | 7. [Databricks CLI](https://docs.microsoft.com/en-us/azure/databricks/dev-tools/cli/)
194 | 8. [Azure Data Lake Storage Gen2](https://docs.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-introduction)
195 | 9. [Application Insights](https://docs.microsoft.com/en-us/azure/azure-monitor/app/app-insights-overview)
196 | 10. [Kusto Query Language](https://docs.microsoft.com/en-us/azure/data-explorer/kusto/query/)
197 |
198 | ## Glossaries
199 |
200 | 1. [Application developer](https://docs.microsoft.com/en-us/azure/machine-learning/team-data-science-process/overview) : It is a role that work mainly towards operationalize of machine learning.
201 | 2. [Data scientist](https://docs.microsoft.com/en-us/azure/machine-learning/team-data-science-process/roles-tasks#structure-of-data-science-groups-and-teams) : It is a role to perform the data science parts of the project
202 |
203 | ## Contributors
204 |
205 | - [Julien Chomarat](https://github.com/jchomarat)
206 | - [Benjamin Guinebertière](https://github.com/benjguin)
207 | - [Ankit Sinha](https://github.com/ankitbko)
208 | - [Prabal Deb](https://github.com/prabdeb)
209 | - [Megha Patil](https://github.com/meghapatilcode)
210 | - [Srikantan Sankaran](https://github.com/ssrikantan)
211 | - [Frédéric Le Coquil](https://github.com/flecoqui)
212 | - [Anand Chugh](https://github.com/anandchugh)
213 |
--------------------------------------------------------------------------------
/docs/advance/cicd.md:
--------------------------------------------------------------------------------
1 | # Continuous Integration (CI) & Continuous Deployment (CD)
2 |
3 | CI and CD can be performed using any platform like `Azure DevOps Pipeline` or `GitHub Actions`, etc. where the following `make` commands in [Makefile](../../Makefile) might be useful.
4 |
5 | - CI: execute `make ci` from the Pipeline/Action stage.
6 | - CD: execute `make cd` from the Pipeline/Action stage.
7 |
8 | **NOTE:** Set env variables - `DATABRICKS_HOST`, `DATABRICKS_TOKEN` in the environment prior executing CD stage.
9 |
10 | ## Reference
11 |
12 | - [Design a CI/CD pipeline using Azure DevOps](https://docs.microsoft.com/en-us/azure/architecture/example-scenario/apps/devops-dotnet-webapp)
13 | - [GitHub Actions](https://docs.github.com/en/actions)
--------------------------------------------------------------------------------
/docs/advance/registered_model_stages.md:
--------------------------------------------------------------------------------
1 | # Registered Models Stages and Transitioning
2 |
3 | This document describes a possible way to transitioning a model from different stages available in [Mlflow Model Registry](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/model-registry#model-registry-concepts).
4 |
5 | 1. In this demo setup, currently [Continuous Integration (CI)](cicd.md) step does [register](../ml_ops/src/taxi_fares_mlops/../../../../ml_ops/src/taxi_fares_mlops/publish_model.py) the model in MLflow model registry in `None` stage.
6 | 2. Now the registered model can be [transitioned](https://www.mlflow.org/docs/latest/model-registry.html#transitioning-an-mlflow-models-stage) to next stage `Staging` post Integration test step.
7 | 3. Finally the model can be transitioned to stage `Production` during Continuous Deployment (CD) step.
8 |
9 | ## References
10 |
11 | - [MLflow Model Registry on Azure Databricks](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/model-registry)
12 | - [MLflow Model Registry](https://www.mlflow.org/docs/latest/model-registry.html)
13 |
--------------------------------------------------------------------------------
/docs/images/batch_scoring.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/docs/images/batch_scoring.png
--------------------------------------------------------------------------------
/docs/images/model_training.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/docs/images/model_training.png
--------------------------------------------------------------------------------
/docs/images/result_azure_resources.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/docs/images/result_azure_resources.png
--------------------------------------------------------------------------------
/docs/images/result_batch_scoring.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/docs/images/result_batch_scoring.png
--------------------------------------------------------------------------------
/docs/images/result_databricks_job.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/docs/images/result_databricks_job.png
--------------------------------------------------------------------------------
/docs/images/result_mlflow_experiment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/docs/images/result_mlflow_experiment.png
--------------------------------------------------------------------------------
/docs/images/result_mlflow_model_registry.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/docs/images/result_mlflow_model_registry.png
--------------------------------------------------------------------------------
/ml_ops/README.md:
--------------------------------------------------------------------------------
1 | # MLOps
2 |
3 | ## Overview
4 |
5 | This contains MLOps code. That will be developed, unit tested, packaged and delivered independently and typically maintained by Application developer in an organization.
6 |
7 | ## Contents
8 |
9 | 1. [src](src/) : MLOps source code, that will be packaged as Python `wheel`.
10 | 2. [tests](tests/) : unit test cases for `src`.
11 | 3. [orchestrator](orchestrator/) : Databricks Python Notebooks for MLOps orchestrator.
12 | 4. [deployment](deployment/) : deployment templates ([ARM](https://docs.microsoft.com/en-us/azure/azure-resource-manager/templates/overview) and Databricks Jobs, Cluster).
13 |
--------------------------------------------------------------------------------
/ml_ops/deployment/README.md:
--------------------------------------------------------------------------------
1 | # Deployment
2 |
3 | ## Overview
4 |
5 | This document covers the deployment guide for MLOps.
6 |
7 | ## Databricks Cluster
8 |
9 | For Orchestrator job, either an existing cluster can be used or a new cluster can be created. However, we need to be sure to set following [properties](https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/latest/clusters#--request-structure-of-the-cluster-definition) in the cluster.
10 |
11 | - Cluster Mode: High Concurrency
12 | - DataBricks Runtime Version : 8.1 LTS ML (includes Apache Spark 3.0.1, Scala 2.12)
13 | - Enable Autoscaling: True
14 | - Worker Type: Standard_F4s
15 | - Driver Type: Standard_F4s
16 | - Spark Settings under “Spark Config” (Edit > Advanced Options > Spark)
17 |
18 | ```configuration
19 | spark.databricks.cluster.profile serverless
20 | spark.databricks.repl.allowedLanguages sql,python,r
21 | spark.databricks.conda.condaMagic.enabled true
22 | ```
23 |
24 | ## Databricks Job
25 |
26 | Orchestrator DataBricks Job from a [Databricks Job create template](https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/latest/jobs#--create) can be created using following example CLI command -
27 |
28 | ```sh
29 | databricks jobs create --json-file .json
30 | ```
31 |
32 | Orchestrator DataBricks Job from a [Databricks Job reset template](https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/latest/jobs#--reset) can be updated using following example CLI command -
33 |
34 | ```sh
35 | databricks jobs reset --job-id --json-file .json
36 | ```
37 |
38 | ## Databricks MLflow Experiment
39 |
40 | MLflow Experiment can be created using [Databricks Workspace Portal](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/tracking#workspace-experiments) or using following CLI commands -
41 |
42 | ```sh
43 | export MLFLOW_TRACKING_URI=databricks
44 | export DATABRICKS_HOST=
45 | export DATABRICKS_TOKEN=
46 | mlflow experiments create --experiment-name //
47 | ```
48 |
49 | Get `DATABRICKS_HOST` and `DATABRICKS_TOKEN` from [Databricks CLI Reference](https://docs.microsoft.com/en-us/azure/databricks/dev-tools/cli/)
50 |
51 | ## Databricks DBFS Upload
52 |
53 | The following CLI command can be used to upload Wheel package into DataBricks DBFS.
54 |
55 | ```sh
56 | databricks fs cp --overwrite python-package.whl
57 | ```
58 |
59 | ## Databricks Notebook Import
60 |
61 | The following CLI command can be used to import orchestrator python file as a DataBricks notebook into DataBricks workspace.
62 |
63 | ```sh
64 | databricks workspace import -l PYTHON -f SOURCE -o .py
65 | ```
66 |
67 | ## Orchestrator DataBricks Job trigger
68 |
69 | Orchestrator databricks job can be triggered using following ways -
70 |
71 | - Scheduled :
72 | - Cron based scheduling.
73 | - Manual :
74 | - Databricks workspace portal but clicking on `Run Now With Different Parameters`.
75 | - Via [Databricks-CLI](https://docs.microsoft.com/en-us/azure/databricks/dev-tools/cli/jobs-cli).
76 | - Via [Databricks-API](https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/latest/jobs#--run-now).
77 |
--------------------------------------------------------------------------------
/ml_ops/deployment/arm_templates/databricks_and_storage.json:
--------------------------------------------------------------------------------
1 | {
2 | "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#",
3 | "contentVersion": "1.0.0.1",
4 | "parameters": {
5 | "location": {
6 | "type": "string",
7 | "defaultValue": "[resourceGroup().location]",
8 | "metadata": {
9 | "description": "Resources location."
10 | }
11 | },
12 | "dbWorkspaceName": {
13 | "type": "string",
14 | "defaultValue": "databricks-mlops-mlflow",
15 | "metadata": {
16 | "description": "The name of the Azure Databricks workspace to create."
17 | }
18 | },
19 | "dbTier": {
20 | "defaultValue": "standard",
21 | "type": "string",
22 | "allowedValues": [
23 | "standard",
24 | "premium"
25 | ],
26 | "metadata": {
27 | "description": "The pricing tier of Databricks workspace."
28 | }
29 | },
30 | "stgAccountName": {
31 | "type": "string",
32 | "defaultValue": "[concat('storage', uniqueString(parameters('location'), resourceGroup().id))]",
33 | "metadata": {
34 | "description": "Storage account name."
35 | }
36 | },
37 | "stgAccountType": {
38 | "type": "string",
39 | "defaultValue": "Standard_RAGRS",
40 | "metadata": {
41 | "description": "Storage account type."
42 | }
43 | },
44 | "stgKind": {
45 | "type": "string",
46 | "defaultValue": "StorageV2",
47 | "metadata": {
48 | "description": "Storage account kind."
49 | }
50 | },
51 | "stgAccessTier": {
52 | "type": "string",
53 | "defaultValue": "Cool",
54 | "metadata": {
55 | "description": "Storage account tier."
56 | }
57 | },
58 | "stgIsHnsEnabled": {
59 | "type": "bool",
60 | "defaultValue": true,
61 | "metadata": {
62 | "description": "Enable ADLS Gen2."
63 | }
64 | },
65 | "aiName": {
66 | "type": "string",
67 | "defaultValue": "[concat('ai', uniqueString(parameters('location'), resourceGroup().id))]",
68 | "metadata": {
69 | "description": "Application Insights name."
70 | }
71 | }
72 | },
73 | "variables": {
74 | "managedResourceGroupName": "[concat('databricks-rg-', parameters('dbWorkspaceName'), '-', uniqueString(parameters('dbWorkspaceName'), resourceGroup().id))]",
75 | "managedResourceGroupId": "[concat(subscription().id, '/resourceGroups/', variables('managedResourceGroupName'))]"
76 | },
77 | "resources": [
78 | {
79 | "type": "Microsoft.Databricks/workspaces",
80 | "apiVersion": "2018-04-01",
81 | "name": "[parameters('dbWorkspaceName')]",
82 | "location": "[parameters('location')]",
83 | "sku": {
84 | "name": "[parameters('dbTier')]"
85 | },
86 | "comments": "Please do not use an existing resource group for ManagedResourceGroupId.",
87 | "properties": {
88 | "ManagedResourceGroupId": "[variables('managedResourceGroupId')]",
89 | "parameters": {}
90 | },
91 | "dependsOn": [],
92 | "tags": {
93 | "Purpose": "Demo",
94 | "Project": "azure-databricks-mlops-mlflow"
95 | }
96 | },
97 | {
98 | "type": "Microsoft.Storage/storageAccounts",
99 | "apiVersion": "2019-06-01",
100 | "name": "[parameters('stgAccountName')]",
101 | "location": "[parameters('location')]",
102 | "properties": {
103 | "accessTier": "[parameters('stgAccessTier')]",
104 | "isHnsEnabled": "[parameters('stgIsHnsEnabled')]"
105 | },
106 | "dependsOn": [],
107 | "sku": {
108 | "name": "[parameters('stgAccountType')]"
109 | },
110 | "kind": "[parameters('stgKind')]",
111 | "tags": {
112 | "Purpose": "Demo",
113 | "Project": "azure-databricks-mlops-mlflow"
114 | }
115 | },
116 | {
117 | "type": "Microsoft.Insights/components",
118 | "apiVersion": "2020-02-02",
119 | "name": "[parameters('aiName')]",
120 | "location": "[parameters('location')]",
121 | "kind": "other",
122 | "tags": {
123 | "Purpose": "Demo",
124 | "Project": "azure-databricks-mlops-mlflow"
125 | },
126 | "properties": {
127 | "Application_Type": "web",
128 | "Flow_Type": "Bluefield",
129 | "Request_Source": "CustomDeployment"
130 | }
131 | }
132 | ],
133 | "outputs": {}
134 | }
--------------------------------------------------------------------------------
/ml_ops/deployment/databricks/cluster_template.json:
--------------------------------------------------------------------------------
1 | {
2 | "cluster_name": "azure-databricks-mlops-mlflow",
3 | "spark_version": "10.4.x-cpu-ml-scala2.12",
4 | "num_workers": 0,
5 | "spark_conf": {
6 | "spark.databricks.cluster.profile": "singleNode",
7 | "spark.databricks.conda.condaMagic.enabled": "true",
8 | "spark.master": "local[*]"
9 | },
10 | "node_type_id": "Standard_F4",
11 | "driver_node_type_id": "Standard_F4",
12 | "custom_tags": {
13 | "ResourceClass": "SingleNode"
14 | },
15 | "autotermination_minutes": 30,
16 | "enable_elastic_disk": true
17 | }
--------------------------------------------------------------------------------
/ml_ops/deployment/databricks/job_template_taxi_fares_batch_scoring.json:
--------------------------------------------------------------------------------
1 | {
2 | "job_id": "FILL_JOB_ID",
3 | "name": "taxi_fares_batch_scoring",
4 | "existing_cluster_id": "FILL_CLUSTER_ID",
5 | "notebook_task": {
6 | "notebook_path": "/azure-databricks-mlops-mlflow/taxi_fares_orchestrator_batch_score",
7 | "base_parameters": {
8 | "taxi_fares_raw_data": "/databricks-datasets/nyctaxi-with-zipcodes/subsampled",
9 | "taxi_fares_mount_point": "/mnt/data_batch",
10 | "mlflow_experiment_id": "FILL_MLFLOW_EXPERIMENT_ID",
11 | "execute_feature_engineering": "true",
12 | "scoring_data_start_date": "2016-02-01",
13 | "scoring_data_end_date": "2016-02-29",
14 | "trained_model_version": "",
15 | "wheel_package_dbfs_base_path": "/dbfs/FileStore/libraries/azure-databricks-mlops-mlflow",
16 | "wheel_package_taxi_fares_version": "0.0.1",
17 | "wheel_package_taxi_fares_mlops_version": "0.0.1"
18 | }
19 | },
20 | "timeout_seconds": 86400,
21 | "email_notifications": {
22 | "on_start": [],
23 | "on_success": [],
24 | "on_failure": []
25 | }
26 | }
--------------------------------------------------------------------------------
/ml_ops/deployment/databricks/job_template_taxi_fares_training.json:
--------------------------------------------------------------------------------
1 | {
2 | "job_id": "FILL_JOB_ID",
3 | "name": "taxi_fares_model_training",
4 | "existing_cluster_id": "FILL_CLUSTER_ID",
5 | "notebook_task": {
6 | "notebook_path": "/azure-databricks-mlops-mlflow/taxi_fares_orchestrator_train",
7 | "base_parameters": {
8 | "taxi_fares_raw_data": "/databricks-datasets/nyctaxi-with-zipcodes/subsampled",
9 | "mlflow_experiment_id": "FILL_MLFLOW_EXPERIMENT_ID",
10 | "wheel_package_dbfs_base_path": "/dbfs/FileStore/libraries/azure-databricks-mlops-mlflow",
11 | "wheel_package_taxi_fares_version": "0.0.1",
12 | "wheel_package_taxi_fares_mlops_version": "0.0.1",
13 | "execute_feature_engineering": "true",
14 | "training_data_end_date": "2016-01-31",
15 | "training_data_start_date": "2016-01-01",
16 | "training_num_leaves": "32",
17 | "training_objective": "regression",
18 | "training_metric": "rmse",
19 | "training_num_rounds": "100"
20 | }
21 | },
22 | "timeout_seconds": 86400,
23 | "email_notifications": {
24 | "on_start": [],
25 | "on_success": [],
26 | "on_failure": []
27 | }
28 | }
--------------------------------------------------------------------------------
/ml_ops/orchestrator/README.md:
--------------------------------------------------------------------------------
1 | # Orchestrator
2 |
3 | ## Overview
4 |
5 | This document covers the design guide of the following orchestrators -
6 |
7 | 1. [taxi_fares_orchestrator_train.py](taxi_fares_orchestrator_train.py)
8 | 2. [taxi_fares_orchestrator_batch_score.py](taxi_fares_orchestrator_batch_score.py)
9 |
10 | ## Considerations
11 |
12 | - It will be a Databricks notebook in Databricks workspace.
13 | - It will be stored in GIT as a python file.
14 | - It will use `dbutils` widgets for parametrization
15 | - It will use `pip magic commands` for managing libraries.
16 | - It will be executed from a Databricks Job.
17 | - It will perform logging in Application Insights
18 | - It will log artifacts, metrics, parameters, trained model into MLflow.
19 |
20 | ## Parameters
21 |
22 | ### Define Parameters
23 |
24 | Parameters are defined using `dbutils.widgets.text`, example
25 |
26 | ```py
27 | dbutils.widgets.text("", "")
28 | ```
29 |
30 | ### Read Parameters
31 |
32 | Parameters are read using `dbutils.widgets.get`, example
33 |
34 | ```py
35 | param_value = dbutils.widgets.get("")
36 | ```
37 |
38 | ## Installation of libraries
39 |
40 | ### How to enable %pip magic commands
41 |
42 | Starting with Databricks Runtime ML version 6.4 this feature can be enabled when creating a cluster.
43 | To perform this set `spark.databricks.conda.condaMagic.enabled` to `true` under “Spark Config” (Edit > Advanced Options > Spark).
44 |
45 | ### How to install libraries using pip
46 |
47 | Libraries are installed as [Notebook-scoped Python libraries](https://docs.microsoft.com/en-us/azure/databricks/libraries/notebooks-python-libraries), example
48 |
49 | ```sh
50 | %pip install dbfs//.whl
51 | ```
52 |
53 | ## Calling MLOps Python Functions
54 |
55 | MLOps Python Functions are packaged as a wheel package and orchestrator notebook calls the python functions from wheel package.
56 |
57 | ## Execution of Orchestrator
58 |
59 | Orchestrator are executed from DataBricks Job.
60 |
61 | ## Error handling
62 |
63 | For error handling `try..catch` block is used to handle exceptions -
64 |
65 | ```py
66 | try:
67 | model = run_training()
68 | except(Exception ex):
69 | logger.error(f"Encountered error: {ex.Message}") # To log exception in Application Insights
70 | raise Exception(f"Encountered error - {ex}") from ex # To fail the Databricks Job Run
71 | ```
72 |
73 | ## Observability
74 |
75 | [OpenCensus](https://docs.microsoft.com/en-us/azure/azure-monitor/app/opencensus-python) library is used to capture logs and metrics and send it to Application Insights.
76 |
77 | ## Secret Management
78 |
79 | The following secrets need to be stored in [Databricks Secret Scope](https://docs.microsoft.com/en-us/azure/databricks/security/secrets/):
80 |
81 | - Application Insights Instrumentation Key
82 | - Azure ADLS Gen2 Storage Details (account name, container name, shared access key)
83 |
84 | Secrets are read using `dbutils.secrets.get`, example
85 |
86 | ```py
87 | secret_value = dbutils.secrets.get(scope = "", key = "")
88 | ```
89 |
90 | ## References
91 |
92 | 1. [Enable pip magic commands](https://databricks.com/blog/2020/06/17/simplify-python-environment-management-on-databricks-runtime-for-machine-learning-using-pip-and-conda.html)
93 | 2. [OpenCensus](https://docs.microsoft.com/en-us/azure/azure-monitor/app/opencensus-python)
94 | 3. [DataBricks Job API](https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/latest/jobs)
95 | 4. [DataBricks Cluster API](https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/latest/clusters)
96 | 5. [DataBricks CLI](https://docs.microsoft.com/en-us/azure/databricks/dev-tools/cli/)
97 |
--------------------------------------------------------------------------------
/ml_ops/orchestrator/taxi_fares_orchestrator_batch_score.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | """Orchestrator notebook for taxifares training."""
3 | # Initialization of dbutils to avoid linting errors during developing in vscode
4 | from pyspark.sql import SparkSession
5 |
6 |
7 | def get_dbutils(spark):
8 | """Return dbutils for databricks."""
9 | if spark.conf.get("spark.databricks.service.client.enabled") == "true":
10 | from pyspark.dbutils import DBUtils
11 |
12 | return DBUtils(spark)
13 | else:
14 | import IPython
15 |
16 | return IPython.get_ipython().user_ns["dbutils"]
17 |
18 |
19 | spark = SparkSession.builder.appName("Pipeline").getOrCreate()
20 | dbutils = get_dbutils(spark)
21 |
22 | # COMMAND ----------
23 |
24 | # Define parameters
25 | dbutils.widgets.text(
26 | "taxi_fares_raw_data", "/databricks-datasets/nyctaxi-with-zipcodes/subsampled"
27 | )
28 | dbutils.widgets.text("taxi_fares_mount_point", "/mnt/data")
29 | dbutils.widgets.text("mlflow_experiment_id", "")
30 | dbutils.widgets.text("wheel_package_dbfs_base_path", "")
31 | dbutils.widgets.text("wheel_package_taxi_fares_version", "")
32 | dbutils.widgets.text("wheel_package_taxi_fares_mlops_version", "")
33 | dbutils.widgets.text("execute_feature_engineering", "true")
34 | dbutils.widgets.text("trained_model_version", "")
35 | dbutils.widgets.text("scoring_data_start_date", "2016-02-01")
36 | dbutils.widgets.text("training_data_end_date", "2016-02-29")
37 |
38 | # COMMAND ----------
39 |
40 | # Get wheel package parameters
41 | wheel_package_dbfs_base_path = dbutils.widgets.get(
42 | "wheel_package_dbfs_base_path")
43 | wheel_package_taxi_fares_version = dbutils.widgets.get(
44 | "wheel_package_taxi_fares_version"
45 | )
46 | wheel_package_taxi_fares_mlops_version = dbutils.widgets.get(
47 | "wheel_package_taxi_fares_mlops_version"
48 | )
49 |
50 | # COMMAND ----------
51 |
52 | # MAGIC %pip install $wheel_package_dbfs_base_path/taxi_fares-$wheel_package_taxi_fares_version-py3-none-any.whl # noqa: E501
53 | # MAGIC %pip install $wheel_package_dbfs_base_path/taxi_fares_mlops-$wheel_package_taxi_fares_mlops_version-py3-none-any.whl # noqa: E501
54 |
55 | # COMMAND ----------
56 |
57 | # Imports
58 | import os # noqa: E402
59 | import shutil # noqa: E402
60 | from datetime import datetime # noqa: E402
61 | from pathlib import Path # noqa: E402
62 |
63 | import mlflow # noqa: E402
64 | from databricks import feature_store # noqa: E402
65 | from monitoring.app_logger import AppLogger, get_disabled_logger # noqa: E402
66 | from taxi_fares.utils.pyspark_utils import rounded_taxi_data # noqa: E402
67 | from taxi_fares_mlops.feature_engineering import run as run_feature_engineering # noqa
68 | from taxi_fares_mlops.scoring_batch import run as run_scoring_batch # noqa: E402
69 |
70 | # COMMAND ----------
71 |
72 | # Get other parameters
73 | mlflow_experiment_id = dbutils.widgets.get("mlflow_experiment_id")
74 | execute_feature_engineering = dbutils.widgets.get(
75 | "execute_feature_engineering")
76 | taxi_fares_raw_data = dbutils.widgets.get("taxi_fares_raw_data")
77 | taxi_fares_mount_point = dbutils.widgets.get("taxi_fares_mount_point")
78 | trained_model_version = dbutils.widgets.get("trained_model_version")
79 | scoring_data_start_date = dbutils.widgets.get("scoring_data_start_date")
80 | scoring_data_end_date = dbutils.widgets.get("scoring_data_end_date")
81 |
82 | # COMMAND ----------
83 |
84 | # Initiate mlflow experiment
85 | mlflow.start_run(experiment_id=int(mlflow_experiment_id),
86 | run_name="batch_scoring")
87 | mlflow_run = mlflow.active_run()
88 | mlflow_run_id = mlflow_run.info.run_id
89 | mlflow_log_tmp_dir = "/tmp/" + str(mlflow_run_id) # nosec: B108
90 | Path(mlflow_log_tmp_dir).mkdir(parents=True, exist_ok=True)
91 |
92 | # initiate app logger
93 | if any(
94 | [
95 | True
96 | for secret in dbutils.secrets.list(scope="azure-databricks-mlops-mlflow")
97 | if "app_insights_key" in secret.key
98 | ]
99 | ):
100 | app_insights_key = dbutils.secrets.get(
101 | scope="azure-databricks-mlops-mlflow", key="app_insights_key"
102 | )
103 | config = {"app_insights_key": app_insights_key}
104 | app_logger = AppLogger(config=config)
105 | else:
106 | app_logger = get_disabled_logger()
107 | try:
108 | logger = app_logger.get_logger(
109 | component_name="Batch_Score_Orchestrator",
110 | custom_dimensions={
111 | "mlflow_run_id": mlflow_run_id,
112 | "mlflow_experiment_id": int(mlflow_experiment_id),
113 | },
114 | )
115 | tracer = app_logger.get_tracer(
116 | component_name="Batch_Score_Orchestrator",
117 | )
118 | except Exception as ex:
119 | print(ex)
120 | mlflow.end_run()
121 | shutil.rmtree(mlflow_log_tmp_dir, ignore_errors=True)
122 | raise Exception(f"ERROR - in initializing app logger - {ex}") from ex
123 |
124 | logger.info(f"Stating batch scoring with mlflow run id {mlflow_run_id}")
125 |
126 | # COMMAND ----------
127 |
128 | # Mount ADLS Gen2 storage container
129 | try:
130 | logger.info(f"Mounting {taxi_fares_mount_point}")
131 | if any(mount.mountPoint == taxi_fares_mount_point for mount in dbutils.fs.mounts()):
132 | logger.info(f"Mount point exists {taxi_fares_mount_point}")
133 | else:
134 | storage_account_name = dbutils.secrets.get(
135 | scope="azure-databricks-mlops-mlflow", key="azure-blob-storage-account-name"
136 | )
137 | storage_container_name = dbutils.secrets.get(
138 | scope="azure-databricks-mlops-mlflow",
139 | key="azure-blob-storage-container-name",
140 | )
141 | storage_shared_key_name = dbutils.secrets.get(
142 | scope="azure-databricks-mlops-mlflow",
143 | key="azure-blob-storage-shared-access-key",
144 | )
145 | dbutils.fs.mount(
146 | source=f"wasbs://{storage_container_name}@{storage_account_name}.blob.core.windows.net", # noqa: E501
147 | mount_point=taxi_fares_mount_point,
148 | extra_configs={
149 | f"fs.azure.account.key.{storage_account_name}.blob.core.windows.net": storage_shared_key_name # noqa: E501
150 | },
151 | )
152 | except Exception as ex:
153 | print(ex)
154 | mlflow.end_run()
155 | shutil.rmtree(mlflow_log_tmp_dir, ignore_errors=True)
156 | logger.exception(f"ERROR - in mounting adls - {ex}")
157 | raise Exception(f"ERROR - in mounting adls - {ex}") from ex
158 |
159 | # COMMAND ----------
160 |
161 | # Clean up function
162 |
163 |
164 | def clean():
165 | dbutils.fs.unmount(taxi_fares_mount_point)
166 | mlflow.log_artifacts(mlflow_log_tmp_dir)
167 | shutil.rmtree(mlflow_log_tmp_dir)
168 | mlflow.end_run()
169 |
170 |
171 | # COMMAND ----------
172 |
173 | # Get batch scoring raw data
174 | try:
175 | logger.info("Reading training raw data")
176 | raw_data_file = taxi_fares_raw_data
177 | raw_data = spark.read.format("delta").load(raw_data_file)
178 | mlflow.log_param("data_raw_rows", raw_data.count())
179 | mlflow.log_param("data_raw_cols", len(raw_data.columns))
180 | except Exception as ex:
181 | clean()
182 | logger.exception(f"ERROR - in reading raw data - {ex}")
183 | raise Exception(f"ERROR - in reading raw data - {ex}") from ex
184 |
185 | # COMMAND ----------
186 |
187 |
188 | # Run feature engineering on batch scoring raw data
189 | if execute_feature_engineering == "true":
190 | try:
191 | logger.info("Starting feature engineering")
192 | with tracer.span("run_feature_engineering"):
193 | feature_engineered_data = run_feature_engineering(
194 | df_input=raw_data,
195 | start_date=datetime.strptime(
196 | scoring_data_start_date, "%Y-%m-%d"),
197 | end_date=datetime.strptime(scoring_data_end_date, "%Y-%m-%d"),
198 | mlflow=mlflow,
199 | mlflow_log_tmp_dir=mlflow_log_tmp_dir,
200 | explain_features=True,
201 | app_logger=app_logger,
202 | parent_tracer=tracer,
203 | )
204 | except Exception as ex:
205 | clean()
206 | logger.exception(f"ERROR - in feature engineering - {ex}")
207 | raise Exception(f"ERROR - in feature engineering - {ex}") from ex
208 | else:
209 | logger.info("Skipping feature engineering")
210 |
211 | # COMMAND ----------
212 |
213 | # MAGIC %sql
214 | # MAGIC CREATE DATABASE IF NOT EXISTS feature_store_taxi_example;
215 |
216 | # COMMAND ----------
217 |
218 | # Save features to feature store
219 | fs = feature_store.FeatureStoreClient()
220 | if execute_feature_engineering == "true":
221 | try:
222 | spark.conf.set("spark.sql.shuffle.partitions", "5")
223 |
224 | fs.create_table(
225 | name="feature_store_taxi_example.trip_pickup_features",
226 | primary_keys=["zip", "ts"],
227 | df=feature_engineered_data[0],
228 | partition_columns="yyyy_mm",
229 | description="Taxi Fares. Pickup Features",
230 | )
231 | fs.create_table(
232 | name="feature_store_taxi_example.trip_dropoff_features",
233 | primary_keys=["zip", "ts"],
234 | df=feature_engineered_data[1],
235 | partition_columns="yyyy_mm",
236 | description="Taxi Fares. Dropoff Features",
237 | )
238 |
239 | # Write the pickup features DataFrame to the feature store table
240 | fs.write_table(
241 | name="feature_store_taxi_example.trip_pickup_features",
242 | df=feature_engineered_data[0],
243 | mode="merge",
244 | )
245 | # Write the dropoff features DataFrame to the feature store table
246 | fs.write_table(
247 | name="feature_store_taxi_example.trip_dropoff_features",
248 | df=feature_engineered_data[1],
249 | mode="merge",
250 | )
251 | except Exception as ex:
252 | clean()
253 | logger.exception(
254 | f"ERROR - in feature saving into feature store - {ex}")
255 | raise Exception(
256 | f"ERROR - in feature saving into feature store - {ex}") from ex
257 | else:
258 | logger.info("Skipping feature saving into feature store")
259 |
260 | # COMMAND ----------
261 |
262 | # Batch scoring
263 | try:
264 | logger.info("Starting batch scoring")
265 | with tracer.span("run_scoring_batch"):
266 | run_scoring_batch(
267 | trained_model_name="taxi_fares",
268 | score_df=rounded_taxi_data(raw_data),
269 | mlflow=mlflow,
270 | mlflow_log_tmp_dir=mlflow_log_tmp_dir,
271 | trained_model_version=trained_model_version,
272 | app_logger=app_logger,
273 | parent_tracer=tracer,
274 | )
275 | except Exception as ex:
276 | clean()
277 | logger.exception(f"ERROR - in batch scoring - {ex}")
278 | raise Exception(f"ERROR - in batch scoring - {ex}") from ex
279 |
280 |
281 | # COMMAND ----------
282 |
283 | # Batch scoring result publish
284 | try:
285 | logger.info("Starting batch scoring result publish to adls")
286 | with tracer.span("publish_result"):
287 | result_path = "/".join(
288 | [
289 | "/dbfs",
290 | taxi_fares_mount_point,
291 | "batch_scoring_result",
292 | str(mlflow_run_id),
293 | ]
294 | )
295 | Path(result_path).mkdir(parents=True, exist_ok=True)
296 | shutil.copyfile(
297 | os.path.join(mlflow_log_tmp_dir, "batch_scoring_result.html"),
298 | os.path.join(
299 | result_path,
300 | "batch_scoring_result.html",
301 | ),
302 | )
303 | shutil.copyfile(
304 | os.path.join(mlflow_log_tmp_dir, "batch_scoring_result.csv"),
305 | os.path.join(
306 | result_path,
307 | "batch_scoring_result.csv",
308 | ),
309 | )
310 | logger.info(f"Published score result in {result_path}")
311 | except Exception as ex:
312 | clean()
313 | logger.exception(f"ERROR - in batch scoring result publish to adls - {ex}")
314 | raise Exception(
315 | f"ERROR - in batch scoring result publish to adls - {ex}") from ex
316 |
317 |
318 | # COMMAND ----------
319 |
320 | # End
321 | logger.info(f"Completed batch scoring with mlflow run id {mlflow_run_id}")
322 | clean()
323 |
--------------------------------------------------------------------------------
/ml_ops/orchestrator/taxi_fares_orchestrator_train.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | """Orchestrator notebook for taxifares training."""
3 | # Initialization of dbutils to avoid linting errors during developing in vscode
4 | from pyspark.sql import SparkSession
5 |
6 |
7 | def get_dbutils(spark):
8 | """Return dbutils for databricks."""
9 | if spark.conf.get("spark.databricks.service.client.enabled") == "true":
10 | from pyspark.dbutils import DBUtils
11 |
12 | return DBUtils(spark)
13 | else:
14 | import IPython
15 |
16 | return IPython.get_ipython().user_ns["dbutils"]
17 |
18 |
19 | spark = SparkSession.builder.appName("Pipeline").getOrCreate()
20 | dbutils = get_dbutils(spark)
21 |
22 | # COMMAND ----------
23 |
24 | # Define parameters
25 | dbutils.widgets.text(
26 | "taxi_fares_raw_data", "/databricks-datasets/nyctaxi-with-zipcodes/subsampled"
27 | )
28 | dbutils.widgets.text("mlflow_experiment_id", "")
29 | dbutils.widgets.text("wheel_package_dbfs_base_path", "")
30 | dbutils.widgets.text("wheel_package_taxi_fares_version", "")
31 | dbutils.widgets.text("wheel_package_taxi_fares_mlops_version", "")
32 | dbutils.widgets.text("execute_feature_engineering", "true")
33 | dbutils.widgets.text("training_data_start_date", "2016-01-01")
34 | dbutils.widgets.text("training_data_end_date", "2016-01-31")
35 | dbutils.widgets.text("training_num_leaves", "32")
36 | dbutils.widgets.text("training_objective", "regression")
37 | dbutils.widgets.text("training_metric", "rmse")
38 | dbutils.widgets.text("training_num_rounds", "100")
39 |
40 | # COMMAND ----------
41 |
42 | # Get wheel package parameters
43 | wheel_package_dbfs_base_path = dbutils.widgets.get(
44 | "wheel_package_dbfs_base_path")
45 | wheel_package_taxi_fares_version = dbutils.widgets.get(
46 | "wheel_package_taxi_fares_version"
47 | )
48 | wheel_package_taxi_fares_mlops_version = dbutils.widgets.get(
49 | "wheel_package_taxi_fares_mlops_version"
50 | )
51 |
52 | # COMMAND ----------
53 |
54 | # MAGIC %pip install $wheel_package_dbfs_base_path/taxi_fares-$wheel_package_taxi_fares_version-py3-none-any.whl # noqa: E501
55 | # MAGIC %pip install $wheel_package_dbfs_base_path/taxi_fares_mlops-$wheel_package_taxi_fares_mlops_version-py3-none-any.whl # noqa: E501
56 |
57 | # COMMAND ----------
58 |
59 | # Imports
60 | import shutil # noqa: E402
61 | from datetime import datetime # noqa: E402
62 | from pathlib import Path # noqa: E402
63 |
64 | import mlflow # noqa: E402
65 | from databricks import feature_store # noqa: E402
66 | from databricks.feature_store import FeatureLookup # noqa: E402
67 | from monitoring.app_logger import AppLogger, get_disabled_logger # noqa: E402
68 | from taxi_fares.utils.pyspark_utils import rounded_taxi_data # noqa: E402
69 | from taxi_fares_mlops.feature_engineering import run as run_feature_engineering # noqa
70 | from taxi_fares_mlops.publish_model import run as run_publish_model # noqa: E402
71 | from taxi_fares_mlops.training import run as run_training # noqa: E402
72 |
73 | # COMMAND ----------
74 |
75 | # Get other parameters
76 | mlflow_experiment_id = dbutils.widgets.get("mlflow_experiment_id")
77 | execute_feature_engineering = dbutils.widgets.get(
78 | "execute_feature_engineering")
79 | training_data_start_date = dbutils.widgets.get("training_data_start_date")
80 | training_data_end_date = dbutils.widgets.get("training_data_end_date")
81 | taxi_fares_raw_data = dbutils.widgets.get("taxi_fares_raw_data")
82 | training_num_leaves = int(dbutils.widgets.get("training_num_leaves"))
83 | training_objective = dbutils.widgets.get("training_objective")
84 | training_metric = dbutils.widgets.get("training_metric")
85 | training_num_rounds = int(dbutils.widgets.get("training_num_rounds"))
86 |
87 | # COMMAND ----------
88 |
89 | # Initiate mlflow experiment
90 | mlflow.start_run(experiment_id=int(mlflow_experiment_id), run_name="training")
91 | mlflow_run = mlflow.active_run()
92 | mlflow_run_id = mlflow_run.info.run_id
93 | mlflow_log_tmp_dir = "/tmp/" + str(mlflow_run_id) # nosec: B108
94 | Path(mlflow_log_tmp_dir).mkdir(parents=True, exist_ok=True)
95 |
96 | # initiate app logger
97 | if any(
98 | [
99 | True
100 | for secret in dbutils.secrets.list(scope="azure-databricks-mlops-mlflow")
101 | if "app_insights_key" in secret.key
102 | ]
103 | ):
104 | app_insights_key = dbutils.secrets.get(
105 | scope="azure-databricks-mlops-mlflow", key="app_insights_key"
106 | )
107 | config = {"app_insights_key": app_insights_key}
108 | app_logger = AppLogger(config=config)
109 | else:
110 | app_logger = get_disabled_logger()
111 | try:
112 | logger = app_logger.get_logger(
113 | component_name="Train_Orchestrator",
114 | custom_dimensions={
115 | "mlflow_run_id": mlflow_run_id,
116 | "mlflow_experiment_id": int(mlflow_experiment_id),
117 | },
118 | )
119 | tracer = app_logger.get_tracer(
120 | component_name="Train_Orchestrator",
121 | )
122 | except Exception as ex:
123 | print(ex)
124 | mlflow.end_run()
125 | shutil.rmtree(mlflow_log_tmp_dir, ignore_errors=True)
126 | raise Exception(f"ERROR - in initializing app logger - {ex}") from ex
127 |
128 | logger.info(f"Stating training with mlflow run id {mlflow_run_id}")
129 |
130 | # COMMAND ----------
131 |
132 | # Clean up function
133 |
134 |
135 | def clean():
136 | mlflow.log_artifacts(mlflow_log_tmp_dir)
137 | shutil.rmtree(mlflow_log_tmp_dir)
138 | mlflow.end_run()
139 |
140 |
141 | # COMMAND ----------
142 |
143 | # Get training raw data
144 | try:
145 | logger.info("Reading training raw data")
146 | raw_data_file = taxi_fares_raw_data
147 | raw_data = spark.read.format("delta").load(raw_data_file)
148 | mlflow.log_param("data_raw_rows", raw_data.count())
149 | mlflow.log_param("data_raw_cols", len(raw_data.columns))
150 | except Exception as ex:
151 | clean()
152 | logger.exception(f"ERROR - in reading raw data - {ex}")
153 | raise Exception(f"ERROR - in reading raw data - {ex}") from ex
154 |
155 | # COMMAND ----------
156 |
157 | # Run feature engineering
158 | if execute_feature_engineering == "true":
159 | try:
160 | logger.info("Starting feature engineering")
161 | with tracer.span("run_feature_engineering"):
162 | feature_engineered_data = run_feature_engineering(
163 | df_input=raw_data,
164 | start_date=datetime.strptime(
165 | training_data_start_date, "%Y-%m-%d"),
166 | end_date=datetime.strptime(training_data_end_date, "%Y-%m-%d"),
167 | mlflow=mlflow,
168 | mlflow_log_tmp_dir=mlflow_log_tmp_dir,
169 | explain_features=True,
170 | app_logger=app_logger,
171 | parent_tracer=tracer,
172 | )
173 | except Exception as ex:
174 | clean()
175 | logger.exception(f"ERROR - in feature engineering - {ex}")
176 | raise Exception(f"ERROR - in feature engineering - {ex}") from ex
177 | else:
178 | logger.info("Skipping feature engineering")
179 |
180 | # COMMAND ----------
181 |
182 | # MAGIC %sql
183 | # MAGIC CREATE DATABASE IF NOT EXISTS feature_store_taxi_example;
184 |
185 | # COMMAND ----------
186 |
187 | # Save features to feature store
188 | fs = feature_store.FeatureStoreClient()
189 | if execute_feature_engineering == "true":
190 | try:
191 | spark.conf.set("spark.sql.shuffle.partitions", "5")
192 |
193 | fs.create_table(
194 | name="feature_store_taxi_example.trip_pickup_features",
195 | primary_keys=["zip", "ts"],
196 | df=feature_engineered_data[0],
197 | partition_columns="yyyy_mm",
198 | description="Taxi Fares. Pickup Features",
199 | )
200 | fs.create_table(
201 | name="feature_store_taxi_example.trip_dropoff_features",
202 | primary_keys=["zip", "ts"],
203 | df=feature_engineered_data[1],
204 | partition_columns="yyyy_mm",
205 | description="Taxi Fares. Dropoff Features",
206 | )
207 |
208 | # Write the pickup features DataFrame to the feature store table
209 | fs.write_table(
210 | name="feature_store_taxi_example.trip_pickup_features",
211 | df=feature_engineered_data[0],
212 | mode="merge",
213 | )
214 | # Write the dropoff features DataFrame to the feature store table
215 | fs.write_table(
216 | name="feature_store_taxi_example.trip_dropoff_features",
217 | df=feature_engineered_data[1],
218 | mode="merge",
219 | )
220 | except Exception as ex:
221 | clean()
222 | logger.exception(
223 | f"ERROR - in feature saving into feature store - {ex}")
224 | raise Exception(
225 | f"ERROR - in feature saving into feature store - {ex}") from ex
226 | else:
227 | logger.info("Skipping feature saving into feature store")
228 |
229 | # COMMAND ----------
230 |
231 | # Load features from feature store
232 | try:
233 | pickup_features_table = "feature_store_taxi_example.trip_pickup_features"
234 | dropoff_features_table = "feature_store_taxi_example.trip_dropoff_features"
235 |
236 | pickup_feature_lookups = [
237 | FeatureLookup(
238 | table_name=pickup_features_table,
239 | feature_names=[
240 | "mean_fare_window_1h_pickup_zip",
241 | "count_trips_window_1h_pickup_zip",
242 | ],
243 | lookup_key=["pickup_zip", "rounded_pickup_datetime"],
244 | ),
245 | ]
246 |
247 | dropoff_feature_lookups = [
248 | FeatureLookup(
249 | table_name=dropoff_features_table,
250 | feature_names=["count_trips_window_30m_dropoff_zip",
251 | "dropoff_is_weekend"],
252 | lookup_key=["dropoff_zip", "rounded_dropoff_datetime"],
253 | ),
254 | ]
255 |
256 | # unless additional feature engineering was performed,
257 | # exclude them to avoid training on them.
258 | exclude_columns = ["rounded_pickup_datetime", "rounded_dropoff_datetime"]
259 |
260 | # Create the training set that includes the raw input data merged with
261 | # corresponding features from both feature tables
262 | with tracer.span("create_training_set"):
263 | training_set = fs.create_training_set(
264 | rounded_taxi_data(raw_data),
265 | feature_lookups=pickup_feature_lookups + dropoff_feature_lookups,
266 | label="fare_amount",
267 | exclude_columns=exclude_columns,
268 | )
269 |
270 | # Load the TrainingSet into a dataframe which can be passed into
271 | # sklearn for training a model
272 | training_df = training_set.load_df()
273 |
274 | logger.info(
275 | f"Shape of training dataframe, rows: {training_df.count()}, cols: {len(training_df.columns)}" # noqa: E501
276 | )
277 | mlflow.log_param("training_data_rows", training_df.count())
278 | mlflow.log_param("training_data_columns", len(training_df.columns))
279 | except Exception as ex:
280 | clean()
281 | logger.exception(f"ERROR - in feature loading from feature store - {ex}")
282 | raise Exception(
283 | f"ERROR - in feature loading from feature store - {ex}") from ex
284 |
285 | # COMMAND ----------
286 |
287 | # Run training
288 | try:
289 | logger.info("Starting model training")
290 | params = {
291 | "num_leaves": training_num_leaves,
292 | "objective": training_objective,
293 | "metric": training_metric,
294 | }
295 | num_rounds = training_num_rounds
296 | with tracer.span("run_training"):
297 | trained_model = run_training(
298 | training_df,
299 | mlflow,
300 | params=params,
301 | num_rounds=num_rounds,
302 | app_logger=app_logger,
303 | parent_tracer=tracer,
304 | )
305 | except Exception as ex:
306 | clean()
307 | logger.exception(f"ERROR - in model training - {ex}")
308 | raise Exception(f"ERROR - in model training - {ex}") from ex
309 |
310 | # COMMAND ----------
311 |
312 | # Publish trained model
313 | try:
314 | logger.info("Starting publish model")
315 | with tracer.span("run_publish_model"):
316 | run_publish_model(
317 | trained_model=trained_model,
318 | training_set=training_set,
319 | mlflow=mlflow,
320 | model_name="taxi_fares",
321 | app_logger=app_logger,
322 | parent_tracer=tracer,
323 | )
324 | except Exception as ex:
325 | clean()
326 | logger.exception(f"ERROR - in publish trained model - {ex}")
327 | raise Exception(f"ERROR - in publish trained model - {ex}") from ex
328 |
329 | # COMMAND ----------
330 |
331 | # End
332 | logger.info(f"Completed training with mlflow run id {mlflow_run_id}")
333 | clean()
334 |
--------------------------------------------------------------------------------
/ml_ops/src/README.md:
--------------------------------------------------------------------------------
1 | # SRC
2 |
3 | ## Overview
4 |
5 | Source code for MLOps, based on -
6 |
7 | 1. `taxifares_mlops` contains MLOps source code for `taxifares` machine learning code.
8 | 2. The MLOps Python functions will be called from orchestrator Databricks Notebook.
9 | 3. Ops related integrations (MLflow and Application Insights Metrics, Tracing, etc.) may happen in MLOps source code.
10 | 4. Mostly no Machine Learning (data science) related logics will be written in MLOps.
11 | 5. DataFrame I/O will happen in orchestrator Databricks Notebook, not in MLOps source code.
12 |
--------------------------------------------------------------------------------
/ml_ops/src/setup.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from setuptools import find_packages, setup
4 |
5 |
6 | def read(fname):
7 | return open(os.path.join(os.path.dirname(__file__), fname)).read()
8 |
9 |
10 | requirements_file_name = "requirements.txt"
11 | with open(requirements_file_name) as f:
12 | required_packages = f.read().splitlines()
13 | required_packages = [
14 | package.strip(" ")
15 | for package in required_packages
16 | if package.strip(" ") and "#" not in package
17 | ]
18 |
19 | setup(
20 | name="taxi_fares_mlops",
21 | version="0.0.1",
22 | author="",
23 | author_email="",
24 | description=(""),
25 | license="",
26 | keywords="",
27 | url="",
28 | package_dir={"": "ml_ops/src"},
29 | packages=find_packages(where="ml_ops/src"),
30 | classifiers=[],
31 | install_requires=required_packages,
32 | )
33 |
--------------------------------------------------------------------------------
/ml_ops/src/taxi_fares_mlops/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/ml_ops/src/taxi_fares_mlops/__init__.py
--------------------------------------------------------------------------------
/ml_ops/src/taxi_fares_mlops/feature_engineering.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import math
3 | from datetime import datetime
4 | from pathlib import Path
5 | from typing import Tuple
6 |
7 | import matplotlib.pyplot as plt
8 | import mlflow
9 | import seaborn as sns
10 | from monitoring.app_logger import AppLogger, get_disabled_logger
11 | from opencensus.trace.tracer import Tracer
12 | from pyspark.sql.dataframe import DataFrame
13 | from taxi_fares.feature_eng.features import dropoff_features_fn, pickup_features_fn
14 |
15 |
16 | def run(
17 | df_input: DataFrame,
18 | start_date: datetime,
19 | end_date: datetime,
20 | mlflow: mlflow,
21 | mlflow_log_tmp_dir: str,
22 | explain_features: bool = True,
23 | app_logger: AppLogger = get_disabled_logger(),
24 | parent_tracer: Tracer = None,
25 | ) -> Tuple[DataFrame, DataFrame]:
26 | """MLOps feature engineering entry point.
27 |
28 | Args:
29 | df_input (pd.DataFrame): input data - raw
30 | mlflow (mlflow): mlflow object that is having an active run
31 | initiated by mlflow.start_run
32 | mlflow_log_tmp_dir (str): directory for putting files to be logged
33 | in mlflow artifacts
34 | explain_features (bool, optional): explain features, possible only with
35 | training data. Defaults to True.
36 | app_logger (monitoring.app_logger.AppLogger): AppLogger object default
37 | to monitoring.app_logger.get_disabled_logger
38 | parent_tracer (Tracer): OpenCensus parent tracer for correlation
39 | Returns:
40 | pd.DataFrame: clean and feature engineered data
41 | """
42 | logger = logging.getLogger(__name__)
43 | try:
44 | component_name = "Taxi_Fares_Feature_Eng"
45 | # mlflow tracking
46 | mlflow_run = mlflow.active_run()
47 | mlflow_run_id = mlflow_run.info.run_id
48 | mlflow_experiment_id = mlflow_run.info.experiment_id
49 |
50 | logger = app_logger.get_logger(
51 | component_name=component_name,
52 | custom_dimensions={
53 | "mlflow_run_id": mlflow_run_id,
54 | "mlflow_experiment_id": mlflow_experiment_id,
55 | },
56 | )
57 | tracer = app_logger.get_tracer(
58 | component_name=component_name, parent_tracer=parent_tracer
59 | )
60 |
61 | logger.info("Running MLOps feature engineering")
62 | logger.info(
63 | f"Shape of input dataframe, rows: {df_input.count()}, cols: {len(df_input.columns)}" # noqa: E501
64 | )
65 |
66 | logger.info("Getting pickup features")
67 | with tracer.span("pickup_features"):
68 | pickup_features = pickup_features_fn(
69 | df_input,
70 | ts_column="tpep_pickup_datetime",
71 | start_date=start_date,
72 | end_date=end_date,
73 | )
74 | logger.info(
75 | f"Shape of pickup features dataframe, rows: {pickup_features.count()}, cols: {len(pickup_features.columns)}" # noqa: E501
76 | )
77 | mlflow.log_param(
78 | "feature_engineering_pickup_features",
79 | (pickup_features.count(), len(pickup_features.columns)),
80 | )
81 |
82 | logger.info("Getting drop off features")
83 | with tracer.span("dropoff_features"):
84 | dropoff_features = dropoff_features_fn(
85 | df_input,
86 | ts_column="tpep_dropoff_datetime",
87 | start_date=start_date,
88 | end_date=end_date,
89 | )
90 | logger.info(
91 | f"Shape of dropoff features dataframe, rows: {dropoff_features.count()}, cols: {len(dropoff_features.columns)}" # noqa: E501
92 | )
93 | mlflow.log_param(
94 | "feature_engineering_dropoff_features",
95 | (dropoff_features.count(), len(dropoff_features.columns)),
96 | )
97 |
98 | with tracer.span("explain_features"):
99 | if explain_features:
100 | logger.info("Getting feature explanations - statistics")
101 | feature_statistic_pickup_features = (
102 | pickup_features.describe().toPandas()
103 | )
104 | feature_statistic_pickup_features.to_html(
105 | Path(
106 | mlflow_log_tmp_dir,
107 | "feature_statistic_pickup_features.html",
108 | ),
109 | justify="center",
110 | na_rep="",
111 | )
112 | feature_statistic_dropoff_features = (
113 | dropoff_features.describe().toPandas()
114 | )
115 | feature_statistic_dropoff_features.to_html(
116 | Path(
117 | mlflow_log_tmp_dir,
118 | "feature_statistic_dropoff_features.html",
119 | ),
120 | justify="center",
121 | na_rep="",
122 | )
123 | logger.info("Getting feature explanations - box plot")
124 | pickup_features_pandas = pickup_features.toPandas()[
125 | [
126 | "mean_fare_window_1h_pickup_zip",
127 | "count_trips_window_1h_pickup_zip",
128 | ]
129 | ]
130 | numeric_cols = pickup_features_pandas.columns
131 | plot_data = pickup_features_pandas.copy()
132 | select_top_k = len(numeric_cols)
133 | n_col = 2
134 | n_row = math.ceil(select_top_k / n_col)
135 | s_col = 5
136 | s_row = 3
137 | fig, axs = plt.subplots(
138 | n_row, n_col, figsize=(s_col * n_col, s_row * n_row), sharey=False
139 | )
140 | axs = axs.flatten()
141 | for index, col in enumerate(numeric_cols[:select_top_k]):
142 | ax = sns.boxplot(
143 | x="count_trips_window_1h_pickup_zip",
144 | y=col,
145 | data=plot_data,
146 | ax=axs[index],
147 | )
148 | ax.set(title=col, ylabel="")
149 | fig.tight_layout()
150 | fig.savefig(
151 | Path(mlflow_log_tmp_dir, "feature_pickup_features_boxplot.png")
152 | )
153 | dropoff_features_pandas = dropoff_features.toPandas()[
154 | ["count_trips_window_30m_dropoff_zip", "dropoff_is_weekend"]
155 | ]
156 | numeric_cols = dropoff_features_pandas.columns
157 | plot_data = dropoff_features_pandas.copy()
158 | select_top_k = len(numeric_cols)
159 | n_col = 2
160 | n_row = math.ceil(select_top_k / n_col)
161 | s_col = 5
162 | s_row = 3
163 | fig, axs = plt.subplots(
164 | n_row, n_col, figsize=(s_col * n_col, s_row * n_row), sharey=False
165 | )
166 | axs = axs.flatten()
167 | for index, col in enumerate(numeric_cols[:select_top_k]):
168 | ax = sns.boxplot(
169 | x="dropoff_is_weekend", y=col, data=plot_data, ax=axs[index]
170 | )
171 | ax.set(title=col, ylabel="")
172 | fig.tight_layout()
173 | fig.savefig(
174 | Path(mlflow_log_tmp_dir, "feature_dropoff_features_boxplot.png")
175 | )
176 |
177 | logger.info("Completed MLOps feature engineering")
178 | return (pickup_features, dropoff_features)
179 | except Exception as exp:
180 | logger.error("an exception occurred in Feature Eng")
181 | raise Exception("an exception occurred in Feature Eng") from exp
182 |
--------------------------------------------------------------------------------
/ml_ops/src/taxi_fares_mlops/publish_model.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | import lightgbm as lgb
4 | import mlflow
5 | from databricks import feature_store
6 | from databricks.feature_store.training_set import TrainingSet
7 | from mlflow.entities.model_registry import ModelVersion
8 | from monitoring.app_logger import AppLogger, get_disabled_logger
9 | from opencensus.trace.tracer import Tracer
10 |
11 | from taxi_fares_mlops.utils import get_latest_model_version
12 |
13 |
14 | def run(
15 | trained_model: lgb.Booster,
16 | training_set: TrainingSet,
17 | mlflow: mlflow,
18 | model_name: str = "taxi_fares",
19 | app_logger: AppLogger = get_disabled_logger(),
20 | parent_tracer: Tracer = None,
21 | ) -> ModelVersion:
22 | """MLOps publish model in mlflow model registry - entry point.
23 |
24 | Args:
25 | trained_model (Ridge): trained Ridge model
26 | mlflow (mlflow): mlflow object that is having an active run
27 | initiated by mlflow.start_run
28 | model_name (str, optional): model name in mlflow model registry.
29 | Defaults to "taxi_fares".
30 | app_logger (monitoring.app_logger.AppLogger): AppLogger object deafult
31 | to monitoring.app_logger.get_disabled_logger
32 | parent_tracer (Tracer): OpenCensus parent tracer for correlation
33 | Returns:
34 | mlflow.entities.model_registry.ModelVersion: registered model details
35 | """
36 | logger = logging.getLogger(__name__)
37 | try:
38 | component_name = "Taxi_Fares_Publish_Model"
39 |
40 | # mlflow tracking
41 | mlflow_run = mlflow.active_run()
42 | mlflow_run_id = mlflow_run.info.run_id
43 | mlflow_experiment_id = mlflow_run.info.experiment_id
44 |
45 | logger = app_logger.get_logger(
46 | component_name=component_name,
47 | custom_dimensions={
48 | "mlflow_run_id": mlflow_run_id,
49 | "mlflow_experiment_id": mlflow_experiment_id,
50 | },
51 | )
52 | tracer = app_logger.get_tracer(
53 | component_name=component_name, parent_tracer=parent_tracer
54 | )
55 |
56 | logger.info("Publishing trained model into mlflow model registry")
57 | with tracer.span("register_model"):
58 | fs = feature_store.FeatureStoreClient()
59 | fs.log_model(
60 | trained_model,
61 | artifact_path="model_packaged",
62 | flavor=mlflow.lightgbm,
63 | training_set=training_set,
64 | registered_model_name=model_name,
65 | )
66 | model_version = get_latest_model_version(model_name)
67 | mlflow.log_param("model_version", model_version)
68 | mlflow.log_param("model_name", model_name)
69 |
70 | logger.info(f"published model name: {model_name}, version: {model_version}")
71 | logger.info("Completed MLOps publish model")
72 | except Exception as exp:
73 | logger.error("an exception occurred in publish model")
74 | raise Exception("an exception occurred in publish model") from exp
75 |
--------------------------------------------------------------------------------
/ml_ops/src/taxi_fares_mlops/scoring_batch.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from pathlib import Path
3 |
4 | import mlflow
5 | import pandas as pd
6 | import pyspark.sql.functions as func
7 | from databricks import feature_store
8 | from monitoring.app_logger import AppLogger, get_disabled_logger
9 | from opencensus.trace.tracer import Tracer
10 |
11 | from taxi_fares_mlops.utils import get_latest_model_version
12 |
13 |
14 | def run(
15 | trained_model_name: str,
16 | score_df: pd.DataFrame,
17 | mlflow: mlflow,
18 | mlflow_log_tmp_dir: str,
19 | trained_model_version: str = None,
20 | app_logger: AppLogger = get_disabled_logger(),
21 | parent_tracer: Tracer = None,
22 | ) -> None:
23 | """[summary]
24 |
25 | Args:
26 | trained_model (Ridge): trained Ridge model
27 | df_input (pd.DataFrame): input dataframe for batch scoring,
28 | feature engineeringered.
29 | mlflow (mlflow): mlflow object that is having an active run
30 | initiated by mlflow.start_run
31 | mlflow_log_tmp_dir (str): directory for puting files to be logged
32 | in mlflow artifacts
33 | app_logger (monitoring.app_logger.AppLogger): AppLogger object deafult
34 | to monitoring.app_logger.get_disabled_logger
35 | parent_tracer (Tracer): OpenCensus parent tracer for correlation
36 | """
37 | logger = logging.getLogger(__name__)
38 | try:
39 | component_name = "Taxi_Fares_Scoring_Batch"
40 | # mlflow tracking
41 | mlflow_run = mlflow.active_run()
42 | mlflow_run_id = mlflow_run.info.run_id
43 | mlflow_experiment_id = mlflow_run.info.experiment_id
44 |
45 | logger = app_logger.get_logger(
46 | component_name=component_name,
47 | custom_dimensions={
48 | "mlflow_run_id": mlflow_run_id,
49 | "mlflow_experiment_id": mlflow_experiment_id,
50 | },
51 | )
52 | tracer = app_logger.get_tracer(
53 | component_name=component_name, parent_tracer=parent_tracer
54 | )
55 |
56 | logger.info("Running MLOps batch scoring")
57 | with tracer.span("batch_scoring"):
58 | cols = [
59 | "fare_amount",
60 | "trip_distance",
61 | "pickup_zip",
62 | "dropoff_zip",
63 | "rounded_pickup_datetime",
64 | "rounded_dropoff_datetime",
65 | ]
66 | score_df_reordered = score_df.select(cols)
67 | if trained_model_version is None or trained_model_version == "":
68 | trained_model_version = get_latest_model_version(
69 | trained_model_name)
70 | else:
71 | trained_model_version = int(trained_model_version)
72 | model_uri = f"models:/{trained_model_name}/{trained_model_version}"
73 | mlflow.log_param("trained_model_version", trained_model_version)
74 | logger.info(f"trained model version {trained_model_version}")
75 | fs = feature_store.FeatureStoreClient()
76 | predictions = fs.score_batch(model_uri, score_df_reordered)
77 | cols = [
78 | "prediction",
79 | "fare_amount",
80 | "trip_distance",
81 | "pickup_zip",
82 | "dropoff_zip",
83 | "rounded_pickup_datetime",
84 | "rounded_dropoff_datetime",
85 | "mean_fare_window_1h_pickup_zip",
86 | "count_trips_window_1h_pickup_zip",
87 | "count_trips_window_30m_dropoff_zip",
88 | "dropoff_is_weekend",
89 | ]
90 |
91 | with_predictions_reordered = (
92 | predictions.select(
93 | cols,
94 | )
95 | .withColumnRenamed(
96 | "prediction",
97 | "predicted_fare_amount",
98 | )
99 | .withColumn(
100 | "predicted_fare_amount",
101 | func.round("predicted_fare_amount", 2),
102 | )
103 | )
104 | with_predictions_reordered.toPandas().to_html(
105 | Path(
106 | mlflow_log_tmp_dir,
107 | "batch_scoring_result.html",
108 | ),
109 | justify="center",
110 | na_rep="",
111 | )
112 | with_predictions_reordered.toPandas().to_csv(
113 | Path(
114 | mlflow_log_tmp_dir,
115 | "batch_scoring_result.csv",
116 | ),
117 | index=False,
118 | )
119 | logger.info("Completed MLOps batch scoring")
120 | except Exception as exp:
121 | logger.error("an exception occurred in scoring batch")
122 | raise Exception("an exception occurred in scoring batch") from exp
123 |
--------------------------------------------------------------------------------
/ml_ops/src/taxi_fares_mlops/training.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from typing import Dict
3 |
4 | import lightgbm as lgb
5 | import mlflow
6 | import pandas as pd
7 | from monitoring.app_logger import AppLogger, get_disabled_logger
8 | from opencensus.trace.tracer import Tracer
9 | from taxi_fares.training.evaluate import get_model_metrics, split_data
10 | from taxi_fares.training.train import train
11 |
12 |
13 | def run(
14 | train_df: pd.DataFrame,
15 | mlflow: mlflow,
16 | params: Dict = {"num_leaves": 32,
17 | "objective": "regression", "metric": "rmse"},
18 | num_rounds: int = 100,
19 | app_logger: AppLogger = get_disabled_logger(),
20 | parent_tracer: Tracer = None,
21 | ) -> lgb.Booster:
22 | """MLOps training entry point.
23 |
24 | Args:
25 | train_df (pd.DataFrame): data for training, output of feature engineering
26 | mlflow (mlflow): mlflow object that is having an active run
27 | initiated by mlflow.start_run
28 | app_logger (monitoring.app_logger.AppLogger): AppLogger object deafult
29 | to monitoring.app_logger.get_disabled_logger
30 | parent_tracer (Tracer): OpenCensus parent tracer for correlation
31 | Returns:
32 | Ridge: trained model
33 | """
34 | logger = logging.getLogger(__name__)
35 | try:
36 | component_name = "Taxi_Fares_Training"
37 |
38 | # mlflow tracking
39 | mlflow_run = mlflow.active_run()
40 | mlflow_run_id = mlflow_run.info.run_id
41 | mlflow_experiment_id = mlflow_run.info.experiment_id
42 |
43 | logger = app_logger.get_logger(
44 | component_name=component_name,
45 | custom_dimensions={
46 | "mlflow_run_id": mlflow_run_id,
47 | "mlflow_experiment_id": mlflow_experiment_id,
48 | },
49 | )
50 | tracer = app_logger.get_tracer(
51 | component_name=component_name, parent_tracer=parent_tracer
52 | )
53 |
54 | logger.info("Running MLOps training")
55 |
56 | params = {"num_leaves": 32,
57 | "objective": "regression", "metric": "rmse"}
58 | num_rounds = 100
59 | for k, v in params.items():
60 | logger.info(f"Training parameter {k}: {v}")
61 | logger.info(f"Training parameter num_rounds: {num_rounds}")
62 |
63 | logger.info("Splitting data for train and test")
64 | data = split_data(train_df)
65 |
66 | logger.info("Train the model")
67 | with tracer.span("train_model"):
68 | mlflow.lightgbm.autolog()
69 | model = train(data["train"], params, num_rounds)
70 |
71 | logger.info("Log the metrics for the model")
72 | metrics = get_model_metrics(model, data["test"])
73 | for (k, v) in metrics.items():
74 | logger.info(f"Metric {k}: {v}")
75 | mlflow.log_metric(k, v)
76 |
77 | logger.info("Completed MLOps training")
78 | return model
79 | except Exception as exp:
80 | logger.error("an exception occurred in training")
81 | raise Exception("an exception occurred in training") from exp
82 |
--------------------------------------------------------------------------------
/ml_ops/src/taxi_fares_mlops/utils.py:
--------------------------------------------------------------------------------
1 | from mlflow.tracking import MlflowClient
2 |
3 |
4 | def get_latest_model_version(model_name: str) -> int:
5 | latest_version = 1
6 | mlflow_client = MlflowClient()
7 | for mv in mlflow_client.search_model_versions(f"name='{model_name}'"):
8 | version_int = int(mv.version)
9 | if version_int > latest_version:
10 | latest_version = version_int
11 | return latest_version
12 |
--------------------------------------------------------------------------------
/ml_ops/tests/README.md:
--------------------------------------------------------------------------------
1 | # TESTS
2 |
3 | Unit test cases for `taxifares_mlops` MLOps source code.
4 |
--------------------------------------------------------------------------------
/ml_ops/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/ml_ops/tests/__init__.py
--------------------------------------------------------------------------------
/ml_ops/tests/taxi_fares/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/ml_ops/tests/taxi_fares/__init__.py
--------------------------------------------------------------------------------
/ml_ops/tests/taxi_fares/data/taxi_fares_unit_test_training.csv:
--------------------------------------------------------------------------------
1 | trip_distance,pickup_zip,dropoff_zip,mean_fare_window_1h_pickup_zip,count_trips_window_1h_pickup_zip,count_trips_window_30m_dropoff_zip,dropoff_is_weekend,fare_amount
2 | 4.94,10282,10171,13,2,1,1,19
3 | 0.28,10110,10110,3.5,1,2,0,3.5
4 | 0.7,10103,10023,7.5,2,1,0,5
5 | 0.8,10022,10017,6,1,1,0,6
6 | 4.51,10110,10282,17,1,1,0,17
7 | 1.8,10009,10065,8,2,1,0,7
8 | 2.58,10153,10199,7.75,2,2,0,12
9 | 1.4,10112,10069,11,1,1,0,11
10 | 1.21,10023,10153,7.75,2,1,1,7.5
11 | 0.6,10012,10003,7.5,2,2,1,6
--------------------------------------------------------------------------------
/ml_ops/tests/taxi_fares/test_publish_model.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import unittest
3 | from unittest.mock import MagicMock, patch
4 |
5 | from taxi_fares_mlops.publish_model import run
6 |
7 |
8 | class TestEvaluateMethods(unittest.TestCase):
9 | logger = logging.getLogger(__name__)
10 | logging.basicConfig(
11 | format="%(asctime)s %(module)s %(levelname)s: %(message)s",
12 | datefmt="%m/%d/%Y %I:%M:%S %p",
13 | level=logging.INFO,
14 | )
15 |
16 | @patch("taxi_fares_mlops.publish_model.feature_store")
17 | @patch("taxi_fares_mlops.publish_model.get_latest_model_version")
18 | def test_publish_model(self, mock_feature_store, mock_get_latest_model_version):
19 | self.logger.info("unittest test_publish_model")
20 | run(MagicMock(), MagicMock(), MagicMock())
21 | assert True
22 |
23 | def test_publish_model_exception(self):
24 | self.logger.info("unittest test_publish_model exception")
25 | with self.assertRaises(Exception):
26 | run(None, None, None)
27 | assert True
28 |
29 |
30 | if __name__ == "__main__":
31 | unittest.main()
32 |
--------------------------------------------------------------------------------
/ml_ops/tests/taxi_fares/test_training.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | import unittest
4 | from unittest.mock import MagicMock
5 |
6 | import lightgbm as lgb
7 | import pandas as pd
8 | from pyspark.sql import SparkSession
9 | from taxi_fares_mlops.training import run
10 |
11 |
12 | class TestEvaluateMethods(unittest.TestCase):
13 | @classmethod
14 | def setUpClass(cls):
15 | cls.spark = (
16 | SparkSession.builder.master("local[*]").appName("Unit-tests").getOrCreate()
17 | )
18 |
19 | logger = logging.getLogger(__name__)
20 | logging.basicConfig(
21 | format="%(asctime)s %(module)s %(levelname)s: %(message)s",
22 | datefmt="%m/%d/%Y %I:%M:%S %p",
23 | level=logging.INFO,
24 | )
25 |
26 | def test_training(self):
27 | self.logger.info("unittest test_training")
28 | data_file = os.path.join(
29 | "tests/taxi_fares/data", "taxi_fares_unit_test_training.csv"
30 | )
31 | train_df_pandas = pd.read_csv(data_file)
32 | train_df = self.spark.createDataFrame(train_df_pandas)
33 | model = run(train_df, MagicMock())
34 |
35 | assert isinstance(model, lgb.Booster)
36 |
37 | def test_training_exception(self):
38 | self.logger.info("unittest test_training exception")
39 | with self.assertRaises(Exception):
40 | model = run(MagicMock(), MagicMock())
41 | assert model is not None
42 |
43 |
44 | if __name__ == "__main__":
45 | unittest.main()
46 |
--------------------------------------------------------------------------------
/ml_ops/tests/taxi_fares/test_utils.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from unittest.mock import patch
3 |
4 | from taxi_fares_mlops.utils import get_latest_model_version
5 |
6 |
7 | class TestUtils(unittest.TestCase):
8 | @patch("taxi_fares_mlops.utils.MlflowClient")
9 | def test_get_latest_model_version(self, mock_mlflow_client):
10 | assert get_latest_model_version("taxi_fares") == 1
11 |
--------------------------------------------------------------------------------
/ml_source/README.md:
--------------------------------------------------------------------------------
1 | # ML Source
2 |
3 | ## Overview
4 |
5 | This contains machine learning code. That will be developed, unit tested, packaged and delivered independently and typically maintained by Data scientist in an organization.
6 |
7 | ## Contents
8 |
9 | 1. [src](src/) : machine learning source code, that will be packaged as Python `wheel`.
10 | 2. [tests](tests/) : unit test cases for `src`.
11 |
--------------------------------------------------------------------------------
/ml_source/src/README.md:
--------------------------------------------------------------------------------
1 | # SRC
2 |
3 | ## Overview
4 |
5 | Source code for machine learning, based on -
6 |
7 | 1. [taxifares](taxi_fares/) contains machine learning source code.
8 | 2. `monitoring` contains logging class for logging into Application Insights.
9 | 3. The machine python functions will be called from MLOps python functions.
10 |
--------------------------------------------------------------------------------
/ml_source/src/monitoring/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/ml_source/src/monitoring/__init__.py
--------------------------------------------------------------------------------
/ml_source/src/monitoring/app_logger.py:
--------------------------------------------------------------------------------
1 | """This module is used to log traces into Azure Application Insights."""
2 | import logging
3 | import uuid
4 | from os import getenv
5 |
6 | from opencensus.ext.azure.common import utils
7 | from opencensus.ext.azure.log_exporter import AzureLogHandler
8 | from opencensus.ext.azure.trace_exporter import AzureExporter
9 | from opencensus.trace import config_integration
10 | from opencensus.trace.samplers import AlwaysOffSampler, AlwaysOnSampler
11 | from opencensus.trace.tracer import Tracer
12 |
13 |
14 | class CustomDimensionsFilter(logging.Filter):
15 | """Add custom-dimensions like run_id in each log by using filters."""
16 |
17 | def __init__(self, custom_dimensions=None):
18 | """Initialize CustomDimensionsFilter."""
19 | self.custom_dimensions = custom_dimensions or {}
20 |
21 | def filter(self, record):
22 | """Add the default custom_dimensions into the current log record."""
23 | dim = {**self.custom_dimensions, **
24 | getattr(record, "custom_dimensions", {})}
25 | record.custom_dimensions = dim
26 | return True
27 |
28 |
29 | class AppLogger:
30 | """Logger wrapper that attach the handler to Application Insights."""
31 |
32 | HANDLER_NAME = "Azure Application Insights Handler"
33 |
34 | def __init__(self, config=None):
35 | """Create an instance of the Logger class.
36 |
37 | Args:
38 | config:([dict], optional):
39 | Contains the setting for logger {"log_level": logging.debug,"env":"dev",
40 | "app_insights_key":""}
41 | parent:tracer([opencensus.trace.tracer], optional):
42 | Contains parent tracer required for setting coorelation.
43 | """
44 | self.config = {"log_level": logging.INFO, "logging_enabled": "true"}
45 | self.APPINSIGHTS_INSTRUMENTATION_KEY = "APPINSIGHTS_INSTRUMENTATION_KEY"
46 | self.update_config(config)
47 | pass
48 |
49 | def _initialize_azure_log_handler(self, component_name, custom_dimensions):
50 | """Initialize azure log handler."""
51 | # Adding logging to trace_integrations
52 | # This will help in adding trace and span ids to logs
53 | # https://github.com/census-instrumentation/opencensus-python/tree/master/contrib/opencensus-ext-logging
54 |
55 | config_integration.trace_integrations(["logging"])
56 | logging.basicConfig(
57 | format="%(asctime)s name=%(name)s level=%(levelname)s "
58 | "traceId=%(traceId)s spanId=%(spanId)s %(message)s"
59 | )
60 | app_insights_cs = "InstrumentationKey=" + self._get_app_insights_key()
61 | log_handler = AzureLogHandler(
62 | connection_string=app_insights_cs, export_interval=0.0
63 | )
64 | log_handler.add_telemetry_processor(self._get_callback(component_name))
65 | log_handler.name = self.HANDLER_NAME
66 | log_handler.addFilter(CustomDimensionsFilter(custom_dimensions))
67 | return log_handler
68 |
69 | def _initialize_azure_log_exporter(self, component_name):
70 | """Initialize azure log exporter."""
71 | app_insights_cs = "InstrumentationKey=" + self._get_app_insights_key()
72 | log_exporter = AzureExporter(
73 | connection_string=app_insights_cs, export_interval=0.0
74 | )
75 | log_exporter.add_telemetry_processor(
76 | self._get_callback(component_name))
77 | return log_exporter
78 |
79 | def _initialize_logger(self, log_handler, component_name):
80 | """Initialize Logger."""
81 | logger = logging.getLogger(component_name)
82 | logger.setLevel(self.log_level)
83 | if self.config.get("logging_enabled") == "true":
84 | if not any(x for x in logger.handlers if x.name == self.HANDLER_NAME):
85 | logger.addHandler(log_handler)
86 | return logger
87 |
88 | def get_logger(self, component_name="TaxiFaresMlOps", custom_dimensions={}):
89 | """Get Logger Object.
90 |
91 | Args:
92 | component_name (str, optional): Name of logger. Defaults to "TaxiFaresMlOps".
93 | custom_dimensions (dict, optional): {"key":"value"}
94 | to capture with every log.
95 | Defaults to {}.
96 |
97 | Returns:
98 | Logger: A logger.
99 | """
100 | self.update_config(self.config)
101 | handler = self._initialize_azure_log_handler(
102 | component_name, custom_dimensions)
103 | return self._initialize_logger(handler, component_name)
104 |
105 | def get_tracer(self, component_name="TaxiFaresMlOps", parent_tracer=None):
106 | """Get Tracer Object.
107 |
108 | Args:
109 | component_name (str, optional): Name of logger. Defaults to "TaxiFaresMlOps".
110 | parent_tracer([opencensus.trace.tracer], optional):
111 | Contains parent tracer required for setting coorelation.
112 |
113 | Returns:
114 | opencensus.trace.tracer: A Tracer.
115 | """
116 | self.update_config(self.config)
117 | sampler = AlwaysOnSampler()
118 | exporter = self._initialize_azure_log_exporter(component_name)
119 | if self.config.get("logging_enabled") != "true":
120 | sampler = AlwaysOffSampler()
121 | if parent_tracer is None:
122 | tracer = Tracer(exporter=exporter, sampler=sampler)
123 | else:
124 | tracer = Tracer(
125 | span_context=parent_tracer.span_context,
126 | exporter=exporter,
127 | sampler=sampler,
128 | )
129 | return tracer
130 |
131 | def _get_app_insights_key(self):
132 | """Get Application Insights Key."""
133 | try:
134 | if self.app_insights_key is None:
135 | self.app_insights_key = getenv(
136 | self.APPINSIGHTS_INSTRUMENTATION_KEY, None
137 | )
138 | if self.app_insights_key is not None:
139 | utils.validate_instrumentation_key(self.app_insights_key)
140 | return self.app_insights_key
141 | else:
142 | raise Exception("ApplicationInsights Key is not set")
143 | except Exception as exp:
144 | raise Exception(f"Exception is getting app insights key-> {exp}")
145 |
146 | def _get_callback(self, component_name):
147 | def _callback_add_role_name(envelope):
148 | """Add role name for logger."""
149 | envelope.tags["ai.cloud.role"] = component_name
150 | envelope.tags["ai.cloud.roleInstance"] = component_name
151 |
152 | return _callback_add_role_name
153 |
154 | def update_config(self, config=None):
155 | """Update logger configuration."""
156 | if config is not None:
157 | self.config.update(config)
158 | self.app_insights_key = self.config.get("app_insights_key")
159 | self.log_level = self.config.get("log_level")
160 |
161 |
162 | def get_disabled_logger():
163 | """Get a disabled AppLogger.
164 |
165 | Returns:
166 | AppLogger: A disabled AppLogger
167 | """
168 | return AppLogger(
169 | config={"logging_enabled": "false",
170 | "app_insights_key": str(uuid.uuid1())}
171 | )
172 |
--------------------------------------------------------------------------------
/ml_source/src/setup.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from setuptools import find_packages, setup
4 |
5 |
6 | # Utility function to read the README file.
7 | # Used for the long_description. It's nice, because now 1) we have a top level
8 | # README file and 2) it's easier to type in the README file than to put a raw
9 | # string in below ...
10 | def read(fname):
11 | return open(os.path.join(os.path.dirname(__file__), fname)).read()
12 |
13 |
14 | requirements_file_name = "requirements.txt"
15 | with open(requirements_file_name) as f:
16 | required_packages = f.read().splitlines()
17 | required_packages = [
18 | package.strip(" ")
19 | for package in required_packages
20 | if package.strip(" ") and "#" not in package
21 | ]
22 | setup(
23 | name="taxi_fares",
24 | version="0.0.1",
25 | author="",
26 | author_email="",
27 | description=(""),
28 | license="",
29 | keywords="",
30 | url="",
31 | package_dir={"": "ml_source/src"},
32 | packages=find_packages(where="ml_source/src"),
33 | classifiers=[],
34 | install_requires=required_packages,
35 | )
36 |
--------------------------------------------------------------------------------
/ml_source/src/taxi_fares/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/ml_source/src/taxi_fares/__init__.py
--------------------------------------------------------------------------------
/ml_source/src/taxi_fares/feature_eng/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/ml_source/src/taxi_fares/feature_eng/__init__.py
--------------------------------------------------------------------------------
/ml_source/src/taxi_fares/feature_eng/features.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 |
3 | from pyspark.sql.dataframe import DataFrame
4 | from pyspark.sql.functions import col, count, mean, to_timestamp, unix_timestamp, window
5 | from pyspark.sql.types import FloatType, IntegerType
6 | from taxi_fares.utils.pyspark_utils import filter_df_by_ts, is_weekend, partition_id
7 |
8 |
9 | def pickup_features_fn(
10 | df: DataFrame, ts_column: str, start_date: datetime, end_date: datetime
11 | ) -> DataFrame:
12 | """
13 | Computes the pickup_features feature group.
14 | To restrict features to a time range, pass in ts_column, start_date,
15 | and/or end_date as kwargs.
16 | """
17 | df = filter_df_by_ts(df, ts_column, start_date, end_date)
18 | pickupzip_features = (
19 | df.groupBy(
20 | "pickup_zip", window("tpep_pickup_datetime", "1 hour", "15 minutes")
21 | ) # 1 hour window, sliding every 15 minutes
22 | .agg(
23 | mean("fare_amount").alias("mean_fare_window_1h_pickup_zip"),
24 | count("*").alias("count_trips_window_1h_pickup_zip"),
25 | )
26 | .select(
27 | col("pickup_zip").alias("zip"),
28 | unix_timestamp(col("window.end")).alias("ts").cast(IntegerType()),
29 | partition_id(to_timestamp(col("window.end"))).alias("yyyy_mm"),
30 | col("mean_fare_window_1h_pickup_zip").cast(FloatType()),
31 | col("count_trips_window_1h_pickup_zip").cast(IntegerType()),
32 | )
33 | )
34 | return pickupzip_features
35 |
36 |
37 | def dropoff_features_fn(
38 | df: DataFrame, ts_column: str, start_date: datetime, end_date: datetime
39 | ) -> DataFrame:
40 | """
41 | Computes the dropoff_features feature group.
42 | To restrict features to a time range, pass in ts_column, start_date,
43 | and/or end_date as kwargs.
44 | """
45 | df = filter_df_by_ts(df, ts_column, start_date, end_date)
46 | dropoffzip_features = (
47 | df.groupBy("dropoff_zip", window("tpep_dropoff_datetime", "30 minute"))
48 | .agg(count("*").alias("count_trips_window_30m_dropoff_zip"))
49 | .select(
50 | col("dropoff_zip").alias("zip"),
51 | unix_timestamp(col("window.end")).alias("ts").cast(IntegerType()),
52 | partition_id(to_timestamp(col("window.end"))).alias("yyyy_mm"),
53 | col("count_trips_window_30m_dropoff_zip").cast(IntegerType()),
54 | is_weekend(col("window.end")).alias("dropoff_is_weekend"),
55 | )
56 | )
57 | return dropoffzip_features
58 |
--------------------------------------------------------------------------------
/ml_source/src/taxi_fares/training/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/ml_source/src/taxi_fares/training/__init__.py
--------------------------------------------------------------------------------
/ml_source/src/taxi_fares/training/evaluate.py:
--------------------------------------------------------------------------------
1 | from typing import Dict
2 |
3 | import lightgbm as lgb
4 | import numpy as np
5 | from pyspark.sql.dataframe import DataFrame
6 | from sklearn.metrics import mean_squared_error
7 | from sklearn.model_selection import train_test_split
8 |
9 |
10 | def split_data(df: DataFrame) -> dict:
11 | """Split the dataframe into test and train data.
12 |
13 | Args:
14 | df (pd.DataFrame): processed dataframe for train and evaluate
15 |
16 | Returns:
17 | dict: splitted data for train and test -
18 | {
19 | "train":
20 | "X": np.array,
21 | "y": np.array,
22 | },
23 | "test":
24 | "X": np.array,
25 | "y": np.array,
26 | }
27 | }
28 | """
29 | features_and_label = df.columns
30 |
31 | # Collect data into a Pandas array for training
32 | data = df.toPandas()[features_and_label]
33 |
34 | train, test = train_test_split(data, random_state=123)
35 | X_train = train.drop(["fare_amount"], axis=1)
36 | y_train = train.fare_amount
37 | X_test = test.drop(["fare_amount"], axis=1)
38 | y_test = test.fare_amount
39 |
40 | data = {"train": {"X": X_train, "y": y_train}, "test": {"X": X_test, "y": y_test}}
41 | return data
42 |
43 |
44 | def get_model_metrics(model: lgb.Booster, test_data: Dict[str, np.ndarray]) -> dict:
45 | """Evaluate the metrics for the model.
46 |
47 | Args:
48 | model (Ridge): trained ridge model
49 | test_data (Dict[np.array]): test data with X key for features and y key labels
50 |
51 | Returns:
52 | dict: mse metrics
53 | """
54 | preds = model.predict(test_data["X"])
55 | mse = mean_squared_error(preds, test_data["y"])
56 | metrics = {"mse": mse}
57 | return metrics
58 |
--------------------------------------------------------------------------------
/ml_source/src/taxi_fares/training/train.py:
--------------------------------------------------------------------------------
1 | from typing import Dict
2 |
3 | import lightgbm as lgb
4 | import numpy as np
5 |
6 |
7 | def train(
8 | train_data: Dict[str, np.ndarray], params: dict, num_rounds: int
9 | ) -> lgb.Booster:
10 | train_lgb_dataset = lgb.Dataset(train_data["X"], label=train_data["y"].values)
11 |
12 | # Train a lightGBM model
13 | model = lgb.train(params, train_lgb_dataset, num_rounds)
14 | return model
15 |
--------------------------------------------------------------------------------
/ml_source/src/taxi_fares/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/ml_source/src/taxi_fares/utils/__init__.py
--------------------------------------------------------------------------------
/ml_source/src/taxi_fares/utils/pyspark_utils.py:
--------------------------------------------------------------------------------
1 | import math
2 | from datetime import datetime, timedelta
3 |
4 | from pyspark.sql.column import Column
5 | from pyspark.sql.dataframe import DataFrame
6 | from pyspark.sql.functions import col, lit, udf
7 | from pyspark.sql.types import IntegerType, StringType
8 | from pytz import timezone
9 |
10 |
11 | @udf(returnType=IntegerType())
12 | def is_weekend(dt: Column) -> Column:
13 | tz = "America/New_York"
14 | return int(dt.astimezone(timezone(tz)).weekday() >= 5) # 5 = Saturday, 6 = Sunday
15 |
16 |
17 | @udf(returnType=StringType())
18 | def partition_id(dt: Column) -> Column:
19 | # datetime -> "YYYY-MM"
20 | return f"{dt.year:04d}-{dt.month:02d}"
21 |
22 |
23 | def filter_df_by_ts(
24 | df: DataFrame, ts_column: str, start_date: datetime, end_date: datetime
25 | ) -> DataFrame:
26 | if ts_column and start_date:
27 | df = df.filter(col(ts_column) >= start_date)
28 | if ts_column and end_date:
29 | df = df.filter(col(ts_column) < end_date)
30 | return df
31 |
32 |
33 | def rounded_unix_timestamp(dt, num_minutes=15):
34 | """
35 | Ceilings datetime dt to interval num_minutes, then returns the unix timestamp.
36 | """
37 | nsecs = dt.minute * 60 + dt.second + dt.microsecond * 1e-6
38 | delta = math.ceil(nsecs / (60 * num_minutes)) * (60 * num_minutes) - nsecs
39 | return int((dt + timedelta(seconds=delta)).timestamp())
40 |
41 |
42 | rounded_unix_timestamp_udf = udf(rounded_unix_timestamp, IntegerType())
43 |
44 |
45 | def rounded_taxi_data(taxi_data_df):
46 | # Round the taxi data timestamp to 15 and 30 minute intervals so we can join with
47 | # the pickup and dropoff features
48 | # respectively.
49 | taxi_data_df = (
50 | taxi_data_df.withColumn(
51 | "rounded_pickup_datetime",
52 | rounded_unix_timestamp_udf(taxi_data_df["tpep_pickup_datetime"], lit(15)),
53 | )
54 | .withColumn(
55 | "rounded_dropoff_datetime",
56 | rounded_unix_timestamp_udf(taxi_data_df["tpep_dropoff_datetime"], lit(30)),
57 | )
58 | .drop("tpep_pickup_datetime")
59 | .drop("tpep_dropoff_datetime")
60 | )
61 | taxi_data_df.createOrReplaceTempView("taxi_data")
62 | return taxi_data_df
63 |
--------------------------------------------------------------------------------
/ml_source/tests/README.md:
--------------------------------------------------------------------------------
1 | # TESTS
2 |
3 | Unit test cases for `taxifares` machine learning source code.
4 |
--------------------------------------------------------------------------------
/ml_source/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/ml_source/tests/__init__.py
--------------------------------------------------------------------------------
/ml_source/tests/monitoring/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/ml_source/tests/monitoring/__init__.py
--------------------------------------------------------------------------------
/ml_source/tests/monitoring/test_app_logger.py:
--------------------------------------------------------------------------------
1 | """Test src/monitoring/app_logger.py."""
2 |
3 | import logging
4 | import unittest
5 | import uuid
6 |
7 | from monitoring.app_logger import AppLogger, get_disabled_logger
8 |
9 | test_instrumentation_key = str(uuid.uuid1())
10 | test_invalid_instrumentation_key = "invalid_instrumentation_key"
11 |
12 |
13 | class TestAppLogger(unittest.TestCase):
14 | @classmethod
15 | def setUpClass(cls):
16 | cls.valid_config = {
17 | "log_level": "DEBUG",
18 | "logging_enabled": "true",
19 | "app_insights_key": test_instrumentation_key,
20 | }
21 | cls.invalid_config = {
22 | "log_level": "DEBUG",
23 | "logging_enabled": "false",
24 | "app_insights_key": test_invalid_instrumentation_key,
25 | }
26 |
27 | def test_logger_creation_valid_instrumentation_key(self):
28 | """Test with valid formatted instrumentation key."""
29 | global test_instrumentation_key
30 | try:
31 | app_logger = AppLogger(
32 | config=self.valid_config,
33 | )
34 | assert app_logger is not None
35 | except Exception:
36 | assert False
37 |
38 | def test_logger_creation_invalid_instrumentation_key(self):
39 | """Test with invalid instrumentation key."""
40 | global test_invalid_instrumentation_key
41 | with self.assertRaises(Exception):
42 | logging.disable(logging.CRITICAL)
43 | app_logger = AppLogger(
44 | config=self.invalid_config,
45 | )
46 | app_logger.get_logger()
47 | assert app_logger is not None
48 |
49 | def test_logger_creation_no_instrumentation_key(self):
50 | """Test with no instrumentation key."""
51 | with self.assertRaises(Exception):
52 | logging.disable(logging.CRITICAL)
53 | config = {"log_level": logging.DEBUG, "logging_enabled": "false"}
54 | app_logger = AppLogger(config=config)
55 | app_logger.get_logger()
56 | assert app_logger is not None
57 |
58 | def test_logging(self):
59 | """Test to use logging functions."""
60 | global test_instrumentation_key
61 | try:
62 | component_name = "TestComponent"
63 | app_logger = AppLogger(config=self.valid_config)
64 | assert app_logger is not None
65 | test_logger = app_logger.get_logger(
66 | component_name=component_name,
67 | )
68 |
69 | assert test_logger is not None
70 | test_logger.info("Test Logging")
71 | except Exception:
72 | assert False
73 |
74 | def test_tracing(self):
75 | """Test for Tracer."""
76 | global test_instrumentation_key
77 | try:
78 | component_name = "TestComponent"
79 | app_logger = AppLogger(config=self.valid_config)
80 | assert app_logger is not None
81 |
82 | tracer = app_logger.get_tracer(
83 | component_name=component_name,
84 | )
85 | tracer_with_parent = app_logger.get_tracer(
86 | component_name=component_name, parent_tracer=tracer
87 | )
88 | test_logger = app_logger.get_logger(
89 | component_name=component_name,
90 | )
91 |
92 | assert test_logger is not None
93 | assert tracer is not None
94 | assert tracer_with_parent is not None
95 |
96 | with tracer.span(name="testspan"):
97 | test_logger.info("in test span")
98 | except Exception:
99 | assert False
100 |
101 | def test_tracing_with_disabled_logger(self):
102 | """Test with no instrumentation key."""
103 | app_logger = get_disabled_logger()
104 | tracer = app_logger.get_tracer()
105 | assert tracer is not None
106 |
107 | def test_exception(self):
108 | """Test for calling logger.exception method."""
109 | global test_instrumentation_key
110 | try:
111 | component_name = "TestComponent"
112 | app_logger = AppLogger(
113 | config=self.valid_config,
114 | )
115 | assert app_logger is not None
116 |
117 | test_logger = app_logger.get_logger(
118 | component_name=component_name,
119 | )
120 | assert test_logger is not None
121 | try:
122 | raise Exception("Testing exception logging")
123 | except Exception as exp:
124 | test_logger.exception(exp)
125 | except Exception:
126 | assert False
127 |
128 | def test_logging_level(self):
129 | """Test for changing logger level in config."""
130 | try:
131 | global test_instrumentation_key
132 | component_name = "TestComponent"
133 | valid_config = self.valid_config.copy()
134 | valid_config["log_level"] = logging.ERROR
135 | app_logger = AppLogger(
136 | config=valid_config,
137 | )
138 | assert app_logger.config["log_level"] == logging.ERROR
139 | test_logger = app_logger.get_logger(
140 | component_name=component_name,
141 | )
142 |
143 | test_logger.error("Testing logging level")
144 | except Exception:
145 | assert False
146 |
147 | def test_logging_extra_params(self):
148 | """Test logging extra params."""
149 | try:
150 | global test_instrumentation_key
151 | component_name = "TestComponent"
152 | app_logger = AppLogger(
153 | config=self.valid_config,
154 | )
155 | test_logger = app_logger.get_logger(
156 | component_name=component_name,
157 | )
158 | extra_params = {"custom_dimensions": {"key1": "value1"}}
159 | test_logger.info("Logging extra params", extra=extra_params)
160 | except Exception:
161 | assert False
162 |
163 | def test_disabled_logger(self):
164 | """Test disabled logger."""
165 | try:
166 |
167 | def do_work(app_logger=get_disabled_logger()):
168 | component_name = "TestComponent"
169 | test_logger = app_logger.get_logger(
170 | component_name=component_name,
171 | )
172 | extra_params = {"custom_dimensions": {"key1": "value1"}}
173 | test_logger.info("Logging extra params", extra=extra_params)
174 |
175 | do_work()
176 | except Exception:
177 | assert False
178 |
179 |
180 | if __name__ == "__main__":
181 | unittest.main()
182 |
--------------------------------------------------------------------------------
/ml_source/tests/taxi_fares/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/ml_source/tests/taxi_fares/__init__.py
--------------------------------------------------------------------------------
/ml_source/tests/taxi_fares/feature_eng/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/ml_source/tests/taxi_fares/feature_eng/__init__.py
--------------------------------------------------------------------------------
/ml_source/tests/taxi_fares/feature_eng/test_features.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from pyspark.sql import SparkSession
4 | from src.taxi_fares.feature_eng.features import pickup_features_fn
5 |
6 |
7 | class TestFeatures(unittest.TestCase):
8 | @classmethod
9 | def setUpClass(cls):
10 | cls.spark = (
11 | SparkSession.builder.master("local[*]").appName("Unit-tests").getOrCreate()
12 | )
13 |
14 | @classmethod
15 | def tearDownClass(cls):
16 | cls.spark.stop()
17 |
18 | def test_if_pickup_features_are_computed(self):
19 | df = self.spark.createDataFrame(
20 | [
21 | ("2019-01-01 00:00:00", "2019-01-01 01:00:00", 1.0, 1, 10000, 10001),
22 | ("2019-01-01 00:15:00", "2019-01-01 01:15:00", 2.0, 2, 10002, 10003),
23 | ("2019-01-01 00:30:00", "2019-01-01 01:30:00", 3.0, 3, 10004, 10005),
24 | ("2019-01-01 00:45:00", "2019-01-01 01:45:00", 4.0, 4, 10006, 10007),
25 | ("2019-01-01 01:00:00", "2019-01-01 02:00:00", 5.0, 5, 10008, 10009),
26 | ("2019-01-01 01:15:00", "2019-01-01 02:15:00", 6.0, 6, 10010, 10011),
27 | ("2019-01-01 01:30:00", "2019-01-01 02:30:00", 7.0, 7, 10012, 10013),
28 | ("2019-01-01 01:45:00", "2019-01-01 02:45:00", 8.0, 8, 10014, 10015),
29 | ("2019-01-01 02:00:00", "2019-01-01 03:00:00", 9.0, 9, 10016, 10017),
30 | ("2019-01-01 02:15:00", "2019-01-01 03:15:00", 10.0, 10, 10018, 10019),
31 | ("2019-01-01 02:30:00", "2019-01-01 03:30:00", 11.0, 11, 10020, 10021),
32 | ("2019-01-01 02:45:00", "2019-01-01 03:45:00", 12.0, 12, 10022, 10023),
33 | ],
34 | [
35 | "tpep_pickup_datetime",
36 | "tpep_dropoff_datetime",
37 | "trip_distance",
38 | "fare_amount",
39 | "pickup_zip",
40 | "dropoff_zip",
41 | ],
42 | )
43 | df = pickup_features_fn(
44 | df, "tpep_pickup_datetime", "2019-01-01 00:00:00", "2019-01-01 01:45:00"
45 | )
46 | self.assertEqual(df.count(), 28)
47 |
--------------------------------------------------------------------------------
/ml_source/tests/taxi_fares/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/ml_source/tests/taxi_fares/utils/__init__.py
--------------------------------------------------------------------------------
/ml_source/tests/taxi_fares/utils/test_pyspark_utils.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from pyspark.sql import SparkSession
4 | from src.taxi_fares.utils.pyspark_utils import filter_df_by_ts
5 |
6 |
7 | class TestPysparkUtils(unittest.TestCase):
8 | @classmethod
9 | def setUpClass(cls):
10 | cls.spark = (
11 | SparkSession.builder.master("local[*]").appName("Unit-tests").getOrCreate()
12 | )
13 |
14 | @classmethod
15 | def tearDownClass(cls):
16 | cls.spark.stop()
17 |
18 | def test_if_df_is_getting_filtered_by_ts(self):
19 | df = self.spark.createDataFrame(
20 | [
21 | ("2019-01-01 00:00:00", 1),
22 | ("2019-01-01 00:15:00", 2),
23 | ("2019-01-01 00:30:00", 3),
24 | ("2019-01-01 00:45:00", 4),
25 | ("2019-01-01 01:00:00", 5),
26 | ("2019-01-01 01:15:00", 6),
27 | ("2019-01-01 01:30:00", 7),
28 | ("2019-01-01 01:45:00", 8),
29 | ("2019-01-01 02:00:00", 9),
30 | ("2019-01-01 02:15:00", 10),
31 | ("2019-01-01 02:30:00", 11),
32 | ("2019-01-01 02:45:00", 12),
33 | ],
34 | ["tpep_pickup_datetime", "fare_amount"],
35 | )
36 | df = filter_df_by_ts(
37 | df, "tpep_pickup_datetime", "2019-01-01 00:00:00", "2019-01-01 01:45:00"
38 | )
39 | self.assertEqual(df.count(), 7)
40 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | scikit-learn==1.5.0
2 | pandas==1.2.4
3 | black==24.3.0
4 | coverage==5.5
5 | databricks-cli==0.14.3
6 | mlflow==2.21.0
7 | opencensus-ext-azure==1.0.7
8 | opencensus-ext-logging==0.1.0
9 | protobuf==3.18.3
10 | lightgbm==4.6.0
11 | isort==5.10.1
12 |
--------------------------------------------------------------------------------