├── .devcontainer ├── Dockerfile └── devcontainer.json ├── .github ├── CODE_OF_CONDUCT.md ├── ISSUE_TEMPLATE.md └── PULL_REQUEST_TEMPLATE.md ├── .gitignore ├── .vscode └── settings.json ├── CHANGELOG.md ├── CONTRIBUTING.md ├── LICENSE.md ├── Makefile ├── README.md ├── docs ├── advance │ ├── cicd.md │ └── registered_model_stages.md └── images │ ├── batch_scoring.png │ ├── model_training.png │ ├── result_azure_resources.png │ ├── result_batch_scoring.png │ ├── result_databricks_job.png │ ├── result_mlflow_experiment.png │ └── result_mlflow_model_registry.png ├── ml_experiments └── Feature Store Taxi example notebook.ipynb ├── ml_ops ├── README.md ├── deployment │ ├── README.md │ ├── arm_templates │ │ └── databricks_and_storage.json │ └── databricks │ │ ├── cluster_template.json │ │ ├── job_template_taxi_fares_batch_scoring.json │ │ └── job_template_taxi_fares_training.json ├── orchestrator │ ├── README.md │ ├── taxi_fares_orchestrator_batch_score.py │ └── taxi_fares_orchestrator_train.py ├── src │ ├── README.md │ ├── setup.py │ └── taxi_fares_mlops │ │ ├── __init__.py │ │ ├── feature_engineering.py │ │ ├── publish_model.py │ │ ├── scoring_batch.py │ │ ├── training.py │ │ └── utils.py └── tests │ ├── README.md │ ├── __init__.py │ └── taxi_fares │ ├── __init__.py │ ├── data │ └── taxi_fares_unit_test_training.csv │ ├── test_publish_model.py │ ├── test_training.py │ └── test_utils.py ├── ml_source ├── README.md ├── src │ ├── README.md │ ├── monitoring │ │ ├── __init__.py │ │ └── app_logger.py │ ├── setup.py │ └── taxi_fares │ │ ├── __init__.py │ │ ├── feature_eng │ │ ├── __init__.py │ │ └── features.py │ │ ├── training │ │ ├── __init__.py │ │ ├── evaluate.py │ │ └── train.py │ │ └── utils │ │ ├── __init__.py │ │ └── pyspark_utils.py └── tests │ ├── README.md │ ├── __init__.py │ ├── monitoring │ ├── __init__.py │ └── test_app_logger.py │ └── taxi_fares │ ├── __init__.py │ ├── feature_eng │ ├── __init__.py │ └── test_features.py │ └── utils │ ├── __init__.py │ └── test_pyspark_utils.py └── requirements.txt /.devcontainer/Dockerfile: -------------------------------------------------------------------------------- 1 | # See here for image contents: https://github.com/microsoft/vscode-dev-containers/blob/master/containers/python-3-anaconda/.devcontainer/base.Dockerfile 2 | ARG VARIANT="3" 3 | FROM mcr.microsoft.com/vscode/devcontainers/anaconda:0-${VARIANT} 4 | 5 | # Additional packages 6 | RUN sudo apt-get update 7 | RUN sudo apt-get install --reinstall build-essential -y 8 | RUN sudo apt-get install default-jdk -y 9 | 10 | # Get local user 11 | ARG USERNAME=vscode 12 | 13 | # Change conda to be owned by the local user 14 | RUN chown -R $USERNAME:$USERNAME /opt/conda 15 | 16 | # Activate local user 17 | USER $USERNAME 18 | 19 | # Conda init 20 | RUN conda init bash 21 | 22 | # [Optional] If your pip requirements rarely change, uncomment this section to add them to the image. 23 | COPY requirements.txt /tmp/pip-tmp/ 24 | RUN pip --disable-pip-version-check --no-cache-dir install -r /tmp/pip-tmp/requirements.txt \ 25 | && sudo rm -rf /tmp/pip-tmp 26 | RUN pip --disable-pip-version-check --no-cache-dir install databricks-feature-store -------------------------------------------------------------------------------- /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | // For format details, see https://aka.ms/devcontainer.json. For config options, see the README at: 2 | // https://github.com/microsoft/vscode-dev-containers/tree/v0.155.1/containers/python-3 3 | { 4 | "name": "Anaconda (Python 3)", 5 | "build": { 6 | "dockerfile": "Dockerfile", 7 | "context": "..", 8 | "args": { 9 | // Update 'VARIANT' 10 | "VARIANT": "3", 11 | // Options 12 | "INSTALL_NODE": "false", 13 | } 14 | }, 15 | "mounts": [ 16 | "source=${localEnv:HOME}/.ssh,target=/home/vscode/.ssh,type=bind", 17 | "source=${localEnv:HOME}/.gitconfig,target=/home/vscode/.gitconfig,type=bind", 18 | ], 19 | // Set *default* container specific settings.json values on container create. 20 | "settings": { 21 | "terminal.integrated.shell.linux": "/bin/bash", 22 | "python.pythonPath": "/opt/conda/bin/python", 23 | "python.linting.enabled": true, 24 | "python.linting.pylintEnabled": false, 25 | "python.linting.flake8Enabled": true, 26 | "python.linting.flake8Path": "/opt/conda/bin/flake8", 27 | "python.linting.flake8Args": [ 28 | "--max-line-length=88" 29 | ], 30 | "python.formatting.provider": "black", 31 | "python.formatting.blackPath": "/opt/conda/bin/black", 32 | "python.testing.promptToConfigure": false, 33 | "[python]": { 34 | "editor.formatOnSave": true, 35 | "editor.codeActionsOnSave": { 36 | "source.organizeImports": true 37 | }, 38 | "files.trimTrailingWhitespace": true 39 | }, 40 | }, 41 | // Add the IDs of extensions you want installed when the container is created. 42 | "extensions": [ 43 | "ms-python.python", 44 | "yzhang.markdown-all-in-one", 45 | "streetsidesoftware.code-spell-checker", 46 | "njpwerner.autodocstring", 47 | "GitHub.copilot" 48 | ], 49 | // Use 'forwardPorts' to make a list of ports inside the container available locally. 50 | "forwardPorts": [ 51 | 5000 52 | ], 53 | // Use 'postCreateCommand' to run commands after the container is created. 54 | // "postCreateCommand": "python --version", 55 | // Comment out connect as root instead. More info: https://aka.ms/vscode-remote/containers/non-root. 56 | "remoteUser": "vscode" 57 | } -------------------------------------------------------------------------------- /.github/CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 4 | > Please provide us with the following information: 5 | > --------------------------------------------------------------- 6 | 7 | ### This issue is for a: (mark with an `x`) 8 | ``` 9 | - [ ] bug report -> please search issues before submitting 10 | - [ ] feature request 11 | - [ ] documentation issue or request 12 | - [ ] regression (a behavior that used to work and stopped in a new release) 13 | ``` 14 | 15 | ### Minimal steps to reproduce 16 | > 17 | 18 | ### Any log messages given by the failure 19 | > 20 | 21 | ### Expected/desired behavior 22 | > 23 | 24 | ### OS and Version? 25 | > Windows 7, 8 or 10. Linux (which distribution). macOS (Yosemite? El Capitan? Sierra?) 26 | 27 | ### Versions 28 | > 29 | 30 | ### Mention any other details that might be useful 31 | 32 | > --------------------------------------------------------------- 33 | > Thanks! We'll be in touch soon. 34 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ## Purpose 2 | 3 | * ... 4 | 5 | ## Does this introduce a breaking change? 6 | 7 | ``` 8 | [ ] Yes 9 | [ ] No 10 | ``` 11 | 12 | ## Pull Request Type 13 | What kind of change does this Pull Request introduce? 14 | 15 | 16 | ``` 17 | [ ] Bugfix 18 | [ ] Feature 19 | [ ] Code style update (formatting, local variables) 20 | [ ] Refactoring (no functional changes, no api changes) 21 | [ ] Documentation content changes 22 | [ ] Other... Please describe: 23 | ``` 24 | 25 | ## How to Test 26 | * Get the code 27 | 28 | ``` 29 | git clone [repo-address] 30 | cd [repo-name] 31 | git checkout [branch-name] 32 | npm install 33 | ``` 34 | 35 | * Test the code 36 | 37 | ``` 38 | ``` 39 | 40 | ## What to Check 41 | Verify that the following are valid 42 | * ... 43 | 44 | ## Other Information 45 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | 134 | # pytype static type analyzer 135 | .pytype/ 136 | 137 | # Cython debug symbols 138 | cython_debug/ 139 | 140 | # Local files 141 | **/.DS_Store 142 | .vscode/settings.json 143 | mlruns 144 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.pythonPath": "/opt/conda/bin/python", 3 | "python.testing.unittestArgs": [ 4 | "-v", 5 | "-s", 6 | "./ml_source", 7 | "-p", 8 | "test*.py" 9 | ], 10 | "python.testing.pytestEnabled": false, 11 | "python.testing.unittestEnabled": true, 12 | "python.sortImports.path": "/opt/conda/bin/isort" 13 | } -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## [project-title] Changelog 2 | 3 | 4 | # x.y.z (yyyy-mm-dd) 5 | 6 | *Features* 7 | * ... 8 | 9 | *Bug Fixes* 10 | * ... 11 | 12 | *Breaking Changes* 13 | * ... 14 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to [project-title] 2 | 3 | This project welcomes contributions and suggestions. Most contributions require you to agree to a 4 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us 5 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. 6 | 7 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide 8 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions 9 | provided by the bot. You will only need to do this once across all repos using our CLA. 10 | 11 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 12 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or 13 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 14 | 15 | - [Code of Conduct](#coc) 16 | - [Issues and Bugs](#issue) 17 | - [Feature Requests](#feature) 18 | - [Submission Guidelines](#submit) 19 | 20 | ## Code of Conduct 21 | Help us keep this project open and inclusive. Please read and follow our [Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 22 | 23 | ## Found an Issue? 24 | If you find a bug in the source code or a mistake in the documentation, you can help us by 25 | [submitting an issue](#submit-issue) to the GitHub Repository. Even better, you can 26 | [submit a Pull Request](#submit-pr) with a fix. 27 | 28 | ## Want a Feature? 29 | You can *request* a new feature by [submitting an issue](#submit-issue) to the GitHub 30 | Repository. If you would like to *implement* a new feature, please submit an issue with 31 | a proposal for your work first, to be sure that we can use it. 32 | 33 | * **Small Features** can be crafted and directly [submitted as a Pull Request](#submit-pr). 34 | 35 | ## Submission Guidelines 36 | 37 | ### Submitting an Issue 38 | Before you submit an issue, search the archive, maybe your question was already answered. 39 | 40 | If your issue appears to be a bug, and hasn't been reported, open a new issue. 41 | Help us to maximize the effort we can spend fixing issues and adding new 42 | features, by not reporting duplicate issues. Providing the following information will increase the 43 | chances of your issue being dealt with quickly: 44 | 45 | * **Overview of the Issue** - if an error is being thrown a non-minified stack trace helps 46 | * **Version** - what version is affected (e.g. 0.1.2) 47 | * **Motivation for or Use Case** - explain what are you trying to do and why the current behavior is a bug for you 48 | * **Browsers and Operating System** - is this a problem with all browsers? 49 | * **Reproduce the Error** - provide a live example or a unambiguous set of steps 50 | * **Related Issues** - has a similar issue been reported before? 51 | * **Suggest a Fix** - if you can't fix the bug yourself, perhaps you can point to what might be 52 | causing the problem (line of code or commit) 53 | 54 | You can file new issues by providing the above information at the corresponding repository's issues link: https://github.com/[organization-name]/[repository-name]/issues/new]. 55 | 56 | ### Submitting a Pull Request (PR) 57 | Before you submit your Pull Request (PR) consider the following guidelines: 58 | 59 | * Search the repository (https://github.com/[organization-name]/[repository-name]/pulls) for an open or closed PR 60 | that relates to your submission. You don't want to duplicate effort. 61 | 62 | * Make your changes in a new git fork: 63 | 64 | * Commit your changes using a descriptive commit message 65 | * Push your fork to GitHub: 66 | * In GitHub, create a pull request 67 | * If we suggest changes then: 68 | * Make the required updates. 69 | * Rebase your fork and force push to your GitHub repository (this will update your Pull Request): 70 | 71 | ```shell 72 | git rebase master -i 73 | git push -f 74 | ``` 75 | 76 | That's it! Thank you for your contribution! 77 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: clean clean-test clean-pyc clean-build 2 | SHELL=/bin/bash 3 | 4 | ## remove Python file artifacts 5 | clean-pyc: 6 | find . -name '*.pyc' -exec rm -f {} + 7 | find . -name '*.pyo' -exec rm -f {} + 8 | find . -name '*~' -exec rm -f {} + 9 | find . -name '__pycache__' -exec rm -fr {} + 10 | 11 | ## remove test and coverage artifacts 12 | clean-test: 13 | rm -f .coverage 14 | rm -fr htmlcov/ 15 | rm -fr .pytest_cache 16 | 17 | ## remove build artifacts 18 | clean-build: 19 | rm -fr build/ 20 | rm -fr dist/ 21 | rm -fr .eggs/ 22 | find . -name '*.egg-info' -exec rm -fr {} + 23 | find . -name '*.egg' -exec rm -f {} + 24 | 25 | ## remove all build, test, coverage and Python artifacts 26 | clean: clean-build clean-pyc clean-test 27 | 28 | ## pcakage ml 29 | dist-ml: clean 30 | python ml_source/src/setup.py bdist_wheel 31 | rm -fr build/ 32 | 33 | ## pcakage mlops 34 | dist-mlops: clean 35 | python ml_ops/src/setup.py bdist_wheel 36 | rm -fr build/ 37 | 38 | ## pcakage all 39 | dist: dist-ml dist-mlops 40 | 41 | ## install ml locally 42 | install-ml: clean 43 | python ml_source/src/setup.py install 44 | rm -fr build/ 45 | 46 | ## install mlops locally 47 | install-mlops: clean 48 | python ml_ops/src/setup.py install 49 | rm -fr build/ 50 | 51 | ## install all locally 52 | install: install-ml install-mlops 53 | 54 | ## unit test ml locally 55 | test-ml: install-ml 56 | cd ml_source && coverage run --source=taxi_fares,monitoring -m unittest discover 57 | cd ml_source && coverage report -m 58 | 59 | ## unit test mlops locally 60 | test-mlops: install-mlops 61 | cd ml_ops && coverage run --source=taxi_fares_mlops -m unittest discover 62 | cd ml_ops && coverage report -m 63 | 64 | ## unit test all locally 65 | test: test-ml test-mlops 66 | coverage combine ml_source/.coverage ml_ops/.coverage 67 | coverage report 68 | 69 | ## lint all python src and tests 70 | lint: 71 | flake8 --max-line-length=88 ml_ops/src ml_ops/tests ml_source/src ml_source/tests 72 | 73 | ## databricks authenticate 74 | databricks-authenticate: 75 | $(info Authenticate Databricks CLI) 76 | $(info Follow https://docs.microsoft.com/en-us/azure/databricks/dev-tools/cli/ for getting Host and token value) 77 | databricks configure --token 78 | $(info Taking Backup of .databrickscfg file in .env/databrickscfg) 79 | mkdir -p .env 80 | cp ~/.databrickscfg .env/.databrickscfg 81 | $(info Creating env script file for mlflow) 82 | DATABRICKS_HOST="$$(cat ~/.databrickscfg | grep '^host' | cut -d' ' -f 3)"; \ 83 | DATABRICKS_TOKEN="$$(cat ~/.databrickscfg | grep '^token' | cut -d' ' -f 3)"; \ 84 | echo "export MLFLOW_TRACKING_URI=databricks"> .env/.databricks_env.sh; \ 85 | echo "export DATABRICKS_HOST=$$DATABRICKS_HOST" >> .env/.databricks_env.sh; \ 86 | echo "export DATABRICKS_TOKEN=$$DATABRICKS_TOKEN" >> .env/.databricks_env.sh 87 | 88 | ## databricks init (create cluster, base workspace, mlflow experiment, secret scope) 89 | databricks-init: 90 | echo "Creating databricks workspace root directory"; \ 91 | databricks workspace mkdirs /azure-databricks-mlops-mlflow; \ 92 | echo "Creating databricks dbfs root directory"; \ 93 | databricks fs mkdirs dbfs:/FileStore/libraries/azure-databricks-mlops-mlflow; \ 94 | CLUSTER_ID="$$(databricks clusters list --output json | \ 95 | jq ".clusters[] | select(.cluster_name == \"azure-databricks-mlops-mlflow\") | .cluster_id")"; \ 96 | echo "Got existing cluster azure-databricks-mlops-mlflow with id: $$CLUSTER_ID"; \ 97 | if [[ $$CLUSTER_ID == "" ]]; then \ 98 | echo "Creating databricks cluster azure-databricks-mlops-mlflow"; \ 99 | databricks clusters create --json-file ml_ops/deployment/databricks/cluster_template.json; \ 100 | fi; \ 101 | SECRET_SCOPE_NAME="$$(databricks secrets list-scopes --output json | \ 102 | jq ".scopes[] | select(.name == \"azure-databricks-mlops-mlflow\") | .name")"; \ 103 | echo "Got existing secret scope $$SECRET_SCOPE_NAME"; \ 104 | if [[ $$SECRET_SCOPE_NAME == "" ]]; then \ 105 | echo "Creating databricks secret scope azure-databricks-mlops-mlflow"; \ 106 | databricks secrets create-scope --scope azure-databricks-mlops-mlflow --initial-manage-principal users; \ 107 | fi; \ 108 | MLFLOW_EXPERIMENT_ID="$$(source .env/.databricks_env.sh && mlflow experiments list | \ 109 | grep '/azure-databricks-mlops-mlflow/Experiment' | \ 110 | cut -d' ' -f 1)"; \ 111 | echo "Got existing mlflow experiment id: $$MLFLOW_EXPERIMENT_ID"; \ 112 | if [[ "$$MLFLOW_EXPERIMENT_ID" == "" ]]; then \ 113 | echo "Creating mlflow experiment in databricks workspace /azure-databricks-mlops-mlflow/Experiment directory"; \ 114 | source .env/.databricks_env.sh && mlflow experiments create --experiment-name /azure-databricks-mlops-mlflow/Experiment; \ 115 | fi; \ 116 | 117 | ## databricks secrets put 118 | databricks-secrets-put: 119 | $(info Put databricks secret azure-blob-storage-account-name) 120 | @read -p "Enter Azure Blob storage Account Name: " stg_account_name; \ 121 | databricks secrets put --scope azure-databricks-mlops-mlflow --key azure-blob-storage-account-name --string-value $$stg_account_name 122 | $(info Put databricks secret azure-blob-storage-container-name) 123 | @read -p "Enter Azure Blob storage Container Name: " stg_container_name; \ 124 | databricks secrets put --scope azure-databricks-mlops-mlflow --key azure-blob-storage-container-name --string-value $$stg_container_name 125 | $(info Put databricks secret azure-shared-access-key) 126 | $(info Mount Blob Storage https://docs.microsoft.com/en-gb/azure/databricks/data/data-sources/azure/azure-storage) 127 | @read -p "Enter Azure Blob storage Shared Access Key: " shared_access_key; \ 128 | databricks secrets put --scope azure-databricks-mlops-mlflow --key azure-blob-storage-shared-access-key --string-value $$shared_access_key 129 | 130 | ## databricks secrets put application insights key 131 | databricks-add-app-insights-key: 132 | $(info Put app insights key) 133 | @read -p "Enter App insights key: " app_insights_key; \ 134 | if [[ "$$app_insights_key" != '' ]]; then \ 135 | echo "Setting app insights key : $$app_insights_key "; \ 136 | databricks secrets put --scope azure-databricks-mlops-mlflow --key app_insights_key --string-value "$$app_insights_key"; \ 137 | fi; \ 138 | 139 | ## databricks deploy (upload wheel pacakges to databricks DBFS workspace) 140 | databricks-deploy-code: dist 141 | $(info Upload wheel packages into databricks dbfs root directory) 142 | databricks fs cp --overwrite --recursive dist/ dbfs:/FileStore/libraries/azure-databricks-mlops-mlflow/ 143 | $(info Importing orchestrator notebooks into databricks workspace root directory) 144 | databricks workspace import_dir --overwrite ml_ops/orchestrator/ /azure-databricks-mlops-mlflow/ 145 | $(info Create or update databricks jobs) 146 | 147 | ## databricks deploy jobs (create databricks jobs) 148 | databricks-deploy-jobs: databricks-deploy-code 149 | $(info Getting required values from databricks) 150 | CLUSTER_ID="$$(databricks clusters list --output json | \ 151 | jq ".clusters[] | select(.cluster_name == \"azure-databricks-mlops-mlflow\") | .cluster_id")"; \ 152 | echo "Got existing cluster id: $$CLUSTER_ID"; \ 153 | TRAINING_JOB_ID="$$(databricks jobs list --output json | \ 154 | jq ".jobs[] | select(.settings.name == \"taxi_fares_model_training\") | .job_id")"; \ 155 | echo "Got existing taxi_fares_model_training job id: $$TRAINING_JOB_ID"; \ 156 | if [[ "$$TRAINING_JOB_ID" == "" ]]; then \ 157 | databricks jobs create --json "{\"name\": \"taxi_fares_model_training\", \"existing_cluster_id\": $$CLUSTER_ID}"; \ 158 | TRAINING_JOB_ID="$$(databricks jobs list --output json | \ 159 | jq ".jobs[] | select(.settings.name == \"taxi_fares_model_training\") | .job_id")"; \ 160 | echo "Created taxi_fares_model_training with job id: $$TRAINING_JOB_ID"; \ 161 | fi; \ 162 | BATCH_SCORING_JOB_ID="$$(databricks jobs list --output json | \ 163 | jq ".jobs[] | select(.settings.name == \"taxi_fares_batch_scoring\") | .job_id")"; \ 164 | echo "Got existing taxi_fares_batch_scoring job id: $$BATCH_SCORING_JOB_ID"; \ 165 | if [[ "$$BATCH_SCORING_JOB_ID" == "" ]]; then \ 166 | databricks jobs create --json "{\"name\": \"taxi_fares_batch_scoring\", \"existing_cluster_id\": $$CLUSTER_ID}"; \ 167 | BATCH_SCORING_JOB_ID="$$(databricks jobs list --output json | \ 168 | jq ".jobs[] | select(.settings.name == \"taxi_fares_batch_scoring\") | .job_id")"; \ 169 | echo "Created taxi_fares_batch_scoring with job id: $$BATCH_SCORING_JOB_ID"; \ 170 | fi; \ 171 | MLFLOW_EXPERIMENT_ID="$$(source .env/.databricks_env.sh && mlflow experiments list | \ 172 | grep '/azure-databricks-mlops-mlflow/Experiment' | \ 173 | cut -d' ' -f 1)"; \ 174 | echo "Got existing mlflow experiment id: $$MLFLOW_EXPERIMENT_ID"; \ 175 | echo "Updating taxi_fares_model_training by using template ml_ops/deployment/databricks/job_template_taxi_fares_training.json"; \ 176 | TRAINING_JOB_UPDATE_JSON="$$(cat ml_ops/deployment/databricks/job_template_taxi_fares_training.json | \ 177 | sed "s/\"FILL_JOB_ID\"/$$TRAINING_JOB_ID/" | \ 178 | sed "s/FILL_MLFLOW_EXPERIMENT_ID/$$MLFLOW_EXPERIMENT_ID/" | \ 179 | sed "s/\"FILL_CLUSTER_ID\"/$$CLUSTER_ID/")"; \ 180 | databricks jobs reset --job-id $$TRAINING_JOB_ID --json "$$TRAINING_JOB_UPDATE_JSON"; \ 181 | echo "Updating taxi_fares_batch_scoring by using template ml_ops/deployment/databricks/job_template_taxi_fares_batch_scoring.json"; \ 182 | BATCH_SCORING_JOB_UPDATE_JSON="$$(cat ml_ops/deployment/databricks/job_template_taxi_fares_batch_scoring.json | \ 183 | sed "s/\"FILL_JOB_ID\"/$$BATCH_SCORING_JOB_ID/" | \ 184 | sed "s/FILL_MLFLOW_EXPERIMENT_ID/$$MLFLOW_EXPERIMENT_ID/" | \ 185 | sed "s/\"FILL_CLUSTER_ID\"/$$CLUSTER_ID/")"; \ 186 | databricks jobs reset --job-id $$BATCH_SCORING_JOB_ID --json "$$BATCH_SCORING_JOB_UPDATE_JSON"; \ 187 | 188 | ## deploy databricks all 189 | deploy: databricks-deploy-jobs 190 | 191 | ## run databricks taxi_fares_model_training job 192 | run-taxifares-model-training: 193 | $(info Triggering model training job) 194 | TRAINING_JOB_ID="$$(databricks jobs list --output json | \ 195 | jq ".jobs[] | select(.settings.name == \"taxi_fares_model_training\") | .job_id")"; \ 196 | RUN_ID="$$(databricks jobs run-now --job-id $$TRAINING_JOB_ID | \ 197 | jq ".number_in_job")"; \ 198 | DATABRICKS_HOST="$$(cat ~/.databrickscfg | grep '^host' | cut -d' ' -f 3)"; \ 199 | DATABRICKS_ORG_ID="$$(echo $$DATABRICKS_HOST | cut -d'-' -f 2 | cut -d'.' -f 1)"; \ 200 | echo "Open the following link in browser to check result -"; \ 201 | echo "$$DATABRICKS_HOST/?o=$$DATABRICKS_ORG_ID/#job/$$TRAINING_JOB_ID/run/$$RUN_ID"; \ 202 | 203 | 204 | ## run databricks taxi_fares_batch_scoring job 205 | run-taxifares-batch-scoring: 206 | $(info Triggering batch scoring job) 207 | BATCH_SCORING_JOB_ID="$$(databricks jobs list --output json | \ 208 | jq ".jobs[] | select(.settings.name == \"taxi_fares_batch_scoring\") | .job_id")"; \ 209 | RUN_ID="$$(databricks jobs run-now --job-id $$BATCH_SCORING_JOB_ID | \ 210 | jq ".number_in_job")"; \ 211 | DATABRICKS_HOST="$$(cat ~/.databrickscfg | grep '^host' | cut -d' ' -f 3)"; \ 212 | DATABRICKS_ORG_ID="$$(echo $$DATABRICKS_HOST | cut -d'-' -f 2 | cut -d'.' -f 1)"; \ 213 | echo "Open the following link in browser to check result -"; \ 214 | echo "$$DATABRICKS_HOST/?o=$$DATABRICKS_ORG_ID/#job/$$BATCH_SCORING_JOB_ID/run/$$RUN_ID"; \ 215 | 216 | # continuous integration (CI) 217 | ci: lint test dist 218 | 219 | # continuous deployment (CD) 220 | cd: deploy 221 | 222 | # train model 223 | train: run-taxifares-model-training 224 | 225 | # batch scoring 226 | score: run-taxifares-batch-scoring -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | --- 2 | page_type: sample 3 | ms.custom: 4 | - team=cse 5 | ms.contributors: 6 | - prdeb-12/21/2021 7 | - anchugh-12/21/2021 8 | languages: 9 | - python 10 | products: 11 | - azure-databricks 12 | - azure-blob-storage 13 | - azure-monitor 14 | --- 15 | 16 | # Azure Databricks MLOps using MLflow 17 | 18 | This is a template or sample for [MLOps](https://github.com/microsoft/mlops) for [Python](https://www.python.org) based source code in [Azure Databricks](https://docs.microsoft.com/en-us/azure/databricks/) using [MLflow](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/) without using [MLflow Project](https://mlflow.org/docs/latest/projects.html#). 19 | 20 | This template provides the following features: 21 | 22 | - A way to run Python based MLOps without using [MLflow Project](https://mlflow.org/docs/latest/projects.html#), but still using [MLflow](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/) for managing the end-to-end machine learning lifecycle. 23 | - Sample of machine learning source code structure along with Unit Test cases 24 | - Sample of MLOps code structure along with Unit Test cases 25 | - Demo setup to try on users subscription 26 | 27 | ## Problem Summary 28 | 29 | - This demonstrates deployment scenario of [Orchestrate MLOps on Azure Databricks using Databricks Notebook](https://docs.microsoft.com/en-us/azure/architecture/reference-architectures/ai/orchestrate-mlops-azure-databricks) 30 | 31 | ## Products/Technologies/Languages Used 32 | 33 | - Products & Technologies: 34 | - Azure Databricks 35 | - Azure Blob Storage 36 | - Azure Monitor 37 | - Languages: 38 | - Python 39 | 40 | ## Architecture 41 | 42 | ### Model Training 43 | 44 | ![Model Training](docs/images/model_training.png) 45 | 46 | ### Batch Scoring 47 | 48 | ![Batch Scoring](docs/images/batch_scoring.png) 49 | 50 | ## Individual Components 51 | 52 | - [ml_experiment](./ml_experiments/experiment_notebook.ipynb) - sample ML experiment notebook. 53 | - [ml_data](./ml_data/) - dummy data for sample model 54 | - [ml_ops](./ml_ops/) - sample MLOps code along with Unit Test cases, orchestrator, deployment setup. 55 | - [ml_source](./ml_source/) - sample ML code along with Unit Test cases 56 | - [Makefile](./Makefile) - for build, test in local environment 57 | - [requirements.txt](./requirements.txt) - python dependencies 58 | 59 | ## Getting Started 60 | 61 | ### Prerequisites 62 | 63 | - [Azure Databricks](https://docs.microsoft.com/en-us/azure/databricks/) workspace 64 | - [Azure Data Lake Storage Gen2](https://docs.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-introduction) account 65 | - [Visual Studio Code](https://code.visualstudio.com/) in local environment for development 66 | - [Docker](https://www.docker.com/) in local environment for development 67 | 68 | ### Development 69 | 70 | 1. `git clone https://github.com/Azure-Samples/azure-databricks-mlops-mlflow.git` 71 | 2. `cd azure-databricks-mlops-mlflow` 72 | 3. Open cloned repository in Visual Studio Code [Remote Container](https://code.visualstudio.com/docs/remote/containers) 73 | 4. Open a [terminal](https://code.visualstudio.com/docs/remote/containers#_opening-a-terminal) in Remote Container from Visual Studio Code 74 | 5. `make install` to install sample packages (`taxi_fares` and `taxi_fares_mlops`) locally 75 | 6. `make test` to Unit Test the code locally 76 | 77 | ### Package 78 | 79 | 1. `make dist` to build wheel Ml and MLOps packages (`taxi_fares` and `taxi_fares_mlops`) locally 80 | 81 | ### Deployment 82 | 83 | 1. `make databricks-deploy-code` to deploy Databricks Orchestrator Notebooks, ML and MLOps Python wheel packages. If any code changes. 84 | 2. `make databricks-deploy-jobs` to deploy Databricks Jobs. If any changes in job specs. 85 | 86 | ### Run training and batch scoring 87 | 88 | 1. To trigger training, execute `make run-taxi-fares-model-training` 89 | 2. To trigger batch scoring, execute `make run-taxi-fares-batch-scoring` 90 | 91 | **NOTE:** for [deployment](#deployment) and [running](#run-training-and-batch-scoring) the Databricks environment should be created first, for creating a demo environment the [Demo](#demo) chapter can be followed. 92 | 93 | ### Observability 94 | 95 | Check Logs, create alerts. etc. in [Application Insights](https://docs.microsoft.com/en-us/azure/azure-monitor/app/app-insights-overview). Following are the few sample [Kusto Query](https://docs.microsoft.com/en-us/azure/data-explorer/kusto/query/) to check logs, traces, exception, etc. 96 | 97 | - Check for Error, Info, Debug Logs 98 | 99 | Kusto Query for checking general logs for a specific MLflow experiment, filtered by `mlflow_experiment_id` 100 | 101 | ```kusto 102 | traces 103 | | extend mlflow_experiment_id = customDimensions.mlflow_experiment_id 104 | | where timestamp > ago(30m) 105 | | where mlflow_experiment_id == 106 | | limit 1000 107 | ``` 108 | 109 | Kusto Query for checking general logs for a specific Databricks job execution filtered by `mlflow_experiment_id` and `mlflow_run_id` 110 | 111 | ```kusto 112 | traces 113 | | extend mlflow_run_id = customDimensions.mlflow_run_id 114 | | extend mlflow_experiment_id = customDimensions.mlflow_experiment_id 115 | | where timestamp > ago(30m) 116 | | where mlflow_experiment_id == 117 | | where mlflow_run_id == "" 118 | | limit 1000 119 | ``` 120 | 121 | - Check for Exceptions 122 | 123 | Kusto Query for checking exception log if any 124 | 125 | ```kusto 126 | exceptions 127 | | where timestamp > ago(30m) 128 | | limit 1000 129 | ``` 130 | 131 | - Check for duration of different stages in MLOps 132 | 133 | Sample Kusto Query for checking duration of different stages in MLOps 134 | 135 | ```kusto 136 | dependencies 137 | | where timestamp > ago(30m) 138 | | where cloud_RoleName == 'TaxiFares_Training' 139 | | limit 1000 140 | ``` 141 | 142 | To correlate dependencies, exceptions and traces, `operation_Id` can be used a filter to above Kusto Queries. 143 | 144 | ## Demo 145 | 146 | 1. Create Databricks workspace, a storage account (Azure Data Lake Storage Gen2) and Application Insights 147 | 1. Create an [Azure Account](https://azure.microsoft.com/en-in/free/) 148 | 2. [Deploy resources](https://docs.microsoft.com/en-us/azure/azure-resource-manager/templates/deploy-portal#deploy-resources-from-custom-template) from [custom ARM template](ml_ops/deployment/arm_templates/databricks_and_storage.json) 149 | 2. Initialize Databricks (create cluster, base workspace, mlflow experiment, secret scope) 150 | 1. Get [Databricks CLI](https://docs.microsoft.com/en-us/azure/databricks/dev-tools/cli/) Host and Token 151 | 2. Authenticate Databricks CLI `make databricks-authenticate` 152 | 3. Execute `make databricks-init` 153 | 3. Create Azure Data Lake Storage Gen2 Container and upload data 154 | 1. [Create](https://docs.microsoft.com/en-us/azure/storage/blobs/storage-quickstart-blobs-portal#create-a-container) Azure Data Lake Storage Gen2 Container named - `taxifares` 155 | 2. [Upload](https://docs.microsoft.com/en-us/azure/storage/blobs/storage-quickstart-blobs-portal#upload-a-block-blob) as blob [taxi-fares data files](./ml_data/) into Azure Data Lake Storage Gen2 container named - `taxifares` 156 | 4. Put secrets to [Mount ADLS Gen2 Storage using Shared Access Key](https://docs.microsoft.com/en-gb/azure/databricks/data/data-sources/azure/azure-storage) 157 | 1. Get Azure Data Lake Storage Gen2 account name created in step 1 158 | 2. Get [Shared Key](https://docs.microsoft.com/en-us/rest/api/storageservices/authorize-with-shared-key) for Azure Data Lake Storage Gen2 account 159 | 3. Execute `make databricks-secrets-put` to put secret in Databricks secret scope 160 | 5. Put Application Insights Key as a secret in Databricks secret scope (optional) 161 | 1. Get [Application Insights Key](https://docs.microsoft.com/en-us/azure/azure-monitor/app/create-new-resource#copy-the-instrumentation-key) created in step 1 162 | 2. Execute `make databricks-add-app-insights-key` to put secret in Databricks secret scope 163 | 6. Package and deploy into Databricks (Databricks Jobs, Orchestrator Notebooks, ML and MLOps Python wheel packages) 164 | 1. Execute `make deploy` 165 | 7. Run Databricks Jobs 166 | 1. To trigger training, execute `make run-taxifares-model-training` 167 | 2. To trigger batch scoring, execute `make run-taxifares-batch-scoring` 168 | 8. Expected results 169 | 1. Azure resources 170 | ![Azure resources](docs/images/result_azure_resources.png) 171 | 2. Databricks jobs 172 | ![Databricks jobs](docs/images/result_databricks_job.png) 173 | 3. Databricks mlflow experiment 174 | ![Databricks mlflow experiment](docs/images/result_mlflow_experiment.png) 175 | 4. Databricks mlflow model registry 176 | ![Databricks mlflow model registry](docs/images/result_mlflow_model_registry.png) 177 | 5. Output of batch scoring 178 | ![Output of batch scoring](docs/images/result_batch_scoring.png) 179 | 180 | ## Additional Details 181 | 182 | 1. [Continuous Integration (CI) & Continuous Deployment (CD)](docs/advance/cicd.md) 183 | 2. [Registered Models Stages and Transitioning](docs/advance/registered_model_stages.md) 184 | 185 | ## Related resources 186 | 187 | 1. [Azure Databricks](https://docs.microsoft.com/en-us/azure/databricks/) 188 | 2. [MLflow](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/) 189 | 3. [MLflow Project](https://mlflow.org/docs/latest/projects.html#) 190 | 4. [Run MLflow Projects on Azure Databricks](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/projects) 191 | 5. [Databricks Widgets](https://docs.microsoft.com/en-us/azure/databricks/dev-tools/databricks-utils#--widget-utilities) 192 | 6. [Databricks Notebook-scoped Python libraries](https://docs.microsoft.com/en-us/azure/databricks/libraries/notebooks-python-libraries) 193 | 7. [Databricks CLI](https://docs.microsoft.com/en-us/azure/databricks/dev-tools/cli/) 194 | 8. [Azure Data Lake Storage Gen2](https://docs.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-introduction) 195 | 9. [Application Insights](https://docs.microsoft.com/en-us/azure/azure-monitor/app/app-insights-overview) 196 | 10. [Kusto Query Language](https://docs.microsoft.com/en-us/azure/data-explorer/kusto/query/) 197 | 198 | ## Glossaries 199 | 200 | 1. [Application developer](https://docs.microsoft.com/en-us/azure/machine-learning/team-data-science-process/overview) : It is a role that work mainly towards operationalize of machine learning. 201 | 2. [Data scientist](https://docs.microsoft.com/en-us/azure/machine-learning/team-data-science-process/roles-tasks#structure-of-data-science-groups-and-teams) : It is a role to perform the data science parts of the project 202 | 203 | ## Contributors 204 | 205 | - [Julien Chomarat](https://github.com/jchomarat) 206 | - [Benjamin Guinebertière](https://github.com/benjguin) 207 | - [Ankit Sinha](https://github.com/ankitbko) 208 | - [Prabal Deb](https://github.com/prabdeb) 209 | - [Megha Patil](https://github.com/meghapatilcode) 210 | - [Srikantan Sankaran](https://github.com/ssrikantan) 211 | - [Frédéric Le Coquil](https://github.com/flecoqui) 212 | - [Anand Chugh](https://github.com/anandchugh) 213 | -------------------------------------------------------------------------------- /docs/advance/cicd.md: -------------------------------------------------------------------------------- 1 | # Continuous Integration (CI) & Continuous Deployment (CD) 2 | 3 | CI and CD can be performed using any platform like `Azure DevOps Pipeline` or `GitHub Actions`, etc. where the following `make` commands in [Makefile](../../Makefile) might be useful. 4 | 5 | - CI: execute `make ci` from the Pipeline/Action stage. 6 | - CD: execute `make cd` from the Pipeline/Action stage. 7 | 8 | **NOTE:** Set env variables - `DATABRICKS_HOST`, `DATABRICKS_TOKEN` in the environment prior executing CD stage. 9 | 10 | ## Reference 11 | 12 | - [Design a CI/CD pipeline using Azure DevOps](https://docs.microsoft.com/en-us/azure/architecture/example-scenario/apps/devops-dotnet-webapp) 13 | - [GitHub Actions](https://docs.github.com/en/actions) -------------------------------------------------------------------------------- /docs/advance/registered_model_stages.md: -------------------------------------------------------------------------------- 1 | # Registered Models Stages and Transitioning 2 | 3 | This document describes a possible way to transitioning a model from different stages available in [Mlflow Model Registry](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/model-registry#model-registry-concepts). 4 | 5 | 1. In this demo setup, currently [Continuous Integration (CI)](cicd.md) step does [register](../ml_ops/src/taxi_fares_mlops/../../../../ml_ops/src/taxi_fares_mlops/publish_model.py) the model in MLflow model registry in `None` stage. 6 | 2. Now the registered model can be [transitioned](https://www.mlflow.org/docs/latest/model-registry.html#transitioning-an-mlflow-models-stage) to next stage `Staging` post Integration test step. 7 | 3. Finally the model can be transitioned to stage `Production` during Continuous Deployment (CD) step. 8 | 9 | ## References 10 | 11 | - [MLflow Model Registry on Azure Databricks](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/model-registry) 12 | - [MLflow Model Registry](https://www.mlflow.org/docs/latest/model-registry.html) 13 | -------------------------------------------------------------------------------- /docs/images/batch_scoring.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/docs/images/batch_scoring.png -------------------------------------------------------------------------------- /docs/images/model_training.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/docs/images/model_training.png -------------------------------------------------------------------------------- /docs/images/result_azure_resources.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/docs/images/result_azure_resources.png -------------------------------------------------------------------------------- /docs/images/result_batch_scoring.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/docs/images/result_batch_scoring.png -------------------------------------------------------------------------------- /docs/images/result_databricks_job.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/docs/images/result_databricks_job.png -------------------------------------------------------------------------------- /docs/images/result_mlflow_experiment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/docs/images/result_mlflow_experiment.png -------------------------------------------------------------------------------- /docs/images/result_mlflow_model_registry.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/docs/images/result_mlflow_model_registry.png -------------------------------------------------------------------------------- /ml_ops/README.md: -------------------------------------------------------------------------------- 1 | # MLOps 2 | 3 | ## Overview 4 | 5 | This contains MLOps code. That will be developed, unit tested, packaged and delivered independently and typically maintained by Application developer in an organization. 6 | 7 | ## Contents 8 | 9 | 1. [src](src/) : MLOps source code, that will be packaged as Python `wheel`. 10 | 2. [tests](tests/) : unit test cases for `src`. 11 | 3. [orchestrator](orchestrator/) : Databricks Python Notebooks for MLOps orchestrator. 12 | 4. [deployment](deployment/) : deployment templates ([ARM](https://docs.microsoft.com/en-us/azure/azure-resource-manager/templates/overview) and Databricks Jobs, Cluster). 13 | -------------------------------------------------------------------------------- /ml_ops/deployment/README.md: -------------------------------------------------------------------------------- 1 | # Deployment 2 | 3 | ## Overview 4 | 5 | This document covers the deployment guide for MLOps. 6 | 7 | ## Databricks Cluster 8 | 9 | For Orchestrator job, either an existing cluster can be used or a new cluster can be created. However, we need to be sure to set following [properties](https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/latest/clusters#--request-structure-of-the-cluster-definition) in the cluster. 10 | 11 | - Cluster Mode: High Concurrency 12 | - DataBricks Runtime Version : 8.1 LTS ML (includes Apache Spark 3.0.1, Scala 2.12) 13 | - Enable Autoscaling: True 14 | - Worker Type: Standard_F4s 15 | - Driver Type: Standard_F4s 16 | - Spark Settings under “Spark Config” (Edit > Advanced Options > Spark) 17 | 18 | ```configuration 19 | spark.databricks.cluster.profile serverless 20 | spark.databricks.repl.allowedLanguages sql,python,r 21 | spark.databricks.conda.condaMagic.enabled true 22 | ``` 23 | 24 | ## Databricks Job 25 | 26 | Orchestrator DataBricks Job from a [Databricks Job create template](https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/latest/jobs#--create) can be created using following example CLI command - 27 | 28 | ```sh 29 | databricks jobs create --json-file .json 30 | ``` 31 | 32 | Orchestrator DataBricks Job from a [Databricks Job reset template](https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/latest/jobs#--reset) can be updated using following example CLI command - 33 | 34 | ```sh 35 | databricks jobs reset --job-id --json-file .json 36 | ``` 37 | 38 | ## Databricks MLflow Experiment 39 | 40 | MLflow Experiment can be created using [Databricks Workspace Portal](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/tracking#workspace-experiments) or using following CLI commands - 41 | 42 | ```sh 43 | export MLFLOW_TRACKING_URI=databricks 44 | export DATABRICKS_HOST= 45 | export DATABRICKS_TOKEN= 46 | mlflow experiments create --experiment-name // 47 | ``` 48 | 49 | Get `DATABRICKS_HOST` and `DATABRICKS_TOKEN` from [Databricks CLI Reference](https://docs.microsoft.com/en-us/azure/databricks/dev-tools/cli/) 50 | 51 | ## Databricks DBFS Upload 52 | 53 | The following CLI command can be used to upload Wheel package into DataBricks DBFS. 54 | 55 | ```sh 56 | databricks fs cp --overwrite python-package.whl 57 | ``` 58 | 59 | ## Databricks Notebook Import 60 | 61 | The following CLI command can be used to import orchestrator python file as a DataBricks notebook into DataBricks workspace. 62 | 63 | ```sh 64 | databricks workspace import -l PYTHON -f SOURCE -o .py 65 | ``` 66 | 67 | ## Orchestrator DataBricks Job trigger 68 | 69 | Orchestrator databricks job can be triggered using following ways - 70 | 71 | - Scheduled : 72 | - Cron based scheduling. 73 | - Manual : 74 | - Databricks workspace portal but clicking on `Run Now With Different Parameters`. 75 | - Via [Databricks-CLI](https://docs.microsoft.com/en-us/azure/databricks/dev-tools/cli/jobs-cli). 76 | - Via [Databricks-API](https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/latest/jobs#--run-now). 77 | -------------------------------------------------------------------------------- /ml_ops/deployment/arm_templates/databricks_and_storage.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", 3 | "contentVersion": "1.0.0.1", 4 | "parameters": { 5 | "location": { 6 | "type": "string", 7 | "defaultValue": "[resourceGroup().location]", 8 | "metadata": { 9 | "description": "Resources location." 10 | } 11 | }, 12 | "dbWorkspaceName": { 13 | "type": "string", 14 | "defaultValue": "databricks-mlops-mlflow", 15 | "metadata": { 16 | "description": "The name of the Azure Databricks workspace to create." 17 | } 18 | }, 19 | "dbTier": { 20 | "defaultValue": "standard", 21 | "type": "string", 22 | "allowedValues": [ 23 | "standard", 24 | "premium" 25 | ], 26 | "metadata": { 27 | "description": "The pricing tier of Databricks workspace." 28 | } 29 | }, 30 | "stgAccountName": { 31 | "type": "string", 32 | "defaultValue": "[concat('storage', uniqueString(parameters('location'), resourceGroup().id))]", 33 | "metadata": { 34 | "description": "Storage account name." 35 | } 36 | }, 37 | "stgAccountType": { 38 | "type": "string", 39 | "defaultValue": "Standard_RAGRS", 40 | "metadata": { 41 | "description": "Storage account type." 42 | } 43 | }, 44 | "stgKind": { 45 | "type": "string", 46 | "defaultValue": "StorageV2", 47 | "metadata": { 48 | "description": "Storage account kind." 49 | } 50 | }, 51 | "stgAccessTier": { 52 | "type": "string", 53 | "defaultValue": "Cool", 54 | "metadata": { 55 | "description": "Storage account tier." 56 | } 57 | }, 58 | "stgIsHnsEnabled": { 59 | "type": "bool", 60 | "defaultValue": true, 61 | "metadata": { 62 | "description": "Enable ADLS Gen2." 63 | } 64 | }, 65 | "aiName": { 66 | "type": "string", 67 | "defaultValue": "[concat('ai', uniqueString(parameters('location'), resourceGroup().id))]", 68 | "metadata": { 69 | "description": "Application Insights name." 70 | } 71 | } 72 | }, 73 | "variables": { 74 | "managedResourceGroupName": "[concat('databricks-rg-', parameters('dbWorkspaceName'), '-', uniqueString(parameters('dbWorkspaceName'), resourceGroup().id))]", 75 | "managedResourceGroupId": "[concat(subscription().id, '/resourceGroups/', variables('managedResourceGroupName'))]" 76 | }, 77 | "resources": [ 78 | { 79 | "type": "Microsoft.Databricks/workspaces", 80 | "apiVersion": "2018-04-01", 81 | "name": "[parameters('dbWorkspaceName')]", 82 | "location": "[parameters('location')]", 83 | "sku": { 84 | "name": "[parameters('dbTier')]" 85 | }, 86 | "comments": "Please do not use an existing resource group for ManagedResourceGroupId.", 87 | "properties": { 88 | "ManagedResourceGroupId": "[variables('managedResourceGroupId')]", 89 | "parameters": {} 90 | }, 91 | "dependsOn": [], 92 | "tags": { 93 | "Purpose": "Demo", 94 | "Project": "azure-databricks-mlops-mlflow" 95 | } 96 | }, 97 | { 98 | "type": "Microsoft.Storage/storageAccounts", 99 | "apiVersion": "2019-06-01", 100 | "name": "[parameters('stgAccountName')]", 101 | "location": "[parameters('location')]", 102 | "properties": { 103 | "accessTier": "[parameters('stgAccessTier')]", 104 | "isHnsEnabled": "[parameters('stgIsHnsEnabled')]" 105 | }, 106 | "dependsOn": [], 107 | "sku": { 108 | "name": "[parameters('stgAccountType')]" 109 | }, 110 | "kind": "[parameters('stgKind')]", 111 | "tags": { 112 | "Purpose": "Demo", 113 | "Project": "azure-databricks-mlops-mlflow" 114 | } 115 | }, 116 | { 117 | "type": "Microsoft.Insights/components", 118 | "apiVersion": "2020-02-02", 119 | "name": "[parameters('aiName')]", 120 | "location": "[parameters('location')]", 121 | "kind": "other", 122 | "tags": { 123 | "Purpose": "Demo", 124 | "Project": "azure-databricks-mlops-mlflow" 125 | }, 126 | "properties": { 127 | "Application_Type": "web", 128 | "Flow_Type": "Bluefield", 129 | "Request_Source": "CustomDeployment" 130 | } 131 | } 132 | ], 133 | "outputs": {} 134 | } -------------------------------------------------------------------------------- /ml_ops/deployment/databricks/cluster_template.json: -------------------------------------------------------------------------------- 1 | { 2 | "cluster_name": "azure-databricks-mlops-mlflow", 3 | "spark_version": "10.4.x-cpu-ml-scala2.12", 4 | "num_workers": 0, 5 | "spark_conf": { 6 | "spark.databricks.cluster.profile": "singleNode", 7 | "spark.databricks.conda.condaMagic.enabled": "true", 8 | "spark.master": "local[*]" 9 | }, 10 | "node_type_id": "Standard_F4", 11 | "driver_node_type_id": "Standard_F4", 12 | "custom_tags": { 13 | "ResourceClass": "SingleNode" 14 | }, 15 | "autotermination_minutes": 30, 16 | "enable_elastic_disk": true 17 | } -------------------------------------------------------------------------------- /ml_ops/deployment/databricks/job_template_taxi_fares_batch_scoring.json: -------------------------------------------------------------------------------- 1 | { 2 | "job_id": "FILL_JOB_ID", 3 | "name": "taxi_fares_batch_scoring", 4 | "existing_cluster_id": "FILL_CLUSTER_ID", 5 | "notebook_task": { 6 | "notebook_path": "/azure-databricks-mlops-mlflow/taxi_fares_orchestrator_batch_score", 7 | "base_parameters": { 8 | "taxi_fares_raw_data": "/databricks-datasets/nyctaxi-with-zipcodes/subsampled", 9 | "taxi_fares_mount_point": "/mnt/data_batch", 10 | "mlflow_experiment_id": "FILL_MLFLOW_EXPERIMENT_ID", 11 | "execute_feature_engineering": "true", 12 | "scoring_data_start_date": "2016-02-01", 13 | "scoring_data_end_date": "2016-02-29", 14 | "trained_model_version": "", 15 | "wheel_package_dbfs_base_path": "/dbfs/FileStore/libraries/azure-databricks-mlops-mlflow", 16 | "wheel_package_taxi_fares_version": "0.0.1", 17 | "wheel_package_taxi_fares_mlops_version": "0.0.1" 18 | } 19 | }, 20 | "timeout_seconds": 86400, 21 | "email_notifications": { 22 | "on_start": [], 23 | "on_success": [], 24 | "on_failure": [] 25 | } 26 | } -------------------------------------------------------------------------------- /ml_ops/deployment/databricks/job_template_taxi_fares_training.json: -------------------------------------------------------------------------------- 1 | { 2 | "job_id": "FILL_JOB_ID", 3 | "name": "taxi_fares_model_training", 4 | "existing_cluster_id": "FILL_CLUSTER_ID", 5 | "notebook_task": { 6 | "notebook_path": "/azure-databricks-mlops-mlflow/taxi_fares_orchestrator_train", 7 | "base_parameters": { 8 | "taxi_fares_raw_data": "/databricks-datasets/nyctaxi-with-zipcodes/subsampled", 9 | "mlflow_experiment_id": "FILL_MLFLOW_EXPERIMENT_ID", 10 | "wheel_package_dbfs_base_path": "/dbfs/FileStore/libraries/azure-databricks-mlops-mlflow", 11 | "wheel_package_taxi_fares_version": "0.0.1", 12 | "wheel_package_taxi_fares_mlops_version": "0.0.1", 13 | "execute_feature_engineering": "true", 14 | "training_data_end_date": "2016-01-31", 15 | "training_data_start_date": "2016-01-01", 16 | "training_num_leaves": "32", 17 | "training_objective": "regression", 18 | "training_metric": "rmse", 19 | "training_num_rounds": "100" 20 | } 21 | }, 22 | "timeout_seconds": 86400, 23 | "email_notifications": { 24 | "on_start": [], 25 | "on_success": [], 26 | "on_failure": [] 27 | } 28 | } -------------------------------------------------------------------------------- /ml_ops/orchestrator/README.md: -------------------------------------------------------------------------------- 1 | # Orchestrator 2 | 3 | ## Overview 4 | 5 | This document covers the design guide of the following orchestrators - 6 | 7 | 1. [taxi_fares_orchestrator_train.py](taxi_fares_orchestrator_train.py) 8 | 2. [taxi_fares_orchestrator_batch_score.py](taxi_fares_orchestrator_batch_score.py) 9 | 10 | ## Considerations 11 | 12 | - It will be a Databricks notebook in Databricks workspace. 13 | - It will be stored in GIT as a python file. 14 | - It will use `dbutils` widgets for parametrization 15 | - It will use `pip magic commands` for managing libraries. 16 | - It will be executed from a Databricks Job. 17 | - It will perform logging in Application Insights 18 | - It will log artifacts, metrics, parameters, trained model into MLflow. 19 | 20 | ## Parameters 21 | 22 | ### Define Parameters 23 | 24 | Parameters are defined using `dbutils.widgets.text`, example 25 | 26 | ```py 27 | dbutils.widgets.text("", "") 28 | ``` 29 | 30 | ### Read Parameters 31 | 32 | Parameters are read using `dbutils.widgets.get`, example 33 | 34 | ```py 35 | param_value = dbutils.widgets.get("") 36 | ``` 37 | 38 | ## Installation of libraries 39 | 40 | ### How to enable %pip magic commands 41 | 42 | Starting with Databricks Runtime ML version 6.4 this feature can be enabled when creating a cluster. 43 | To perform this set `spark.databricks.conda.condaMagic.enabled` to `true` under “Spark Config” (Edit > Advanced Options > Spark). 44 | 45 | ### How to install libraries using pip 46 | 47 | Libraries are installed as [Notebook-scoped Python libraries](https://docs.microsoft.com/en-us/azure/databricks/libraries/notebooks-python-libraries), example 48 | 49 | ```sh 50 | %pip install dbfs//.whl 51 | ``` 52 | 53 | ## Calling MLOps Python Functions 54 | 55 | MLOps Python Functions are packaged as a wheel package and orchestrator notebook calls the python functions from wheel package. 56 | 57 | ## Execution of Orchestrator 58 | 59 | Orchestrator are executed from DataBricks Job. 60 | 61 | ## Error handling 62 | 63 | For error handling `try..catch` block is used to handle exceptions - 64 | 65 | ```py 66 | try: 67 | model = run_training() 68 | except(Exception ex): 69 | logger.error(f"Encountered error: {ex.Message}") # To log exception in Application Insights 70 | raise Exception(f"Encountered error - {ex}") from ex # To fail the Databricks Job Run 71 | ``` 72 | 73 | ## Observability 74 | 75 | [OpenCensus](https://docs.microsoft.com/en-us/azure/azure-monitor/app/opencensus-python) library is used to capture logs and metrics and send it to Application Insights. 76 | 77 | ## Secret Management 78 | 79 | The following secrets need to be stored in [Databricks Secret Scope](https://docs.microsoft.com/en-us/azure/databricks/security/secrets/): 80 | 81 | - Application Insights Instrumentation Key 82 | - Azure ADLS Gen2 Storage Details (account name, container name, shared access key) 83 | 84 | Secrets are read using `dbutils.secrets.get`, example 85 | 86 | ```py 87 | secret_value = dbutils.secrets.get(scope = "", key = "") 88 | ``` 89 | 90 | ## References 91 | 92 | 1. [Enable pip magic commands](https://databricks.com/blog/2020/06/17/simplify-python-environment-management-on-databricks-runtime-for-machine-learning-using-pip-and-conda.html) 93 | 2. [OpenCensus](https://docs.microsoft.com/en-us/azure/azure-monitor/app/opencensus-python) 94 | 3. [DataBricks Job API](https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/latest/jobs) 95 | 4. [DataBricks Cluster API](https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/latest/clusters) 96 | 5. [DataBricks CLI](https://docs.microsoft.com/en-us/azure/databricks/dev-tools/cli/) 97 | -------------------------------------------------------------------------------- /ml_ops/orchestrator/taxi_fares_orchestrator_batch_score.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | """Orchestrator notebook for taxifares training.""" 3 | # Initialization of dbutils to avoid linting errors during developing in vscode 4 | from pyspark.sql import SparkSession 5 | 6 | 7 | def get_dbutils(spark): 8 | """Return dbutils for databricks.""" 9 | if spark.conf.get("spark.databricks.service.client.enabled") == "true": 10 | from pyspark.dbutils import DBUtils 11 | 12 | return DBUtils(spark) 13 | else: 14 | import IPython 15 | 16 | return IPython.get_ipython().user_ns["dbutils"] 17 | 18 | 19 | spark = SparkSession.builder.appName("Pipeline").getOrCreate() 20 | dbutils = get_dbutils(spark) 21 | 22 | # COMMAND ---------- 23 | 24 | # Define parameters 25 | dbutils.widgets.text( 26 | "taxi_fares_raw_data", "/databricks-datasets/nyctaxi-with-zipcodes/subsampled" 27 | ) 28 | dbutils.widgets.text("taxi_fares_mount_point", "/mnt/data") 29 | dbutils.widgets.text("mlflow_experiment_id", "") 30 | dbutils.widgets.text("wheel_package_dbfs_base_path", "") 31 | dbutils.widgets.text("wheel_package_taxi_fares_version", "") 32 | dbutils.widgets.text("wheel_package_taxi_fares_mlops_version", "") 33 | dbutils.widgets.text("execute_feature_engineering", "true") 34 | dbutils.widgets.text("trained_model_version", "") 35 | dbutils.widgets.text("scoring_data_start_date", "2016-02-01") 36 | dbutils.widgets.text("training_data_end_date", "2016-02-29") 37 | 38 | # COMMAND ---------- 39 | 40 | # Get wheel package parameters 41 | wheel_package_dbfs_base_path = dbutils.widgets.get( 42 | "wheel_package_dbfs_base_path") 43 | wheel_package_taxi_fares_version = dbutils.widgets.get( 44 | "wheel_package_taxi_fares_version" 45 | ) 46 | wheel_package_taxi_fares_mlops_version = dbutils.widgets.get( 47 | "wheel_package_taxi_fares_mlops_version" 48 | ) 49 | 50 | # COMMAND ---------- 51 | 52 | # MAGIC %pip install $wheel_package_dbfs_base_path/taxi_fares-$wheel_package_taxi_fares_version-py3-none-any.whl # noqa: E501 53 | # MAGIC %pip install $wheel_package_dbfs_base_path/taxi_fares_mlops-$wheel_package_taxi_fares_mlops_version-py3-none-any.whl # noqa: E501 54 | 55 | # COMMAND ---------- 56 | 57 | # Imports 58 | import os # noqa: E402 59 | import shutil # noqa: E402 60 | from datetime import datetime # noqa: E402 61 | from pathlib import Path # noqa: E402 62 | 63 | import mlflow # noqa: E402 64 | from databricks import feature_store # noqa: E402 65 | from monitoring.app_logger import AppLogger, get_disabled_logger # noqa: E402 66 | from taxi_fares.utils.pyspark_utils import rounded_taxi_data # noqa: E402 67 | from taxi_fares_mlops.feature_engineering import run as run_feature_engineering # noqa 68 | from taxi_fares_mlops.scoring_batch import run as run_scoring_batch # noqa: E402 69 | 70 | # COMMAND ---------- 71 | 72 | # Get other parameters 73 | mlflow_experiment_id = dbutils.widgets.get("mlflow_experiment_id") 74 | execute_feature_engineering = dbutils.widgets.get( 75 | "execute_feature_engineering") 76 | taxi_fares_raw_data = dbutils.widgets.get("taxi_fares_raw_data") 77 | taxi_fares_mount_point = dbutils.widgets.get("taxi_fares_mount_point") 78 | trained_model_version = dbutils.widgets.get("trained_model_version") 79 | scoring_data_start_date = dbutils.widgets.get("scoring_data_start_date") 80 | scoring_data_end_date = dbutils.widgets.get("scoring_data_end_date") 81 | 82 | # COMMAND ---------- 83 | 84 | # Initiate mlflow experiment 85 | mlflow.start_run(experiment_id=int(mlflow_experiment_id), 86 | run_name="batch_scoring") 87 | mlflow_run = mlflow.active_run() 88 | mlflow_run_id = mlflow_run.info.run_id 89 | mlflow_log_tmp_dir = "/tmp/" + str(mlflow_run_id) # nosec: B108 90 | Path(mlflow_log_tmp_dir).mkdir(parents=True, exist_ok=True) 91 | 92 | # initiate app logger 93 | if any( 94 | [ 95 | True 96 | for secret in dbutils.secrets.list(scope="azure-databricks-mlops-mlflow") 97 | if "app_insights_key" in secret.key 98 | ] 99 | ): 100 | app_insights_key = dbutils.secrets.get( 101 | scope="azure-databricks-mlops-mlflow", key="app_insights_key" 102 | ) 103 | config = {"app_insights_key": app_insights_key} 104 | app_logger = AppLogger(config=config) 105 | else: 106 | app_logger = get_disabled_logger() 107 | try: 108 | logger = app_logger.get_logger( 109 | component_name="Batch_Score_Orchestrator", 110 | custom_dimensions={ 111 | "mlflow_run_id": mlflow_run_id, 112 | "mlflow_experiment_id": int(mlflow_experiment_id), 113 | }, 114 | ) 115 | tracer = app_logger.get_tracer( 116 | component_name="Batch_Score_Orchestrator", 117 | ) 118 | except Exception as ex: 119 | print(ex) 120 | mlflow.end_run() 121 | shutil.rmtree(mlflow_log_tmp_dir, ignore_errors=True) 122 | raise Exception(f"ERROR - in initializing app logger - {ex}") from ex 123 | 124 | logger.info(f"Stating batch scoring with mlflow run id {mlflow_run_id}") 125 | 126 | # COMMAND ---------- 127 | 128 | # Mount ADLS Gen2 storage container 129 | try: 130 | logger.info(f"Mounting {taxi_fares_mount_point}") 131 | if any(mount.mountPoint == taxi_fares_mount_point for mount in dbutils.fs.mounts()): 132 | logger.info(f"Mount point exists {taxi_fares_mount_point}") 133 | else: 134 | storage_account_name = dbutils.secrets.get( 135 | scope="azure-databricks-mlops-mlflow", key="azure-blob-storage-account-name" 136 | ) 137 | storage_container_name = dbutils.secrets.get( 138 | scope="azure-databricks-mlops-mlflow", 139 | key="azure-blob-storage-container-name", 140 | ) 141 | storage_shared_key_name = dbutils.secrets.get( 142 | scope="azure-databricks-mlops-mlflow", 143 | key="azure-blob-storage-shared-access-key", 144 | ) 145 | dbutils.fs.mount( 146 | source=f"wasbs://{storage_container_name}@{storage_account_name}.blob.core.windows.net", # noqa: E501 147 | mount_point=taxi_fares_mount_point, 148 | extra_configs={ 149 | f"fs.azure.account.key.{storage_account_name}.blob.core.windows.net": storage_shared_key_name # noqa: E501 150 | }, 151 | ) 152 | except Exception as ex: 153 | print(ex) 154 | mlflow.end_run() 155 | shutil.rmtree(mlflow_log_tmp_dir, ignore_errors=True) 156 | logger.exception(f"ERROR - in mounting adls - {ex}") 157 | raise Exception(f"ERROR - in mounting adls - {ex}") from ex 158 | 159 | # COMMAND ---------- 160 | 161 | # Clean up function 162 | 163 | 164 | def clean(): 165 | dbutils.fs.unmount(taxi_fares_mount_point) 166 | mlflow.log_artifacts(mlflow_log_tmp_dir) 167 | shutil.rmtree(mlflow_log_tmp_dir) 168 | mlflow.end_run() 169 | 170 | 171 | # COMMAND ---------- 172 | 173 | # Get batch scoring raw data 174 | try: 175 | logger.info("Reading training raw data") 176 | raw_data_file = taxi_fares_raw_data 177 | raw_data = spark.read.format("delta").load(raw_data_file) 178 | mlflow.log_param("data_raw_rows", raw_data.count()) 179 | mlflow.log_param("data_raw_cols", len(raw_data.columns)) 180 | except Exception as ex: 181 | clean() 182 | logger.exception(f"ERROR - in reading raw data - {ex}") 183 | raise Exception(f"ERROR - in reading raw data - {ex}") from ex 184 | 185 | # COMMAND ---------- 186 | 187 | 188 | # Run feature engineering on batch scoring raw data 189 | if execute_feature_engineering == "true": 190 | try: 191 | logger.info("Starting feature engineering") 192 | with tracer.span("run_feature_engineering"): 193 | feature_engineered_data = run_feature_engineering( 194 | df_input=raw_data, 195 | start_date=datetime.strptime( 196 | scoring_data_start_date, "%Y-%m-%d"), 197 | end_date=datetime.strptime(scoring_data_end_date, "%Y-%m-%d"), 198 | mlflow=mlflow, 199 | mlflow_log_tmp_dir=mlflow_log_tmp_dir, 200 | explain_features=True, 201 | app_logger=app_logger, 202 | parent_tracer=tracer, 203 | ) 204 | except Exception as ex: 205 | clean() 206 | logger.exception(f"ERROR - in feature engineering - {ex}") 207 | raise Exception(f"ERROR - in feature engineering - {ex}") from ex 208 | else: 209 | logger.info("Skipping feature engineering") 210 | 211 | # COMMAND ---------- 212 | 213 | # MAGIC %sql 214 | # MAGIC CREATE DATABASE IF NOT EXISTS feature_store_taxi_example; 215 | 216 | # COMMAND ---------- 217 | 218 | # Save features to feature store 219 | fs = feature_store.FeatureStoreClient() 220 | if execute_feature_engineering == "true": 221 | try: 222 | spark.conf.set("spark.sql.shuffle.partitions", "5") 223 | 224 | fs.create_table( 225 | name="feature_store_taxi_example.trip_pickup_features", 226 | primary_keys=["zip", "ts"], 227 | df=feature_engineered_data[0], 228 | partition_columns="yyyy_mm", 229 | description="Taxi Fares. Pickup Features", 230 | ) 231 | fs.create_table( 232 | name="feature_store_taxi_example.trip_dropoff_features", 233 | primary_keys=["zip", "ts"], 234 | df=feature_engineered_data[1], 235 | partition_columns="yyyy_mm", 236 | description="Taxi Fares. Dropoff Features", 237 | ) 238 | 239 | # Write the pickup features DataFrame to the feature store table 240 | fs.write_table( 241 | name="feature_store_taxi_example.trip_pickup_features", 242 | df=feature_engineered_data[0], 243 | mode="merge", 244 | ) 245 | # Write the dropoff features DataFrame to the feature store table 246 | fs.write_table( 247 | name="feature_store_taxi_example.trip_dropoff_features", 248 | df=feature_engineered_data[1], 249 | mode="merge", 250 | ) 251 | except Exception as ex: 252 | clean() 253 | logger.exception( 254 | f"ERROR - in feature saving into feature store - {ex}") 255 | raise Exception( 256 | f"ERROR - in feature saving into feature store - {ex}") from ex 257 | else: 258 | logger.info("Skipping feature saving into feature store") 259 | 260 | # COMMAND ---------- 261 | 262 | # Batch scoring 263 | try: 264 | logger.info("Starting batch scoring") 265 | with tracer.span("run_scoring_batch"): 266 | run_scoring_batch( 267 | trained_model_name="taxi_fares", 268 | score_df=rounded_taxi_data(raw_data), 269 | mlflow=mlflow, 270 | mlflow_log_tmp_dir=mlflow_log_tmp_dir, 271 | trained_model_version=trained_model_version, 272 | app_logger=app_logger, 273 | parent_tracer=tracer, 274 | ) 275 | except Exception as ex: 276 | clean() 277 | logger.exception(f"ERROR - in batch scoring - {ex}") 278 | raise Exception(f"ERROR - in batch scoring - {ex}") from ex 279 | 280 | 281 | # COMMAND ---------- 282 | 283 | # Batch scoring result publish 284 | try: 285 | logger.info("Starting batch scoring result publish to adls") 286 | with tracer.span("publish_result"): 287 | result_path = "/".join( 288 | [ 289 | "/dbfs", 290 | taxi_fares_mount_point, 291 | "batch_scoring_result", 292 | str(mlflow_run_id), 293 | ] 294 | ) 295 | Path(result_path).mkdir(parents=True, exist_ok=True) 296 | shutil.copyfile( 297 | os.path.join(mlflow_log_tmp_dir, "batch_scoring_result.html"), 298 | os.path.join( 299 | result_path, 300 | "batch_scoring_result.html", 301 | ), 302 | ) 303 | shutil.copyfile( 304 | os.path.join(mlflow_log_tmp_dir, "batch_scoring_result.csv"), 305 | os.path.join( 306 | result_path, 307 | "batch_scoring_result.csv", 308 | ), 309 | ) 310 | logger.info(f"Published score result in {result_path}") 311 | except Exception as ex: 312 | clean() 313 | logger.exception(f"ERROR - in batch scoring result publish to adls - {ex}") 314 | raise Exception( 315 | f"ERROR - in batch scoring result publish to adls - {ex}") from ex 316 | 317 | 318 | # COMMAND ---------- 319 | 320 | # End 321 | logger.info(f"Completed batch scoring with mlflow run id {mlflow_run_id}") 322 | clean() 323 | -------------------------------------------------------------------------------- /ml_ops/orchestrator/taxi_fares_orchestrator_train.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | """Orchestrator notebook for taxifares training.""" 3 | # Initialization of dbutils to avoid linting errors during developing in vscode 4 | from pyspark.sql import SparkSession 5 | 6 | 7 | def get_dbutils(spark): 8 | """Return dbutils for databricks.""" 9 | if spark.conf.get("spark.databricks.service.client.enabled") == "true": 10 | from pyspark.dbutils import DBUtils 11 | 12 | return DBUtils(spark) 13 | else: 14 | import IPython 15 | 16 | return IPython.get_ipython().user_ns["dbutils"] 17 | 18 | 19 | spark = SparkSession.builder.appName("Pipeline").getOrCreate() 20 | dbutils = get_dbutils(spark) 21 | 22 | # COMMAND ---------- 23 | 24 | # Define parameters 25 | dbutils.widgets.text( 26 | "taxi_fares_raw_data", "/databricks-datasets/nyctaxi-with-zipcodes/subsampled" 27 | ) 28 | dbutils.widgets.text("mlflow_experiment_id", "") 29 | dbutils.widgets.text("wheel_package_dbfs_base_path", "") 30 | dbutils.widgets.text("wheel_package_taxi_fares_version", "") 31 | dbutils.widgets.text("wheel_package_taxi_fares_mlops_version", "") 32 | dbutils.widgets.text("execute_feature_engineering", "true") 33 | dbutils.widgets.text("training_data_start_date", "2016-01-01") 34 | dbutils.widgets.text("training_data_end_date", "2016-01-31") 35 | dbutils.widgets.text("training_num_leaves", "32") 36 | dbutils.widgets.text("training_objective", "regression") 37 | dbutils.widgets.text("training_metric", "rmse") 38 | dbutils.widgets.text("training_num_rounds", "100") 39 | 40 | # COMMAND ---------- 41 | 42 | # Get wheel package parameters 43 | wheel_package_dbfs_base_path = dbutils.widgets.get( 44 | "wheel_package_dbfs_base_path") 45 | wheel_package_taxi_fares_version = dbutils.widgets.get( 46 | "wheel_package_taxi_fares_version" 47 | ) 48 | wheel_package_taxi_fares_mlops_version = dbutils.widgets.get( 49 | "wheel_package_taxi_fares_mlops_version" 50 | ) 51 | 52 | # COMMAND ---------- 53 | 54 | # MAGIC %pip install $wheel_package_dbfs_base_path/taxi_fares-$wheel_package_taxi_fares_version-py3-none-any.whl # noqa: E501 55 | # MAGIC %pip install $wheel_package_dbfs_base_path/taxi_fares_mlops-$wheel_package_taxi_fares_mlops_version-py3-none-any.whl # noqa: E501 56 | 57 | # COMMAND ---------- 58 | 59 | # Imports 60 | import shutil # noqa: E402 61 | from datetime import datetime # noqa: E402 62 | from pathlib import Path # noqa: E402 63 | 64 | import mlflow # noqa: E402 65 | from databricks import feature_store # noqa: E402 66 | from databricks.feature_store import FeatureLookup # noqa: E402 67 | from monitoring.app_logger import AppLogger, get_disabled_logger # noqa: E402 68 | from taxi_fares.utils.pyspark_utils import rounded_taxi_data # noqa: E402 69 | from taxi_fares_mlops.feature_engineering import run as run_feature_engineering # noqa 70 | from taxi_fares_mlops.publish_model import run as run_publish_model # noqa: E402 71 | from taxi_fares_mlops.training import run as run_training # noqa: E402 72 | 73 | # COMMAND ---------- 74 | 75 | # Get other parameters 76 | mlflow_experiment_id = dbutils.widgets.get("mlflow_experiment_id") 77 | execute_feature_engineering = dbutils.widgets.get( 78 | "execute_feature_engineering") 79 | training_data_start_date = dbutils.widgets.get("training_data_start_date") 80 | training_data_end_date = dbutils.widgets.get("training_data_end_date") 81 | taxi_fares_raw_data = dbutils.widgets.get("taxi_fares_raw_data") 82 | training_num_leaves = int(dbutils.widgets.get("training_num_leaves")) 83 | training_objective = dbutils.widgets.get("training_objective") 84 | training_metric = dbutils.widgets.get("training_metric") 85 | training_num_rounds = int(dbutils.widgets.get("training_num_rounds")) 86 | 87 | # COMMAND ---------- 88 | 89 | # Initiate mlflow experiment 90 | mlflow.start_run(experiment_id=int(mlflow_experiment_id), run_name="training") 91 | mlflow_run = mlflow.active_run() 92 | mlflow_run_id = mlflow_run.info.run_id 93 | mlflow_log_tmp_dir = "/tmp/" + str(mlflow_run_id) # nosec: B108 94 | Path(mlflow_log_tmp_dir).mkdir(parents=True, exist_ok=True) 95 | 96 | # initiate app logger 97 | if any( 98 | [ 99 | True 100 | for secret in dbutils.secrets.list(scope="azure-databricks-mlops-mlflow") 101 | if "app_insights_key" in secret.key 102 | ] 103 | ): 104 | app_insights_key = dbutils.secrets.get( 105 | scope="azure-databricks-mlops-mlflow", key="app_insights_key" 106 | ) 107 | config = {"app_insights_key": app_insights_key} 108 | app_logger = AppLogger(config=config) 109 | else: 110 | app_logger = get_disabled_logger() 111 | try: 112 | logger = app_logger.get_logger( 113 | component_name="Train_Orchestrator", 114 | custom_dimensions={ 115 | "mlflow_run_id": mlflow_run_id, 116 | "mlflow_experiment_id": int(mlflow_experiment_id), 117 | }, 118 | ) 119 | tracer = app_logger.get_tracer( 120 | component_name="Train_Orchestrator", 121 | ) 122 | except Exception as ex: 123 | print(ex) 124 | mlflow.end_run() 125 | shutil.rmtree(mlflow_log_tmp_dir, ignore_errors=True) 126 | raise Exception(f"ERROR - in initializing app logger - {ex}") from ex 127 | 128 | logger.info(f"Stating training with mlflow run id {mlflow_run_id}") 129 | 130 | # COMMAND ---------- 131 | 132 | # Clean up function 133 | 134 | 135 | def clean(): 136 | mlflow.log_artifacts(mlflow_log_tmp_dir) 137 | shutil.rmtree(mlflow_log_tmp_dir) 138 | mlflow.end_run() 139 | 140 | 141 | # COMMAND ---------- 142 | 143 | # Get training raw data 144 | try: 145 | logger.info("Reading training raw data") 146 | raw_data_file = taxi_fares_raw_data 147 | raw_data = spark.read.format("delta").load(raw_data_file) 148 | mlflow.log_param("data_raw_rows", raw_data.count()) 149 | mlflow.log_param("data_raw_cols", len(raw_data.columns)) 150 | except Exception as ex: 151 | clean() 152 | logger.exception(f"ERROR - in reading raw data - {ex}") 153 | raise Exception(f"ERROR - in reading raw data - {ex}") from ex 154 | 155 | # COMMAND ---------- 156 | 157 | # Run feature engineering 158 | if execute_feature_engineering == "true": 159 | try: 160 | logger.info("Starting feature engineering") 161 | with tracer.span("run_feature_engineering"): 162 | feature_engineered_data = run_feature_engineering( 163 | df_input=raw_data, 164 | start_date=datetime.strptime( 165 | training_data_start_date, "%Y-%m-%d"), 166 | end_date=datetime.strptime(training_data_end_date, "%Y-%m-%d"), 167 | mlflow=mlflow, 168 | mlflow_log_tmp_dir=mlflow_log_tmp_dir, 169 | explain_features=True, 170 | app_logger=app_logger, 171 | parent_tracer=tracer, 172 | ) 173 | except Exception as ex: 174 | clean() 175 | logger.exception(f"ERROR - in feature engineering - {ex}") 176 | raise Exception(f"ERROR - in feature engineering - {ex}") from ex 177 | else: 178 | logger.info("Skipping feature engineering") 179 | 180 | # COMMAND ---------- 181 | 182 | # MAGIC %sql 183 | # MAGIC CREATE DATABASE IF NOT EXISTS feature_store_taxi_example; 184 | 185 | # COMMAND ---------- 186 | 187 | # Save features to feature store 188 | fs = feature_store.FeatureStoreClient() 189 | if execute_feature_engineering == "true": 190 | try: 191 | spark.conf.set("spark.sql.shuffle.partitions", "5") 192 | 193 | fs.create_table( 194 | name="feature_store_taxi_example.trip_pickup_features", 195 | primary_keys=["zip", "ts"], 196 | df=feature_engineered_data[0], 197 | partition_columns="yyyy_mm", 198 | description="Taxi Fares. Pickup Features", 199 | ) 200 | fs.create_table( 201 | name="feature_store_taxi_example.trip_dropoff_features", 202 | primary_keys=["zip", "ts"], 203 | df=feature_engineered_data[1], 204 | partition_columns="yyyy_mm", 205 | description="Taxi Fares. Dropoff Features", 206 | ) 207 | 208 | # Write the pickup features DataFrame to the feature store table 209 | fs.write_table( 210 | name="feature_store_taxi_example.trip_pickup_features", 211 | df=feature_engineered_data[0], 212 | mode="merge", 213 | ) 214 | # Write the dropoff features DataFrame to the feature store table 215 | fs.write_table( 216 | name="feature_store_taxi_example.trip_dropoff_features", 217 | df=feature_engineered_data[1], 218 | mode="merge", 219 | ) 220 | except Exception as ex: 221 | clean() 222 | logger.exception( 223 | f"ERROR - in feature saving into feature store - {ex}") 224 | raise Exception( 225 | f"ERROR - in feature saving into feature store - {ex}") from ex 226 | else: 227 | logger.info("Skipping feature saving into feature store") 228 | 229 | # COMMAND ---------- 230 | 231 | # Load features from feature store 232 | try: 233 | pickup_features_table = "feature_store_taxi_example.trip_pickup_features" 234 | dropoff_features_table = "feature_store_taxi_example.trip_dropoff_features" 235 | 236 | pickup_feature_lookups = [ 237 | FeatureLookup( 238 | table_name=pickup_features_table, 239 | feature_names=[ 240 | "mean_fare_window_1h_pickup_zip", 241 | "count_trips_window_1h_pickup_zip", 242 | ], 243 | lookup_key=["pickup_zip", "rounded_pickup_datetime"], 244 | ), 245 | ] 246 | 247 | dropoff_feature_lookups = [ 248 | FeatureLookup( 249 | table_name=dropoff_features_table, 250 | feature_names=["count_trips_window_30m_dropoff_zip", 251 | "dropoff_is_weekend"], 252 | lookup_key=["dropoff_zip", "rounded_dropoff_datetime"], 253 | ), 254 | ] 255 | 256 | # unless additional feature engineering was performed, 257 | # exclude them to avoid training on them. 258 | exclude_columns = ["rounded_pickup_datetime", "rounded_dropoff_datetime"] 259 | 260 | # Create the training set that includes the raw input data merged with 261 | # corresponding features from both feature tables 262 | with tracer.span("create_training_set"): 263 | training_set = fs.create_training_set( 264 | rounded_taxi_data(raw_data), 265 | feature_lookups=pickup_feature_lookups + dropoff_feature_lookups, 266 | label="fare_amount", 267 | exclude_columns=exclude_columns, 268 | ) 269 | 270 | # Load the TrainingSet into a dataframe which can be passed into 271 | # sklearn for training a model 272 | training_df = training_set.load_df() 273 | 274 | logger.info( 275 | f"Shape of training dataframe, rows: {training_df.count()}, cols: {len(training_df.columns)}" # noqa: E501 276 | ) 277 | mlflow.log_param("training_data_rows", training_df.count()) 278 | mlflow.log_param("training_data_columns", len(training_df.columns)) 279 | except Exception as ex: 280 | clean() 281 | logger.exception(f"ERROR - in feature loading from feature store - {ex}") 282 | raise Exception( 283 | f"ERROR - in feature loading from feature store - {ex}") from ex 284 | 285 | # COMMAND ---------- 286 | 287 | # Run training 288 | try: 289 | logger.info("Starting model training") 290 | params = { 291 | "num_leaves": training_num_leaves, 292 | "objective": training_objective, 293 | "metric": training_metric, 294 | } 295 | num_rounds = training_num_rounds 296 | with tracer.span("run_training"): 297 | trained_model = run_training( 298 | training_df, 299 | mlflow, 300 | params=params, 301 | num_rounds=num_rounds, 302 | app_logger=app_logger, 303 | parent_tracer=tracer, 304 | ) 305 | except Exception as ex: 306 | clean() 307 | logger.exception(f"ERROR - in model training - {ex}") 308 | raise Exception(f"ERROR - in model training - {ex}") from ex 309 | 310 | # COMMAND ---------- 311 | 312 | # Publish trained model 313 | try: 314 | logger.info("Starting publish model") 315 | with tracer.span("run_publish_model"): 316 | run_publish_model( 317 | trained_model=trained_model, 318 | training_set=training_set, 319 | mlflow=mlflow, 320 | model_name="taxi_fares", 321 | app_logger=app_logger, 322 | parent_tracer=tracer, 323 | ) 324 | except Exception as ex: 325 | clean() 326 | logger.exception(f"ERROR - in publish trained model - {ex}") 327 | raise Exception(f"ERROR - in publish trained model - {ex}") from ex 328 | 329 | # COMMAND ---------- 330 | 331 | # End 332 | logger.info(f"Completed training with mlflow run id {mlflow_run_id}") 333 | clean() 334 | -------------------------------------------------------------------------------- /ml_ops/src/README.md: -------------------------------------------------------------------------------- 1 | # SRC 2 | 3 | ## Overview 4 | 5 | Source code for MLOps, based on - 6 | 7 | 1. `taxifares_mlops` contains MLOps source code for `taxifares` machine learning code. 8 | 2. The MLOps Python functions will be called from orchestrator Databricks Notebook. 9 | 3. Ops related integrations (MLflow and Application Insights Metrics, Tracing, etc.) may happen in MLOps source code. 10 | 4. Mostly no Machine Learning (data science) related logics will be written in MLOps. 11 | 5. DataFrame I/O will happen in orchestrator Databricks Notebook, not in MLOps source code. 12 | -------------------------------------------------------------------------------- /ml_ops/src/setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from setuptools import find_packages, setup 4 | 5 | 6 | def read(fname): 7 | return open(os.path.join(os.path.dirname(__file__), fname)).read() 8 | 9 | 10 | requirements_file_name = "requirements.txt" 11 | with open(requirements_file_name) as f: 12 | required_packages = f.read().splitlines() 13 | required_packages = [ 14 | package.strip(" ") 15 | for package in required_packages 16 | if package.strip(" ") and "#" not in package 17 | ] 18 | 19 | setup( 20 | name="taxi_fares_mlops", 21 | version="0.0.1", 22 | author="", 23 | author_email="", 24 | description=(""), 25 | license="", 26 | keywords="", 27 | url="", 28 | package_dir={"": "ml_ops/src"}, 29 | packages=find_packages(where="ml_ops/src"), 30 | classifiers=[], 31 | install_requires=required_packages, 32 | ) 33 | -------------------------------------------------------------------------------- /ml_ops/src/taxi_fares_mlops/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/ml_ops/src/taxi_fares_mlops/__init__.py -------------------------------------------------------------------------------- /ml_ops/src/taxi_fares_mlops/feature_engineering.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import math 3 | from datetime import datetime 4 | from pathlib import Path 5 | from typing import Tuple 6 | 7 | import matplotlib.pyplot as plt 8 | import mlflow 9 | import seaborn as sns 10 | from monitoring.app_logger import AppLogger, get_disabled_logger 11 | from opencensus.trace.tracer import Tracer 12 | from pyspark.sql.dataframe import DataFrame 13 | from taxi_fares.feature_eng.features import dropoff_features_fn, pickup_features_fn 14 | 15 | 16 | def run( 17 | df_input: DataFrame, 18 | start_date: datetime, 19 | end_date: datetime, 20 | mlflow: mlflow, 21 | mlflow_log_tmp_dir: str, 22 | explain_features: bool = True, 23 | app_logger: AppLogger = get_disabled_logger(), 24 | parent_tracer: Tracer = None, 25 | ) -> Tuple[DataFrame, DataFrame]: 26 | """MLOps feature engineering entry point. 27 | 28 | Args: 29 | df_input (pd.DataFrame): input data - raw 30 | mlflow (mlflow): mlflow object that is having an active run 31 | initiated by mlflow.start_run 32 | mlflow_log_tmp_dir (str): directory for putting files to be logged 33 | in mlflow artifacts 34 | explain_features (bool, optional): explain features, possible only with 35 | training data. Defaults to True. 36 | app_logger (monitoring.app_logger.AppLogger): AppLogger object default 37 | to monitoring.app_logger.get_disabled_logger 38 | parent_tracer (Tracer): OpenCensus parent tracer for correlation 39 | Returns: 40 | pd.DataFrame: clean and feature engineered data 41 | """ 42 | logger = logging.getLogger(__name__) 43 | try: 44 | component_name = "Taxi_Fares_Feature_Eng" 45 | # mlflow tracking 46 | mlflow_run = mlflow.active_run() 47 | mlflow_run_id = mlflow_run.info.run_id 48 | mlflow_experiment_id = mlflow_run.info.experiment_id 49 | 50 | logger = app_logger.get_logger( 51 | component_name=component_name, 52 | custom_dimensions={ 53 | "mlflow_run_id": mlflow_run_id, 54 | "mlflow_experiment_id": mlflow_experiment_id, 55 | }, 56 | ) 57 | tracer = app_logger.get_tracer( 58 | component_name=component_name, parent_tracer=parent_tracer 59 | ) 60 | 61 | logger.info("Running MLOps feature engineering") 62 | logger.info( 63 | f"Shape of input dataframe, rows: {df_input.count()}, cols: {len(df_input.columns)}" # noqa: E501 64 | ) 65 | 66 | logger.info("Getting pickup features") 67 | with tracer.span("pickup_features"): 68 | pickup_features = pickup_features_fn( 69 | df_input, 70 | ts_column="tpep_pickup_datetime", 71 | start_date=start_date, 72 | end_date=end_date, 73 | ) 74 | logger.info( 75 | f"Shape of pickup features dataframe, rows: {pickup_features.count()}, cols: {len(pickup_features.columns)}" # noqa: E501 76 | ) 77 | mlflow.log_param( 78 | "feature_engineering_pickup_features", 79 | (pickup_features.count(), len(pickup_features.columns)), 80 | ) 81 | 82 | logger.info("Getting drop off features") 83 | with tracer.span("dropoff_features"): 84 | dropoff_features = dropoff_features_fn( 85 | df_input, 86 | ts_column="tpep_dropoff_datetime", 87 | start_date=start_date, 88 | end_date=end_date, 89 | ) 90 | logger.info( 91 | f"Shape of dropoff features dataframe, rows: {dropoff_features.count()}, cols: {len(dropoff_features.columns)}" # noqa: E501 92 | ) 93 | mlflow.log_param( 94 | "feature_engineering_dropoff_features", 95 | (dropoff_features.count(), len(dropoff_features.columns)), 96 | ) 97 | 98 | with tracer.span("explain_features"): 99 | if explain_features: 100 | logger.info("Getting feature explanations - statistics") 101 | feature_statistic_pickup_features = ( 102 | pickup_features.describe().toPandas() 103 | ) 104 | feature_statistic_pickup_features.to_html( 105 | Path( 106 | mlflow_log_tmp_dir, 107 | "feature_statistic_pickup_features.html", 108 | ), 109 | justify="center", 110 | na_rep="", 111 | ) 112 | feature_statistic_dropoff_features = ( 113 | dropoff_features.describe().toPandas() 114 | ) 115 | feature_statistic_dropoff_features.to_html( 116 | Path( 117 | mlflow_log_tmp_dir, 118 | "feature_statistic_dropoff_features.html", 119 | ), 120 | justify="center", 121 | na_rep="", 122 | ) 123 | logger.info("Getting feature explanations - box plot") 124 | pickup_features_pandas = pickup_features.toPandas()[ 125 | [ 126 | "mean_fare_window_1h_pickup_zip", 127 | "count_trips_window_1h_pickup_zip", 128 | ] 129 | ] 130 | numeric_cols = pickup_features_pandas.columns 131 | plot_data = pickup_features_pandas.copy() 132 | select_top_k = len(numeric_cols) 133 | n_col = 2 134 | n_row = math.ceil(select_top_k / n_col) 135 | s_col = 5 136 | s_row = 3 137 | fig, axs = plt.subplots( 138 | n_row, n_col, figsize=(s_col * n_col, s_row * n_row), sharey=False 139 | ) 140 | axs = axs.flatten() 141 | for index, col in enumerate(numeric_cols[:select_top_k]): 142 | ax = sns.boxplot( 143 | x="count_trips_window_1h_pickup_zip", 144 | y=col, 145 | data=plot_data, 146 | ax=axs[index], 147 | ) 148 | ax.set(title=col, ylabel="") 149 | fig.tight_layout() 150 | fig.savefig( 151 | Path(mlflow_log_tmp_dir, "feature_pickup_features_boxplot.png") 152 | ) 153 | dropoff_features_pandas = dropoff_features.toPandas()[ 154 | ["count_trips_window_30m_dropoff_zip", "dropoff_is_weekend"] 155 | ] 156 | numeric_cols = dropoff_features_pandas.columns 157 | plot_data = dropoff_features_pandas.copy() 158 | select_top_k = len(numeric_cols) 159 | n_col = 2 160 | n_row = math.ceil(select_top_k / n_col) 161 | s_col = 5 162 | s_row = 3 163 | fig, axs = plt.subplots( 164 | n_row, n_col, figsize=(s_col * n_col, s_row * n_row), sharey=False 165 | ) 166 | axs = axs.flatten() 167 | for index, col in enumerate(numeric_cols[:select_top_k]): 168 | ax = sns.boxplot( 169 | x="dropoff_is_weekend", y=col, data=plot_data, ax=axs[index] 170 | ) 171 | ax.set(title=col, ylabel="") 172 | fig.tight_layout() 173 | fig.savefig( 174 | Path(mlflow_log_tmp_dir, "feature_dropoff_features_boxplot.png") 175 | ) 176 | 177 | logger.info("Completed MLOps feature engineering") 178 | return (pickup_features, dropoff_features) 179 | except Exception as exp: 180 | logger.error("an exception occurred in Feature Eng") 181 | raise Exception("an exception occurred in Feature Eng") from exp 182 | -------------------------------------------------------------------------------- /ml_ops/src/taxi_fares_mlops/publish_model.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import lightgbm as lgb 4 | import mlflow 5 | from databricks import feature_store 6 | from databricks.feature_store.training_set import TrainingSet 7 | from mlflow.entities.model_registry import ModelVersion 8 | from monitoring.app_logger import AppLogger, get_disabled_logger 9 | from opencensus.trace.tracer import Tracer 10 | 11 | from taxi_fares_mlops.utils import get_latest_model_version 12 | 13 | 14 | def run( 15 | trained_model: lgb.Booster, 16 | training_set: TrainingSet, 17 | mlflow: mlflow, 18 | model_name: str = "taxi_fares", 19 | app_logger: AppLogger = get_disabled_logger(), 20 | parent_tracer: Tracer = None, 21 | ) -> ModelVersion: 22 | """MLOps publish model in mlflow model registry - entry point. 23 | 24 | Args: 25 | trained_model (Ridge): trained Ridge model 26 | mlflow (mlflow): mlflow object that is having an active run 27 | initiated by mlflow.start_run 28 | model_name (str, optional): model name in mlflow model registry. 29 | Defaults to "taxi_fares". 30 | app_logger (monitoring.app_logger.AppLogger): AppLogger object deafult 31 | to monitoring.app_logger.get_disabled_logger 32 | parent_tracer (Tracer): OpenCensus parent tracer for correlation 33 | Returns: 34 | mlflow.entities.model_registry.ModelVersion: registered model details 35 | """ 36 | logger = logging.getLogger(__name__) 37 | try: 38 | component_name = "Taxi_Fares_Publish_Model" 39 | 40 | # mlflow tracking 41 | mlflow_run = mlflow.active_run() 42 | mlflow_run_id = mlflow_run.info.run_id 43 | mlflow_experiment_id = mlflow_run.info.experiment_id 44 | 45 | logger = app_logger.get_logger( 46 | component_name=component_name, 47 | custom_dimensions={ 48 | "mlflow_run_id": mlflow_run_id, 49 | "mlflow_experiment_id": mlflow_experiment_id, 50 | }, 51 | ) 52 | tracer = app_logger.get_tracer( 53 | component_name=component_name, parent_tracer=parent_tracer 54 | ) 55 | 56 | logger.info("Publishing trained model into mlflow model registry") 57 | with tracer.span("register_model"): 58 | fs = feature_store.FeatureStoreClient() 59 | fs.log_model( 60 | trained_model, 61 | artifact_path="model_packaged", 62 | flavor=mlflow.lightgbm, 63 | training_set=training_set, 64 | registered_model_name=model_name, 65 | ) 66 | model_version = get_latest_model_version(model_name) 67 | mlflow.log_param("model_version", model_version) 68 | mlflow.log_param("model_name", model_name) 69 | 70 | logger.info(f"published model name: {model_name}, version: {model_version}") 71 | logger.info("Completed MLOps publish model") 72 | except Exception as exp: 73 | logger.error("an exception occurred in publish model") 74 | raise Exception("an exception occurred in publish model") from exp 75 | -------------------------------------------------------------------------------- /ml_ops/src/taxi_fares_mlops/scoring_batch.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from pathlib import Path 3 | 4 | import mlflow 5 | import pandas as pd 6 | import pyspark.sql.functions as func 7 | from databricks import feature_store 8 | from monitoring.app_logger import AppLogger, get_disabled_logger 9 | from opencensus.trace.tracer import Tracer 10 | 11 | from taxi_fares_mlops.utils import get_latest_model_version 12 | 13 | 14 | def run( 15 | trained_model_name: str, 16 | score_df: pd.DataFrame, 17 | mlflow: mlflow, 18 | mlflow_log_tmp_dir: str, 19 | trained_model_version: str = None, 20 | app_logger: AppLogger = get_disabled_logger(), 21 | parent_tracer: Tracer = None, 22 | ) -> None: 23 | """[summary] 24 | 25 | Args: 26 | trained_model (Ridge): trained Ridge model 27 | df_input (pd.DataFrame): input dataframe for batch scoring, 28 | feature engineeringered. 29 | mlflow (mlflow): mlflow object that is having an active run 30 | initiated by mlflow.start_run 31 | mlflow_log_tmp_dir (str): directory for puting files to be logged 32 | in mlflow artifacts 33 | app_logger (monitoring.app_logger.AppLogger): AppLogger object deafult 34 | to monitoring.app_logger.get_disabled_logger 35 | parent_tracer (Tracer): OpenCensus parent tracer for correlation 36 | """ 37 | logger = logging.getLogger(__name__) 38 | try: 39 | component_name = "Taxi_Fares_Scoring_Batch" 40 | # mlflow tracking 41 | mlflow_run = mlflow.active_run() 42 | mlflow_run_id = mlflow_run.info.run_id 43 | mlflow_experiment_id = mlflow_run.info.experiment_id 44 | 45 | logger = app_logger.get_logger( 46 | component_name=component_name, 47 | custom_dimensions={ 48 | "mlflow_run_id": mlflow_run_id, 49 | "mlflow_experiment_id": mlflow_experiment_id, 50 | }, 51 | ) 52 | tracer = app_logger.get_tracer( 53 | component_name=component_name, parent_tracer=parent_tracer 54 | ) 55 | 56 | logger.info("Running MLOps batch scoring") 57 | with tracer.span("batch_scoring"): 58 | cols = [ 59 | "fare_amount", 60 | "trip_distance", 61 | "pickup_zip", 62 | "dropoff_zip", 63 | "rounded_pickup_datetime", 64 | "rounded_dropoff_datetime", 65 | ] 66 | score_df_reordered = score_df.select(cols) 67 | if trained_model_version is None or trained_model_version == "": 68 | trained_model_version = get_latest_model_version( 69 | trained_model_name) 70 | else: 71 | trained_model_version = int(trained_model_version) 72 | model_uri = f"models:/{trained_model_name}/{trained_model_version}" 73 | mlflow.log_param("trained_model_version", trained_model_version) 74 | logger.info(f"trained model version {trained_model_version}") 75 | fs = feature_store.FeatureStoreClient() 76 | predictions = fs.score_batch(model_uri, score_df_reordered) 77 | cols = [ 78 | "prediction", 79 | "fare_amount", 80 | "trip_distance", 81 | "pickup_zip", 82 | "dropoff_zip", 83 | "rounded_pickup_datetime", 84 | "rounded_dropoff_datetime", 85 | "mean_fare_window_1h_pickup_zip", 86 | "count_trips_window_1h_pickup_zip", 87 | "count_trips_window_30m_dropoff_zip", 88 | "dropoff_is_weekend", 89 | ] 90 | 91 | with_predictions_reordered = ( 92 | predictions.select( 93 | cols, 94 | ) 95 | .withColumnRenamed( 96 | "prediction", 97 | "predicted_fare_amount", 98 | ) 99 | .withColumn( 100 | "predicted_fare_amount", 101 | func.round("predicted_fare_amount", 2), 102 | ) 103 | ) 104 | with_predictions_reordered.toPandas().to_html( 105 | Path( 106 | mlflow_log_tmp_dir, 107 | "batch_scoring_result.html", 108 | ), 109 | justify="center", 110 | na_rep="", 111 | ) 112 | with_predictions_reordered.toPandas().to_csv( 113 | Path( 114 | mlflow_log_tmp_dir, 115 | "batch_scoring_result.csv", 116 | ), 117 | index=False, 118 | ) 119 | logger.info("Completed MLOps batch scoring") 120 | except Exception as exp: 121 | logger.error("an exception occurred in scoring batch") 122 | raise Exception("an exception occurred in scoring batch") from exp 123 | -------------------------------------------------------------------------------- /ml_ops/src/taxi_fares_mlops/training.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Dict 3 | 4 | import lightgbm as lgb 5 | import mlflow 6 | import pandas as pd 7 | from monitoring.app_logger import AppLogger, get_disabled_logger 8 | from opencensus.trace.tracer import Tracer 9 | from taxi_fares.training.evaluate import get_model_metrics, split_data 10 | from taxi_fares.training.train import train 11 | 12 | 13 | def run( 14 | train_df: pd.DataFrame, 15 | mlflow: mlflow, 16 | params: Dict = {"num_leaves": 32, 17 | "objective": "regression", "metric": "rmse"}, 18 | num_rounds: int = 100, 19 | app_logger: AppLogger = get_disabled_logger(), 20 | parent_tracer: Tracer = None, 21 | ) -> lgb.Booster: 22 | """MLOps training entry point. 23 | 24 | Args: 25 | train_df (pd.DataFrame): data for training, output of feature engineering 26 | mlflow (mlflow): mlflow object that is having an active run 27 | initiated by mlflow.start_run 28 | app_logger (monitoring.app_logger.AppLogger): AppLogger object deafult 29 | to monitoring.app_logger.get_disabled_logger 30 | parent_tracer (Tracer): OpenCensus parent tracer for correlation 31 | Returns: 32 | Ridge: trained model 33 | """ 34 | logger = logging.getLogger(__name__) 35 | try: 36 | component_name = "Taxi_Fares_Training" 37 | 38 | # mlflow tracking 39 | mlflow_run = mlflow.active_run() 40 | mlflow_run_id = mlflow_run.info.run_id 41 | mlflow_experiment_id = mlflow_run.info.experiment_id 42 | 43 | logger = app_logger.get_logger( 44 | component_name=component_name, 45 | custom_dimensions={ 46 | "mlflow_run_id": mlflow_run_id, 47 | "mlflow_experiment_id": mlflow_experiment_id, 48 | }, 49 | ) 50 | tracer = app_logger.get_tracer( 51 | component_name=component_name, parent_tracer=parent_tracer 52 | ) 53 | 54 | logger.info("Running MLOps training") 55 | 56 | params = {"num_leaves": 32, 57 | "objective": "regression", "metric": "rmse"} 58 | num_rounds = 100 59 | for k, v in params.items(): 60 | logger.info(f"Training parameter {k}: {v}") 61 | logger.info(f"Training parameter num_rounds: {num_rounds}") 62 | 63 | logger.info("Splitting data for train and test") 64 | data = split_data(train_df) 65 | 66 | logger.info("Train the model") 67 | with tracer.span("train_model"): 68 | mlflow.lightgbm.autolog() 69 | model = train(data["train"], params, num_rounds) 70 | 71 | logger.info("Log the metrics for the model") 72 | metrics = get_model_metrics(model, data["test"]) 73 | for (k, v) in metrics.items(): 74 | logger.info(f"Metric {k}: {v}") 75 | mlflow.log_metric(k, v) 76 | 77 | logger.info("Completed MLOps training") 78 | return model 79 | except Exception as exp: 80 | logger.error("an exception occurred in training") 81 | raise Exception("an exception occurred in training") from exp 82 | -------------------------------------------------------------------------------- /ml_ops/src/taxi_fares_mlops/utils.py: -------------------------------------------------------------------------------- 1 | from mlflow.tracking import MlflowClient 2 | 3 | 4 | def get_latest_model_version(model_name: str) -> int: 5 | latest_version = 1 6 | mlflow_client = MlflowClient() 7 | for mv in mlflow_client.search_model_versions(f"name='{model_name}'"): 8 | version_int = int(mv.version) 9 | if version_int > latest_version: 10 | latest_version = version_int 11 | return latest_version 12 | -------------------------------------------------------------------------------- /ml_ops/tests/README.md: -------------------------------------------------------------------------------- 1 | # TESTS 2 | 3 | Unit test cases for `taxifares_mlops` MLOps source code. 4 | -------------------------------------------------------------------------------- /ml_ops/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/ml_ops/tests/__init__.py -------------------------------------------------------------------------------- /ml_ops/tests/taxi_fares/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/ml_ops/tests/taxi_fares/__init__.py -------------------------------------------------------------------------------- /ml_ops/tests/taxi_fares/data/taxi_fares_unit_test_training.csv: -------------------------------------------------------------------------------- 1 | trip_distance,pickup_zip,dropoff_zip,mean_fare_window_1h_pickup_zip,count_trips_window_1h_pickup_zip,count_trips_window_30m_dropoff_zip,dropoff_is_weekend,fare_amount 2 | 4.94,10282,10171,13,2,1,1,19 3 | 0.28,10110,10110,3.5,1,2,0,3.5 4 | 0.7,10103,10023,7.5,2,1,0,5 5 | 0.8,10022,10017,6,1,1,0,6 6 | 4.51,10110,10282,17,1,1,0,17 7 | 1.8,10009,10065,8,2,1,0,7 8 | 2.58,10153,10199,7.75,2,2,0,12 9 | 1.4,10112,10069,11,1,1,0,11 10 | 1.21,10023,10153,7.75,2,1,1,7.5 11 | 0.6,10012,10003,7.5,2,2,1,6 -------------------------------------------------------------------------------- /ml_ops/tests/taxi_fares/test_publish_model.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import unittest 3 | from unittest.mock import MagicMock, patch 4 | 5 | from taxi_fares_mlops.publish_model import run 6 | 7 | 8 | class TestEvaluateMethods(unittest.TestCase): 9 | logger = logging.getLogger(__name__) 10 | logging.basicConfig( 11 | format="%(asctime)s %(module)s %(levelname)s: %(message)s", 12 | datefmt="%m/%d/%Y %I:%M:%S %p", 13 | level=logging.INFO, 14 | ) 15 | 16 | @patch("taxi_fares_mlops.publish_model.feature_store") 17 | @patch("taxi_fares_mlops.publish_model.get_latest_model_version") 18 | def test_publish_model(self, mock_feature_store, mock_get_latest_model_version): 19 | self.logger.info("unittest test_publish_model") 20 | run(MagicMock(), MagicMock(), MagicMock()) 21 | assert True 22 | 23 | def test_publish_model_exception(self): 24 | self.logger.info("unittest test_publish_model exception") 25 | with self.assertRaises(Exception): 26 | run(None, None, None) 27 | assert True 28 | 29 | 30 | if __name__ == "__main__": 31 | unittest.main() 32 | -------------------------------------------------------------------------------- /ml_ops/tests/taxi_fares/test_training.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import unittest 4 | from unittest.mock import MagicMock 5 | 6 | import lightgbm as lgb 7 | import pandas as pd 8 | from pyspark.sql import SparkSession 9 | from taxi_fares_mlops.training import run 10 | 11 | 12 | class TestEvaluateMethods(unittest.TestCase): 13 | @classmethod 14 | def setUpClass(cls): 15 | cls.spark = ( 16 | SparkSession.builder.master("local[*]").appName("Unit-tests").getOrCreate() 17 | ) 18 | 19 | logger = logging.getLogger(__name__) 20 | logging.basicConfig( 21 | format="%(asctime)s %(module)s %(levelname)s: %(message)s", 22 | datefmt="%m/%d/%Y %I:%M:%S %p", 23 | level=logging.INFO, 24 | ) 25 | 26 | def test_training(self): 27 | self.logger.info("unittest test_training") 28 | data_file = os.path.join( 29 | "tests/taxi_fares/data", "taxi_fares_unit_test_training.csv" 30 | ) 31 | train_df_pandas = pd.read_csv(data_file) 32 | train_df = self.spark.createDataFrame(train_df_pandas) 33 | model = run(train_df, MagicMock()) 34 | 35 | assert isinstance(model, lgb.Booster) 36 | 37 | def test_training_exception(self): 38 | self.logger.info("unittest test_training exception") 39 | with self.assertRaises(Exception): 40 | model = run(MagicMock(), MagicMock()) 41 | assert model is not None 42 | 43 | 44 | if __name__ == "__main__": 45 | unittest.main() 46 | -------------------------------------------------------------------------------- /ml_ops/tests/taxi_fares/test_utils.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from unittest.mock import patch 3 | 4 | from taxi_fares_mlops.utils import get_latest_model_version 5 | 6 | 7 | class TestUtils(unittest.TestCase): 8 | @patch("taxi_fares_mlops.utils.MlflowClient") 9 | def test_get_latest_model_version(self, mock_mlflow_client): 10 | assert get_latest_model_version("taxi_fares") == 1 11 | -------------------------------------------------------------------------------- /ml_source/README.md: -------------------------------------------------------------------------------- 1 | # ML Source 2 | 3 | ## Overview 4 | 5 | This contains machine learning code. That will be developed, unit tested, packaged and delivered independently and typically maintained by Data scientist in an organization. 6 | 7 | ## Contents 8 | 9 | 1. [src](src/) : machine learning source code, that will be packaged as Python `wheel`. 10 | 2. [tests](tests/) : unit test cases for `src`. 11 | -------------------------------------------------------------------------------- /ml_source/src/README.md: -------------------------------------------------------------------------------- 1 | # SRC 2 | 3 | ## Overview 4 | 5 | Source code for machine learning, based on - 6 | 7 | 1. [taxifares](taxi_fares/) contains machine learning source code. 8 | 2. `monitoring` contains logging class for logging into Application Insights. 9 | 3. The machine python functions will be called from MLOps python functions. 10 | -------------------------------------------------------------------------------- /ml_source/src/monitoring/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/ml_source/src/monitoring/__init__.py -------------------------------------------------------------------------------- /ml_source/src/monitoring/app_logger.py: -------------------------------------------------------------------------------- 1 | """This module is used to log traces into Azure Application Insights.""" 2 | import logging 3 | import uuid 4 | from os import getenv 5 | 6 | from opencensus.ext.azure.common import utils 7 | from opencensus.ext.azure.log_exporter import AzureLogHandler 8 | from opencensus.ext.azure.trace_exporter import AzureExporter 9 | from opencensus.trace import config_integration 10 | from opencensus.trace.samplers import AlwaysOffSampler, AlwaysOnSampler 11 | from opencensus.trace.tracer import Tracer 12 | 13 | 14 | class CustomDimensionsFilter(logging.Filter): 15 | """Add custom-dimensions like run_id in each log by using filters.""" 16 | 17 | def __init__(self, custom_dimensions=None): 18 | """Initialize CustomDimensionsFilter.""" 19 | self.custom_dimensions = custom_dimensions or {} 20 | 21 | def filter(self, record): 22 | """Add the default custom_dimensions into the current log record.""" 23 | dim = {**self.custom_dimensions, ** 24 | getattr(record, "custom_dimensions", {})} 25 | record.custom_dimensions = dim 26 | return True 27 | 28 | 29 | class AppLogger: 30 | """Logger wrapper that attach the handler to Application Insights.""" 31 | 32 | HANDLER_NAME = "Azure Application Insights Handler" 33 | 34 | def __init__(self, config=None): 35 | """Create an instance of the Logger class. 36 | 37 | Args: 38 | config:([dict], optional): 39 | Contains the setting for logger {"log_level": logging.debug,"env":"dev", 40 | "app_insights_key":""} 41 | parent:tracer([opencensus.trace.tracer], optional): 42 | Contains parent tracer required for setting coorelation. 43 | """ 44 | self.config = {"log_level": logging.INFO, "logging_enabled": "true"} 45 | self.APPINSIGHTS_INSTRUMENTATION_KEY = "APPINSIGHTS_INSTRUMENTATION_KEY" 46 | self.update_config(config) 47 | pass 48 | 49 | def _initialize_azure_log_handler(self, component_name, custom_dimensions): 50 | """Initialize azure log handler.""" 51 | # Adding logging to trace_integrations 52 | # This will help in adding trace and span ids to logs 53 | # https://github.com/census-instrumentation/opencensus-python/tree/master/contrib/opencensus-ext-logging 54 | 55 | config_integration.trace_integrations(["logging"]) 56 | logging.basicConfig( 57 | format="%(asctime)s name=%(name)s level=%(levelname)s " 58 | "traceId=%(traceId)s spanId=%(spanId)s %(message)s" 59 | ) 60 | app_insights_cs = "InstrumentationKey=" + self._get_app_insights_key() 61 | log_handler = AzureLogHandler( 62 | connection_string=app_insights_cs, export_interval=0.0 63 | ) 64 | log_handler.add_telemetry_processor(self._get_callback(component_name)) 65 | log_handler.name = self.HANDLER_NAME 66 | log_handler.addFilter(CustomDimensionsFilter(custom_dimensions)) 67 | return log_handler 68 | 69 | def _initialize_azure_log_exporter(self, component_name): 70 | """Initialize azure log exporter.""" 71 | app_insights_cs = "InstrumentationKey=" + self._get_app_insights_key() 72 | log_exporter = AzureExporter( 73 | connection_string=app_insights_cs, export_interval=0.0 74 | ) 75 | log_exporter.add_telemetry_processor( 76 | self._get_callback(component_name)) 77 | return log_exporter 78 | 79 | def _initialize_logger(self, log_handler, component_name): 80 | """Initialize Logger.""" 81 | logger = logging.getLogger(component_name) 82 | logger.setLevel(self.log_level) 83 | if self.config.get("logging_enabled") == "true": 84 | if not any(x for x in logger.handlers if x.name == self.HANDLER_NAME): 85 | logger.addHandler(log_handler) 86 | return logger 87 | 88 | def get_logger(self, component_name="TaxiFaresMlOps", custom_dimensions={}): 89 | """Get Logger Object. 90 | 91 | Args: 92 | component_name (str, optional): Name of logger. Defaults to "TaxiFaresMlOps". 93 | custom_dimensions (dict, optional): {"key":"value"} 94 | to capture with every log. 95 | Defaults to {}. 96 | 97 | Returns: 98 | Logger: A logger. 99 | """ 100 | self.update_config(self.config) 101 | handler = self._initialize_azure_log_handler( 102 | component_name, custom_dimensions) 103 | return self._initialize_logger(handler, component_name) 104 | 105 | def get_tracer(self, component_name="TaxiFaresMlOps", parent_tracer=None): 106 | """Get Tracer Object. 107 | 108 | Args: 109 | component_name (str, optional): Name of logger. Defaults to "TaxiFaresMlOps". 110 | parent_tracer([opencensus.trace.tracer], optional): 111 | Contains parent tracer required for setting coorelation. 112 | 113 | Returns: 114 | opencensus.trace.tracer: A Tracer. 115 | """ 116 | self.update_config(self.config) 117 | sampler = AlwaysOnSampler() 118 | exporter = self._initialize_azure_log_exporter(component_name) 119 | if self.config.get("logging_enabled") != "true": 120 | sampler = AlwaysOffSampler() 121 | if parent_tracer is None: 122 | tracer = Tracer(exporter=exporter, sampler=sampler) 123 | else: 124 | tracer = Tracer( 125 | span_context=parent_tracer.span_context, 126 | exporter=exporter, 127 | sampler=sampler, 128 | ) 129 | return tracer 130 | 131 | def _get_app_insights_key(self): 132 | """Get Application Insights Key.""" 133 | try: 134 | if self.app_insights_key is None: 135 | self.app_insights_key = getenv( 136 | self.APPINSIGHTS_INSTRUMENTATION_KEY, None 137 | ) 138 | if self.app_insights_key is not None: 139 | utils.validate_instrumentation_key(self.app_insights_key) 140 | return self.app_insights_key 141 | else: 142 | raise Exception("ApplicationInsights Key is not set") 143 | except Exception as exp: 144 | raise Exception(f"Exception is getting app insights key-> {exp}") 145 | 146 | def _get_callback(self, component_name): 147 | def _callback_add_role_name(envelope): 148 | """Add role name for logger.""" 149 | envelope.tags["ai.cloud.role"] = component_name 150 | envelope.tags["ai.cloud.roleInstance"] = component_name 151 | 152 | return _callback_add_role_name 153 | 154 | def update_config(self, config=None): 155 | """Update logger configuration.""" 156 | if config is not None: 157 | self.config.update(config) 158 | self.app_insights_key = self.config.get("app_insights_key") 159 | self.log_level = self.config.get("log_level") 160 | 161 | 162 | def get_disabled_logger(): 163 | """Get a disabled AppLogger. 164 | 165 | Returns: 166 | AppLogger: A disabled AppLogger 167 | """ 168 | return AppLogger( 169 | config={"logging_enabled": "false", 170 | "app_insights_key": str(uuid.uuid1())} 171 | ) 172 | -------------------------------------------------------------------------------- /ml_source/src/setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from setuptools import find_packages, setup 4 | 5 | 6 | # Utility function to read the README file. 7 | # Used for the long_description. It's nice, because now 1) we have a top level 8 | # README file and 2) it's easier to type in the README file than to put a raw 9 | # string in below ... 10 | def read(fname): 11 | return open(os.path.join(os.path.dirname(__file__), fname)).read() 12 | 13 | 14 | requirements_file_name = "requirements.txt" 15 | with open(requirements_file_name) as f: 16 | required_packages = f.read().splitlines() 17 | required_packages = [ 18 | package.strip(" ") 19 | for package in required_packages 20 | if package.strip(" ") and "#" not in package 21 | ] 22 | setup( 23 | name="taxi_fares", 24 | version="0.0.1", 25 | author="", 26 | author_email="", 27 | description=(""), 28 | license="", 29 | keywords="", 30 | url="", 31 | package_dir={"": "ml_source/src"}, 32 | packages=find_packages(where="ml_source/src"), 33 | classifiers=[], 34 | install_requires=required_packages, 35 | ) 36 | -------------------------------------------------------------------------------- /ml_source/src/taxi_fares/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/ml_source/src/taxi_fares/__init__.py -------------------------------------------------------------------------------- /ml_source/src/taxi_fares/feature_eng/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/ml_source/src/taxi_fares/feature_eng/__init__.py -------------------------------------------------------------------------------- /ml_source/src/taxi_fares/feature_eng/features.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | from pyspark.sql.dataframe import DataFrame 4 | from pyspark.sql.functions import col, count, mean, to_timestamp, unix_timestamp, window 5 | from pyspark.sql.types import FloatType, IntegerType 6 | from taxi_fares.utils.pyspark_utils import filter_df_by_ts, is_weekend, partition_id 7 | 8 | 9 | def pickup_features_fn( 10 | df: DataFrame, ts_column: str, start_date: datetime, end_date: datetime 11 | ) -> DataFrame: 12 | """ 13 | Computes the pickup_features feature group. 14 | To restrict features to a time range, pass in ts_column, start_date, 15 | and/or end_date as kwargs. 16 | """ 17 | df = filter_df_by_ts(df, ts_column, start_date, end_date) 18 | pickupzip_features = ( 19 | df.groupBy( 20 | "pickup_zip", window("tpep_pickup_datetime", "1 hour", "15 minutes") 21 | ) # 1 hour window, sliding every 15 minutes 22 | .agg( 23 | mean("fare_amount").alias("mean_fare_window_1h_pickup_zip"), 24 | count("*").alias("count_trips_window_1h_pickup_zip"), 25 | ) 26 | .select( 27 | col("pickup_zip").alias("zip"), 28 | unix_timestamp(col("window.end")).alias("ts").cast(IntegerType()), 29 | partition_id(to_timestamp(col("window.end"))).alias("yyyy_mm"), 30 | col("mean_fare_window_1h_pickup_zip").cast(FloatType()), 31 | col("count_trips_window_1h_pickup_zip").cast(IntegerType()), 32 | ) 33 | ) 34 | return pickupzip_features 35 | 36 | 37 | def dropoff_features_fn( 38 | df: DataFrame, ts_column: str, start_date: datetime, end_date: datetime 39 | ) -> DataFrame: 40 | """ 41 | Computes the dropoff_features feature group. 42 | To restrict features to a time range, pass in ts_column, start_date, 43 | and/or end_date as kwargs. 44 | """ 45 | df = filter_df_by_ts(df, ts_column, start_date, end_date) 46 | dropoffzip_features = ( 47 | df.groupBy("dropoff_zip", window("tpep_dropoff_datetime", "30 minute")) 48 | .agg(count("*").alias("count_trips_window_30m_dropoff_zip")) 49 | .select( 50 | col("dropoff_zip").alias("zip"), 51 | unix_timestamp(col("window.end")).alias("ts").cast(IntegerType()), 52 | partition_id(to_timestamp(col("window.end"))).alias("yyyy_mm"), 53 | col("count_trips_window_30m_dropoff_zip").cast(IntegerType()), 54 | is_weekend(col("window.end")).alias("dropoff_is_weekend"), 55 | ) 56 | ) 57 | return dropoffzip_features 58 | -------------------------------------------------------------------------------- /ml_source/src/taxi_fares/training/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/ml_source/src/taxi_fares/training/__init__.py -------------------------------------------------------------------------------- /ml_source/src/taxi_fares/training/evaluate.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | import lightgbm as lgb 4 | import numpy as np 5 | from pyspark.sql.dataframe import DataFrame 6 | from sklearn.metrics import mean_squared_error 7 | from sklearn.model_selection import train_test_split 8 | 9 | 10 | def split_data(df: DataFrame) -> dict: 11 | """Split the dataframe into test and train data. 12 | 13 | Args: 14 | df (pd.DataFrame): processed dataframe for train and evaluate 15 | 16 | Returns: 17 | dict: splitted data for train and test - 18 | { 19 | "train": 20 | "X": np.array, 21 | "y": np.array, 22 | }, 23 | "test": 24 | "X": np.array, 25 | "y": np.array, 26 | } 27 | } 28 | """ 29 | features_and_label = df.columns 30 | 31 | # Collect data into a Pandas array for training 32 | data = df.toPandas()[features_and_label] 33 | 34 | train, test = train_test_split(data, random_state=123) 35 | X_train = train.drop(["fare_amount"], axis=1) 36 | y_train = train.fare_amount 37 | X_test = test.drop(["fare_amount"], axis=1) 38 | y_test = test.fare_amount 39 | 40 | data = {"train": {"X": X_train, "y": y_train}, "test": {"X": X_test, "y": y_test}} 41 | return data 42 | 43 | 44 | def get_model_metrics(model: lgb.Booster, test_data: Dict[str, np.ndarray]) -> dict: 45 | """Evaluate the metrics for the model. 46 | 47 | Args: 48 | model (Ridge): trained ridge model 49 | test_data (Dict[np.array]): test data with X key for features and y key labels 50 | 51 | Returns: 52 | dict: mse metrics 53 | """ 54 | preds = model.predict(test_data["X"]) 55 | mse = mean_squared_error(preds, test_data["y"]) 56 | metrics = {"mse": mse} 57 | return metrics 58 | -------------------------------------------------------------------------------- /ml_source/src/taxi_fares/training/train.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | import lightgbm as lgb 4 | import numpy as np 5 | 6 | 7 | def train( 8 | train_data: Dict[str, np.ndarray], params: dict, num_rounds: int 9 | ) -> lgb.Booster: 10 | train_lgb_dataset = lgb.Dataset(train_data["X"], label=train_data["y"].values) 11 | 12 | # Train a lightGBM model 13 | model = lgb.train(params, train_lgb_dataset, num_rounds) 14 | return model 15 | -------------------------------------------------------------------------------- /ml_source/src/taxi_fares/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/ml_source/src/taxi_fares/utils/__init__.py -------------------------------------------------------------------------------- /ml_source/src/taxi_fares/utils/pyspark_utils.py: -------------------------------------------------------------------------------- 1 | import math 2 | from datetime import datetime, timedelta 3 | 4 | from pyspark.sql.column import Column 5 | from pyspark.sql.dataframe import DataFrame 6 | from pyspark.sql.functions import col, lit, udf 7 | from pyspark.sql.types import IntegerType, StringType 8 | from pytz import timezone 9 | 10 | 11 | @udf(returnType=IntegerType()) 12 | def is_weekend(dt: Column) -> Column: 13 | tz = "America/New_York" 14 | return int(dt.astimezone(timezone(tz)).weekday() >= 5) # 5 = Saturday, 6 = Sunday 15 | 16 | 17 | @udf(returnType=StringType()) 18 | def partition_id(dt: Column) -> Column: 19 | # datetime -> "YYYY-MM" 20 | return f"{dt.year:04d}-{dt.month:02d}" 21 | 22 | 23 | def filter_df_by_ts( 24 | df: DataFrame, ts_column: str, start_date: datetime, end_date: datetime 25 | ) -> DataFrame: 26 | if ts_column and start_date: 27 | df = df.filter(col(ts_column) >= start_date) 28 | if ts_column and end_date: 29 | df = df.filter(col(ts_column) < end_date) 30 | return df 31 | 32 | 33 | def rounded_unix_timestamp(dt, num_minutes=15): 34 | """ 35 | Ceilings datetime dt to interval num_minutes, then returns the unix timestamp. 36 | """ 37 | nsecs = dt.minute * 60 + dt.second + dt.microsecond * 1e-6 38 | delta = math.ceil(nsecs / (60 * num_minutes)) * (60 * num_minutes) - nsecs 39 | return int((dt + timedelta(seconds=delta)).timestamp()) 40 | 41 | 42 | rounded_unix_timestamp_udf = udf(rounded_unix_timestamp, IntegerType()) 43 | 44 | 45 | def rounded_taxi_data(taxi_data_df): 46 | # Round the taxi data timestamp to 15 and 30 minute intervals so we can join with 47 | # the pickup and dropoff features 48 | # respectively. 49 | taxi_data_df = ( 50 | taxi_data_df.withColumn( 51 | "rounded_pickup_datetime", 52 | rounded_unix_timestamp_udf(taxi_data_df["tpep_pickup_datetime"], lit(15)), 53 | ) 54 | .withColumn( 55 | "rounded_dropoff_datetime", 56 | rounded_unix_timestamp_udf(taxi_data_df["tpep_dropoff_datetime"], lit(30)), 57 | ) 58 | .drop("tpep_pickup_datetime") 59 | .drop("tpep_dropoff_datetime") 60 | ) 61 | taxi_data_df.createOrReplaceTempView("taxi_data") 62 | return taxi_data_df 63 | -------------------------------------------------------------------------------- /ml_source/tests/README.md: -------------------------------------------------------------------------------- 1 | # TESTS 2 | 3 | Unit test cases for `taxifares` machine learning source code. 4 | -------------------------------------------------------------------------------- /ml_source/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/ml_source/tests/__init__.py -------------------------------------------------------------------------------- /ml_source/tests/monitoring/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/ml_source/tests/monitoring/__init__.py -------------------------------------------------------------------------------- /ml_source/tests/monitoring/test_app_logger.py: -------------------------------------------------------------------------------- 1 | """Test src/monitoring/app_logger.py.""" 2 | 3 | import logging 4 | import unittest 5 | import uuid 6 | 7 | from monitoring.app_logger import AppLogger, get_disabled_logger 8 | 9 | test_instrumentation_key = str(uuid.uuid1()) 10 | test_invalid_instrumentation_key = "invalid_instrumentation_key" 11 | 12 | 13 | class TestAppLogger(unittest.TestCase): 14 | @classmethod 15 | def setUpClass(cls): 16 | cls.valid_config = { 17 | "log_level": "DEBUG", 18 | "logging_enabled": "true", 19 | "app_insights_key": test_instrumentation_key, 20 | } 21 | cls.invalid_config = { 22 | "log_level": "DEBUG", 23 | "logging_enabled": "false", 24 | "app_insights_key": test_invalid_instrumentation_key, 25 | } 26 | 27 | def test_logger_creation_valid_instrumentation_key(self): 28 | """Test with valid formatted instrumentation key.""" 29 | global test_instrumentation_key 30 | try: 31 | app_logger = AppLogger( 32 | config=self.valid_config, 33 | ) 34 | assert app_logger is not None 35 | except Exception: 36 | assert False 37 | 38 | def test_logger_creation_invalid_instrumentation_key(self): 39 | """Test with invalid instrumentation key.""" 40 | global test_invalid_instrumentation_key 41 | with self.assertRaises(Exception): 42 | logging.disable(logging.CRITICAL) 43 | app_logger = AppLogger( 44 | config=self.invalid_config, 45 | ) 46 | app_logger.get_logger() 47 | assert app_logger is not None 48 | 49 | def test_logger_creation_no_instrumentation_key(self): 50 | """Test with no instrumentation key.""" 51 | with self.assertRaises(Exception): 52 | logging.disable(logging.CRITICAL) 53 | config = {"log_level": logging.DEBUG, "logging_enabled": "false"} 54 | app_logger = AppLogger(config=config) 55 | app_logger.get_logger() 56 | assert app_logger is not None 57 | 58 | def test_logging(self): 59 | """Test to use logging functions.""" 60 | global test_instrumentation_key 61 | try: 62 | component_name = "TestComponent" 63 | app_logger = AppLogger(config=self.valid_config) 64 | assert app_logger is not None 65 | test_logger = app_logger.get_logger( 66 | component_name=component_name, 67 | ) 68 | 69 | assert test_logger is not None 70 | test_logger.info("Test Logging") 71 | except Exception: 72 | assert False 73 | 74 | def test_tracing(self): 75 | """Test for Tracer.""" 76 | global test_instrumentation_key 77 | try: 78 | component_name = "TestComponent" 79 | app_logger = AppLogger(config=self.valid_config) 80 | assert app_logger is not None 81 | 82 | tracer = app_logger.get_tracer( 83 | component_name=component_name, 84 | ) 85 | tracer_with_parent = app_logger.get_tracer( 86 | component_name=component_name, parent_tracer=tracer 87 | ) 88 | test_logger = app_logger.get_logger( 89 | component_name=component_name, 90 | ) 91 | 92 | assert test_logger is not None 93 | assert tracer is not None 94 | assert tracer_with_parent is not None 95 | 96 | with tracer.span(name="testspan"): 97 | test_logger.info("in test span") 98 | except Exception: 99 | assert False 100 | 101 | def test_tracing_with_disabled_logger(self): 102 | """Test with no instrumentation key.""" 103 | app_logger = get_disabled_logger() 104 | tracer = app_logger.get_tracer() 105 | assert tracer is not None 106 | 107 | def test_exception(self): 108 | """Test for calling logger.exception method.""" 109 | global test_instrumentation_key 110 | try: 111 | component_name = "TestComponent" 112 | app_logger = AppLogger( 113 | config=self.valid_config, 114 | ) 115 | assert app_logger is not None 116 | 117 | test_logger = app_logger.get_logger( 118 | component_name=component_name, 119 | ) 120 | assert test_logger is not None 121 | try: 122 | raise Exception("Testing exception logging") 123 | except Exception as exp: 124 | test_logger.exception(exp) 125 | except Exception: 126 | assert False 127 | 128 | def test_logging_level(self): 129 | """Test for changing logger level in config.""" 130 | try: 131 | global test_instrumentation_key 132 | component_name = "TestComponent" 133 | valid_config = self.valid_config.copy() 134 | valid_config["log_level"] = logging.ERROR 135 | app_logger = AppLogger( 136 | config=valid_config, 137 | ) 138 | assert app_logger.config["log_level"] == logging.ERROR 139 | test_logger = app_logger.get_logger( 140 | component_name=component_name, 141 | ) 142 | 143 | test_logger.error("Testing logging level") 144 | except Exception: 145 | assert False 146 | 147 | def test_logging_extra_params(self): 148 | """Test logging extra params.""" 149 | try: 150 | global test_instrumentation_key 151 | component_name = "TestComponent" 152 | app_logger = AppLogger( 153 | config=self.valid_config, 154 | ) 155 | test_logger = app_logger.get_logger( 156 | component_name=component_name, 157 | ) 158 | extra_params = {"custom_dimensions": {"key1": "value1"}} 159 | test_logger.info("Logging extra params", extra=extra_params) 160 | except Exception: 161 | assert False 162 | 163 | def test_disabled_logger(self): 164 | """Test disabled logger.""" 165 | try: 166 | 167 | def do_work(app_logger=get_disabled_logger()): 168 | component_name = "TestComponent" 169 | test_logger = app_logger.get_logger( 170 | component_name=component_name, 171 | ) 172 | extra_params = {"custom_dimensions": {"key1": "value1"}} 173 | test_logger.info("Logging extra params", extra=extra_params) 174 | 175 | do_work() 176 | except Exception: 177 | assert False 178 | 179 | 180 | if __name__ == "__main__": 181 | unittest.main() 182 | -------------------------------------------------------------------------------- /ml_source/tests/taxi_fares/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/ml_source/tests/taxi_fares/__init__.py -------------------------------------------------------------------------------- /ml_source/tests/taxi_fares/feature_eng/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/ml_source/tests/taxi_fares/feature_eng/__init__.py -------------------------------------------------------------------------------- /ml_source/tests/taxi_fares/feature_eng/test_features.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from pyspark.sql import SparkSession 4 | from src.taxi_fares.feature_eng.features import pickup_features_fn 5 | 6 | 7 | class TestFeatures(unittest.TestCase): 8 | @classmethod 9 | def setUpClass(cls): 10 | cls.spark = ( 11 | SparkSession.builder.master("local[*]").appName("Unit-tests").getOrCreate() 12 | ) 13 | 14 | @classmethod 15 | def tearDownClass(cls): 16 | cls.spark.stop() 17 | 18 | def test_if_pickup_features_are_computed(self): 19 | df = self.spark.createDataFrame( 20 | [ 21 | ("2019-01-01 00:00:00", "2019-01-01 01:00:00", 1.0, 1, 10000, 10001), 22 | ("2019-01-01 00:15:00", "2019-01-01 01:15:00", 2.0, 2, 10002, 10003), 23 | ("2019-01-01 00:30:00", "2019-01-01 01:30:00", 3.0, 3, 10004, 10005), 24 | ("2019-01-01 00:45:00", "2019-01-01 01:45:00", 4.0, 4, 10006, 10007), 25 | ("2019-01-01 01:00:00", "2019-01-01 02:00:00", 5.0, 5, 10008, 10009), 26 | ("2019-01-01 01:15:00", "2019-01-01 02:15:00", 6.0, 6, 10010, 10011), 27 | ("2019-01-01 01:30:00", "2019-01-01 02:30:00", 7.0, 7, 10012, 10013), 28 | ("2019-01-01 01:45:00", "2019-01-01 02:45:00", 8.0, 8, 10014, 10015), 29 | ("2019-01-01 02:00:00", "2019-01-01 03:00:00", 9.0, 9, 10016, 10017), 30 | ("2019-01-01 02:15:00", "2019-01-01 03:15:00", 10.0, 10, 10018, 10019), 31 | ("2019-01-01 02:30:00", "2019-01-01 03:30:00", 11.0, 11, 10020, 10021), 32 | ("2019-01-01 02:45:00", "2019-01-01 03:45:00", 12.0, 12, 10022, 10023), 33 | ], 34 | [ 35 | "tpep_pickup_datetime", 36 | "tpep_dropoff_datetime", 37 | "trip_distance", 38 | "fare_amount", 39 | "pickup_zip", 40 | "dropoff_zip", 41 | ], 42 | ) 43 | df = pickup_features_fn( 44 | df, "tpep_pickup_datetime", "2019-01-01 00:00:00", "2019-01-01 01:45:00" 45 | ) 46 | self.assertEqual(df.count(), 28) 47 | -------------------------------------------------------------------------------- /ml_source/tests/taxi_fares/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/azure-databricks-mlops-mlflow/eeacd5223d21973628bda972dbaf476ddad14a28/ml_source/tests/taxi_fares/utils/__init__.py -------------------------------------------------------------------------------- /ml_source/tests/taxi_fares/utils/test_pyspark_utils.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from pyspark.sql import SparkSession 4 | from src.taxi_fares.utils.pyspark_utils import filter_df_by_ts 5 | 6 | 7 | class TestPysparkUtils(unittest.TestCase): 8 | @classmethod 9 | def setUpClass(cls): 10 | cls.spark = ( 11 | SparkSession.builder.master("local[*]").appName("Unit-tests").getOrCreate() 12 | ) 13 | 14 | @classmethod 15 | def tearDownClass(cls): 16 | cls.spark.stop() 17 | 18 | def test_if_df_is_getting_filtered_by_ts(self): 19 | df = self.spark.createDataFrame( 20 | [ 21 | ("2019-01-01 00:00:00", 1), 22 | ("2019-01-01 00:15:00", 2), 23 | ("2019-01-01 00:30:00", 3), 24 | ("2019-01-01 00:45:00", 4), 25 | ("2019-01-01 01:00:00", 5), 26 | ("2019-01-01 01:15:00", 6), 27 | ("2019-01-01 01:30:00", 7), 28 | ("2019-01-01 01:45:00", 8), 29 | ("2019-01-01 02:00:00", 9), 30 | ("2019-01-01 02:15:00", 10), 31 | ("2019-01-01 02:30:00", 11), 32 | ("2019-01-01 02:45:00", 12), 33 | ], 34 | ["tpep_pickup_datetime", "fare_amount"], 35 | ) 36 | df = filter_df_by_ts( 37 | df, "tpep_pickup_datetime", "2019-01-01 00:00:00", "2019-01-01 01:45:00" 38 | ) 39 | self.assertEqual(df.count(), 7) 40 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | scikit-learn==1.5.0 2 | pandas==1.2.4 3 | black==24.3.0 4 | coverage==5.5 5 | databricks-cli==0.14.3 6 | mlflow==2.21.0 7 | opencensus-ext-azure==1.0.7 8 | opencensus-ext-logging==0.1.0 9 | protobuf==3.18.3 10 | lightgbm==4.6.0 11 | isort==5.10.1 12 | --------------------------------------------------------------------------------