├── .cloud └── .gitkeep ├── .github ├── labels.yaml └── workflows │ ├── cleanup.yml │ ├── run-workflows.yml │ └── smoke.yml ├── .gitignore ├── CODE_OF_CONDUCT.md ├── LICENSE ├── README.md ├── SECURITY.md ├── cleanup.py ├── notebooks └── Untitled.ipynb ├── requirements.txt ├── setup-workspace.py └── workflows └── basic ├── job.py ├── requirements.txt └── src └── train.py /.cloud/.gitkeep: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /.github/labels.yaml: -------------------------------------------------------------------------------- 1 | - name: Ask 2 | description: Define and scope problem and solution 3 | color: #c9ecff 4 | 5 | - name: Explore 6 | description: Explore and document data to increase understanding 7 | color: #f0f29b 8 | 9 | - name: Experiment 10 | description: Build features and train models 11 | color: #8569c6 12 | 13 | - name: Data 14 | description: Get and transform data 15 | color: #1c587c 16 | 17 | - name: Model 18 | description: Prepare model for deployment 19 | color: #0b4e82 20 | 21 | - name: Deploy 22 | description: Register, package, and deploy model 23 | color: #f79499 24 | 25 | - name: Communicate 26 | description: Write reports, create dashboards, summarize findings, etc. 27 | color: #f9f345 28 | 29 | - name: succeeded 30 | description: This was successful 31 | color: #67d157 32 | 33 | - name: failed 34 | description: This didn't go as hoped 35 | color: #c2021c 36 | 37 | - name: on hold 38 | description: Still seems promising, but let's revist later 39 | color: #ffd04f 40 | 41 | - name: blocked - need access 42 | description: Blocked due to lack of access to data, resources, environment, etc. 43 | color: #ed9a53 44 | -------------------------------------------------------------------------------- /.github/workflows/cleanup.yml: -------------------------------------------------------------------------------- 1 | name: cleanup 2 | on: 3 | schedule: 4 | - cron: "0 8 * * *" 5 | jobs: 6 | build: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - name: check out repo 10 | uses: actions/checkout@v2 11 | - name: setup python 12 | uses: actions/setup-python@v2 13 | with: 14 | python-version: "3.8" 15 | - name: pip install 16 | run: pip install -r requirements.txt 17 | - name: azure login 18 | uses: azure/login@v1 19 | with: 20 | creds: ${{secrets.AZ_CREDS}} 21 | - name: install azmlcli 22 | run: az extension add -n azure-cli-ml 23 | - name: attach to workspace 24 | run: az ml folder attach -w default -g azureml-template 25 | - name: run cleanup script 26 | run: python cleanup.py 27 | -------------------------------------------------------------------------------- /.github/workflows/run-workflows.yml: -------------------------------------------------------------------------------- 1 | name: run-workflows 2 | on: 3 | push: 4 | branches: 5 | - main 6 | paths: 7 | - workflows/** 8 | pull_request: 9 | branches: 10 | - main 11 | paths: 12 | - workflows/** 13 | schedule: 14 | - cron: "0 0/2 * * *" 15 | jobs: 16 | build: 17 | runs-on: ubuntu-latest 18 | steps: 19 | - name: check out repo 20 | uses: actions/checkout@v2 21 | - name: setup python 22 | uses: actions/setup-python@v2 23 | with: 24 | python-version: "3.8" 25 | - name: pip install 26 | run: pip install -r requirements.txt 27 | - name: azure login 28 | uses: azure/login@v1 29 | with: 30 | creds: ${{secrets.AZ_CREDS}} 31 | - name: install azmlcli 32 | run: az extension add -n azure-cli-ml 33 | - name: attach to workspace 34 | run: az ml folder attach -w default -g azureml-template 35 | - name: run basic job 36 | run: python workflows/basic/job.py 37 | -------------------------------------------------------------------------------- /.github/workflows/smoke.yml: -------------------------------------------------------------------------------- 1 | name: smoke 2 | on: 3 | push: 4 | branches: 5 | - main 6 | pull_request: 7 | branches: 8 | - main 9 | jobs: 10 | build: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - name: check out repo 14 | uses: actions/checkout@v2 15 | - name: setup python 16 | uses: actions/setup-python@v2 17 | with: 18 | python-version: "3.8" 19 | - name: pip install 20 | run: pip install -r requirements.txt 21 | - name: check code format 22 | run: black --check . 23 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .azureml 2 | .vscode 3 | pythonenv* -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Microsoft Azure 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Azure Machine Learning (AML) Template 2 | 3 | [![run-workflows-badge](https://github.com/Azure/azureml-template/workflows/run-workflows/badge.svg)](https://github.com/Azure/azureml-template/actions?query=workflow%3Arun-workflows) 4 | [![cleanup](https://github.com/Azure/azureml-template/workflows/cleanup/badge.svg)](https://github.com/Azure/azureml-template/actions?query=workflow%3Acleanup) 5 | [![smoke](https://github.com/Azure/azureml-template/workflows/smoke/badge.svg)](https://github.com/Azure/azureml-template/actions?query=workflow%3Asmoke) 6 | [![code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) 7 | [![license: MIT](https://img.shields.io/badge/License-MIT-purple.svg)](LICENSE) 8 | 9 | Welcome to the Azure Machine Learning (AML) template repository! 10 | 11 | ## Prerequisites 12 | 13 | 1. An Azure subscription. If you don't have an Azure subscription, [create a free account](https://aka.ms/AMLFree) before you begin. 14 | 2. A terminal and Python >=3.6,[\<3.9](https://pypi.org/project/azureml-core). 15 | 16 | ## Getting started 17 | 18 | Click "Use this template" above and create a repository. 19 | 20 | Follow the setup guide below to add your Azure credentials and create required Azure resources. At the end, you will have a repository with: 21 | 22 | - simple LightGBM training workflow running every 2 hours and on push/PR 23 | - code format check on push/PR 24 | - resource cleanup script running nightly 25 | 26 | ## Setup 27 | 28 | First, export your Azure subscription id as an environment variable: 29 | 30 | ```console 31 | export ID= 32 | ``` 33 | 34 | Second, create the Azure resource group and required AML resources: 35 | 36 | ```console 37 | python setup-workspace.py --subscription-id $ID 38 | ``` 39 | 40 | This will create a resource group named `azureml-template`, a workspace named `default`, and a cluster named `cpu-cluster`. Edit `setup-workspace.py` as needed. If you change the names, ensure you change corresponding names in the `.github/workflows` files and in the third step below. 41 | 42 | Third, create a service principal for the resource group: 43 | 44 | ```console 45 | az ad sp create-for-rbac --name "azureml-template" \ 46 | --role contributor \ 47 | --scopes /subscriptions/$ID/resourceGroups/azureml-template \ 48 | --sdk-auth 49 | ``` 50 | 51 | Copy the output json, which looks like this: 52 | 53 | ```console 54 | { 55 | "clientId": "", 56 | "clientSecret": "", 57 | "subscriptionId": "", 58 | "tenantId": "", 59 | (...) 60 | } 61 | ``` 62 | 63 | In your repository, navigate to "Settings > Secrets > New Secret". Name the secret `AZ_CREDS` and paste the json output from above. This is used in the Azure login action in the GitHub Actions. If you use a different name for the secret, ensure you change the corresponding names in the `.github/workflows` files. 64 | 65 | ## Contents 66 | 67 | Adapt this template to automate the entire ML lifecycle on GitHub, using AML for centralized tracking and scaling up/out on Azure compute. 68 | 69 | |directory|description| 70 | |-|-| 71 | |`.cloud`|cloud templates| 72 | |`.github`|GitHub specific files like Actions workflow yaml definitions and issue templates| 73 | |`notebooks`|interactive jupyter notebooks for iterative ML development| 74 | |`workflows`|self-contained directories of job/workflow to be run| 75 | 76 | ## GitHub Actions 77 | 78 | Modify all files as needed. 79 | 80 | **Actions**: 81 | 82 | - [`.github/workflows/smoke.yml`](.github/workflows/smoke.yml) runs on every PR and push to `main` to check code format 83 | - [`.github/workflows/cleanup.yml`](.github/workflows/cleanup.yml) runs daily and can be used to cleanup AML resources 84 | - [`.github/workflows/run-workflows.yml`](.github/workflows/run-workflows.yml) runs a ml workflow every two hours and push/PR to `main` 85 | 86 | **Other**: 87 | 88 | - [`requirements.txt`](requirements.txt) specifies required pip packages for GitHub actions 89 | - [`setup-workspace.py`](setup-workspace.py) can be modified for workspace and resource setup 90 | - [`cleanup.py`](cleanup.py) can be modified for nightly workspace cleanup tasks 91 | - [`workflows/basic/job.py`](workflows/basic/job.py) is the AML control code 92 | - [`workflows/basic/src/train.py`](workflows/basic/src/train.py) is the ML training script with mlflow tracking 93 | - [`workflows/basic/requirements.txt`](workflows/basic/requirements.txt) specifies required pip packages for the training script 94 | 95 | ## Reference 96 | 97 | - [Azure Machine Learning Examples](https://github.com/Azure/azureml-examples) 98 | - [Cheat Sheet, VSCode Snippets, and Templates](https://azure.github.io/azureml-web) 99 | - [Azure Machine Learning Documentation](https://docs.microsoft.com/azure/machine-learning) 100 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd). 40 | 41 | 42 | -------------------------------------------------------------------------------- /cleanup.py: -------------------------------------------------------------------------------- 1 | # imports 2 | import argparse 3 | from azureml.core import Workspace 4 | 5 | # setup argparse 6 | parser = argparse.ArgumentParser() 7 | args = parser.parse_args() 8 | 9 | # get workspace 10 | ws = Workspace.from_config() 11 | 12 | # process webservices 13 | for webservice in ws.webservices: 14 | pass 15 | 16 | # process compute targets 17 | for compute_target in ws.compute_targets: 18 | pass 19 | -------------------------------------------------------------------------------- /notebooks/Untitled.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Interactive experimentation" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "!pip install --upgrade lightgbm scikit-learn pandas adlfs" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "## Setup cloud tracking" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "import mlflow\n", 33 | "from azureml.core import Workspace\n", 34 | "\n", 35 | "ws = Workspace.from_config()\n", 36 | "mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())\n", 37 | "mlflow.set_experiment(\"untitled\")" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "## Load data\n", 45 | "\n", 46 | "You can read directly from public URIs into Pandas. For private Blob or ADLS data, consider using [adlfs](https://github.com/dask/adlfs)." 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "data_uri = \"https://azuremlexamples.blob.core.windows.net/datasets/iris.csv\"" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "import pandas as pd\n", 65 | "\n", 66 | "df = pd.read_csv(data_uri)\n", 67 | "df.head()" 68 | ] 69 | }, 70 | { 71 | "source": [ 72 | "## Define functions" 73 | ], 74 | "cell_type": "markdown", 75 | "metadata": {} 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "# imports\n", 84 | "import time\n", 85 | "\n", 86 | "import lightgbm as lgb\n", 87 | "\n", 88 | "from sklearn.metrics import log_loss, accuracy_score\n", 89 | "from sklearn.preprocessing import LabelEncoder\n", 90 | "from sklearn.model_selection import train_test_split\n", 91 | "\n", 92 | "# define functions\n", 93 | "def preprocess_data(df):\n", 94 | " X = df.drop([\"species\"], axis=1)\n", 95 | " y = df[\"species\"]\n", 96 | "\n", 97 | " enc = LabelEncoder()\n", 98 | " y = enc.fit_transform(y)\n", 99 | "\n", 100 | " X_train, X_test, y_train, y_test = train_test_split(\n", 101 | " X, y, test_size=0.2, random_state=42\n", 102 | " )\n", 103 | "\n", 104 | " return X_train, X_test, y_train, y_test, enc\n", 105 | "\n", 106 | "\n", 107 | "def train_model(params, num_boost_round, X_train, X_test, y_train, y_test):\n", 108 | " t1 = time.time()\n", 109 | " train_data = lgb.Dataset(X_train, label=y_train)\n", 110 | " test_data = lgb.Dataset(X_test, label=y_test)\n", 111 | " model = lgb.train(\n", 112 | " params,\n", 113 | " train_data,\n", 114 | " num_boost_round=num_boost_round,\n", 115 | " valid_sets=[test_data],\n", 116 | " valid_names=[\"test\"],\n", 117 | " )\n", 118 | " t2 = time.time()\n", 119 | "\n", 120 | " return model, t2 - t1\n", 121 | "\n", 122 | "\n", 123 | "def evaluate_model(model, X_test, y_test):\n", 124 | " y_proba = model.predict(X_test)\n", 125 | " y_pred = y_proba.argmax(axis=1)\n", 126 | " loss = log_loss(y_test, y_proba)\n", 127 | " acc = accuracy_score(y_test, y_pred)\n", 128 | "\n", 129 | " return loss, acc" 130 | ] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "metadata": {}, 135 | "source": [ 136 | "## Run a trial" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "# preprocess data\n", 146 | "X_train, X_test, y_train, y_test, enc = preprocess_data(df)\n", 147 | "\n", 148 | "# set training parameters\n", 149 | "params = {\n", 150 | " \"objective\": \"multiclass\",\n", 151 | " \"num_class\": 3,\n", 152 | " \"learning_rate\": 0.1,\n", 153 | " \"metric\": \"multi_logloss\",\n", 154 | " \"colsample_bytree\": 1.0,\n", 155 | " \"subsample\": 1.0,\n", 156 | " \"seed\": 42,\n", 157 | "}\n", 158 | "\n", 159 | "num_boost_round = 32\n", 160 | "\n", 161 | "# start run\n", 162 | "run = mlflow.start_run()\n", 163 | "\n", 164 | "# enable automatic logging\n", 165 | "mlflow.lightgbm.autolog()\n", 166 | "\n", 167 | "# train model\n", 168 | "model, train_time = train_model(\n", 169 | " params, num_boost_round, X_train, X_test, y_train, y_test\n", 170 | ")\n", 171 | "mlflow.log_metric(\"training_time\", train_time)\n", 172 | "\n", 173 | "# evaluate model\n", 174 | "loss, acc = evaluate_model(model, X_test, y_test)\n", 175 | "mlflow.log_metrics({\"loss\": loss, \"accuracy\": acc})\n", 176 | "\n", 177 | "# end run\n", 178 | "mlflow.end_run()" 179 | ] 180 | } 181 | ], 182 | "metadata": { 183 | "kernelspec": { 184 | "name": "python3.8", 185 | "display_name": "Python 3.8", 186 | "language": "python" 187 | }, 188 | "language_info": { 189 | "codemirror_mode": { 190 | "name": "ipython", 191 | "version": 3 192 | }, 193 | "file_extension": ".py", 194 | "mimetype": "text/x-python", 195 | "name": "python", 196 | "nbconvert_exporter": "python", 197 | "pygments_lexer": "ipython3", 198 | "version": "3.8.5-final" 199 | } 200 | }, 201 | "nbformat": 4, 202 | "nbformat_minor": 2 203 | } 204 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | azureml-core>=1.15.0 2 | azureml-dataprep 3 | azureml-mlflow 4 | papermill 5 | black 6 | -------------------------------------------------------------------------------- /setup-workspace.py: -------------------------------------------------------------------------------- 1 | # imports 2 | import argparse 3 | 4 | from azureml.core import Workspace 5 | from azureml.core.compute import ComputeTarget, AmlCompute, AksCompute 6 | 7 | # setup argparse 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument("--subscription-id", type=str, default=None) 10 | parser.add_argument("--workspace-name", type=str, default="default") 11 | parser.add_argument("--resource-group", type=str, default="azureml-template") 12 | parser.add_argument("--location", type=str, default="eastus") 13 | args = parser.parse_args() 14 | 15 | # define aml compute target(s) to create 16 | amlcomputes = { 17 | "cpu-cluster": { 18 | "vm_size": "STANDARD_DS3_V2", 19 | "min_nodes": 0, 20 | "max_nodes": 3, 21 | "idle_seconds_before_scaledown": 1200, 22 | } 23 | } 24 | 25 | # create workspace 26 | ws = Workspace.create( 27 | args.workspace_name, 28 | subscription_id=args.subscription_id, 29 | resource_group=args.resource_group, 30 | location=args.location, 31 | create_resource_group=True, 32 | exist_ok=True, 33 | show_output=True, 34 | ) 35 | ws.write_config() 36 | 37 | # create aml compute targets 38 | for ct_name in amlcomputes: 39 | if ct_name not in ws.compute_targets: 40 | compute_config = AmlCompute.provisioning_configuration(**amlcomputes[ct_name]) 41 | ct = ComputeTarget.create(ws, ct_name, compute_config) 42 | ct.wait_for_completion(show_output=True) 43 | -------------------------------------------------------------------------------- /workflows/basic/job.py: -------------------------------------------------------------------------------- 1 | # imports 2 | from pathlib import Path 3 | from azureml.core import Workspace, ScriptRunConfig, Experiment, Environment, Dataset 4 | 5 | # constants 6 | compute_name = "cpu-cluster" # use "local" for local execution 7 | source_dir = "src" 8 | entry_script = "train.py" 9 | environment_name = "myenv-template" 10 | environment_file = "requirements.txt" 11 | experiment_name = "template-workflow-base" 12 | data_uri = "https://azuremlexamples.blob.core.windows.net/datasets/iris.csv" 13 | 14 | # convert to relative paths 15 | prefix = Path(__file__).parent 16 | source_dir = str(prefix.joinpath(source_dir)) 17 | environment_file = str(prefix.joinpath(environment_file)) 18 | 19 | # get workspace 20 | ws = Workspace.from_config() 21 | 22 | # create dataset 23 | ds = Dataset.File.from_files(data_uri) 24 | 25 | # create environment 26 | env = Environment.from_pip_requirements(environment_name, environment_file) 27 | 28 | # setup entry script arguments 29 | args = ["--data-dir", ds.as_mount()] 30 | 31 | # create a job configuration 32 | src = ScriptRunConfig( 33 | source_directory=source_dir, 34 | script=entry_script, 35 | arguments=args, 36 | environment=env, 37 | compute_target=compute_name, 38 | ) 39 | 40 | # run the job 41 | run = Experiment(ws, experiment_name).submit(src) 42 | run.wait_for_completion(show_output=True) 43 | -------------------------------------------------------------------------------- /workflows/basic/requirements.txt: -------------------------------------------------------------------------------- 1 | lightgbm 2 | matplotlib 3 | scikit-learn 4 | azureml-mlflow 5 | azureml-dataprep 6 | -------------------------------------------------------------------------------- /workflows/basic/src/train.py: -------------------------------------------------------------------------------- 1 | # imports 2 | import os 3 | import time 4 | import mlflow 5 | import argparse 6 | 7 | import pandas as pd 8 | import lightgbm as lgb 9 | import matplotlib.pyplot as plt 10 | 11 | from sklearn.metrics import log_loss, accuracy_score 12 | from sklearn.preprocessing import LabelEncoder 13 | from sklearn.model_selection import train_test_split 14 | 15 | # define functions 16 | def preprocess_data(df): 17 | X = df.drop(["species"], axis=1) 18 | y = df["species"] 19 | 20 | enc = LabelEncoder() 21 | y = enc.fit_transform(y) 22 | 23 | X_train, X_test, y_train, y_test = train_test_split( 24 | X, y, test_size=0.2, random_state=42 25 | ) 26 | 27 | return X_train, X_test, y_train, y_test, enc 28 | 29 | 30 | def train_model(params, num_boost_round, X_train, X_test, y_train, y_test): 31 | t1 = time.time() 32 | train_data = lgb.Dataset(X_train, label=y_train) 33 | test_data = lgb.Dataset(X_test, label=y_test) 34 | model = lgb.train( 35 | params, 36 | train_data, 37 | num_boost_round=num_boost_round, 38 | valid_sets=[test_data], 39 | valid_names=["test"], 40 | ) 41 | t2 = time.time() 42 | 43 | return model, t2 - t1 44 | 45 | 46 | def evaluate_model(model, X_test, y_test): 47 | y_proba = model.predict(X_test) 48 | y_pred = y_proba.argmax(axis=1) 49 | loss = log_loss(y_test, y_proba) 50 | acc = accuracy_score(y_test, y_pred) 51 | 52 | return loss, acc 53 | 54 | 55 | print("*" * 60) 56 | print("\n\n") 57 | 58 | # enable auto logging 59 | mlflow.lightgbm.autolog() 60 | 61 | # arg parser 62 | parser = argparse.ArgumentParser() 63 | parser.add_argument("--data-dir", type=str) 64 | parser.add_argument("--num-boost-round", type=int, default=10) 65 | parser.add_argument("--boosting", type=str, default="gbdt") 66 | parser.add_argument("--num-iterations", type=int, default=16) 67 | parser.add_argument("--num-leaves", type=int, default=31) 68 | parser.add_argument("--num-threads", type=int, default=0) 69 | parser.add_argument("--learning-rate", type=float, default=0.1) 70 | parser.add_argument("--metric", type=str, default="multi_logloss") 71 | parser.add_argument("--seed", type=int, default=42) 72 | parser.add_argument("--verbose", type=int, default=0) 73 | args = parser.parse_args() 74 | 75 | # setup parameters 76 | num_boost_round = args.num_boost_round 77 | 78 | params = { 79 | "objective": "multiclass", 80 | "num_class": 3, 81 | "boosting": args.boosting, 82 | "num_iterations": args.num_iterations, 83 | "num_leaves": args.num_leaves, 84 | "num_threads": args.num_threads, 85 | "learning_rate": args.learning_rate, 86 | "metric": args.metric, 87 | "seed": args.seed, 88 | "verbose": args.verbose, 89 | } 90 | 91 | # read in data 92 | df = pd.read_csv(args.data_dir) 93 | 94 | # preprocess data 95 | X_train, X_test, y_train, y_test, enc = preprocess_data(df) 96 | 97 | # train model 98 | model, train_time = train_model( 99 | params, num_boost_round, X_train, X_test, y_train, y_test 100 | ) 101 | mlflow.log_metric("training_time", train_time) 102 | 103 | # evaluate model 104 | loss, acc = evaluate_model(model, X_test, y_test) 105 | mlflow.log_metrics({"loss": loss, "accuracy": acc}) 106 | 107 | print("\n\n") 108 | print("*" * 60) 109 | --------------------------------------------------------------------------------