├── .cloud
    └── .gitkeep
├── .github
    ├── labels.yaml
    └── workflows
    │   ├── cleanup.yml
    │   ├── run-workflows.yml
    │   └── smoke.yml
├── .gitignore
├── CODE_OF_CONDUCT.md
├── LICENSE
├── README.md
├── SECURITY.md
├── cleanup.py
├── notebooks
    └── Untitled.ipynb
├── requirements.txt
├── setup-workspace.py
└── workflows
    └── basic
        ├── job.py
        ├── requirements.txt
        └── src
            └── train.py


/.cloud/.gitkeep:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/.github/labels.yaml:
--------------------------------------------------------------------------------
 1 | - name: Ask
 2 |   description: Define and scope problem and solution
 3 |   color: #c9ecff
 4 | 
 5 | - name: Explore
 6 |   description: Explore and document data to increase understanding
 7 |   color: #f0f29b
 8 | 
 9 | - name: Experiment
10 |   description: Build features and train models
11 |   color: #8569c6
12 | 
13 | - name: Data
14 |   description: Get and transform data
15 |   color: #1c587c
16 | 
17 | - name: Model
18 |   description: Prepare model for deployment
19 |   color: #0b4e82
20 |   
21 | - name: Deploy
22 |   description: Register, package, and deploy model
23 |   color: #f79499
24 | 
25 | - name: Communicate
26 |   description: Write reports, create dashboards, summarize findings, etc.
27 |   color: #f9f345
28 | 
29 |   - name: succeeded
30 |   description: This was successful
31 |   color: #67d157
32 | 
33 |   - name: failed
34 |   description: This didn't go as hoped
35 |   color: #c2021c
36 | 
37 |   - name: on hold
38 |   description: Still seems promising, but let's revist later
39 |   color: #ffd04f
40 | 
41 |   - name: blocked - need access
42 |   description: Blocked due to lack of access to data, resources, environment, etc.
43 |   color: #ed9a53
44 | 


--------------------------------------------------------------------------------
/.github/workflows/cleanup.yml:
--------------------------------------------------------------------------------
 1 | name: cleanup
 2 | on: 
 3 |   schedule:
 4 |     - cron: "0 8 * * *" 
 5 | jobs:
 6 |   build:
 7 |     runs-on: ubuntu-latest 
 8 |     steps:
 9 |     - name: check out repo
10 |       uses: actions/checkout@v2
11 |     - name: setup python
12 |       uses: actions/setup-python@v2
13 |       with:
14 |         python-version: "3.8"
15 |     - name: pip install
16 |       run: pip install -r requirements.txt
17 |     - name: azure login
18 |       uses: azure/login@v1
19 |       with:
20 |         creds: ${{secrets.AZ_CREDS}}
21 |     - name: install azmlcli
22 |       run: az extension add -n azure-cli-ml
23 |     - name: attach to workspace
24 |       run: az ml folder attach -w default -g azureml-template
25 |     - name: run cleanup script
26 |       run: python cleanup.py  
27 | 


--------------------------------------------------------------------------------
/.github/workflows/run-workflows.yml:
--------------------------------------------------------------------------------
 1 | name: run-workflows
 2 | on:
 3 |   push: 
 4 |     branches:
 5 |       - main
 6 |     paths:
 7 |       - workflows/**
 8 |   pull_request:
 9 |     branches:
10 |       - main
11 |     paths:
12 |       - workflows/**
13 |   schedule:
14 |       - cron: "0 0/2 * * *"
15 | jobs:
16 |   build:
17 |     runs-on: ubuntu-latest 
18 |     steps:
19 |     - name: check out repo
20 |       uses: actions/checkout@v2
21 |     - name: setup python
22 |       uses: actions/setup-python@v2
23 |       with: 
24 |         python-version: "3.8"
25 |     - name: pip install
26 |       run: pip install -r requirements.txt
27 |     - name: azure login
28 |       uses: azure/login@v1
29 |       with:
30 |         creds: ${{secrets.AZ_CREDS}}
31 |     - name: install azmlcli
32 |       run: az extension add -n azure-cli-ml
33 |     - name: attach to workspace
34 |       run: az ml folder attach -w default -g azureml-template
35 |     - name: run basic job
36 |       run: python workflows/basic/job.py
37 | 


--------------------------------------------------------------------------------
/.github/workflows/smoke.yml:
--------------------------------------------------------------------------------
 1 | name: smoke
 2 | on:
 3 |   push: 
 4 |     branches:
 5 |       - main
 6 |   pull_request:
 7 |     branches:
 8 |       - main
 9 | jobs:
10 |   build:
11 |     runs-on: ubuntu-latest 
12 |     steps:
13 |     - name: check out repo
14 |       uses: actions/checkout@v2
15 |     - name: setup python
16 |       uses: actions/setup-python@v2
17 |       with: 
18 |         python-version: "3.8"
19 |     - name: pip install
20 |       run: pip install -r requirements.txt
21 |     - name: check code format
22 |       run: black --check .
23 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .azureml
2 | .vscode
3 | pythonenv*


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Microsoft Open Source Code of Conduct
 2 | 
 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 4 | 
 5 | Resources:
 6 | 
 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Microsoft Azure
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Azure Machine Learning (AML) Template
  2 | 
  3 | [![run-workflows-badge](https://github.com/Azure/azureml-template/workflows/run-workflows/badge.svg)](https://github.com/Azure/azureml-template/actions?query=workflow%3Arun-workflows)
  4 | [![cleanup](https://github.com/Azure/azureml-template/workflows/cleanup/badge.svg)](https://github.com/Azure/azureml-template/actions?query=workflow%3Acleanup)
  5 | [![smoke](https://github.com/Azure/azureml-template/workflows/smoke/badge.svg)](https://github.com/Azure/azureml-template/actions?query=workflow%3Asmoke)
  6 | [![code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
  7 | [![license: MIT](https://img.shields.io/badge/License-MIT-purple.svg)](LICENSE)
  8 | 
  9 | Welcome to the Azure Machine Learning (AML) template repository!
 10 | 
 11 | ## Prerequisites
 12 | 
 13 | 1. An Azure subscription. If you don't have an Azure subscription, [create a free account](https://aka.ms/AMLFree) before you begin.
 14 | 2. A terminal and Python >=3.6,[\<3.9](https://pypi.org/project/azureml-core).
 15 | 
 16 | ## Getting started
 17 | 
 18 | Click "Use this template" above and create a repository.
 19 | 
 20 | Follow the setup guide below to add your Azure credentials and create required Azure resources. At the end, you will have a repository with:
 21 | 
 22 | - simple LightGBM training workflow running every 2 hours and on push/PR
 23 | - code format check on push/PR
 24 | - resource cleanup script running nightly
 25 | 
 26 | ## Setup
 27 | 
 28 | First, export your Azure subscription id as an environment variable:
 29 | 
 30 | ```console
 31 | export ID=<your-subscription-id>
 32 | ```
 33 | 
 34 | Second, create the Azure resource group and required AML resources:
 35 | 
 36 | ```console
 37 | python setup-workspace.py --subscription-id $ID
 38 | ```
 39 | 
 40 | This will create a resource group named `azureml-template`, a workspace named `default`, and a cluster named `cpu-cluster`. Edit `setup-workspace.py` as needed. If you change the names, ensure you change corresponding names in the `.github/workflows` files and in the third step below.
 41 | 
 42 | Third, create a service principal for the resource group:
 43 | 
 44 | ```console
 45 | az ad sp create-for-rbac --name "azureml-template" \
 46 |                          --role contributor \
 47 |                          --scopes /subscriptions/$ID/resourceGroups/azureml-template \
 48 |                          --sdk-auth
 49 | ```
 50 | 
 51 | Copy the output json, which looks like this:
 52 | 
 53 | ```console
 54 | {
 55 |     "clientId": "<GUID>",
 56 |     "clientSecret": "<GUID>",
 57 |     "subscriptionId": "<GUID>",
 58 |     "tenantId": "<GUID>",
 59 |     (...)
 60 | }
 61 | ```
 62 | 
 63 | In your repository, navigate to "Settings > Secrets > New Secret". Name the secret `AZ_CREDS` and paste the json output from above. This is used in the Azure login action in the GitHub Actions. If you use a different name for the secret, ensure you change the corresponding names in the `.github/workflows` files.
 64 | 
 65 | ## Contents
 66 | 
 67 | Adapt this template to automate the entire ML lifecycle on GitHub, using AML for centralized tracking and scaling up/out on Azure compute.
 68 | 
 69 | |directory|description|
 70 | |-|-|
 71 | |`.cloud`|cloud templates|
 72 | |`.github`|GitHub specific files like Actions workflow yaml definitions and issue templates|
 73 | |`notebooks`|interactive jupyter notebooks for iterative ML development|
 74 | |`workflows`|self-contained directories of job/workflow to be run|
 75 | 
 76 | ## GitHub Actions
 77 | 
 78 | Modify all files as needed.
 79 | 
 80 | **Actions**:
 81 | 
 82 | - [`.github/workflows/smoke.yml`](.github/workflows/smoke.yml) runs on every PR and push to `main` to check code format
 83 | - [`.github/workflows/cleanup.yml`](.github/workflows/cleanup.yml) runs daily and can be used to cleanup AML resources
 84 | - [`.github/workflows/run-workflows.yml`](.github/workflows/run-workflows.yml) runs a ml workflow every two hours and push/PR to `main`
 85 | 
 86 | **Other**:
 87 | 
 88 | - [`requirements.txt`](requirements.txt) specifies required pip packages for GitHub actions
 89 | - [`setup-workspace.py`](setup-workspace.py) can be modified for workspace and resource setup
 90 | - [`cleanup.py`](cleanup.py) can be modified for nightly workspace cleanup tasks
 91 | - [`workflows/basic/job.py`](workflows/basic/job.py) is the AML control code
 92 | - [`workflows/basic/src/train.py`](workflows/basic/src/train.py) is the ML training script with mlflow tracking
 93 | - [`workflows/basic/requirements.txt`](workflows/basic/requirements.txt) specifies required pip packages for the training script
 94 | 
 95 | ## Reference
 96 | 
 97 | - [Azure Machine Learning Examples](https://github.com/Azure/azureml-examples)
 98 | - [Cheat Sheet, VSCode Snippets, and Templates](https://azure.github.io/azureml-web)
 99 | - [Azure Machine Learning Documentation](https://docs.microsoft.com/azure/machine-learning)
100 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.7 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->
42 | 


--------------------------------------------------------------------------------
/cleanup.py:
--------------------------------------------------------------------------------
 1 | # imports
 2 | import argparse
 3 | from azureml.core import Workspace
 4 | 
 5 | # setup argparse
 6 | parser = argparse.ArgumentParser()
 7 | args = parser.parse_args()
 8 | 
 9 | # get workspace
10 | ws = Workspace.from_config()
11 | 
12 | # process webservices
13 | for webservice in ws.webservices:
14 |     pass
15 | 
16 | # process compute targets
17 | for compute_target in ws.compute_targets:
18 |     pass
19 | 


--------------------------------------------------------------------------------
/notebooks/Untitled.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Interactive experimentation"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "!pip install --upgrade lightgbm scikit-learn pandas adlfs"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "## Setup cloud tracking"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": null,
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "import mlflow\n",
 33 |     "from azureml.core import Workspace\n",
 34 |     "\n",
 35 |     "ws = Workspace.from_config()\n",
 36 |     "mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())\n",
 37 |     "mlflow.set_experiment(\"untitled\")"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "metadata": {},
 43 |    "source": [
 44 |     "## Load data\n",
 45 |     "\n",
 46 |     "You can read directly from public URIs into Pandas. For private Blob or ADLS data, consider using [adlfs](https://github.com/dask/adlfs)."
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": null,
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "data_uri = \"https://azuremlexamples.blob.core.windows.net/datasets/iris.csv\""
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": null,
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "import pandas as pd\n",
 65 |     "\n",
 66 |     "df = pd.read_csv(data_uri)\n",
 67 |     "df.head()"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "source": [
 72 |     "## Define functions"
 73 |    ],
 74 |    "cell_type": "markdown",
 75 |    "metadata": {}
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": null,
 80 |    "metadata": {},
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "# imports\n",
 84 |     "import time\n",
 85 |     "\n",
 86 |     "import lightgbm as lgb\n",
 87 |     "\n",
 88 |     "from sklearn.metrics import log_loss, accuracy_score\n",
 89 |     "from sklearn.preprocessing import LabelEncoder\n",
 90 |     "from sklearn.model_selection import train_test_split\n",
 91 |     "\n",
 92 |     "# define functions\n",
 93 |     "def preprocess_data(df):\n",
 94 |     "    X = df.drop([\"species\"], axis=1)\n",
 95 |     "    y = df[\"species\"]\n",
 96 |     "\n",
 97 |     "    enc = LabelEncoder()\n",
 98 |     "    y = enc.fit_transform(y)\n",
 99 |     "\n",
100 |     "    X_train, X_test, y_train, y_test = train_test_split(\n",
101 |     "        X, y, test_size=0.2, random_state=42\n",
102 |     "    )\n",
103 |     "\n",
104 |     "    return X_train, X_test, y_train, y_test, enc\n",
105 |     "\n",
106 |     "\n",
107 |     "def train_model(params, num_boost_round, X_train, X_test, y_train, y_test):\n",
108 |     "    t1 = time.time()\n",
109 |     "    train_data = lgb.Dataset(X_train, label=y_train)\n",
110 |     "    test_data = lgb.Dataset(X_test, label=y_test)\n",
111 |     "    model = lgb.train(\n",
112 |     "        params,\n",
113 |     "        train_data,\n",
114 |     "        num_boost_round=num_boost_round,\n",
115 |     "        valid_sets=[test_data],\n",
116 |     "        valid_names=[\"test\"],\n",
117 |     "    )\n",
118 |     "    t2 = time.time()\n",
119 |     "\n",
120 |     "    return model, t2 - t1\n",
121 |     "\n",
122 |     "\n",
123 |     "def evaluate_model(model, X_test, y_test):\n",
124 |     "    y_proba = model.predict(X_test)\n",
125 |     "    y_pred = y_proba.argmax(axis=1)\n",
126 |     "    loss = log_loss(y_test, y_proba)\n",
127 |     "    acc = accuracy_score(y_test, y_pred)\n",
128 |     "\n",
129 |     "    return loss, acc"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "markdown",
134 |    "metadata": {},
135 |    "source": [
136 |     "## Run a trial"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": null,
142 |    "metadata": {},
143 |    "outputs": [],
144 |    "source": [
145 |     "# preprocess data\n",
146 |     "X_train, X_test, y_train, y_test, enc = preprocess_data(df)\n",
147 |     "\n",
148 |     "# set training parameters\n",
149 |     "params = {\n",
150 |     "    \"objective\": \"multiclass\",\n",
151 |     "    \"num_class\": 3,\n",
152 |     "    \"learning_rate\": 0.1,\n",
153 |     "    \"metric\": \"multi_logloss\",\n",
154 |     "    \"colsample_bytree\": 1.0,\n",
155 |     "    \"subsample\": 1.0,\n",
156 |     "    \"seed\": 42,\n",
157 |     "}\n",
158 |     "\n",
159 |     "num_boost_round = 32\n",
160 |     "\n",
161 |     "# start run\n",
162 |     "run = mlflow.start_run()\n",
163 |     "\n",
164 |     "# enable automatic logging\n",
165 |     "mlflow.lightgbm.autolog()\n",
166 |     "\n",
167 |     "# train model\n",
168 |     "model, train_time = train_model(\n",
169 |     "    params, num_boost_round, X_train, X_test, y_train, y_test\n",
170 |     ")\n",
171 |     "mlflow.log_metric(\"training_time\", train_time)\n",
172 |     "\n",
173 |     "# evaluate model\n",
174 |     "loss, acc = evaluate_model(model, X_test, y_test)\n",
175 |     "mlflow.log_metrics({\"loss\": loss, \"accuracy\": acc})\n",
176 |     "\n",
177 |     "# end run\n",
178 |     "mlflow.end_run()"
179 |    ]
180 |   }
181 |  ],
182 |  "metadata": {
183 |   "kernelspec": {
184 |    "name": "python3.8",
185 |    "display_name": "Python 3.8",
186 |    "language": "python"
187 |   },
188 |   "language_info": {
189 |    "codemirror_mode": {
190 |     "name": "ipython",
191 |     "version": 3
192 |    },
193 |    "file_extension": ".py",
194 |    "mimetype": "text/x-python",
195 |    "name": "python",
196 |    "nbconvert_exporter": "python",
197 |    "pygments_lexer": "ipython3",
198 |    "version": "3.8.5-final"
199 |   }
200 |  },
201 |  "nbformat": 4,
202 |  "nbformat_minor": 2
203 | }
204 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | azureml-core>=1.15.0
2 | azureml-dataprep
3 | azureml-mlflow
4 | papermill
5 | black
6 | 


--------------------------------------------------------------------------------
/setup-workspace.py:
--------------------------------------------------------------------------------
 1 | # imports
 2 | import argparse
 3 | 
 4 | from azureml.core import Workspace
 5 | from azureml.core.compute import ComputeTarget, AmlCompute, AksCompute
 6 | 
 7 | # setup argparse
 8 | parser = argparse.ArgumentParser()
 9 | parser.add_argument("--subscription-id", type=str, default=None)
10 | parser.add_argument("--workspace-name", type=str, default="default")
11 | parser.add_argument("--resource-group", type=str, default="azureml-template")
12 | parser.add_argument("--location", type=str, default="eastus")
13 | args = parser.parse_args()
14 | 
15 | # define aml compute target(s) to create
16 | amlcomputes = {
17 |     "cpu-cluster": {
18 |         "vm_size": "STANDARD_DS3_V2",
19 |         "min_nodes": 0,
20 |         "max_nodes": 3,
21 |         "idle_seconds_before_scaledown": 1200,
22 |     }
23 | }
24 | 
25 | # create workspace
26 | ws = Workspace.create(
27 |     args.workspace_name,
28 |     subscription_id=args.subscription_id,
29 |     resource_group=args.resource_group,
30 |     location=args.location,
31 |     create_resource_group=True,
32 |     exist_ok=True,
33 |     show_output=True,
34 | )
35 | ws.write_config()
36 | 
37 | # create aml compute targets
38 | for ct_name in amlcomputes:
39 |     if ct_name not in ws.compute_targets:
40 |         compute_config = AmlCompute.provisioning_configuration(**amlcomputes[ct_name])
41 |         ct = ComputeTarget.create(ws, ct_name, compute_config)
42 |         ct.wait_for_completion(show_output=True)
43 | 


--------------------------------------------------------------------------------
/workflows/basic/job.py:
--------------------------------------------------------------------------------
 1 | # imports
 2 | from pathlib import Path
 3 | from azureml.core import Workspace, ScriptRunConfig, Experiment, Environment, Dataset
 4 | 
 5 | # constants
 6 | compute_name = "cpu-cluster"  # use "local" for local execution
 7 | source_dir = "src"
 8 | entry_script = "train.py"
 9 | environment_name = "myenv-template"
10 | environment_file = "requirements.txt"
11 | experiment_name = "template-workflow-base"
12 | data_uri = "https://azuremlexamples.blob.core.windows.net/datasets/iris.csv"
13 | 
14 | # convert to relative paths
15 | prefix = Path(__file__).parent
16 | source_dir = str(prefix.joinpath(source_dir))
17 | environment_file = str(prefix.joinpath(environment_file))
18 | 
19 | # get workspace
20 | ws = Workspace.from_config()
21 | 
22 | # create dataset
23 | ds = Dataset.File.from_files(data_uri)
24 | 
25 | # create environment
26 | env = Environment.from_pip_requirements(environment_name, environment_file)
27 | 
28 | # setup entry script arguments
29 | args = ["--data-dir", ds.as_mount()]
30 | 
31 | # create a job configuration
32 | src = ScriptRunConfig(
33 |     source_directory=source_dir,
34 |     script=entry_script,
35 |     arguments=args,
36 |     environment=env,
37 |     compute_target=compute_name,
38 | )
39 | 
40 | # run the job
41 | run = Experiment(ws, experiment_name).submit(src)
42 | run.wait_for_completion(show_output=True)
43 | 


--------------------------------------------------------------------------------
/workflows/basic/requirements.txt:
--------------------------------------------------------------------------------
1 | lightgbm
2 | matplotlib
3 | scikit-learn
4 | azureml-mlflow 
5 | azureml-dataprep
6 | 


--------------------------------------------------------------------------------
/workflows/basic/src/train.py:
--------------------------------------------------------------------------------
  1 | # imports
  2 | import os
  3 | import time
  4 | import mlflow
  5 | import argparse
  6 | 
  7 | import pandas as pd
  8 | import lightgbm as lgb
  9 | import matplotlib.pyplot as plt
 10 | 
 11 | from sklearn.metrics import log_loss, accuracy_score
 12 | from sklearn.preprocessing import LabelEncoder
 13 | from sklearn.model_selection import train_test_split
 14 | 
 15 | # define functions
 16 | def preprocess_data(df):
 17 |     X = df.drop(["species"], axis=1)
 18 |     y = df["species"]
 19 | 
 20 |     enc = LabelEncoder()
 21 |     y = enc.fit_transform(y)
 22 | 
 23 |     X_train, X_test, y_train, y_test = train_test_split(
 24 |         X, y, test_size=0.2, random_state=42
 25 |     )
 26 | 
 27 |     return X_train, X_test, y_train, y_test, enc
 28 | 
 29 | 
 30 | def train_model(params, num_boost_round, X_train, X_test, y_train, y_test):
 31 |     t1 = time.time()
 32 |     train_data = lgb.Dataset(X_train, label=y_train)
 33 |     test_data = lgb.Dataset(X_test, label=y_test)
 34 |     model = lgb.train(
 35 |         params,
 36 |         train_data,
 37 |         num_boost_round=num_boost_round,
 38 |         valid_sets=[test_data],
 39 |         valid_names=["test"],
 40 |     )
 41 |     t2 = time.time()
 42 | 
 43 |     return model, t2 - t1
 44 | 
 45 | 
 46 | def evaluate_model(model, X_test, y_test):
 47 |     y_proba = model.predict(X_test)
 48 |     y_pred = y_proba.argmax(axis=1)
 49 |     loss = log_loss(y_test, y_proba)
 50 |     acc = accuracy_score(y_test, y_pred)
 51 | 
 52 |     return loss, acc
 53 | 
 54 | 
 55 | print("*" * 60)
 56 | print("\n\n")
 57 | 
 58 | # enable auto logging
 59 | mlflow.lightgbm.autolog()
 60 | 
 61 | # arg parser
 62 | parser = argparse.ArgumentParser()
 63 | parser.add_argument("--data-dir", type=str)
 64 | parser.add_argument("--num-boost-round", type=int, default=10)
 65 | parser.add_argument("--boosting", type=str, default="gbdt")
 66 | parser.add_argument("--num-iterations", type=int, default=16)
 67 | parser.add_argument("--num-leaves", type=int, default=31)
 68 | parser.add_argument("--num-threads", type=int, default=0)
 69 | parser.add_argument("--learning-rate", type=float, default=0.1)
 70 | parser.add_argument("--metric", type=str, default="multi_logloss")
 71 | parser.add_argument("--seed", type=int, default=42)
 72 | parser.add_argument("--verbose", type=int, default=0)
 73 | args = parser.parse_args()
 74 | 
 75 | # setup parameters
 76 | num_boost_round = args.num_boost_round
 77 | 
 78 | params = {
 79 |     "objective": "multiclass",
 80 |     "num_class": 3,
 81 |     "boosting": args.boosting,
 82 |     "num_iterations": args.num_iterations,
 83 |     "num_leaves": args.num_leaves,
 84 |     "num_threads": args.num_threads,
 85 |     "learning_rate": args.learning_rate,
 86 |     "metric": args.metric,
 87 |     "seed": args.seed,
 88 |     "verbose": args.verbose,
 89 | }
 90 | 
 91 | # read in data
 92 | df = pd.read_csv(args.data_dir)
 93 | 
 94 | # preprocess data
 95 | X_train, X_test, y_train, y_test, enc = preprocess_data(df)
 96 | 
 97 | # train model
 98 | model, train_time = train_model(
 99 |     params, num_boost_round, X_train, X_test, y_train, y_test
100 | )
101 | mlflow.log_metric("training_time", train_time)
102 | 
103 | # evaluate model
104 | loss, acc = evaluate_model(model, X_test, y_test)
105 | mlflow.log_metrics({"loss": loss, "accuracy": acc})
106 | 
107 | print("\n\n")
108 | print("*" * 60)
109 | 


--------------------------------------------------------------------------------