├── .github
    ├── amlarc-tool.sh
    ├── convert.py
    ├── run_pipeline.py
    └── workflows
    │   ├── kubernetes-compute-simple-examples.yml
    │   └── portal-provision-pr-gate.yaml
├── .gitignore
├── CODE_OF_CONDUCT.md
├── LICENSE
├── README.md
├── SECURITY.md
├── docs
    ├── 4. View metrics in Compute level and runs level.markdown
    ├── AKS-HCI
    │   ├── AML-ARC-Compute.md
    │   ├── README.md
    │   ├── Train-AzureArc.md
    │   ├── Yaml
    │   │   ├── kfserving_v1_4_1.yaml
    │   │   └── serving-default-domain-knative-1-4-0.yaml
    │   ├── cli
    │   │   ├── README.md
    │   │   └── mnist
    │   │   │   ├── README.md
    │   │   │   ├── deployment.yml
    │   │   │   ├── endpoint.yml
    │   │   │   ├── mnist_script
    │   │   │       ├── train.py
    │   │   │       └── utils.py
    │   │   │   ├── model
    │   │   │       └── conda.yml
    │   │   │   ├── sample-request.json
    │   │   │   ├── score.py
    │   │   │   ├── train_env
    │   │   │       └── conda.yml
    │   │   │   └── training.yml
    │   ├── imgs
    │   │   ├── Cstorage.png
    │   │   ├── Inner-compute.png
    │   │   ├── Inner-workspace.png
    │   │   ├── azureml_log.png
    │   │   ├── container.png
    │   │   ├── cors.png
    │   │   ├── datastore-set.png
    │   │   ├── datastore.png
    │   │   ├── kubernetes_arc.png
    │   │   ├── network.png
    │   │   ├── sas-token.png
    │   │   ├── sas.png
    │   │   ├── structure.png
    │   │   ├── studio-arc.png
    │   │   ├── studio-s.png
    │   │   ├── studio.png
    │   │   ├── url.png
    │   │   └── vid-img.png
    │   ├── nfs
    │   │   ├── README.md
    │   │   ├── Verify_NFS_Setup_in_AMLArc.ipynb
    │   │   ├── config.json
    │   │   ├── images
    │   │   │   ├── configure-public-ip.png
    │   │   │   ├── create-ubuntu-vm.png
    │   │   │   ├── reset-network.png
    │   │   │   ├── ssh-status.png
    │   │   │   ├── ubuntu-vm-created.png
    │   │   │   ├── ufw-nfs.png
    │   │   │   ├── ufw-ssh.png
    │   │   │   ├── verify-nfs-training.png
    │   │   │   └── verify-nfs-vm.png
    │   │   ├── mount-config.yaml
    │   │   ├── nfs-server-setup.sh
    │   │   └── nfs_script
    │   │   │   └── test.py
    │   ├── notebooks
    │   │   ├── README.md
    │   │   ├── distributed-cifar10
    │   │   │   ├── README.md
    │   │   │   ├── config.json
    │   │   │   ├── data.json
    │   │   │   ├── distributed-pytorch-cifar10.ipynb
    │   │   │   ├── helpers.py
    │   │   │   ├── pt_deployment.yml
    │   │   │   ├── pt_endpoint.yml
    │   │   │   ├── pytorch-script
    │   │   │   │   ├── cifar_dist_main.py
    │   │   │   │   └── conda_dependencies.yml
    │   │   │   ├── score_pytorch.py
    │   │   │   ├── score_tf.py
    │   │   │   ├── test_imgs
    │   │   │   │   ├── test_img_0_cat.jpg
    │   │   │   │   ├── test_img_1_ship.jpg
    │   │   │   │   ├── test_img_2_ship.jpg
    │   │   │   │   ├── test_img_3_plane.jpg
    │   │   │   │   └── test_img_4_frog.jpg
    │   │   │   └── test_request_pytorch
    │   │   │   │   ├── cifar_test_input_img_0_cat_pytorch.json
    │   │   │   │   ├── cifar_test_input_img_1_ship_pytorch.json
    │   │   │   │   ├── cifar_test_input_img_2_ship_pytorch.json
    │   │   │   │   ├── cifar_test_input_img_3_airplane_pytorch.json
    │   │   │   │   ├── cifar_test_input_img_4_frog_pytorch.json
    │   │   │   │   └── cifar_test_input_img_first_5_pytorch.json
    │   │   ├── mnist
    │   │   │   ├── MNIST_Training_with_AKS-HCI_Cluster_and_NFS.ipynb
    │   │   │   ├── README.md
    │   │   │   ├── config.json
    │   │   │   ├── confusion.png
    │   │   │   ├── deployment.yml
    │   │   │   ├── digit_7.jpg
    │   │   │   ├── endpoint.yml
    │   │   │   ├── helpers.py
    │   │   │   ├── mnist_script
    │   │   │   │   ├── train.py
    │   │   │   │   └── utils.py
    │   │   │   ├── model
    │   │   │   │   └── conda.yml
    │   │   │   └── score.py
    │   │   ├── object-segmentation-on-azure-stack
    │   │   │   ├── FudanPed00001.png
    │   │   │   ├── README.md
    │   │   │   ├── aml_src
    │   │   │   │   ├── Dockerfile.gpu
    │   │   │   │   ├── coco_eval.py
    │   │   │   │   ├── coco_utils.py
    │   │   │   │   ├── conda-env.yaml
    │   │   │   │   ├── engine.py
    │   │   │   │   ├── obj_segment_step_data_process.py
    │   │   │   │   ├── obj_segment_step_training.py
    │   │   │   │   ├── transforms.py
    │   │   │   │   └── utils.py
    │   │   │   ├── config.json
    │   │   │   ├── deployment.yml
    │   │   │   ├── endpoint.yml
    │   │   │   ├── helpers.py
    │   │   │   ├── object_segmentation-akshci-nfs.ipynb
    │   │   │   ├── object_segmentation-akshci.ipynb
    │   │   │   └── score.py
    │   │   ├── pipeline
    │   │   │   ├── README.md
    │   │   │   ├── config.json
    │   │   │   ├── deployment.yml
    │   │   │   ├── endpoint.yml
    │   │   │   ├── helpers.py
    │   │   │   ├── images
    │   │   │   │   ├── pipeline-using-dataflow.png
    │   │   │   │   └── pipeline-using-stepsequence.png
    │   │   │   ├── model
    │   │   │   │   └── conda.yml
    │   │   │   ├── nyc-taxi-data-regression-model-building-nfs.ipynb
    │   │   │   ├── nyc-taxi-data-regression-model-building.ipynb
    │   │   │   ├── score.py
    │   │   │   ├── scripts
    │   │   │   │   ├── prepdata
    │   │   │   │   │   ├── cleanse.py
    │   │   │   │   │   ├── filter.py
    │   │   │   │   │   ├── merge.py
    │   │   │   │   │   ├── normalize.py
    │   │   │   │   │   └── transform.py
    │   │   │   │   └── trainmodel
    │   │   │   │   │   ├── train_step.py
    │   │   │   │   │   └── train_test_split.py
    │   │   │   └── test_set.csv
    │   │   └── upload-download-model
    │   │   │   ├── AML-model-download-upload.ipynb
    │   │   │   ├── README.md
    │   │   │   └── config.json
    │   ├── test-data
    │   │   ├── cifar10_test_input.json
    │   │   └── flower_sample_test_input.json
    │   ├── troubleshooting.md
    │   └── video
    │   │   └── kfserving_tf_blob_structure.mp4
    ├── application-gateway-ingress-controller.md
    ├── attach-compute-on-aks-v1-cluster.md
    ├── attach-compute.md
    ├── azureml-aks-ta-support.md
    ├── deploy-extension.md
    ├── deploy-on-ocp.md
    ├── faq.md
    ├── gke-setup.md
    ├── happy-path.md
    ├── how-to-debug-arc-kubernetes-training.md
    ├── inference-byoc.md
    ├── instance-type.md
    ├── limitations-and-known-issues.md
    ├── managed-identity.md
    ├── media
    │   ├── README.md
    │   ├── assign-role.png
    │   ├── attach-1.png
    │   ├── attach-4.png
    │   ├── attach.png
    │   ├── detach.png
    │   ├── edit-identity.png
    │   ├── gke-ssh.png
    │   ├── gkecreate.png
    │   ├── privatelink-networkflow-v3.png
    │   ├── privatelink
    │   │   ├── acr_subnet.png
    │   │   ├── acr_target.png
    │   │   ├── acr_trusted.png
    │   │   ├── aks.png
    │   │   ├── dns.png
    │   │   ├── kv_target.png
    │   │   ├── kv_trusted.png
    │   │   ├── kv_vnet.png
    │   │   ├── ml_compute.png
    │   │   ├── ml_computemsi.png
    │   │   ├── ml_disablepublicaccess.png
    │   │   ├── ml_privateendpoint.png
    │   │   ├── onprem.png
    │   │   ├── relay.png
    │   │   ├── relay_connectstring.png
    │   │   ├── relay_resourceid.png
    │   │   ├── storageaccount.png
    │   │   ├── ts_curl.png
    │   │   ├── ts_expected.png
    │   │   ├── ts_getpo.png
    │   │   ├── ts_nslookup.png
    │   │   └── ts_ws.png
    │   ├── profileConfig.png
    │   ├── update-identity2.png
    │   └── ws-msi.png
    ├── network-requirements.md
    ├── nginx-ingress-controller.md
    ├── private-link.md
    ├── pvc.md
    ├── release-notes.md
    ├── setup-ephemeral-nfs-volume.md
    ├── simple-flow.md
    ├── simple-train-cli.md
    ├── troubleshooting.md
    └── workflows.md
├── examples
    ├── inference
    │   ├── gpu-inferencing
    │   │   ├── blue-deployment.yml
    │   │   ├── endpoint.yml
    │   │   ├── model
    │   │   │   ├── checkpoint
    │   │   │   ├── conda.yml
    │   │   │   ├── mnist-tf.model.data-00000-of-00001
    │   │   │   ├── mnist-tf.model.index
    │   │   │   └── mnist-tf.model.meta
    │   │   ├── sample-request.json
    │   │   └── script
    │   │   │   └── score.py
    │   └── simple-flow
    │   │   ├── blue-deployment.yml
    │   │   ├── endpoint.yml
    │   │   ├── model
    │   │       ├── conda.yml
    │   │       └── sklearn_mnist_model.pkl
    │   │   ├── sample-request.json
    │   │   ├── script
    │   │       └── score.py
    │   │   └── sklearn-model.yml
    └── training
    │   ├── additional-sdk-examples
    │       ├── 001-Tensorflow
    │       │   ├── tf_mnist_with_checkpoint.py
    │       │   ├── train-tensorflow-resume-training.ipynb
    │       │   └── utils.py
    │       ├── 002-SciKitLearn
    │       │   ├── img-classification-part1-training.ipynb
    │       │   └── utils.py
    │       └── 003-Distributed TensorFlow with parameter server
    │       │   ├── distributed-tensorflow-with-parameter-server.ipynb
    │       │   └── tf_mnist_replica.py
    │   ├── simple-train-cli
    │       ├── job.yml
    │       └── src
    │       │   ├── mnist-data
    │       │       ├── t10k-images-idx3-ubyte.gz
    │       │       ├── t10k-labels-idx1-ubyte.gz
    │       │       ├── train-images-idx3-ubyte.gz
    │       │       └── train-labels-idx1-ubyte.gz
    │       │   ├── train.py
    │       │   └── utils.py
    │   ├── simple-train-sdk
    │       ├── img-classification-training.ipynb
    │       └── utils.py
    │   └── train-using-nfs
    │       ├── amlarc-nfs-setup
    │           ├── README.md
    │           └── mount-config.yaml
    │       ├── pytorch-on-amlarc-with-nfs
    │           ├── pytorch-on-amlarc-with-nfs.ipynb
    │           └── scripts
    │           │   └── train.py
    │       └── scikit-learn-on-amlarc-with-nfs
    │           ├── iris.csv
    │           ├── scikit-learn-on-amlarc-with-nfs.ipynb
    │           └── scripts
    │               └── train.py
├── files
    ├── deploy-amlarc.sh
    ├── deploy.py
    ├── deployextension.json
    ├── deployextension.parameters.json
    ├── entry.sh
    ├── quota setting tool
    │   ├── get_quotaoverrides_cr.py
    │   ├── microsoft_graph.py
    │   ├── quotaoverridesCRTemplate.yaml
    │   ├── readme.md
    │   └── utils.py
    ├── sslsecret.yaml
    └── terraform-template.tf
└── pics
    ├── check_scoringfe_v2_output.png
    ├── nvml_error.png
    └── permission_denied.png


/.github/convert.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import yaml
  3 | import os
  4 | 
  5 | 
  6 | def convert(input_file, compute_target, instance_type, common_runtime, output_file):
  7 |     def _convert(input_file, data, job_schema):
  8 |         # check job type
  9 |         is_pipeline_job = False
 10 |         is_sweep_job = False
 11 |         if "pipelineJob" in job_schema or "jobs" in data:
 12 |             is_pipeline_job = True
 13 |         if "sweepJob" in job_schema or data.get("type") == "sweep":
 14 |             is_sweep_job = True
 15 | 
 16 |         print("Job type: pipelineJob", is_pipeline_job, "sweepJob:", is_sweep_job)
 17 | 
 18 |         # change compute target
 19 |         if compute_target:
 20 |             data["compute"] = "azureml:%s" % compute_target
 21 |             if is_pipeline_job:
 22 |                 settings = data.get("settings", {})
 23 |                 settings["default_compute"] = "azureml:%s" % compute_target
 24 |                 data["settings"] = settings
 25 | 
 26 |         # set instance type
 27 |         if not is_pipeline_job and instance_type:
 28 |             resources = data.get("resources", {})
 29 |             resources["instance_type"] = instance_type
 30 |             data["resources"] = resources
 31 | 
 32 |         for field in ["trial", "component"]:
 33 |             if field not in data:
 34 |                 continue
 35 | 
 36 |             file_field = data[field]
 37 |             if not isinstance(file_field, str):
 38 |                 continue
 39 | 
 40 |             if file_field.startswith("file:"):
 41 |                 file_field = file_field.split(":", 1)[1]
 42 | 
 43 |             print("Found sub job spec:", file_field)
 44 |             dirname = os.path.dirname(input_file)
 45 |             convert(
 46 |                 os.path.join(dirname, file_field),
 47 |                 compute_target,
 48 |                 instance_type,
 49 |                 common_runtime,
 50 |                 "",
 51 |             )
 52 | 
 53 |         if is_pipeline_job:
 54 |             jobs = data.get("jobs", {})
 55 |             for step in jobs:
 56 |                 print("Found step:", step)
 57 |                 _convert(input_file, jobs[step], "")
 58 |             return
 59 | 
 60 |     print("Processing file:", input_file)
 61 |     if not os.path.exists(input_file):
 62 |         print("Warning: File doesn't exist: ", input_file)
 63 |         return
 64 |     with open(input_file, "r") as f:
 65 |         data = yaml.load(f, Loader=yaml.FullLoader)
 66 |         job_schema = data.get("$schema", "")
 67 |         _convert(input_file, data, job_schema)
 68 | 
 69 |         # write to output file if output file is specified, otherwise change inplace.
 70 |         if output_file:
 71 |             with open(output_file, "w") as f:
 72 |                 yaml.dump(data, f)
 73 |         else:
 74 |             with open(input_file, "w") as f:
 75 |                 yaml.dump(data, f)
 76 | 
 77 | 
 78 | if __name__ == "__main__":
 79 |     # Parse command line arguments
 80 |     parser = argparse.ArgumentParser(
 81 |         description="Convert test case to AMLARC-compatible files."
 82 |     )
 83 |     parser.add_argument("-i", "--input", required=True, help="Input test case file")
 84 |     parser.add_argument(
 85 |         "-o",
 86 |         "--output",
 87 |         required=False,
 88 |         help="Output AMLARC-compatible file, if not provides, " "replace file inplace",
 89 |     )
 90 |     parser.add_argument("-c", "--compute-target", required=False, help="Compute target")
 91 |     parser.add_argument("-it", "--instance-type", required=False, help="Instance type")
 92 |     parser.add_argument(
 93 |         "-cr",
 94 |         "--common-runtime",
 95 |         required=False,
 96 |         default=False,
 97 |         action="store_true",
 98 |         help='Enable common runtime explicitly, default is "false"',
 99 |     )
100 |     args = parser.parse_args()
101 |     convert(
102 |         args.input,
103 |         args.compute_target,
104 |         args.instance_type,
105 |         args.common_runtime,
106 |         args.output,
107 |     )


--------------------------------------------------------------------------------
/.github/run_pipeline.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | from msrest.authentication import BasicAuthentication
 4 | from azure.devops.connection import Connection
 5 | from azure.devops.v6_0.pipelines.models import RunPipelineParameters, RunResourcesParameters, RepositoryResourceParameters, Run, Variable
 6 | import time
 7 | 
 8 | def init_parser():
 9 |     parser = argparse.ArgumentParser()
10 |     parser.add_argument(
11 |         '-d',
12 |         '--definition-id',
13 |         type=int,
14 |         required=False,
15 |         help='branch name'
16 |     )
17 |     parser.add_argument(
18 |         '--variables',
19 |         '-v',
20 |         type=str,
21 |         nargs='+',
22 |         help='variables set to the pipeline'
23 |     )
24 | 
25 |     return parser
26 | 
27 | def init_clients():
28 |     token = os.environ["PAT_TOKEN"]
29 |     credentials = BasicAuthentication('', token)
30 | 
31 |     organization_url = 'https://dev.azure.com/msdata'
32 |     
33 |     connection = Connection(base_url=organization_url, creds=credentials)
34 |     clients = connection.clients_v6_0
35 |     
36 |     return clients
37 | 
38 | def trigger_build(clients, branch, def_id, variables = {}) -> Run:
39 |     prj = 'Vienna'
40 |     branch = f'refs/heads/{branch}'
41 |     repo = RepositoryResourceParameters(branch)
42 |     res = RunResourcesParameters(repositories={'self': repo})
43 |     params = RunPipelineParameters(resources=res)
44 |     if variables:
45 |         params.variables = variables
46 | 
47 |     pipeline = clients.get_pipelines_client()
48 |     run = pipeline.run_pipeline(params, prj, def_id)
49 |     return run
50 | 
51 | def wait_run_complete(clients, def_id, run_id, timeout_in_sec=3600) -> bool:
52 |     pipeline = clients.get_pipelines_client()
53 |     run = pipeline.get_run('Vienna', def_id, run_id)
54 |     current = time.time()
55 |     while run.state != 'completed' and time.time() - current < timeout_in_sec:
56 |         time.sleep(30)
57 |         for _ in range(3):
58 |             run = pipeline.get_run('Vienna', def_id, run_id)
59 |             if run:
60 |                 break
61 |         if not run:
62 |             print("failed to get pipeline status")
63 |             return False
64 |     if run.state != 'completed':
65 |         return False
66 |     if run.result == 'failed':
67 |         return False
68 |     return True
69 | 
70 | if __name__ == '__main__':
71 |     parser = init_parser()
72 |     args = parser.parse_args()
73 |     variables = {}
74 |     if args.variables:
75 |         for kv in args.variables:
76 |             key, value = kv.split('=')
77 |             variables[key] = Variable(False, value)
78 |     print(f'variables: {variables}')
79 |     clients = init_clients()
80 | 
81 |     run = trigger_build(clients, 'master', args.definition_id, variables)
82 |     if not run:
83 |         exit(1)
84 |     res = wait_run_complete(clients, args.definition_id, run.id)
85 |     if not res:
86 |         exit(1)
87 |     


--------------------------------------------------------------------------------
/.github/workflows/portal-provision-pr-gate.yaml:
--------------------------------------------------------------------------------
 1 | name: Portal Provision PR Gate
 2 | on:
 3 |   # Triggers the workflow on push or pull request events but only for the master branch
 4 |   push:
 5 |     branches: [ master ]
 6 |     # TODO: move these files into a folder
 7 |     paths:
 8 |       - files/deploy.py
 9 |       - files/entry.sh
10 |   pull_request:
11 |     branches: [ master ]
12 |     # TODO: move these files into a folder
13 |     paths:
14 |       - files/deploy.py
15 |       - files/entry.sh
16 | jobs:
17 |     build:
18 |         name: Call Azure Pipeline
19 |         runs-on: ubuntu-latest
20 |         steps:
21 |         - name: check out repo
22 |           uses: actions/checkout@v3
23 |         - name: Extract branch name
24 |           shell: bash
25 |           run: |
26 |             if [ $EVENT_NAME = "push" ]
27 |             then
28 |               echo "##[set-output name=branch;]$(echo ${GITHUB_REF#refs/heads/})"
29 |             else
30 |               echo "##[set-output name=branch;]$(echo $BASE_BRANCH)"
31 |             fi
32 |           env:
33 |             BASE_BRANCH: ${{ github.head_ref }}
34 |             EVENT_NAME: ${{ github.event_name }}
35 |           id: extract_branch
36 | 
37 |         - uses: actions/setup-python@v3          
38 | 
39 |         - name: Install dependencies
40 |           shell: bash
41 |           run: |
42 |             pip install msrest
43 |             pip install azure-devops
44 |           id: install_dependencies
45 | 
46 |         - name: Run pipeline
47 |           shell: bash
48 |           env:
49 |             PAT_TOKEN: ${{ secrets.AMLARC_PORTAL_PROVISION_PIPELINE_PAT }}
50 |             BRANCH: ${{ steps.extract_branch.outputs.branch }}
51 |           run: python .github/run_pipeline.py --definition-id 21230 --variables GITHUB_BRANCH=$BRANCH
52 |           id: run_pipeline
53 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode/settings.json
2 | .ipynb_checkpoints/
3 | PennFudanPed
4 | mnist_data
5 | cifar10-data
6 | .idea/
7 | data
8 | __pycache__


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Microsoft Open Source Code of Conduct
 2 | 
 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 4 | 
 5 | Resources:
 6 | 
 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE
22 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.5 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->


--------------------------------------------------------------------------------
/docs/4. View metrics in Compute level and runs level.markdown:
--------------------------------------------------------------------------------
 1 | 
 2 | For Data Scientists, job(run) level monitoring and reporting during training import to them, it can help them to:
 3 | 
 4 | - Understand the performance of their training script (does the job fully used capacity of GPUs, CPUs, what is the throughput of this model etc.)
 5 | - Find the bottleneck of this training job(run), GPU Memory, network etc.
 6 | - Better understand to different training parameters impact their job (batch size, epoch, learning rate etc.)
 7 | 
 8 | For Admin, compute level monitor and reporting are important, it can help them to:
 9 | - Optimized quota(resource) allocation strategy
10 | - Understand the utilization of resource
11 | 
12 | ## Enable custom metrics 
13 | 
14 | CMAKS use application insights of AML worksapce to show the metrics of compute level and run level. To enable this function, you need enable custom metrics in ```appinsights>config>usage & estimated cost> custom metrics```.
15 | 
16 | ![custom metrics](/pics/2.6custommetrics.png)
17 | 
18 | ![ennable custom metrics](/pics/2.7ennablecustommetrics.png)
19 | 
20 | 
21 | After AML agnet is sucessful installed, you can [attach CMAKS compute](https://github.com/Azure/CMK8s-Samples/blob/master/docs/2.%20Attach%20CMAKS%20compute.markdown)
22 | 
23 | ### Using flight
24 | Because this function is under preview, you need use `flight=computeMetrics` to enble it manually.
25 | 
26 | ### View compute level metrics
27 | 
28 | To view the CMKAKS compute level metrics, you can go to `attached compute > compute detail > Monitoring`. Note, only CMAKS compute target support this page.
29 | 
30 | ![compute level metrics](/pics/5.2computemetrics.png)
31 | 
32 | ### View run level metrics
33 | 
34 | To view the CMKAKS run level metrics, you can use to two workflow:
35 | 1. `compute detail > runs > Monitoring`
36 | 2. `experiment > runs > Monitoring`
37 | 
38 | ![run level metrics](/pics/5.3runmetrics.png)
39 | 
40 | ## Useful chart toolkit
41 | We provide some useful functions in the charts:
42 | - Zoom: Zoom time range, mouse drag and drop on the charts 
43 | - Pan: change the time range through pan the x axis
44 | - Reset Axes: Reset axes to default value, double click on the chart
45 | - Switch Chart type: switch the chart type between bar chart and line chart
46 | - Select/unselect legends: click on the legend will select or unselect legend
47 | - Only select one legend: double click on one selected legend will only show the data of this legend, this is very helpful if you have many legend on the chart.
48 | 
49 | 
50 | 


--------------------------------------------------------------------------------
/docs/AKS-HCI/Yaml/serving-default-domain-knative-1-4-0.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 The Knative Authors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     https://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | apiVersion: batch/v1
16 | kind: Job
17 | metadata:
18 |   name: default-domain
19 |   namespace: knative-serving
20 |   labels:
21 |     app: "default-domain"
22 |     serving.knative.dev/release: "v0.14.0"
23 | spec:
24 |   template:
25 |     metadata:
26 |       labels:
27 |         app: "default-domain"
28 |         serving.knative.dev/release: "v0.14.0"
29 |     spec:
30 |       serviceAccountName: controller
31 |       containers:
32 |       - name: default-domain
33 |         # This is the Go import path for the binary that is containerized
34 |         # and substituted here.
35 |         image: gcr.io/knative-releases/knative.dev/serving/cmd/default-domain@sha256:3f9f0baebbb2ace4aaa6f38537f2a76aa9f02669d43b1a9d8386bf6497559257
36 |         args: ["-magic-dns=xip.io"]
37 |         ports:
38 |         - name: http
39 |           containerPort: 8080
40 |         readinessProbe: &probe
41 |           httpGet:
42 |             port: 8080
43 |         livenessProbe: *probe
44 |         env:
45 |         - name: POD_NAME
46 |           valueFrom:
47 |             fieldRef:
48 |               fieldPath: metadata.name
49 |         - name: SYSTEM_NAMESPACE
50 |           valueFrom:
51 |             fieldRef:
52 |               fieldPath: metadata.namespace
53 |       restartPolicy: OnFailure
54 |   backoffLimit: 10
55 | ---
56 | apiVersion: v1
57 | kind: Service
58 | metadata:
59 |   name: default-domain-service
60 |   namespace: knative-serving
61 |   labels:
62 |     app: default-domain
63 |     serving.knative.dev/release: "v0.14.0"
64 | spec:
65 |   selector:
66 |     app: default-domain
67 |   ports:
68 |   - name: http
69 |     port: 80
70 |     targetPort: 8080
71 |   type: ClusterIP
72 | 
73 | ---
74 | 


--------------------------------------------------------------------------------
/docs/AKS-HCI/cli/README.md:
--------------------------------------------------------------------------------
 1 | # Sample Azure Machine Learning CLI v2 examples
 2 | 
 3 | After following the setup documents, you can go through the CLI examples linked below to get a better understanding of how the process works and the possibilities it can unlock:
 4 | 
 5 | ## CLI v2 Examples
 6 | 
 7 | ### Prerequisites 
 8 | 
 9 | Follow this [doc](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-train-cli?view=azure-devops#prerequisites) to setup the prerequisites of using Azure Machine CLI v2.  
10 | 
11 | ### Examples
12 | 
13 | * [Image Classification Using Scikit-learn](mnist/README.md) (Image Classification)
14 | 
15 |   This example serves as "hello world" of using for training and inference with AKS-HCI Cluster, on-premise NFS Server and Azure Machine Learning workspaces, including
16 |   * Training with AKS-HCI cluster and on-premise NFS Server
17 |   * Register model
18 |   * Inference with registered model on AKS-HCI cluster
19 |   * Test model
20 | 
21 | 


--------------------------------------------------------------------------------
/docs/AKS-HCI/cli/mnist/deployment.yml:
--------------------------------------------------------------------------------
 1 | $schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json
 2 | type: kubernetes
 3 | app_insights_enabled: true
 4 | model: azureml:<your model name>:<your model version>
 5 | code_configuration:
 6 |   code: 
 7 |     local_path: ./
 8 |   scoring_script: score.py
 9 | instance_type: <your instance type>
10 | environment: 
11 |   name: sklearn-mnist-env
12 |   version: 1
13 |   conda_file: ./model/conda.yml
14 |   image: mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20210727.v1
15 | instance_count: 1
16 | request_settings:
17 |   request_timeout_ms: 1000
18 |   max_concurrent_requests_per_instance: 1
19 |   max_queue_wait_ms: 1000
20 | resources:
21 |   requests:
22 |     cpu: "1"
23 |     memory: "1Gi"
24 | liveness_probe:
25 |   initial_delay: 10
26 |   period: 10
27 |   timeout: 10
28 |   success_threshold: 1
29 |   failure_threshold: 1
30 | 
31 | 


--------------------------------------------------------------------------------
/docs/AKS-HCI/cli/mnist/endpoint.yml:
--------------------------------------------------------------------------------
1 | $schema: https://azuremlschemas.azureedge.net/latest/managedOnlineEndpoint.schema.json
2 | auth_mode: aml_token
3 | compute: <your compute target name>


--------------------------------------------------------------------------------
/docs/AKS-HCI/cli/mnist/mnist_script/train.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import argparse
 3 | import os
 4 | import numpy as np
 5 | import glob
 6 | 
 7 | from sklearn.linear_model import LogisticRegression
 8 | import joblib
 9 | 
10 | from azureml.core import Run
11 | from utils import load_data
12 | 
13 | # let user feed in 2 parameters, the dataset to mount or download, and the regularization rate of the logistic regression model
14 | parser = argparse.ArgumentParser()
15 | parser.add_argument('--data-folder', type=str, dest='data_folder', help='data folder mounting point')
16 | parser.add_argument('--regularization', type=float, dest='reg', default=0.01, help='regularization rate')
17 | args = parser.parse_args()
18 | 
19 | data_folder = args.data_folder
20 | print('Data folder:', data_folder)
21 | 
22 | # load train and test set into numpy arrays
23 | # note we scale the pixel intensity values to 0-1 (by dividing it with 255.0) so the model can converge faster.
24 | X_train = load_data(glob.glob(os.path.join(data_folder, '**/train-images-idx3-ubyte.gz'), recursive=True)[0], False) / 255.0
25 | X_test = load_data(glob.glob(os.path.join(data_folder, '**/t10k-images-idx3-ubyte.gz'), recursive=True)[0], False) / 255.0
26 | y_train = load_data(glob.glob(os.path.join(data_folder, '**/train-labels-idx1-ubyte.gz'), recursive=True)[0], True).reshape(-1)
27 | y_test = load_data(glob.glob(os.path.join(data_folder, '**/t10k-labels-idx1-ubyte.gz'), recursive=True)[0], True).reshape(-1)
28 | 
29 | print(X_train.shape, y_train.shape, X_test.shape, y_test.shape, sep = '\n')
30 | 
31 | # get hold of the current run
32 | run = Run.get_context()
33 | 
34 | print('Train a logistic regression model with regularization rate of', args.reg)
35 | clf = LogisticRegression(C=1.0/args.reg, solver="liblinear", multi_class="auto", random_state=42)
36 | clf.fit(X_train, y_train)
37 | 
38 | print('Predict the test set')
39 | y_hat = clf.predict(X_test)
40 | 
41 | # calculate accuracy on the prediction
42 | acc = np.average(y_hat == y_test)
43 | print('Accuracy is', acc)
44 | 
45 | run.log('regularization rate', np.float(args.reg))
46 | run.log('accuracy', np.float(acc))
47 | 
48 | os.makedirs('outputs', exist_ok=True)
49 | # note file saved in the outputs folder is automatically uploaded into experiment record
50 | joblib.dump(value=clf, filename='outputs/sklearn_mnist_model.pkl')
51 | 


--------------------------------------------------------------------------------
/docs/AKS-HCI/cli/mnist/mnist_script/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import gzip
 5 | import numpy as np
 6 | import struct
 7 | 
 8 | 
 9 | # load compressed MNIST gz files and return numpy arrays
10 | def load_data(filename, label=False):
11 |     with gzip.open(filename) as gz:
12 |         struct.unpack('I', gz.read(4))
13 |         n_items = struct.unpack('>I', gz.read(4))
14 |         if not label:
15 |             n_rows = struct.unpack('>I', gz.read(4))[0]
16 |             n_cols = struct.unpack('>I', gz.read(4))[0]
17 |             res = np.frombuffer(gz.read(n_items[0] * n_rows * n_cols), dtype=np.uint8)
18 |             res = res.reshape(n_items[0], n_rows * n_cols)
19 |         else:
20 |             res = np.frombuffer(gz.read(n_items[0]), dtype=np.uint8)
21 |             res = res.reshape(n_items[0], 1)
22 |     return res
23 | 
24 | 
25 | # one-hot encode a 1-D array
26 | def one_hot_encode(array, num_of_classes):
27 |     return np.eye(num_of_classes)[array.reshape(-1)]
28 | 


--------------------------------------------------------------------------------
/docs/AKS-HCI/cli/mnist/model/conda.yml:
--------------------------------------------------------------------------------
 1 | name: mnist-demo-env
 2 | 
 3 | dependencies:
 4 |   - python=3.6.2
 5 | 
 6 |   - pip:
 7 |     - azureml-dataset-runtime[pandas,fuse]~=1.24.0.0
 8 |     - azureml-defaults~=1.24.0.0
 9 |     - scikit-learn==0.22.1
10 | 
11 | channels:
12 |   - anaconda
13 |   - conda-forge
14 | 


--------------------------------------------------------------------------------
/docs/AKS-HCI/cli/mnist/score.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import numpy as np
 3 | import os
 4 | import pickle
 5 | import joblib
 6 | 
 7 | def init():
 8 |     global model
 9 |     # AZUREML_MODEL_DIR is an environment variable created during deployment.
10 |     # It is the path to the model folder (./azureml-models/$MODEL_NAME/$VERSION)
11 |     # For multiple models, it points to the folder containing all deployed models (./azureml-models)
12 |     model_path = os.path.join(os.getenv('AZUREML_MODEL_DIR'), 'sklearn_mnist_model.pkl')
13 |     model = joblib.load(model_path)
14 | 
15 | def run(raw_data):
16 |     data = np.array(json.loads(raw_data)['data'])
17 |     # make prediction
18 |     y_hat = model.predict(data)
19 |     # you can return any data type as long as it is JSON-serializable
20 |     return y_hat.tolist()


--------------------------------------------------------------------------------
/docs/AKS-HCI/cli/mnist/train_env/conda.yml:
--------------------------------------------------------------------------------
1 | channels:
2 |   - anaconda
3 |   - conda-forge
4 | dependencies:
5 |   - python=3.6.2
6 |   - pip:
7 |       - 'azureml-dataset-runtime[pandas,fuse]~=1.34.0'
8 |       - azureml-defaults~=1.34.0
9 |   - scikit-learn==0.22.1


--------------------------------------------------------------------------------
/docs/AKS-HCI/cli/mnist/training.yml:
--------------------------------------------------------------------------------
 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json
 2 | code: 
 3 |   local_path: mnist_script
 4 | command: >-
 5 |   python train.py
 6 |   --data-folder <your nfs mounting point on training pods>/mnist
 7 |   --regularization 0.5
 8 | environment: 
 9 |   name: tutorial-env
10 |   version: 1
11 |   path: .
12 |   conda_file: file:./train_env/conda.yml
13 |   docker:
14 |     image: mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20210806.v1
15 | compute:
16 |   target: azureml:<your compute target name>
17 |   instance_type: <your instance type>
18 | experiment_name: mnist-demo
19 | description: Image Classification Using Scikit-learn


--------------------------------------------------------------------------------
/docs/AKS-HCI/imgs/Cstorage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/imgs/Cstorage.png


--------------------------------------------------------------------------------
/docs/AKS-HCI/imgs/Inner-compute.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/imgs/Inner-compute.png


--------------------------------------------------------------------------------
/docs/AKS-HCI/imgs/Inner-workspace.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/imgs/Inner-workspace.png


--------------------------------------------------------------------------------
/docs/AKS-HCI/imgs/azureml_log.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/imgs/azureml_log.png


--------------------------------------------------------------------------------
/docs/AKS-HCI/imgs/container.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/imgs/container.png


--------------------------------------------------------------------------------
/docs/AKS-HCI/imgs/cors.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/imgs/cors.png


--------------------------------------------------------------------------------
/docs/AKS-HCI/imgs/datastore-set.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/imgs/datastore-set.png


--------------------------------------------------------------------------------
/docs/AKS-HCI/imgs/datastore.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/imgs/datastore.png


--------------------------------------------------------------------------------
/docs/AKS-HCI/imgs/kubernetes_arc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/imgs/kubernetes_arc.png


--------------------------------------------------------------------------------
/docs/AKS-HCI/imgs/network.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/imgs/network.png


--------------------------------------------------------------------------------
/docs/AKS-HCI/imgs/sas-token.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/imgs/sas-token.png


--------------------------------------------------------------------------------
/docs/AKS-HCI/imgs/sas.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/imgs/sas.png


--------------------------------------------------------------------------------
/docs/AKS-HCI/imgs/structure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/imgs/structure.png


--------------------------------------------------------------------------------
/docs/AKS-HCI/imgs/studio-arc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/imgs/studio-arc.png


--------------------------------------------------------------------------------
/docs/AKS-HCI/imgs/studio-s.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/imgs/studio-s.png


--------------------------------------------------------------------------------
/docs/AKS-HCI/imgs/studio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/imgs/studio.png


--------------------------------------------------------------------------------
/docs/AKS-HCI/imgs/url.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/imgs/url.png


--------------------------------------------------------------------------------
/docs/AKS-HCI/imgs/vid-img.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/imgs/vid-img.png


--------------------------------------------------------------------------------
/docs/AKS-HCI/nfs/README.md:
--------------------------------------------------------------------------------
 1 | # Verify the NFS Setup in AMLArc
 2 | 
 3 | After configuring the NFS Setup in AMLArc, using this notebook to list the contents of the NFS Server on training pods. This could help you to verify,
 4 | * The network access between AKS-HCI cluster to the NFS Server
 5 | * The config map of NFS Setup used in AMLArc
 6 | * Learn how to access the NFS data in training pods 
 7 |  
 8 | ## Notebooks
 9 | 
10 | * [Verify the NFS Setup in AMLArc](Verify_NFS_Setup_in_AMLArc.ipynb)
11 | 
12 | 
13 | 
14 | 
15 | 
16 | 
17 | 
18 |    
19 | 


--------------------------------------------------------------------------------
/docs/AKS-HCI/nfs/config.json:
--------------------------------------------------------------------------------
1 | {
2 |     "subscription_id": "<sub_id>",
3 |     "resource_group": "<resource_group_name>",
4 |     "workspace_name": "<AML_workspace_name>"
5 | }


--------------------------------------------------------------------------------
/docs/AKS-HCI/nfs/images/configure-public-ip.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/nfs/images/configure-public-ip.png


--------------------------------------------------------------------------------
/docs/AKS-HCI/nfs/images/create-ubuntu-vm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/nfs/images/create-ubuntu-vm.png


--------------------------------------------------------------------------------
/docs/AKS-HCI/nfs/images/reset-network.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/nfs/images/reset-network.png


--------------------------------------------------------------------------------
/docs/AKS-HCI/nfs/images/ssh-status.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/nfs/images/ssh-status.png


--------------------------------------------------------------------------------
/docs/AKS-HCI/nfs/images/ubuntu-vm-created.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/nfs/images/ubuntu-vm-created.png


--------------------------------------------------------------------------------
/docs/AKS-HCI/nfs/images/ufw-nfs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/nfs/images/ufw-nfs.png


--------------------------------------------------------------------------------
/docs/AKS-HCI/nfs/images/ufw-ssh.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/nfs/images/ufw-ssh.png


--------------------------------------------------------------------------------
/docs/AKS-HCI/nfs/images/verify-nfs-training.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/nfs/images/verify-nfs-training.png


--------------------------------------------------------------------------------
/docs/AKS-HCI/nfs/images/verify-nfs-vm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/nfs/images/verify-nfs-vm.png


--------------------------------------------------------------------------------
/docs/AKS-HCI/nfs/mount-config.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | data:
 3 |   mounts.yaml: |
 4 |     mountPoints:
 5 |     - mountPath: <Mounting path on training pod>
 6 |       mountType: nfs
 7 |       name: <NFS_NAME>
 8 |       path: <NFS_EXPORT_FILE_PATH>
 9 |       server: <NFS_IP>
10 | kind: ConfigMap
11 | metadata:
12 |   name: mount-config
13 |   namespace: azureml


--------------------------------------------------------------------------------
/docs/AKS-HCI/nfs/nfs-server-setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This script should be executed on Linux Ubuntu Virtual Machine
 4 | 
 5 | DATA_DIRECTORY=${1:-/data}
 6 | AKS_SUBNET=${2:-*}
 7 | 
 8 | EXPORT_DIRECTORY="/export/$(basename "$DATA_DIRECTORY")"
 9 | 
10 | echo "Updating packages"
11 | apt-get -y update
12 | 
13 | echo "Installing NFS kernel server"
14 | 
15 | apt-get -y install nfs-kernel-server
16 | 
17 | echo "Making data directory ${DATA_DIRECTORY}"
18 | mkdir -p ${DATA_DIRECTORY}
19 | 
20 | echo "Making new directory to be exported and linked to data directory: ${EXPORT_DIRECTORY}"
21 | mkdir -p ${EXPORT_DIRECTORY}
22 | 
23 | echo "Mount binding ${DATA_DIRECTORY} to ${EXPORT_DIRECTORY}"
24 | mount --bind ${DATA_DIRECTORY} ${EXPORT_DIRECTORY}
25 | 
26 | echo "Giving 777 permissions to ${EXPORT_DIRECTORY} directory"
27 | chmod 777 ${EXPORT_DIRECTORY}
28 | 
29 | parentdir="$(dirname "$EXPORT_DIRECTORY")"
30 | echo "Giving 777 permissions to parent: ${parentdir} directory"
31 | chmod 777 $parentdir
32 | 
33 | echo "Appending bound directories into fstab"
34 | echo "${DATA_DIRECTORY}    ${EXPORT_DIRECTORY}   none    bind  0  0" >> /etc/fstab
35 | 
36 | echo "Appending localhost and Kubernetes subnet address ${AKS_SUBNET} to exports configuration file"
37 | echo "/export        ${AKS_SUBNET}(rw,async,insecure,fsid=0,crossmnt,no_subtree_check)" >> /etc/exports
38 | echo "/export        localhost(rw,async,insecure,fsid=0,crossmnt,no_subtree_check)" >> /etc/exports
39 | 
40 | nohup service nfs-kernel-server restart


--------------------------------------------------------------------------------
/docs/AKS-HCI/nfs/nfs_script/test.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import glob
 4 | 
 5 | from pathlib import Path
 6 | 
 7 | 
 8 | # let user feed in 2 parameters, the dataset to mount or download, and the regularization rate of the logistic regression model
 9 | parser = argparse.ArgumentParser()
10 | parser.add_argument('--nfs-folder', type=str, dest='nfs_folder', help='NFS Server mounting point')
11 | args = parser.parse_args()
12 | 
13 | nfs_folder = args.nfs_folder
14 | print('NFS folder:', nfs_folder)
15 | 
16 | if (Path(nfs_folder).exists() != True):
17 |     raise Exception(f"{nfs_folder} doesn't exist")
18 | if (Path(nfs_folder).is_dir() != True):
19 |     raise Exception(f"{nfs_folder} is not a directory")
20 | 
21 | from itertools import islice
22 | 
23 | space =  '    '
24 | branch = '│   '
25 | tee =    '├── '
26 | last =   '└── '
27 | 
28 | def tree(dir_path: Path, level: int=-1, limit_to_directories: bool=False,
29 |          length_limit: int=1000):
30 |     """Given a directory Path object print a visual tree structure"""
31 |     dir_path = Path(dir_path) # accept string coerceable to Path
32 |     files = 0
33 |     directories = 0
34 |     def inner(dir_path: Path, prefix: str='', level=-1):
35 |         nonlocal files, directories
36 |         if not level: 
37 |             return # 0, stop iterating
38 |         if limit_to_directories:
39 |             contents = [d for d in dir_path.iterdir() if d.is_dir()]
40 |         else: 
41 |             contents = list(dir_path.iterdir())
42 |         pointers = [tee] * (len(contents) - 1) + [last]
43 |         for pointer, path in zip(pointers, contents):
44 |             if path.is_dir():
45 |                 yield prefix + pointer + path.name
46 |                 directories += 1
47 |                 extension = branch if pointer == tee else space 
48 |                 yield from inner(path, prefix=prefix+extension, level=level-1)
49 |             elif not limit_to_directories:
50 |                 yield prefix + pointer + path.name
51 |                 files += 1
52 |     print(dir_path.name)
53 |     iterator = inner(dir_path, level=level)
54 |     for line in islice(iterator, length_limit):
55 |         print(line)
56 |     if next(iterator, None):
57 |         print(f'... length_limit, {length_limit}, reached, counted:')
58 |     print(f'\n{directories} directories' + (f', {files} files' if files else ''))
59 | 
60 | tree(nfs_folder)


--------------------------------------------------------------------------------
/docs/AKS-HCI/notebooks/README.md:
--------------------------------------------------------------------------------
 1 | # Sample Notebooks
 2 | 
 3 | After following the setup documents, you can go through the sample notebooks linked below to get a better understanding of how the process works and the possibilities it can unlock:
 4 | 
 5 | * [Image Classification Using Scikit-learn](mnist/MNIST_Training_with_AKS-HCI_Cluster_and_NFS.ipynb) (Image Classification)
 6 | 
 7 |   This notebook serves as "hello world" of using for training and inference with AKS-HCI Cluster, on-premise NFS Server, and Azure Machine Learning, including
 8 |   * Training with AKS-HCI cluster and on-premise NFS Server
 9 |   * Register model
10 |   * Inference with the registered model on AKS-HCI cluster
11 |   * Test model
12 | 
13 | * [Distributed PyTorch Training with DistributedDataParallel](distributed-cifar10/distributed-pytorch-cifar10.ipynb) (Image Classification)
14 | 
15 |   This notebook demonstrates an example of Image classification with PyTorch, including,
16 |   * Distributed training using PyTorch with 2 worker nodes on AKS-HCI cluster and the training data is stored in on-premise NFS Server
17 |   * Register model
18 |   * Inference with the registered model on AKS-HCI cluster
19 |   * Test model
20 | 
21 | * [Object Segmentation with Transfer Learning](object-segmentation-on-azure-stack/object_segmentation-akshci.ipynb) (Object Segmentation)
22 |   
23 |   Object segmentation using pre-trained Mask R-CNN model on PyTorch. AML pipeline steps are used for data preprocessing. **Training data are stored in on-premise NFS server, and the intermediate data are stored in default datastore associated with the ML workspace.** The whole flow includes,
24 |   * Use AML pipelines to read training data from on-premise NFS server, do data preprocessing and generate intermediate data to default datastore
25 |   * Use AML pipelines to trigger train step on AKS-HCI cluster
26 |   * Register model
27 |   * Inference with the registered model on AKS-HCI cluster
28 |   * Test model 
29 | 
30 | * [Object Segmentation with Transfer Learning with all data on NFS server](object-segmentation-on-azure-stack/object_segmentation-akshci-nfs.ipynb) (Object Segmentation)
31 | 
32 |   Object segmentation using pre-trained Mask R-CNN model on PyTorch. AML pipeline steps are used for data preprocessing. **Both the training and intermediate data are stored in on-prem NFS server.** The whole flow includes,
33 |   * Use AML pipelines to read training data from on-premise NFS server, do data preprocessing and generate intermediate data to NFS server.
34 |   * Use AML pipelines to trigger train step on AKS-HCI cluster
35 |   * Register model
36 |   * Inference with the registered model on AKS-HCI cluster
37 |   * Test model 
38 | 
39 | * [AML Pipelines with NYC-TAXI-DATA](pipeline/nyc-taxi-data-regression-model-building.ipynb) (Structured Text Data Prediction)
40 | 
41 |   This notebook demonstrates an example of Structured Text Data Prediction, preparing / preprocessing / training data in **default datastore associated with the ML workspace**. The whole flow includes,
42 |   * Download and upload training data to default datastore
43 |   * Use AML pipelines to preprocess and train
44 |     * Cleanse data in parallel
45 |     * Merge cleansed data
46 |     * Normalize data
47 |     * Transform data
48 |     * Split data
49 |     * Train model
50 |   * Register model
51 |   * Inference with the registered model on AKS-HCI cluster
52 |   * Test model 
53 | 
54 | * [AML Pipelines with NYC-TAXI-DATA with all data on NFS server](pipeline/nyc-taxi-data-regression-model-building-nfs.ipynb) (Structured Text Data Prediction)
55 | 
56 |   This notebook demonstrates an example of Structured Text Data Prediction, preparing / preprocessing / training data on **on-prem NFS server**. The whole flow includes,
57 |   * Download and upload training data to default datastore
58 |   * Use AML pipelines to preprocess and train
59 |     * Cleanse data in parallel
60 |     * Merge cleansed data
61 |     * Normalize data
62 |     * Transform data
63 |     * Split data
64 |     * Train model
65 |   * Register model
66 |   * Inference with the registered model on AKS-HCI cluster
67 |   * Test model 
68 | 
69 | * [Model Download and Upload](upload-download-model/AML-model-download-upload.ipynb)


--------------------------------------------------------------------------------
/docs/AKS-HCI/notebooks/distributed-cifar10/README.md:
--------------------------------------------------------------------------------
 1 | # Distributed Training on AKS-HCI and on-premise NFS Server
 2 | 
 3 | These sample notebooks guide you through a distributed training workload that trains an ML model on [CIFAR10](https://www.cs.toronto.edu/~kriz/cifar.html) dataset hosted on on-premise NFS Server. We offer two notebooks taking advantage of the most popular deep learning frameworks PyTorch.
 4 | 
 5 | ## Notebooks
 6 | 
 7 | * [Distributed PyTorch Training with DistributedDataParallel](distributed-pytorch-cifar10.ipynb) (Image Classification)
 8 | 
 9 |   This notebook demonstrates an example of Image classification with PyTorch, including,
10 |   * Distributed training using PyTorch with 2 worker nodes on AKS-HCI cluster and the training data is stored in on-premise NFS Server
11 |   * Register model
12 |   * Inference with the registered model on AKS-HCI cluster
13 |   * Test model


--------------------------------------------------------------------------------
/docs/AKS-HCI/notebooks/distributed-cifar10/config.json:
--------------------------------------------------------------------------------
1 | {
2 |     "subscription_id": "<sub_id>",
3 |     "resource_group": "<resource_group_name>",
4 |     "workspace_name": "<AML_workspace_name>"
5 | }


--------------------------------------------------------------------------------
/docs/AKS-HCI/notebooks/distributed-cifar10/pt_deployment.yml:
--------------------------------------------------------------------------------
 1 | $schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json
 2 | type: kubernetes
 3 | app_insights_enabled: true
 4 | model: <modelId>
 5 | code_configuration:
 6 |   code: 
 7 |     local_path: ./
 8 |   scoring_script: score_pytorch.py
 9 | instance_type: <instanceTypeName>
10 | environment: 
11 |   name: pytorch-cifar
12 |   version: 1
13 |   conda_file: ./pytorch-script/conda_dependencies.yml
14 |   image: mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20210727.v1
15 | instance_count: 1
16 | request_settings:
17 |   request_timeout_ms: 1000
18 |   max_concurrent_requests_per_instance: 1
19 |   max_queue_wait_ms: 1000
20 | resources:
21 |   requests:
22 |     cpu: "1"
23 |     memory: "1Gi"
24 | liveness_probe:
25 |   initial_delay: 10
26 |   period: 10
27 |   timeout: 10
28 |   success_threshold: 1
29 |   failure_threshold: 1


--------------------------------------------------------------------------------
/docs/AKS-HCI/notebooks/distributed-cifar10/pt_endpoint.yml:
--------------------------------------------------------------------------------
1 | $schema: https://azuremlschemas.azureedge.net/latest/managedOnlineEndpoint.schema.json
2 | compute: <computeTargetName>
3 | auth_mode: aml_token


--------------------------------------------------------------------------------
/docs/AKS-HCI/notebooks/distributed-cifar10/pytorch-script/conda_dependencies.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | channels:
 3 | - conda-forge
 4 | dependencies:
 5 | - python=3.6.2
 6 | - pip:
 7 |   - azureml-defaults
 8 |   - torch==1.6.0
 9 |   - torchvision==0.7.0
10 |   - future==0.17.1
11 | 


--------------------------------------------------------------------------------
/docs/AKS-HCI/notebooks/distributed-cifar10/score_tf.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | import tensorflow as tf
 4 | import json
 5 | 
 6 | # Called when the deployed service starts
 7 | def init():
 8 |     global model
 9 | 
10 |     # Get the path where the deployed model can be found.
11 |     model_file_path = os.path.join(os.getenv('AZUREML_MODEL_DIR'), '001')
12 |     # model_file_path = model_path + '/obj_segmentation.pkl'
13 |     #model_file_path = "C:\\Users\\v-songshanli\projects\\ashexplore\BIG_FILES\\001"
14 | 
15 |     model = tf.saved_model.load(model_file_path)
16 | 
17 | # Handle requests to the service
18 | def run(data):
19 |     try:
20 |         # Pick out the text property of the JSON request.
21 |         # This expects a request in the form of {"text": "some text to score for sentiment"}
22 | 
23 |         start_at = time.time()
24 |         inputs = json.loads(data)
25 |         img_data_list = inputs["instances"]
26 | 
27 |         signature_name = inputs["signature_name"]
28 |         infer = model.signatures[signature_name]
29 | 
30 |         inputs_tensor = tf.constant(img_data_list, dtype=tf.float32)
31 | 
32 |         res = infer(tf.constant(inputs_tensor))
33 |         return {"predictions": res["dense_1"].numpy().tolist(),
34 |                 "elapsed_time": time.time() - start_at}
35 |     except Exception as e:
36 |         error = str(e)
37 |         print(error)
38 |         raise e
39 | 


--------------------------------------------------------------------------------
/docs/AKS-HCI/notebooks/distributed-cifar10/test_imgs/test_img_0_cat.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/notebooks/distributed-cifar10/test_imgs/test_img_0_cat.jpg


--------------------------------------------------------------------------------
/docs/AKS-HCI/notebooks/distributed-cifar10/test_imgs/test_img_1_ship.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/notebooks/distributed-cifar10/test_imgs/test_img_1_ship.jpg


--------------------------------------------------------------------------------
/docs/AKS-HCI/notebooks/distributed-cifar10/test_imgs/test_img_2_ship.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/notebooks/distributed-cifar10/test_imgs/test_img_2_ship.jpg


--------------------------------------------------------------------------------
/docs/AKS-HCI/notebooks/distributed-cifar10/test_imgs/test_img_3_plane.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/notebooks/distributed-cifar10/test_imgs/test_img_3_plane.jpg


--------------------------------------------------------------------------------
/docs/AKS-HCI/notebooks/distributed-cifar10/test_imgs/test_img_4_frog.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/notebooks/distributed-cifar10/test_imgs/test_img_4_frog.jpg


--------------------------------------------------------------------------------
/docs/AKS-HCI/notebooks/mnist/README.md:
--------------------------------------------------------------------------------
 1 | # Image Classification Using Scikit-learn
 2 | 
 3 | Using MNIST as an example, this sample notebook demonstrates how to train a machine learning model using AKS-HCI Arc compute and an on-premise NFS Server. Training data is stored on on-premise NFS Server. The register the trained model and deploy that on the AKS-HCI Arc compute for inference.
 4 |  
 5 | ## Notebooks
 6 | 
 7 | * [Image Classification Using Scikit-learn](MNIST_Training_with_AKS-HCI_Cluster_and_NFS.ipynb)
 8 | 
 9 | 
10 | 
11 | 
12 | 
13 | 
14 | 
15 |    
16 | 


--------------------------------------------------------------------------------
/docs/AKS-HCI/notebooks/mnist/config.json:
--------------------------------------------------------------------------------
1 | {
2 |     "subscription_id": "<sub_id>",
3 |     "resource_group": "<resource_group_name>",
4 |     "workspace_name": "<AML_workspace_name>"
5 | }


--------------------------------------------------------------------------------
/docs/AKS-HCI/notebooks/mnist/confusion.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/notebooks/mnist/confusion.png


--------------------------------------------------------------------------------
/docs/AKS-HCI/notebooks/mnist/deployment.yml:
--------------------------------------------------------------------------------
 1 | $schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json
 2 | type: kubernetes
 3 | app_insights_enabled: true
 4 | model: <modelId>
 5 | code_configuration:
 6 |   code: 
 7 |     local_path: ./
 8 |   scoring_script: score.py
 9 | instance_type: <instanceTypeName>
10 | environment: 
11 |   name: sklearn-mnist-env
12 |   version: 1
13 |   conda_file: ./model/conda.yml
14 |   image: mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20210727.v1
15 | instance_count: 1
16 | request_settings:
17 |   request_timeout_ms: 1000
18 |   max_concurrent_requests_per_instance: 1
19 |   max_queue_wait_ms: 1000
20 | resources:
21 |   requests:
22 |     cpu: "1"
23 |     memory: "1Gi"
24 | liveness_probe:
25 |   initial_delay: 10
26 |   period: 10
27 |   timeout: 10
28 |   success_threshold: 1
29 |   failure_threshold: 1
30 | 
31 | 


--------------------------------------------------------------------------------
/docs/AKS-HCI/notebooks/mnist/digit_7.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/notebooks/mnist/digit_7.jpg


--------------------------------------------------------------------------------
/docs/AKS-HCI/notebooks/mnist/endpoint.yml:
--------------------------------------------------------------------------------
1 | $schema: https://azuremlschemas.azureedge.net/latest/managedOnlineEndpoint.schema.json
2 | auth_mode: aml_token
3 | compute: <computeTargetName>


--------------------------------------------------------------------------------
/docs/AKS-HCI/notebooks/mnist/mnist_script/train.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import argparse
 3 | import os
 4 | import numpy as np
 5 | import glob
 6 | 
 7 | from sklearn.linear_model import LogisticRegression
 8 | import joblib
 9 | 
10 | from azureml.core import Run
11 | from utils import load_data
12 | 
13 | # let user feed in 2 parameters, the dataset to mount or download, and the regularization rate of the logistic regression model
14 | parser = argparse.ArgumentParser()
15 | parser.add_argument('--data-folder', type=str, dest='data_folder', help='data folder mounting point')
16 | parser.add_argument('--regularization', type=float, dest='reg', default=0.01, help='regularization rate')
17 | args = parser.parse_args()
18 | 
19 | data_folder = args.data_folder
20 | print('Data folder:', data_folder)
21 | 
22 | # load train and test set into numpy arrays
23 | # note we scale the pixel intensity values to 0-1 (by dividing it with 255.0) so the model can converge faster.
24 | X_train = load_data(glob.glob(os.path.join(data_folder, '**/train-images-idx3-ubyte.gz'), recursive=True)[0], False) / 255.0
25 | X_test = load_data(glob.glob(os.path.join(data_folder, '**/t10k-images-idx3-ubyte.gz'), recursive=True)[0], False) / 255.0
26 | y_train = load_data(glob.glob(os.path.join(data_folder, '**/train-labels-idx1-ubyte.gz'), recursive=True)[0], True).reshape(-1)
27 | y_test = load_data(glob.glob(os.path.join(data_folder, '**/t10k-labels-idx1-ubyte.gz'), recursive=True)[0], True).reshape(-1)
28 | 
29 | print(X_train.shape, y_train.shape, X_test.shape, y_test.shape, sep = '\n')
30 | 
31 | # get hold of the current run
32 | run = Run.get_context()
33 | 
34 | print('Train a logistic regression model with regularization rate of', args.reg)
35 | clf = LogisticRegression(C=1.0/args.reg, solver="liblinear", multi_class="auto", random_state=42)
36 | clf.fit(X_train, y_train)
37 | 
38 | print('Predict the test set')
39 | y_hat = clf.predict(X_test)
40 | 
41 | # calculate accuracy on the prediction
42 | acc = np.average(y_hat == y_test)
43 | print('Accuracy is', acc)
44 | 
45 | run.log('regularization rate', np.float(args.reg))
46 | run.log('accuracy', np.float(acc))
47 | 
48 | os.makedirs('outputs', exist_ok=True)
49 | # note file saved in the outputs folder is automatically uploaded into experiment record
50 | joblib.dump(value=clf, filename='outputs/sklearn_mnist_model.pkl')
51 | 


--------------------------------------------------------------------------------
/docs/AKS-HCI/notebooks/mnist/mnist_script/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import gzip
 5 | import numpy as np
 6 | import struct
 7 | 
 8 | 
 9 | # load compressed MNIST gz files and return numpy arrays
10 | def load_data(filename, label=False):
11 |     with gzip.open(filename) as gz:
12 |         struct.unpack('I', gz.read(4))
13 |         n_items = struct.unpack('>I', gz.read(4))
14 |         if not label:
15 |             n_rows = struct.unpack('>I', gz.read(4))[0]
16 |             n_cols = struct.unpack('>I', gz.read(4))[0]
17 |             res = np.frombuffer(gz.read(n_items[0] * n_rows * n_cols), dtype=np.uint8)
18 |             res = res.reshape(n_items[0], n_rows * n_cols)
19 |         else:
20 |             res = np.frombuffer(gz.read(n_items[0]), dtype=np.uint8)
21 |             res = res.reshape(n_items[0], 1)
22 |     return res
23 | 
24 | 
25 | # one-hot encode a 1-D array
26 | def one_hot_encode(array, num_of_classes):
27 |     return np.eye(num_of_classes)[array.reshape(-1)]
28 | 


--------------------------------------------------------------------------------
/docs/AKS-HCI/notebooks/mnist/model/conda.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | name: model-env
 3 | 
 4 | dependencies:
 5 |   - python=3.6.2
 6 | 
 7 |   - pip:
 8 |     - azureml-dataset-runtime[pandas,fuse]~=1.24.0.0
 9 |     - azureml-defaults~=1.24.0.0
10 |     - scikit-learn==0.22.1
11 | 
12 | channels:
13 |   - anaconda
14 |   - conda-forge
15 | 


--------------------------------------------------------------------------------
/docs/AKS-HCI/notebooks/mnist/score.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import numpy as np
 3 | import os
 4 | import pickle
 5 | import joblib
 6 | 
 7 | def init():
 8 |     global model
 9 |     # AZUREML_MODEL_DIR is an environment variable created during deployment.
10 |     # It is the path to the model folder (./azureml-models/$MODEL_NAME/$VERSION)
11 |     # For multiple models, it points to the folder containing all deployed models (./azureml-models)
12 |     model_path = os.path.join(os.getenv('AZUREML_MODEL_DIR'), 'sklearn_mnist_model.pkl')
13 |     model = joblib.load(model_path)
14 | 
15 | def run(raw_data):
16 |     data = np.array(json.loads(raw_data)['data'])
17 |     # make prediction
18 |     y_hat = model.predict(data)
19 |     # you can return any data type as long as it is JSON-serializable
20 |     return y_hat.tolist()


--------------------------------------------------------------------------------
/docs/AKS-HCI/notebooks/object-segmentation-on-azure-stack/FudanPed00001.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/notebooks/object-segmentation-on-azure-stack/FudanPed00001.png


--------------------------------------------------------------------------------
/docs/AKS-HCI/notebooks/object-segmentation-on-azure-stack/README.md:
--------------------------------------------------------------------------------
 1 | # Object Segmentation: Pipeline Training Run on AKS-HCI cluster and on-premise NFS server
 2 | 
 3 | Using Object Segmentation as an example, this sample notebook demonstrates how to run [Azure Machine Learning Pipelines](https://aka.ms/aml-pipelines) using AKS-HCI cluster and on-premise NFS server
 4 | 
 5 | ## Notebooks
 6 | 
 7 | * [Object Segmentation with Transfer Learning](object_segmentation-akshci.ipynb) (Object Segmentation)
 8 |   
 9 |   Object segmentation using pre-trained Mask R-CNN model on PyTorch. AML pipeline steps are used for data preprocessing. **Training data are stored in on-premise NFS server, and the intermediate data are stored in default datastore associated with the ML workspace.** The whole flow includes,
10 |   * Use AML pipelines to read training data from on-premise NFS server, do data preprocessing and generate intermediate data to default datastore
11 |   * Use AML pipelines to trigger train step on AKS-HCI cluster
12 |   * Register model
13 |   * Inference with the registered model on AKS-HCI cluster
14 |   * Test model 
15 | 
16 | 
17 | * [Object Segmentation with Transfer Learning with all data on NFS server](object_segmentation-akshci-nfs.ipynb) (Object Segmentation)
18 | 
19 |   Object segmentation using pre-trained Mask R-CNN model on PyTorch. AML pipeline steps are used for data preprocessing. **Both the training and intermediate data are stored in on-prem NFS server.** The whole flow includes,
20 |   * Use AML pipelines to read training data from on-premise NFS server, do data preprocessing and generate intermediate data to NFS server.
21 |   * Use AML pipelines to trigger train step on AKS-HCI cluster
22 |   * Register model
23 |   * Inference with the registered model on AKS-HCI cluster
24 |   * Test model 
25 | 
26 |    
27 | 


--------------------------------------------------------------------------------
/docs/AKS-HCI/notebooks/object-segmentation-on-azure-stack/aml_src/Dockerfile.gpu:
--------------------------------------------------------------------------------
 1 | FROM tensorflow/tensorflow:2.4.0rc3-gpu
 2 | 
 3 | ARG CONDA_VERSION=4.7.12
 4 | ARG PYTHON_VERSION=3.7
 5 | ARG AZUREML_SDK_VERSION=1.13.0
 6 | ARG INFERENCE_SCHEMA_VERSION=1.1.0
 7 | 
 8 | ENV LANG=C.UTF-8 LC_ALL=C.UTF-8
 9 | ENV PATH /opt/miniconda/bin:$PATH
10 | ENV DEBIAN_FRONTEND=noninteractive
11 | 
12 | RUN apt-get update --fix-missing && \
13 |     apt-get install -y wget bzip2 && \
14 |     apt-get install -y fuse && \
15 |     apt-get clean -y && \
16 |     rm -rf /var/lib/apt/lists/*
17 | 
18 | RUN useradd --create-home dockeruser
19 | WORKDIR /home/dockeruser
20 | USER dockeruser
21 | 
22 | RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-${CONDA_VERSION}-Linux-x86_64.sh -O ~/miniconda.sh && \
23 |     /bin/bash ~/miniconda.sh -b -p ~/miniconda && \
24 |     rm ~/miniconda.sh && \
25 |     ~/miniconda/bin/conda clean -tipsy
26 | ENV PATH="/home/dockeruser/miniconda/bin/:${PATH}"
27 | 


--------------------------------------------------------------------------------
/docs/AKS-HCI/notebooks/object-segmentation-on-azure-stack/aml_src/conda-env.yaml:
--------------------------------------------------------------------------------
 1 | name: pytorch
 2 | dependencies:
 3 |   - python=3.7
 4 |   - python-graphviz
 5 |   - pip:
 6 |     - azureml-defaults
 7 |     - azure-storage-blob
 8 |     - Cython
 9 |     - torch==1.7.1
10 |     - torchvision==0.8.2
11 |     - pycocotools
12 | 


--------------------------------------------------------------------------------
/docs/AKS-HCI/notebooks/object-segmentation-on-azure-stack/aml_src/engine.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import sys
  3 | import time
  4 | import torch
  5 | 
  6 | import torchvision.models.detection.mask_rcnn
  7 | 
  8 | from coco_utils import get_coco_api_from_dataset
  9 | from coco_eval import CocoEvaluator
 10 | import utils
 11 | 
 12 | 
 13 | def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq):
 14 |     model.train()
 15 |     metric_logger = utils.MetricLogger(delimiter="  ")
 16 |     metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
 17 |     header = 'Epoch: [{}]'.format(epoch)
 18 | 
 19 |     lr_scheduler = None
 20 |     if epoch == 0:
 21 |         warmup_factor = 1. / 1000
 22 |         warmup_iters = min(1000, len(data_loader) - 1)
 23 | 
 24 |         lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor)
 25 | 
 26 |     for images, targets in metric_logger.log_every(data_loader, print_freq, header):
 27 |         images = list(image.to(device) for image in images)
 28 |         targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
 29 | 
 30 |         loss_dict = model(images, targets)
 31 | 
 32 |         losses = sum(loss for loss in loss_dict.values())
 33 | 
 34 |         # reduce losses over all GPUs for logging purposes
 35 |         loss_dict_reduced = utils.reduce_dict(loss_dict)
 36 |         losses_reduced = sum(loss for loss in loss_dict_reduced.values())
 37 | 
 38 |         loss_value = losses_reduced.item()
 39 | 
 40 |         if not math.isfinite(loss_value):
 41 |             print("Loss is {}, stopping training".format(loss_value))
 42 |             print(loss_dict_reduced)
 43 |             sys.exit(1)
 44 | 
 45 |         optimizer.zero_grad()
 46 |         losses.backward()
 47 |         optimizer.step()
 48 | 
 49 |         if lr_scheduler is not None:
 50 |             lr_scheduler.step()
 51 | 
 52 |         metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
 53 |         metric_logger.update(lr=optimizer.param_groups[0]["lr"])
 54 | 
 55 |     return metric_logger
 56 | 
 57 | 
 58 | def _get_iou_types(model):
 59 |     model_without_ddp = model
 60 |     if isinstance(model, torch.nn.parallel.DistributedDataParallel):
 61 |         model_without_ddp = model.module
 62 |     iou_types = ["bbox"]
 63 |     if isinstance(model_without_ddp, torchvision.models.detection.MaskRCNN):
 64 |         iou_types.append("segm")
 65 |     if isinstance(model_without_ddp, torchvision.models.detection.KeypointRCNN):
 66 |         iou_types.append("keypoints")
 67 |     return iou_types
 68 | 
 69 | 
 70 | @torch.no_grad()
 71 | def evaluate(model, data_loader, device):
 72 |     n_threads = torch.get_num_threads()
 73 |     # FIXME remove this and make paste_masks_in_image run on the GPU
 74 |     torch.set_num_threads(1)
 75 |     cpu_device = torch.device("cpu")
 76 |     model.eval()
 77 |     metric_logger = utils.MetricLogger(delimiter="  ")
 78 |     header = 'Test:'
 79 | 
 80 |     coco = get_coco_api_from_dataset(data_loader.dataset)
 81 |     iou_types = _get_iou_types(model)
 82 |     coco_evaluator = CocoEvaluator(coco, iou_types)
 83 | 
 84 |     for images, targets in metric_logger.log_every(data_loader, 100, header):
 85 |         images = list(img.to(device) for img in images)
 86 | 
 87 |         if torch.cuda.is_available():
 88 |             torch.cuda.synchronize()
 89 |         model_time = time.time()
 90 |         outputs = model(images)
 91 | 
 92 |         outputs = [{k: v.to(cpu_device) for k, v in t.items()} for t in outputs]
 93 |         model_time = time.time() - model_time
 94 | 
 95 |         res = {target["image_id"].item(): output for target, output in zip(targets, outputs)}
 96 |         evaluator_time = time.time()
 97 |         coco_evaluator.update(res)
 98 |         evaluator_time = time.time() - evaluator_time
 99 |         metric_logger.update(model_time=model_time, evaluator_time=evaluator_time)
100 | 
101 |     # gather the stats from all processes
102 |     metric_logger.synchronize_between_processes()
103 |     print("Averaged stats:", metric_logger)
104 |     coco_evaluator.synchronize_between_processes()
105 | 
106 |     # accumulate predictions from all images
107 |     coco_evaluator.accumulate()
108 |     coco_evaluator.summarize()
109 |     torch.set_num_threads(n_threads)
110 |     return coco_evaluator
111 | 


--------------------------------------------------------------------------------
/docs/AKS-HCI/notebooks/object-segmentation-on-azure-stack/aml_src/obj_segment_step_data_process.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import random
 4 | import shutil
 5 | 
 6 | parser = argparse.ArgumentParser()
 7 | parser.add_argument('--data-path', type=str, help='input data path')
 8 | parser.add_argument('--train-split', type=str, help='training split data output path')
 9 | parser.add_argument('--test-split', type=str, help='test split data output path')
10 | parser.add_argument('--test-size', type=int, help='test split data size')
11 | 
12 | args = parser.parse_args()
13 | types = ["PNGImages", "PedMasks"]
14 | img_mask_list = [[os.path.join(args.data_path, type, file) for file in sorted(os.listdir(os.path.join(args.data_path, type))) ] for type in types]
15 | print("img_mask_list", img_mask_list)
16 | test_indices = random.sample(range(len(img_mask_list[0])), args.test_size)
17 | 
18 | test_img_folder, test_mask_folder = [os.path.join(args.test_split, type) for type in types]
19 | train_img_folder, train_mask_folder = [os.path.join(args.train_split, type) for type in types]
20 | 
21 | os.makedirs(test_img_folder, exist_ok=True)
22 | os.makedirs(test_mask_folder, exist_ok=True)
23 | os.makedirs(train_img_folder, exist_ok=True)
24 | os.makedirs(train_mask_folder, exist_ok=True)
25 | print("test_img_folder",test_img_folder)
26 | for idx, img_mask in enumerate(zip(*img_mask_list)):
27 |     img, mask = img_mask
28 |     if idx in test_indices:
29 |         print("img path", img)
30 |         print("mask path", mask)
31 |         shutil.copy(img, test_img_folder)
32 |         shutil.copy(mask, test_mask_folder)
33 |     else:
34 |         shutil.copy(img, train_img_folder)
35 |         shutil.copy(mask, train_mask_folder)
36 | 


--------------------------------------------------------------------------------
/docs/AKS-HCI/notebooks/object-segmentation-on-azure-stack/aml_src/transforms.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | from torchvision.transforms import functional as F
 4 | 
 5 | 
 6 | def _flip_coco_person_keypoints(kps, width):
 7 |     flip_inds = [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15]
 8 |     flipped_data = kps[:, flip_inds]
 9 |     flipped_data[..., 0] = width - flipped_data[..., 0]
10 |     # Maintain COCO convention that if visibility == 0, then x, y = 0
11 |     inds = flipped_data[..., 2] == 0
12 |     flipped_data[inds] = 0
13 |     return flipped_data
14 | 
15 | 
16 | class Compose(object):
17 |     def __init__(self, transforms):
18 |         self.transforms = transforms
19 | 
20 |     def __call__(self, image, target):
21 |         for t in self.transforms:
22 |             image, target = t(image, target)
23 |         return image, target
24 | 
25 | 
26 | class RandomHorizontalFlip(object):
27 |     def __init__(self, prob):
28 |         self.prob = prob
29 | 
30 |     def __call__(self, image, target):
31 |         if random.random() < self.prob:
32 |             height, width = image.shape[-2:]
33 |             image = image.flip(-1)
34 |             bbox = target["boxes"]
35 |             bbox[:, [0, 2]] = width - bbox[:, [2, 0]]
36 |             target["boxes"] = bbox
37 |             if "masks" in target:
38 |                 target["masks"] = target["masks"].flip(-1)
39 |             if "keypoints" in target:
40 |                 keypoints = target["keypoints"]
41 |                 keypoints = _flip_coco_person_keypoints(keypoints, width)
42 |                 target["keypoints"] = keypoints
43 |         return image, target
44 | 
45 | 
46 | class ToTensor(object):
47 |     def __call__(self, image, target):
48 |         image = F.to_tensor(image)
49 |         return image, target
50 | 


--------------------------------------------------------------------------------
/docs/AKS-HCI/notebooks/object-segmentation-on-azure-stack/config.json:
--------------------------------------------------------------------------------
1 | {
2 |     "subscription_id": "<sub_id>",
3 |     "resource_group": "<resource_group_name>",
4 |     "workspace_name": "<AML_workspace_name>"
5 | }


--------------------------------------------------------------------------------
/docs/AKS-HCI/notebooks/object-segmentation-on-azure-stack/deployment.yml:
--------------------------------------------------------------------------------
 1 | $schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json
 2 | type: kubernetes
 3 | app_insights_enabled: true
 4 | model: <modelId>
 5 | code_configuration:
 6 |   code: 
 7 |     local_path: ./
 8 |   scoring_script: score.py
 9 | instance_type: <instanceTypeName>
10 | instance_count: 1
11 | environment: azureml:AzureML-pytorch-1.7-ubuntu18.04-py37-cpu-inference:21


--------------------------------------------------------------------------------
/docs/AKS-HCI/notebooks/object-segmentation-on-azure-stack/endpoint.yml:
--------------------------------------------------------------------------------
1 | $schema: https://azuremlschemas.azureedge.net/latest/managedOnlineEndpoint.schema.json
2 | compute: <computeTargetName>
3 | auth_mode: aml_token


--------------------------------------------------------------------------------
/docs/AKS-HCI/notebooks/object-segmentation-on-azure-stack/score.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import time
 4 | import torch
 5 | 
 6 | # Called when the deployed service starts
 7 | def init():
 8 |     global model
 9 |     global device
10 | 
11 |     # Get the path where the deployed model can be found.
12 |     model_filename = 'obj_segmentation.pkl'
13 |     model_path = os.path.join(os.environ['AZUREML_MODEL_DIR'], model_filename)
14 | 
15 |     device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
16 |     model = torch.load(model_path, map_location=device)
17 | 
18 | # Handle requests to the service
19 | def run(data):
20 |     try:
21 |         start_at = time.time()
22 |         inputs = json.loads(data)
23 |         img_data_list = inputs["instances"]
24 |         img_tensor_list = [torch.tensor(item) for item in img_data_list]
25 |         model.eval()
26 |         with torch.no_grad():
27 |             predictions = model([item.to(device) for item in img_tensor_list])
28 | 
29 |         pred_data_list = [{
30 |             "masks": prediction['masks'][0, 0].mul(255).byte().cpu().numpy().tolist(),
31 |             "boxes": prediction['boxes'].numpy().tolist(),
32 |             "labels": prediction['labels'].numpy().tolist(),
33 |             "scores": prediction['scores'].numpy().tolist(),
34 | 
35 |         } for prediction in predictions]
36 | 
37 |         return {"predictions": pred_data_list,
38 |                 "elapsed_time": time.time() - start_at}
39 | 
40 |     except Exception as e:
41 |         error = str(e)
42 |         return error
43 | 


--------------------------------------------------------------------------------
/docs/AKS-HCI/notebooks/pipeline/README.md:
--------------------------------------------------------------------------------
 1 | # Pipeline Run with AKS-HCI cluster
 2 | 
 3 | These samples demonstrate how to run [Azure Machine Learning Pipelines](https://aka.ms/aml-pipelines) with Arc compute.
 4 | 
 5 | ## Notebooks
 6 | 
 7 | * [AML Pipelines with NYC-TAXI-DATA](nyc-taxi-data-regression-model-building.ipynb) (Structured Text Data Prediction)
 8 | 
 9 |   This notebook demonstrates an example of Structured Text Data Prediction, preparing / preprocessing / training data in **default datastore associated with the ML workspace**. The whole flow includes,
10 |   * Download and upload training data to default datastore
11 |   * Use AML pipelines to preprocess and train
12 |     * Cleanse data in parallel
13 |     * Merge cleansed data
14 |     * Normalize data
15 |     * Transform data
16 |     * Split data
17 |     * Train model
18 |   * Register model
19 |   * Inference with the registered model on AKS-HCI cluster
20 |   * Test model 
21 | 
22 | * [AML Pipelines with NYC-TAXI-DATA with all data on NFS server](nyc-taxi-data-regression-model-building-nfs.ipynb) (Structured Text Data Prediction)
23 | 
24 |   This notebook demonstrates an example of Structured Text Data Prediction, preparing / preprocessing / training data on **on-prem NFS server**. The whole flow includes,
25 |   * Download and upload training data to default datastore
26 |   * Use AML pipelines to preprocess and train
27 |     * Cleanse data in parallel
28 |     * Merge cleansed data
29 |     * Normalize data
30 |     * Transform data
31 |     * Split data
32 |     * Train model
33 |   * Register model
34 |   * Inference with the registered model on AKS-HCI cluster
35 |   * Test model 
36 | 
37 | 
38 | 
39 | 
40 | 
41 |    


--------------------------------------------------------------------------------
/docs/AKS-HCI/notebooks/pipeline/config.json:
--------------------------------------------------------------------------------
1 | {
2 |     "subscription_id": "<sub_id>",
3 |     "resource_group": "<resource_group_name>",
4 |     "workspace_name": "<AML_workspace_name>"
5 | }


--------------------------------------------------------------------------------
/docs/AKS-HCI/notebooks/pipeline/deployment.yml:
--------------------------------------------------------------------------------
 1 | $schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json
 2 | type: kubernetes
 3 | app_insights_enabled: true
 4 | model: <modelId>
 5 | code_configuration:
 6 |   code: 
 7 |     local_path: ./
 8 |   scoring_script: score.py
 9 | instance_type: <instanceTypeName>
10 | instance_count: 1
11 | environment: 
12 |   name: taxi-model-env
13 |   version: 1
14 |   conda_file: ./model/conda.yml
15 |   image: mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20210727.v1


--------------------------------------------------------------------------------
/docs/AKS-HCI/notebooks/pipeline/endpoint.yml:
--------------------------------------------------------------------------------
1 | $schema: https://azuremlschemas.azureedge.net/latest/managedOnlineEndpoint.schema.json
2 | compute: <computeTargetName>
3 | auth_mode: aml_token


--------------------------------------------------------------------------------
/docs/AKS-HCI/notebooks/pipeline/images/pipeline-using-dataflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/notebooks/pipeline/images/pipeline-using-dataflow.png


--------------------------------------------------------------------------------
/docs/AKS-HCI/notebooks/pipeline/images/pipeline-using-stepsequence.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/notebooks/pipeline/images/pipeline-using-stepsequence.png


--------------------------------------------------------------------------------
/docs/AKS-HCI/notebooks/pipeline/model/conda.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | name: model-env
 3 | 
 4 | dependencies:
 5 |   - python=3.6.2
 6 |   - pip:
 7 |     - pyarrow
 8 |     - azureml-defaults
 9 |   - pandas
10 |   - scikit-learn
11 |   - numpy=1.19.5
12 | 
13 | channels:
14 |   - anaconda
15 |   - conda-forge
16 | 


--------------------------------------------------------------------------------
/docs/AKS-HCI/notebooks/pipeline/score.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import numpy as np
 3 | import os
 4 | import joblib
 5 | 
 6 | def init():
 7 |     global model
 8 |     # AZUREML_MODEL_DIR is an environment variable created during deployment.
 9 |     # It is the path to the model folder (./azureml-models/$MODEL_NAME/$VERSION)
10 |     # For multiple models, it points to the folder containing all deployed models (./azureml-models)
11 |     model_path = os.path.join(os.getenv('AZUREML_MODEL_DIR'), 'taxi.pkl')
12 |     model = joblib.load(model_path)
13 | 
14 | def run(raw_data):
15 |     data = np.array(json.loads(raw_data)['data'])
16 |     # make prediction
17 |     scores = model.predict(data)
18 |     # you can return any data type as long as it is JSON-serializable
19 |     return scores.tolist()


--------------------------------------------------------------------------------
/docs/AKS-HCI/notebooks/pipeline/scripts/prepdata/cleanse.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft. All rights reserved.
 2 | # Licensed under the MIT license.
 3 | 
 4 | import argparse
 5 | import os
 6 | from azureml.core import Run
 7 | import pandas as pd
 8 | 
 9 | 
10 | def get_dict(dict_str):
11 |     pairs = dict_str.strip("{}").split("\;")
12 |     new_dict = {}
13 |     for pair in pairs:
14 |         key, value = pair.strip().split(":")
15 |         new_dict[key.strip().strip("'")] = value.strip().strip("'")
16 | 
17 |     return new_dict
18 | 
19 | 
20 | print("Cleans the input data")
21 | 
22 | parser = argparse.ArgumentParser("cleanse")
23 | parser.add_argument('--data-path', type=str, help='input data path')
24 | parser.add_argument("--output_cleanse", type=str, help="cleaned taxi data directory")
25 | parser.add_argument("--useful_columns", type=str, help="useful columns to keep")
26 | #parser.add_argument("--columns", type=str, help="rename column pattern")
27 | parser.add_argument("--columns_key", type=str, help="rename column pattern")
28 | parser.add_argument("--columns_value", type=str, help="rename column pattern")
29 | 
30 | args = parser.parse_args()
31 | 
32 | print("Argument 1(columns to keep): %s" % str(args.useful_columns.strip("[]").split("\;")))
33 | print("Argument 2(columns renaming mapping Key): %s" % str(args.columns_key.strip("{}").split("\;")))
34 | print("Argument 2(columns renaming mapping value): %s" % str(args.columns_value.strip("{}").split("\;")))
35 | print("Argument 3(output cleansed taxi data path): %s" % args.output_cleanse)
36 | 
37 | # These functions ensure that null data is removed from the dataset,
38 | # which will help increase machine learning model accuracy.
39 | 
40 | useful_columns = [s.strip().strip("'") for s in args.useful_columns.strip("[]").split("\;")]
41 | columns_key = [s.strip().strip("'") for s in args.columns_key.strip("[]").split("\;")]
42 | columns_value = [s.strip().strip("'") for s in args.columns_value.strip("[]").split("\;")]
43 | 
44 | columns = {key: value for key, value in zip(columns_key, columns_value)}
45 | 
46 | 
47 | raw_df = pd.read_csv(args.data_path)
48 | new_df = (raw_df
49 |           .dropna(how='all')
50 |           .rename(columns=columns))[useful_columns]
51 | 
52 | new_df.reset_index(inplace=True, drop=True)
53 | 
54 | if not (args.output_cleanse is None):
55 |     os.makedirs(args.output_cleanse, exist_ok=True)
56 |     print("%s created" % args.output_cleanse)
57 |     path = args.output_cleanse + "/processed.csv"
58 |     write_df = new_df.to_csv(path)
59 | 


--------------------------------------------------------------------------------
/docs/AKS-HCI/notebooks/pipeline/scripts/prepdata/filter.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import pandas as pd
 4 | 
 5 | print("Filters out coordinates for locations that are outside the city border.",
 6 |       "Chain the column filter commands within the filter() function",
 7 |       "and define the minimum and maximum bounds for each field.")
 8 | 
 9 | parser = argparse.ArgumentParser("filter")
10 | parser.add_argument('--data-path', type=str, help='input data path')
11 | parser.add_argument("--output_filter", type=str, help="filter out out of city locations")
12 | 
13 | args = parser.parse_args()
14 | 
15 | print("Argument (output filtered taxi data path): %s" % args.output_filter)
16 | 
17 | # These functions filter out coordinates for locations that are outside the city border.
18 | 
19 | # Filter out coordinates for locations that are outside the city border.
20 | # Chain the column filter commands within the filter() function
21 | # and define the minimum and maximum bounds for each field
22 | 
23 | combined_df = pd.read_csv(args.data_path + "/processed.csv")
24 | 
25 | combined_df = combined_df.astype({"pickup_longitude": 'float64', "pickup_latitude": 'float64',
26 |                                   "dropoff_longitude": 'float64', "dropoff_latitude": 'float64'})
27 | 
28 | latlong_filtered_df = combined_df[(combined_df.pickup_longitude <= -73.72) &
29 |                                   (combined_df.pickup_longitude >= -74.09) &
30 |                                   (combined_df.pickup_latitude <= 40.88) &
31 |                                   (combined_df.pickup_latitude >= 40.53) &
32 |                                   (combined_df.dropoff_longitude <= -73.72) &
33 |                                   (combined_df.dropoff_longitude >= -74.72) &
34 |                                   (combined_df.dropoff_latitude <= 40.88) &
35 |                                   (combined_df.dropoff_latitude >= 40.53)]
36 | 
37 | latlong_filtered_df.reset_index(inplace=True, drop=True)
38 | 
39 | if not (args.output_filter is None):
40 |     os.makedirs(args.output_filter, exist_ok=True)
41 |     print("%s created" % args.output_filter)
42 |     path = args.output_filter + "/processed.csv"
43 |     write_df = latlong_filtered_df.to_csv(path)
44 | 


--------------------------------------------------------------------------------
/docs/AKS-HCI/notebooks/pipeline/scripts/prepdata/merge.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import pandas as pd
 4 | 
 5 | print("Merge Green and Yellow taxi data")
 6 | 
 7 | parser = argparse.ArgumentParser("merge")
 8 | parser.add_argument("--output_merge", type=str, help="green and yellow taxi data merged")
 9 | parser.add_argument("--green_data_path", type=str, help="green data path")
10 | parser.add_argument("--yellow_data_path", type=str, help="yellow data path")
11 | 
12 | args = parser.parse_args()
13 | print("Argument (output merge taxi data path): %s" % args.output_merge)
14 | 
15 | 
16 | green_df = pd.read_csv(args.green_data_path + "/processed.csv")
17 | yellow_df = pd.read_csv(args.yellow_data_path + "/processed.csv")
18 | 
19 | # Appending yellow data to green data
20 | combined_df = green_df.append(yellow_df, ignore_index=True)
21 | combined_df.reset_index(inplace=True, drop=True)
22 | 
23 | if not (args.output_merge is None):
24 |     os.makedirs(args.output_merge, exist_ok=True)
25 |     print("%s created" % args.output_merge)
26 |     path = args.output_merge + "/processed.csv"
27 |     write_df = combined_df.to_csv(path)
28 | 


--------------------------------------------------------------------------------
/docs/AKS-HCI/notebooks/pipeline/scripts/prepdata/normalize.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import pandas as pd
 4 | import pandas as pd
 5 | 
 6 | print("Replace undefined values to relavant values and rename columns to meaningful names")
 7 | 
 8 | parser = argparse.ArgumentParser("normalize")
 9 | parser.add_argument('--data-path', type=str, help='input data path')
10 | parser.add_argument("--output_normalize", type=str, help="replaced undefined values and renamed columns")
11 | 
12 | args = parser.parse_args()
13 | 
14 | print("Argument (output normalized taxi data path): %s" % args.output_normalize)
15 | 
16 | combined_converted_df = pd.read_csv(args.data_path + "/processed.csv")
17 | 
18 | # These functions replace undefined values and rename to use meaningful names.
19 | replaced_stfor_vals_df = (combined_converted_df.replace({"store_forward": "0"}, {"store_forward": "N"})
20 |                           .fillna({"store_forward": "N"}))
21 | 
22 | replaced_distance_vals_df = (replaced_stfor_vals_df.replace({"distance": ".00"}, {"distance": 0})
23 |                              .fillna({"distance": 0}))
24 | 
25 | normalized_df = replaced_distance_vals_df.astype({"distance": 'float64'})
26 | 
27 | temp = pd.DatetimeIndex(normalized_df["pickup_datetime"])
28 | normalized_df["pickup_date"] = temp.date
29 | normalized_df["pickup_time"] = temp.time
30 | 
31 | temp = pd.DatetimeIndex(normalized_df["dropoff_datetime"])
32 | normalized_df["dropoff_date"] = temp.date
33 | normalized_df["dropoff_time"] = temp.time
34 | 
35 | del normalized_df["pickup_datetime"]
36 | del normalized_df["dropoff_datetime"]
37 | 
38 | normalized_df.reset_index(inplace=True, drop=True)
39 | 
40 | if not (args.output_normalize is None):
41 |     os.makedirs(args.output_normalize, exist_ok=True)
42 |     print("%s created" % args.output_normalize)
43 |     path = args.output_normalize + "/processed.csv"
44 |     write_df = normalized_df.to_csv(path)
45 | 


--------------------------------------------------------------------------------
/docs/AKS-HCI/notebooks/pipeline/scripts/prepdata/transform.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import pandas as pd
 4 | 
 5 | print("Transforms the renamed taxi data to the required format")
 6 | 
 7 | 
 8 | parser = argparse.ArgumentParser("transform")
 9 | parser.add_argument('--data-path', type=str, help='input data path')
10 | parser.add_argument("--output_transform", type=str, help="transformed taxi data")
11 | 
12 | args = parser.parse_args()
13 | 
14 | print("Argument 2(output final transformed taxi data): %s" % args.output_transform)
15 | 
16 | # These functions transform the renamed data to be used finally for training.
17 | 
18 | # Split the pickup and dropoff date further into the day of the week, day of the month, and month values.
19 | # To get the day of the week value, use the derive_column_by_example() function.
20 | # The function takes an array parameter of example objects that define the input data,
21 | # and the preferred output. The function automatically determines your preferred transformation.
22 | # For the pickup and dropoff time columns, split the time into the hour, minute, and second by using
23 | # the split_column_by_example() function with no example parameter. After you generate the new features,
24 | # use the drop_columns() function to delete the original fields as the newly generated features are preferred.
25 | # Rename the rest of the fields to use meaningful descriptions.
26 | 
27 | normalized_df = pd.read_csv(args.data_path + "/processed.csv")
28 | normalized_df = normalized_df.astype({"pickup_date": 'datetime64[ns]', "dropoff_date": 'datetime64[ns]',
29 |                                       "pickup_time": 'datetime64[us]', "dropoff_time": 'datetime64[us]',
30 |                                       "distance": 'float64', "cost": 'float64'})
31 | 
32 | normalized_df["pickup_weekday"] = normalized_df["pickup_date"].dt.dayofweek
33 | normalized_df["pickup_month"] = normalized_df["pickup_date"].dt.month
34 | normalized_df["pickup_monthday"] = normalized_df["pickup_date"].dt.day
35 | 
36 | normalized_df["dropoff_weekday"] = normalized_df["dropoff_date"].dt.dayofweek
37 | normalized_df["dropoff_month"] = normalized_df["dropoff_date"].dt.month
38 | normalized_df["dropoff_monthday"] = normalized_df["dropoff_date"].dt.day
39 | 
40 | normalized_df["pickup_hour"] = normalized_df["pickup_time"].dt.hour
41 | normalized_df["pickup_minute"] = normalized_df["pickup_time"].dt.minute
42 | normalized_df["pickup_second"] = normalized_df["pickup_time"].dt.second
43 | 
44 | normalized_df["dropoff_hour"] = normalized_df["dropoff_time"].dt.hour
45 | normalized_df["dropoff_minute"] = normalized_df["dropoff_time"].dt.minute
46 | normalized_df["dropoff_second"] = normalized_df["dropoff_time"].dt.second
47 | 
48 | # Drop the pickup_date, dropoff_date, pickup_time, dropoff_time columns because they're
49 | # no longer needed (granular time features like hour,
50 | # minute and second are more useful for model training).
51 | del normalized_df["pickup_date"]
52 | del normalized_df["dropoff_date"]
53 | del normalized_df["pickup_time"]
54 | del normalized_df["dropoff_time"]
55 | 
56 | # Before you package the dataset, run two final filters on the dataset.
57 | # To eliminate incorrectly captured data points,
58 | # filter the dataset on records where both the cost and distance variable values are greater than zero.
59 | # This step will significantly improve machine learning model accuracy,
60 | # because data points with a zero cost or distance represent major outliers that throw off prediction accuracy.
61 | 
62 | final_df = normalized_df[(normalized_df.distance > 0) & (normalized_df.cost > 0)]
63 | final_df.reset_index(inplace=True, drop=True)
64 | 
65 | # Writing the final dataframe to use for training in the following steps
66 | if not (args.output_transform is None):
67 |     os.makedirs(args.output_transform, exist_ok=True)
68 |     print("%s created" % args.output_transform)
69 |     path = args.output_transform + "/processed.csv"
70 |     write_df = final_df.to_csv(path)
71 | 


--------------------------------------------------------------------------------
/docs/AKS-HCI/notebooks/pipeline/scripts/trainmodel/train_step.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from sklearn.linear_model import LinearRegression
 3 | import os
 4 | import pandas as pd
 5 | 
 6 | import joblib
 7 | 
 8 | def train():
 9 | 
10 |     train_set = pd.read_csv(args.train_data_path + "/processed.csv")
11 |     test_set = pd.read_csv(args.test_data_path + "/processed.csv")
12 | 
13 |     selected_columns = ['pickup_weekday', 'pickup_hour', 'distance', 'passengers', 'vendor', 'cost']
14 |     train_set = train_set[selected_columns]
15 |     test_set = test_set[selected_columns]
16 | 
17 |     train_features = train_set.drop("cost", axis=1)
18 |     train_labels = train_set["cost"].copy()
19 |     lr = LinearRegression()
20 |     lr.fit(train_features, train_labels)
21 | 
22 |     filename = os.path.join('outputs', 'taxi.pkl')
23 | 
24 |     joblib.dump(lr, filename)
25 | 
26 |     test_features = test_set.drop("cost", axis=1)[:3]
27 |     test_labels = test_set["cost"].copy()
28 |     preds = lr.predict(test_features)
29 | 
30 |     print("preds", preds)
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     parser = argparse.ArgumentParser("split")
35 |     parser.add_argument("--train_data_path", type=str, help="train data path")
36 |     parser.add_argument("--test_data_path", type=str, help="test data path")
37 |     args = parser.parse_args()
38 |     train()


--------------------------------------------------------------------------------
/docs/AKS-HCI/notebooks/pipeline/scripts/trainmodel/train_test_split.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import pandas as pd
 4 | from sklearn.model_selection import train_test_split
 5 | 
 6 | def write_output(df, path):
 7 |     os.makedirs(path, exist_ok=True)
 8 |     print("%s created" % path)
 9 |     df.to_csv(path + "/processed.csv")
10 | 
11 | 
12 | print("Split the data into train and test")
13 | 
14 | parser = argparse.ArgumentParser("split")
15 | parser.add_argument('--data-path', type=str, help='input data path')
16 | parser.add_argument("--output_split_train", type=str, help="output split train data")
17 | parser.add_argument("--output_split_test", type=str, help="output split test data")
18 | 
19 | args = parser.parse_args()
20 | 
21 | print("Argument 1(output training data split path): %s" % args.output_split_train)
22 | print("Argument 2(output test data split path): %s" % args.output_split_test)
23 | 
24 | # These functions splits the input features and labels into test and train data
25 | # Visit https://docs.microsoft.com/en-us/azure/machine-learning/service/tutorial-auto-train-models for more detail
26 | 
27 | 
28 | transformed_df = pd.read_csv(args.data_path + "/processed.csv")
29 | output_split_train, output_split_test = train_test_split(transformed_df, test_size=0.2, random_state=223)
30 | output_split_train.reset_index(inplace=True, drop=True)
31 | output_split_test.reset_index(inplace=True, drop=True)
32 | 
33 | if not (args.output_split_train is None and
34 |         args.output_split_test is None):
35 |     write_output(output_split_train, args.output_split_train)
36 |     write_output(output_split_test, args.output_split_test)
37 | 


--------------------------------------------------------------------------------
/docs/AKS-HCI/notebooks/pipeline/test_set.csv:
--------------------------------------------------------------------------------
 1 | ,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,Unnamed: 0.1.1.1.1,cost,distance,dropoff_latitude,dropoff_longitude,passengers,pickup_latitude,pickup_longitude,store_forward,vendor,pickup_weekday,pickup_month,pickup_monthday,dropoff_weekday,dropoff_month,dropoff_monthday,pickup_hour,pickup_minute,pickup_second,dropoff_hour,dropoff_minute,dropoff_second
 2 | 0,3823,3891,3891,3933,3933,10.5,1.49,40.67766953,-73.96222687,1,40.6884613,-73.98020935,N,2,1,1,26,1,1,26,15,55,16,16,12,33
 3 | 1,2408,2444,2444,2467,2467,20,6.66,40.73447418,-73.99212646,1,40.79590988,-73.93545532,N,2,6,1,24,6,1,24,23,20,56,23,36,9
 4 | 2,2722,2766,2766,2792,2792,7.5,1.42,40.70057297,-73.99156189,2,40.68553925,-73.99443054,N,2,5,1,9,5,1,9,17,0,20,17,8,52
 5 | 3,600,609,609,613,613,6,0.89,40.80518341,-73.94141388,1,40.81178665,-73.95517731,N,2,1,1,5,1,1,5,8,24,18,8,30,25
 6 | 4,2181,2216,2216,2235,2235,5.5,0.98,40.71474075,-73.9499054,1,40.71577454,-73.96444702,N,2,4,1,1,4,1,1,22,44,16,22,48,52
 7 | 5,1542,1567,1567,1580,1580,5.5,1,40.66854095,-73.99355316,1,40.67189789,-73.98403168,N,2,6,1,10,6,1,10,18,20,31,18,24,45
 8 | 6,1389,1413,1413,1425,1425,23.5,6.59,40.7955246,-73.97167206,6,40.7430687,-73.91899109,N,2,0,1,18,0,1,18,6,48,55,7,16,50
 9 | 7,653,663,663,667,667,7,1.4,40.7974472,-73.97264862,1,40.79085159,-73.95359039,N,1,5,1,2,5,1,2,14,49,24,14,55,57
10 | 8,2406,2442,2442,2465,2465,4.5,0.66,40.80567551,-73.94690704,1,40.81209183,-73.94241333,N,2,4,1,15,4,1,15,11,6,19,11,9,34
11 | 


--------------------------------------------------------------------------------
/docs/AKS-HCI/notebooks/upload-download-model/AML-model-download-upload.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Upload Downloaded AML Model Files To Azure Blobs\n",
  8 |     "\n",
  9 |     "In this notebook, you will download a model from azure machine learning workspace to your local envrionment, then upload the model files as azure storage blobs.\n",
 10 |     "\n",
 11 |     "## Prerequisites\n",
 12 |     "*  Azure Machine Learning Workspace\n",
 13 |     "If you are using an Azure Machine Learning Notebook VM, you are all set. Otherwise, go through the [AZML-SDK-INSTALL](https://docs.microsoft.com/en-us/python/api/overview/azure/ml/install?view=azure-ml-py)  to install the Azure Machine Learning Python SDK and create an Azure ML `Workspace`.\n",
 14 |     "*  An registered AML machine learning model. For how to train and register model, please see [pytorch-cifar10-train](https://github.com/Azure/AML-Kubernetes/tree/master/docs/AKS-HCI/notebooks/distributed-cifar10/distributed-pytorch-cifar10.ipynb)\n",
 15 |     "*  Azure storage blob client library. For more details, please see [here](https://docs.microsoft.com/en-us/python/api/overview/azure/storage-blob-readme?view=azure-python). "
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": null,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "from azureml.core import Workspace\n",
 25 |     "import os"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "markdown",
 30 |    "metadata": {},
 31 |    "source": [
 32 |     "## Initialize workspace\n",
 33 |     "\n",
 34 |     "Initialize a [Workspace](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#workspace) object from the existing workspace you created in the Prerequisites step. `Workspace.from_config()` creates a workspace object from the details stored in `config.json`. \n",
 35 |     "\n",
 36 |     "If you haven't done already please go to `config.json` file and fill in your workspace information."
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "ws = Workspace.from_config()\n",
 46 |     "print('Workspace name: ' + ws.name, \n",
 47 |     "      'Azure region: ' + ws.location, \n",
 48 |     "      'Subscription id: ' + ws.subscription_id, \n",
 49 |     "      'Resource group: ' + ws.resource_group, sep='\\n')"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "markdown",
 54 |    "metadata": {},
 55 |    "source": [
 56 |     "## Download model from AML Workspace"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": null,
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "model_path = 'cifar10model'\n",
 66 |     "model_name = \"cifar10torch\"\n",
 67 |     "\n",
 68 |     "ws.models[model_name].download(target_dir=model_path, exist_ok=True)"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "markdown",
 73 |    "metadata": {},
 74 |    "source": [
 75 |     "## Upload model files to Azure storage blobs"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": null,
 81 |    "metadata": {},
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "from azure.storage.blob import BlobServiceClient, ContainerClient\n",
 85 |     "    \n",
 86 |     "connection_string = \"<storage account connection string>\"\n",
 87 |     "blob_service_client = BlobServiceClient.from_connection_string(connection_string)\n",
 88 |     "    \n",
 89 |     "container_name = \"pytorchmodel\"\n",
 90 |     "\n",
 91 |     "container_client = blob_service_client.get_container_client(container_name)\n",
 92 |     "\n",
 93 |     "try:\n",
 94 |     "    container_properties = container_client.get_container_properties()\n",
 95 |     "except Exception as e:\n",
 96 |     "    container_client.create_container()\n",
 97 |     "\n",
 98 |     "for root, dirs, files in os.walk(model_path):\n",
 99 |     "    for file in files:\n",
100 |     "        source_file = os.path.join(root, file)\n",
101 |     "        blob_name = source_file\n",
102 |     "        blob_client = container_client.get_blob_client(blob_name)\n",
103 |     "        if blob_client.exists():\n",
104 |     "            blob_client.delete_blob()\n",
105 |     "        with open(source_file, \"rb\") as data:\n",
106 |     "            blob_client.upload_blob(data, blob_type=\"BlockBlob\")"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "markdown",
111 |    "metadata": {},
112 |    "source": [
113 |     "Now you should see the files are uploaded to Blob Storage"
114 |    ]
115 |   }
116 |  ],
117 |  "metadata": {
118 |   "interpreter": {
119 |    "hash": "fc402497f0168b24575e2ffafe64cd34c507b9a7fab971a93b09782ae565c5c6"
120 |   },
121 |   "kernelspec": {
122 |    "display_name": "Python 3.8.3 64-bit",
123 |    "name": "python3"
124 |   },
125 |   "language_info": {
126 |    "codemirror_mode": {
127 |     "name": "ipython",
128 |     "version": 3
129 |    },
130 |    "file_extension": ".py",
131 |    "mimetype": "text/x-python",
132 |    "name": "python",
133 |    "nbconvert_exporter": "python",
134 |    "pygments_lexer": "ipython3",
135 |    "version": "3.8.3"
136 |   }
137 |  },
138 |  "nbformat": 4,
139 |  "nbformat_minor": 4
140 | }
141 | 


--------------------------------------------------------------------------------
/docs/AKS-HCI/notebooks/upload-download-model/README.md:
--------------------------------------------------------------------------------
 1 | # Upload Downloaded AML Model Files To Azure Blobs
 2 | 
 3 | These samples demonstrate how to download AML model files and upload to Azure Blob
 4 | 
 5 | ## Notebooks
 6 | 
 7 | * [Model Download and Upload](AML-model-download-upload.ipynb)
 8 | 
 9 |   This notebook demonstrates an example of downloading AML model files then upload to Azure Blob
10 | 
11 | 
12 | 
13 | 
14 |    


--------------------------------------------------------------------------------
/docs/AKS-HCI/notebooks/upload-download-model/config.json:
--------------------------------------------------------------------------------
1 | {
2 |     "subscription_id": "<sub_id>",
3 |     "resource_group": "<resource_group_name>",
4 |     "workspace_name": "<AML_workspace_name>"
5 | }


--------------------------------------------------------------------------------
/docs/AKS-HCI/troubleshooting.md:
--------------------------------------------------------------------------------
 1 | # Troubleshooting for AzureML Training on Azure Stack Hub Kubernetes Cluster and Storage
 2 | 
 3 | ## AzureML Dataset and Datastore Issues
 4 | 
 5 | * Known limitations:
 6 |    
 7 |    * AML Tabular Datasets are not currenly supported on ASH blob storage
 8 |    * AML Dataset downloading is not yet stable. PLease use mounting to access your files during training
 9 |    
10 | * If you can't access your datastore (mounting or uploading): Please check if your generated SAS token for your ASH storage container is still valid and not expired. If unsure you can try to repeat the instruction from step 5 of [this document](Train-AzureArc.md#create-and-configure-azure-stack-hubs-storage-account).
11 | 
12 | ## ASH Kubernetes Cluster Related Issues
13 | 
14 | * Attaching ASH Kubernetes Cluster to AzureML Workspace Failed
15 |    
16 |   * Make sure your Kubernetes cluster is connected to Azure using Azure Arc. Also, make sure the version of your Kubernetes cluster is [supported](https://docs.microsoft.com/en-us/azure/aks/supported-kubernetes-versions#kubernetes-version-support-policy). Please make sure you are using the latest Arc agent according to pre-requisites of [this doc](https://github.com/Azure/azure-arc-kubernetes-preview/blob/master/docs/k8s-extensions.md#pre-requisites). You can find most of your information from the Azure portal: 
17 |       
18 |       <p align="center">
19 |             <img src="imgs/kubernetes_arc.png" />
20 |       </p>
21 | 
22 |    * Please make sure the latest Arc extensions are installed and Arc connections are created as described in the pre-requisites of [this doc](https://github.com/Azure/azure-arc-kubernetes-preview/blob/master/docs/k8s-extensions.md#pre-requisites).
23 |    
24 | 
25 |       You may also run the following kubectl commands against one of the master nodes of your cluster to check if you cluster is porperly attached to Azure via Azure Arc:
26 |       
27 |       <pre> kubectl get ns </pre>
28 |       
29 |        You should see "azure-arc" is one of the namespaces.
30 | 
31 |       <pre> kubectl get pods -n azure-arc </pre>
32 |        You should see all the pods in "running" status.
33 |    
34 | 
35 | ## AzureML Run Issues
36 | 
37 |   * No Progress on AzureML Experiment Runs
38 | 
39 |     Currently, there is a limitation on the number of pods that can run simultaneously in a single node. There can't be more than 1 pending pod in a single node. This means that if there are more pending pods than the number of worker nodes, all the pending pods will remain pending indefinitely. As an example, if the combination of your training workloads is sending 5 pods to your Kubernetes cluster and you only have 4 worker nodes, your training pods will never get scheduled by the Kubernetes agent.  If you face this issue, please cancel all of your none-progressing runs on your AzureML workspace and retry accordingly. If you do a distributed training run, you may need to reduce the node_count value in Run Configuration.
40 |    
41 |   * Out of memory issue
42 | 
43 |       If your training job fails without any apparent reasons: This could be because of not having sufficient memory on your Kubernetes nodes. Insufficient Node memory could also be the reason for cases in which training is successful for only one epoch but fails for multiple epochs. Please try to either increase your Nodes' memory or optimize your training code to become less memory intensive.
44 | 
45 | * Bugs in your scripts
46 |   
47 | 
48 |    These issues are relatively easy to debug. You can go to your AzureML Workspace to check run logs for information about environment image creations, run time errors, outputs generated by your scripts, etc. This way you can pinpoint the reason for your training workload failing. Here is a snapshot:
49 | 
50 |    <p align="center">
51 |       <img src="imgs/azureml_log.png" />
52 |    </p>
53 | 
54 | 
55 | ## Other Known Limitations
56 | 
57 | Please check out our other [known limitations](../limitations-and-knownIssues.md) and [troubleshooting](../troubleshooting.md) docs.
58 | 


--------------------------------------------------------------------------------
/docs/AKS-HCI/video/kfserving_tf_blob_structure.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/video/kfserving_tf_blob_structure.mp4


--------------------------------------------------------------------------------
/docs/application-gateway-ingress-controller.md:
--------------------------------------------------------------------------------
  1 | # Tutorial
  2 | 
  3 | These tutorials help illustrate how to integrate [Azure Application Gateway](https://azure.microsoft.com/en-us/services/application-gateway/) with AzureML extension over HTTP or HTTPS.
  4 | 
  5 | ## Table of Contents
  6 | 
  7 | - [Prerequisites](#prerequisites)
  8 | - [Deploy AzureML extension](#deploy-azureml-extension)
  9 | - [Expose services over HTTP](#expose-services-over-http)
 10 | - [Expose services over HTTPS](#expose-services-over-https)
 11 | 
 12 | ## Prerequisites
 13 | 
 14 | - Install the latest k8s-extension and ml cli.
 15 |   - `az extension add -n k8s-extension --upgrade`
 16 |   - `az extension add -n ml --upgrade`
 17 | - Setup Application Gateway.
 18 |   - [**Greenfield Deployment**](https://docs.microsoft.com/en-us/azure/application-gateway/tutorial-ingress-controller-add-on-new): If you are starting from scratch, refer to these instructions.
 19 |   - [**Brownfield Deployment**](https://docs.microsoft.com/en-us/azure/application-gateway/tutorial-ingress-controller-add-on-existing): If you have an existing AKS cluster and Application Gateway, refer to these instructions.
 20 | - If you want to use HTTPS on this application, you will need a x509 certificate and its private key.
 21 | 
 22 | ## Deploy AzureML extension
 23 | 
 24 | [Deploy extension](https://github.com/Azure/AML-Kubernetes/blob/master/docs/deploy-extension.md#azureml-extension-deployment-scenarios) with `inferenceRouterServiceType=ClusterIP` and `allowInsecureConnections=True`, so that the Application gateway can handle TLS termination by itself instead of handing it over to azureml-fe (azureml inference router created by extension) when service is exposed over HTTPS.
 25 | 
 26 | 
 27 | ## Expose services over HTTP
 28 | 
 29 | In order to expose the azureml-fe we will using the following ingress resource:
 30 | 
 31 | ```yaml
 32 | apiVersion: networking.k8s.io/v1
 33 | kind: Ingress
 34 | metadata:
 35 |   name: azureml-fe
 36 |   namespace: azureml
 37 | spec:
 38 |   ingressClassName: azure-application-gateway
 39 |   rules:
 40 |   - http:
 41 |       paths:
 42 |       - path: /
 43 |         backend:
 44 |           service:
 45 |             name: azureml-fe
 46 |             port:
 47 |               number: 80
 48 |         pathType: Prefix
 49 | ```
 50 | 
 51 | This ingress will expose the `azureml-fe` service and the selected deployment as a default backend of the Application Gateway.
 52 | 
 53 | Save the above ingress resource as `ing-azureml-fe.yaml`.
 54 | 
 55 | 1. Deploy `ing-azureml-fe.yaml` by running:
 56 | 
 57 |     ```bash
 58 |     kubectl apply -f ing-azureml-fe.yaml
 59 |     ```
 60 | 
 61 | 2. Check the log of the ingress controller for deployment status.
 62 | 
 63 | 3. Now the `azureml-fe` application should be available. You can check this by visiting the public address of the Application Gateway.
 64 | 
 65 | 4. [Create an inference job and invoke](https://github.com/Azure/AML-Kubernetes/blob/master/docs/simple-flow.md).
 66 | 
 67 |     *NOTE:* Replace the ip in scoring_uri with public address of the Application Gateway before invoking.
 68 | 
 69 | ## Expose services over HTTPS
 70 | 
 71 | 1. Before deploying ingress, you need to create a kubernetes secret to host the certificate and private key. You can create a kubernetes secret by running
 72 | 
 73 |     ```bash
 74 |     kubectl create secret tls <ingress-secret-name> -n azureml --key <path-to-key> --cert <path-to-cert>
 75 |     ```
 76 | 
 77 | 2. Define the following ingress. In the ingress, specify the name of the secret in the `secretName` section.
 78 | 
 79 |     ```yaml
 80 |     apiVersion: networking.k8s.io/v1
 81 |     kind: Ingress
 82 |     metadata:
 83 |       name: azureml-fe
 84 |       namespace: azureml
 85 |     spec:
 86 |       ingressClassName: azure-application-gateway
 87 |       tls:
 88 |       - hosts:
 89 |         - <domain>
 90 |         secretName: <ingress-secret-name>
 91 |       rules:
 92 |       - host: <domain>
 93 |         http:
 94 |           paths:
 95 |           - path: /
 96 |             backend:
 97 |               service:
 98 |                 name: azureml-fe
 99 |                 port:
100 |                   number: 80
101 |             pathType: Prefix
102 |     ```
103 | 
104 |     *NOTE:* Replace `<domain>` and `<ingress-secret-name>` in the above Ingress Resource with the domain pointing to the Application Gateway and the name of your secret. Store the above Ingress Resource in a file name `ing-azureml-fe-tls.yaml`.
105 | 
106 | 1. Deploy ing-azureml-fe-tls.yaml by running
107 | 
108 |     ```bash
109 |     kubectl apply -f ing-azureml-fe-tls.yaml
110 |     ```
111 | 
112 | 2. Check the log of the ingress controller for deployment status.
113 | 
114 | 3. Now the `azureml-fe` application will be available on HTTPS. You can check this by visiting the public address of the Application Gateway.
115 | 
116 | 4. [Create an inference job and invoke](https://github.com/Azure/AML-Kubernetes/blob/master/docs/simple-flow.md).
117 | 
118 |     *NOTE:* Replace the protocol and ip in scoring_uri with https and domain pointing to the Application Gateway before invoking.
119 | 


--------------------------------------------------------------------------------
/docs/azureml-aks-ta-support.md:
--------------------------------------------------------------------------------
 1 | # AzureML access to AKS clusters with special configurations
 2 | 
 3 | Built-upon [AKS Trusted Access feature](https://learn.microsoft.com/azure/aks/trusted-access-feature), AzureML now supports access to AKS clusters with following special configurations:
 4 | - AKS cluster with local account disabled
 5 | - AKS cluster with authorized IP range
 6 | - Private AKS with public FQDN configuration
 7 | 
 8 | 📣 This feature has been deployed in the public cloud(AzureCloud). AzureUSGovernment, AzureChinaCloud and AirGap clouds have not enabled this feature.
 9 | 
10 | Once the feature is deplyed to your regions, you could (re/)attach your compute to enable it; you can verify if the feature has been enabled on your AKS cluster with following steps:
11 | - Verify that ```Microsoft.MachineLearningServices/workspaces/mlworkload``` role binding is created in AKS cluster. **Note**: AzureML role binding is per workspace, if your AKS cluster is shared among multiple workspace, you should have AzureML role binding for each workspace.
12 | ```shell
13 |     az aks trustedaccess rolebinding list --resource-group <resource-group> --cluster-name <cluster-name>
14 | ```
15 | > <span style="color:orange">**Notes**:</span> 
16 | > 
17 | > * If you have any existing compute targets created before AzureML role binding was created, those compute targets will not work with AKS cluster with above special configurations. Please detach those existing compute targets to avoid any issues.
18 | > * This role binding does not work with legacy AksCompute (AKS inference cluster).


--------------------------------------------------------------------------------
/docs/deploy-on-ocp.md:
--------------------------------------------------------------------------------
 1 | # Deploy AzureML extension on OpenShift Container Platform
 2 | 
 3 | Azure Arc enabled ML supports both Azure RedHat OpenShift Service (ARO) and OpenShift Container Platform (OCP).
 4 | 
 5 | ## Prerequisites
 6 | 
 7 | An ARO or OCP Kubernetes cluster is up and running. 
 8 | 
 9 |    * To setup ARO Kubernetes cluster on Azure, please follow instruction [here](https://docs.microsoft.com/azure/openshift/tutorial-create-cluster)
10 |    * to setup OCP Kubernetes clsuter, please follow instructure on [RedHat website](https://docs.openshift.com/container-platform/4.6/installing/installing_platform_agnostic/installing-platform-agnostic.html).
11 | 
12 | ## Disable Security Enhanced Linux (SELinux) 
13 | 
14 | [AzureML dataset](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-train-with-datasets), usually used in ML training jobs, is not supported on machines with SELinux enabled. Therefore, to use AzureML dataset, please make sure `selinux` is disabled on workers for AzureML usage. 
15 | 
16 | ## Privileged setup for ARO and OCP
17 | 
18 | For AzureML extension deployment on ARO or OCP cluster, grant privileged access to AzureML service accounts, run ```oc edit scc privileged``` command, and add following service accounts under "users:":
19 | 
20 |    * ```system:serviceaccount:azure-arc:azure-arc-kube-aad-proxy-sa```
21 |    * ```system:serviceaccount:azureml:{EXTENSION-NAME}-kube-state-metrics``` 
22 |    * ```system:serviceaccount:azureml:cluster-status-reporter```
23 |    * ```system:serviceaccount:azureml:prom-admission```
24 |    * ```system:serviceaccount:azureml:default```
25 |    * ```system:serviceaccount:azureml:prom-operator```
26 |    * ```system:serviceaccount:azureml:csi-blob-node-sa```
27 |    * ```system:serviceaccount:azureml:csi-blob-controller-sa```
28 |    * ```system:serviceaccount:azureml:load-amlarc-selinux-policy-sa```
29 |    * ```system:serviceaccount:azureml:azureml-fe```
30 |    * ```system:serviceaccount:azureml:prom-prometheus```
31 |    * ```system:serviceaccount:{KUBERNETES-COMPUTE-NAMESPACE}:default```
32 |    * ```system:serviceaccount:azureml:azureml-ingress-nginx```
33 |    * ```system:serviceaccount:azureml:azureml-ingress-nginx-admission```
34 |    > **<span stype="color:yellow">Notes</span>**
35 |       >* **{EXTENSION-NAME}:** is the extension name specified with ```az k8s-extension create --name``` CLI command. 
36 |       >* **{KUBERNETES-COMPUTE-NAMESPACE}:** is the namespace of kubernetes compute specified with ```az ml compute attach --namespace``` CLI command. Skip configuring 'system:serviceaccount:{KUBERNETES-COMPUTE-NAMESPACE}:default' if no namespace specified with ```az ml compute attach ``` CLI command.
37 | 


--------------------------------------------------------------------------------
/docs/faq.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Frequently Asked Questions 
 3 | 
 4 | ## Who is Azure Arc enabled Machine Learning intended for?
 5 | 
 6 | With increasing adoption of Kubernetes for machine learning among enterprises, Azure Machine Learning provides enterprise ML infrastructure team to easily setup and enable Kubernetes for their data science teams to use. At the same time, data scientists can focus on building high quality models and model deployment professionals can focus on scaling models production without getting involved about Kubernetes technical details. 
 7 | 
 8 | ## Why should I use Azure Arc enabled Machine Learning?
 9 | 
10 | Many enterprises want to start machine learning now with where data lives today, which could be in multi-cloud or on-premises. Enterprises also want to optimize IT operation to leverage wherever workload is available. With flexibility of cloud-native development provided by Kubernetes, enterprises now can spin up Kubernetes cluster anywhere to meet their machine learning needs, at the same time to address security and privacy compliance requirements in a highly regulated environment. With Azure Arc enabled Machine Learning, enterprises now can have hybrid machine learning lifecycle such as train models in cloud and deploy models on-premises, or train models on-premises and deploy models in cloud, to leverage where compute and data available and broaden service access.
11 | 
12 | ## Isn’t Azure Arc enabled Machine Learning still in public cloud?
13 | 
14 | * The control plane (Azure Machine Learning Studio, Azure Machine Learning microservices, dependent Azure services) is in the cloud. The cluster and data can be on premises or in any cloud up to the infrastructure setup. The Azure Machine Learning extension deployed to the cluster is used to communicate with the control plance, and make machine learning workloads run properly in the cluster.
15 | 
16 | * Azure Arc enabled Machine Learning extends AzureML anywhere to on-premises or any cloud. Both existing types of the compute in AzureML and Arc enabled cluster share the same AzureML control plane.
17 | 
18 | * The hybrid archetecture (having control plane in cloud) benefits customer with the evolving experiences with Azure Machine Learning platform features.
19 | 
20 | * Azure private link setup on Azure Arc and Azure Machine Learning related resources can avoid public network inbound and outbound.
21 | 
22 | ## How do I use Azure Arc enabled Machine Learning?
23 | 
24 | Enterprise IT operator can easily setup and enable Kubernetes for Azure Machine Learning with the following steps:
25 | 
26 | * Spin up a Kubernetes cluster anywhere
27 | * Connect Kubernetes cluster to Azure cloud via Azure Arc
28 | * Deploy AzureML extension to Azure Arc enabled Kubernetes cluster
29 | * Attach Azure Arc enabled Kubernetes cluster to Azure ML workspace and create compute target for data science teams to use
30 | 
31 | Once Kubernetes cluster is enabled for Azure Machine Learning, data science professionals can discover Kubernetes compute targets in AzureML workspace or through CLI command, and use those compute targets to submit training job or deploy model.
32 | 
33 | ## How does model deployment with Azure Arc enabled Machine Learning compare to Azure Machine Learning Managed Online Endpoint?
34 | 
35 | Both online endpoints are built on AzureML online endpoint concept, and customers use the same set of tools to create and manage both types of online endpoints. Managed online endpoint runs on powerful Azure managed compute, no compute and infrastructure management for customers and customer gets a turnkey solution with guaranteed SLA. Kubernetes online endpoint runs on customer managed Kubernetes, customer is responsible for managing Kubernetes cluster and ensuring online endpoint SLA. 
36 | 
37 | 
38 | ## Recommended AKS cluster resources
39 | 
40 | We recommend you use a at least 3 nodes cluster, each node having at least 2C 4G. And if you want to running GPU jobs, you need some GPU nodes.
41 | 
42 | ## Why the nodes run occupied in a run is more than node count in run list?
43 | 
44 | The node count in the number of worker, for distribute training job, such as ps-worker or MPI/horovod they may need extra launcher node or ps node, they may also ocuppy one node. We will optimise this in following version.
45 | 
46 | ## What Azure storage does Azure Arc-enabled ML support?
47 | 
48 | Azure Arc-enabled ML compute only support Azure blob container, if your data is in other Azure storage, please move it to Azure blob first. We will support other Azure storage in following iteration.
49 | 


--------------------------------------------------------------------------------
/docs/gke-setup.md:
--------------------------------------------------------------------------------
 1 | ## GKE setup
 2 | 1. Select Ubuntu OS image during cluster create
 3 | 2. A minimum of 3 nodes is required; need enough resources for arc agent and amlk8s agent installation
 4 | 3. DO NOT select smaller VM's than 'medium' size
 5 | 
 6 | GKE console -> +Create Cluster -> Node Pools -> Default-pool -> Nodes
 7 | ![GKEClusterCreate](/docs/media/gkecreate.png)
 8 | 
 9 | 
10 | 4. Once the installation is complete, you need to SSH into each node in your cluster (can be found in Compute Engine under VM instances, SSH tool found under connect column).
11 | 
12 | ![GKEClusterSSH](/docs/media/gke-ssh.png)
13 | 
14 | 5. Execute the following commands in each node:
15 | 
16 |   ```bash
17 |   sudo ln -s /etc/kubernetes/volumeplugins/azure~blobfuse /home/kubernetes/flexvolume/
18 | 
19 |   sudo apt-get update; sudo apt-get install jq
20 | 
21 |   wget https://packages.microsoft.com/config/ubuntu/18.04/packages-microsoft-prod.deb; sudo dpkg -i packages-microsoft-prod.deb; sudo apt-get update; sudo apt-get install blobfuse
22 |   ```
23 | 


--------------------------------------------------------------------------------
/docs/how-to-debug-arc-kubernetes-training.md:
--------------------------------------------------------------------------------
  1 | # Job is pending for a long time
  2 | 
  3 | ## Check the resource capacity of the nodes:
  4 | 
  5 | ``` azure cli
  6 | kubectl get nodes -o json | jq '.items[]|{name: .metadata.name, capacity: .status.capacity, allocatable: .status.allocatable}'
  7 | ```
  8 | 
  9 | Here is a sample output
 10 | 
 11 | ``` azure cli
 12 | {
 13 |   "name": "aks-nodepool1-36994511-vmss000000",
 14 |   "capacity": {
 15 |     "attachable-volumes-azure-disk": "24",
 16 |     "cpu": "6",
 17 |     "ephemeral-storage": "129900528Ki",
 18 |     "github.com/fuse": "1k",
 19 |     "hugepages-1Gi": "0",
 20 |     "hugepages-2Mi": "0",
 21 |     "memory": "57584828Ki",
 22 |     "nvidia.com/gpu": "1",
 23 |     "pods": "110"
 24 |   },
 25 |   "allocatable": {
 26 |     "attachable-volumes-azure-disk": "24",
 27 |     "cpu": "5840m",
 28 |     "ephemeral-storage": "119716326407",
 29 |     "github.com/fuse": "1k",
 30 |     "hugepages-1Gi": "0",
 31 |     "hugepages-2Mi": "0",
 32 |     "memory": "51573948Ki",
 33 |     "nvidia.com/gpu": "1",
 34 |     "pods": "110"
 35 |   }
 36 | }
 37 | ```
 38 | 
 39 | ## Insufficient github.com/fuse
 40 | 
 41 | Check whether k8s-host-device-plugin-daemonset is installed properly.
 42 | 
 43 | ``` azure cli
 44 | kubectl get ds -A | grep k8s-host-device-plugin-daemonset
 45 | ```
 46 | 
 47 | ## Insufficient nvidia.com/gpu
 48 | 
 49 | Check whether nvidia-device-plugin-daemonset is installed properly. For more details, please refer to [k8s-device-plugin](https://github.com/NVIDIA/k8s-device-plugin)
 50 | 
 51 | ``` azure cli
 52 | kubectl get ds -A | grep nvidia-device-plugin-daemonset
 53 | ```
 54 | 
 55 | ## The "ps-0" pod is stuck in pending status
 56 | 
 57 | ``` azure cli
 58 | kubectl get pods | grep ps-0
 59 | ```
 60 | 
 61 | Please try using another instance-type of lower resource requested.
 62 | 
 63 | ## blobfuse fails to mount
 64 | 
 65 | Make sure the [blobfuse-flexvolume-installer](https://github.com/Azure/kubernetes-volume-drivers/tree/master/flexvolume#config-kubelet-service-to-enable-flexvolume-driver) daemonset is installed properly
 66 | 
 67 | ## no volume plugin matched
 68 | 
 69 | ``` error message
 70 | Warning  FailedMount xxx  kubelet  xxxx: failed to get Plugin from volumeSpec for volume "f38a008f5870bd913f36e68c12dc1827-blobfuse-0" err=no volume plugin matched
 71 | ```
 72 | 
 73 | This error message indicates that the flexvol is not installed properly.
 74 | 
 75 | Try changing the installation path for the volume plugin
 76 | 
 77 | ``` azure cli
 78 | az k8s-extension create  -g <resource group> -c <cluster> --cluster-type connectedClusters  --extension-type Microsoft.AzureML.Kubernetes -n trainingcompute --release-train stable --config enableTraining=true  --configuration-settings volumePluginDir=/usr/libexec/kubernetes/kubelet-plugins/volume/exec/
 79 | ```
 80 | 
 81 | # Job is in failed status
 82 | 
 83 | ## OOM Error (Out of Memory)
 84 | 
 85 | Please try adjusting the batch size of the training job or using an instance-type with higher memory limit. For tensorflow jobs, unlike other distributed jobs set the memory limits of ps pod hard coded with 2048Mi, the memory limit of ps pods are the same as the worker nodes'.
 86 | 
 87 | ## Permission Denied under '/workspaceblobstore/azureml'
 88 | 
 89 | ![image](../pics/permission_denied.png)
 90 | 
 91 | Please upgrade the blobfuse on the kubernetes nodes to 1.3.6 or above.
 92 | 
 93 | ``` azure cli
 94 | az k8s-extension create  -g <resource group> -c <cluster> --cluster-type connectedClusters  --extension-type Microsoft.AzureML.Kubernetes -n trainingcompute --release-train stable --config enableTraining=true  blobfuseSysctlInstall.enabled=true
 95 | ```
 96 | 
 97 | ## stderr: nvidia-container-cli: initialization error: nvml error: driver/library version mismatch
 98 | 
 99 | ![image](../pics/nvml_error.png)
100 | 
101 | 1. Try restarting the problematic node.
102 | 
103 | 2. Check whether [nvml driver library version mismatch](https://stackoverflow.com/questions/43022843/nvidia-nvml-driver-library-version-mismatch)
104 | 
105 | 
106 | 
107 | ## Job failed with blobfuse using SasToken
108 | 
109 | It may be due to an outdated CRD of the aml-operator, please update the CRD in the cluster. 
110 | 
111 | 
112 | ## x509: certificate signed by unknown authority
113 | 
114 | It may be due to cluster is configured with an outbound proxy with self-signed certificate but arc extension doesn't trust the certificate. Please follow the [guidance to provide and trust proxy-cert when connect cluster to Azure Arc](https://docs.microsoft.com/en-us/azure/azure-arc/kubernetes/quickstart-connect-cluster?tabs=azure-cli#4a-connect-using-an-outbound-proxy-server)
115 | 
116 | 


--------------------------------------------------------------------------------
/docs/instance-type.md:
--------------------------------------------------------------------------------
  1 | # Instance types
  2 | 
  3 | ## What are instance types?
  4 | Instance types are an Azure Machine Learning concept that allows targeting certain types of
  5 | compute nodes for training and inference workloads.  For an Azure VM, an example for an 
  6 | instance type is `STANDARD_D2_V3`.
  7 | 
  8 | In Kubernetes clusters, instance types are represented by two elements: 
  9 | [nodeSelector](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#nodeselector)
 10 | and [resources](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/).
 11 | In short, a `nodeSelector` lets us specify which node a pod should run on.  The node must have a
 12 | corresponding label.  In the `resources` section, we can set the compute resources (CPU, memory and
 13 | Nvidia GPU) for the pod.
 14 | 
 15 | ## Create instance types
 16 | Instance types are represented in a custom resource definition (CRD) that is installed with the
 17 | Azure Machine Learning extension.  To create a new instance type, create a new custom resource
 18 | for the instance type CRD.  For example:
 19 | ```bash
 20 | kubectl apply -f my_instance_type.yaml
 21 | ```
 22 | 
 23 | With `my_instance_type.yaml`:
 24 | ```yaml
 25 | apiVersion: amlarc.azureml.com/v1alpha1
 26 | kind: InstanceType
 27 | metadata:
 28 |   name: myinstancetypename
 29 | spec:
 30 |   nodeSelector:
 31 |     mylabel: mylabelvalue
 32 |   resources:
 33 |     limits:
 34 |       cpu: "1"
 35 |       nvidia.com/gpu: 1
 36 |       memory: "2Gi"
 37 |     requests:
 38 |       cpu: "700m"
 39 |       memory: "1500Mi"
 40 | ```
 41 | 
 42 | This creates an instance type with the following behavior:
 43 | - Pods will be scheduled only on nodes with label `mylabel: mylabelvalue`.
 44 | - Pods will be assigned resource requests of `700m` CPU and `1500Mi` memory.
 45 | - Pods will be assigned resource limits of `1` CPU, `2Gi` memory and `1` Nvidia GPU.
 46 | 
 47 | Note:
 48 | - Nvidia GPU resources are only specified in the `limits` section as integer values.  For more information,
 49 |   please refer to the Kubernetes [documentation](https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/#using-device-plugins).
 50 | - CPU and memory resources are string values.
 51 | - CPU can be specified in millicores, for example `100m`, or in full numbers, for example `"1"` which
 52 |   is equivalent to `1000m`.
 53 | - Memory can be specified as a full number + suffix, for example `1024Mi` for 1024 MiB.
 54 | 
 55 | It is also possible to create multiple instance types at once:
 56 | ```bash
 57 | kubectl apply -f my_instance_type_list.yaml
 58 | ```
 59 | 
 60 | With `my_instance_type_list.yaml`:
 61 | ```yaml
 62 | apiVersion: amlarc.azureml.com/v1alpha1
 63 | kind: InstanceTypeList
 64 | items:
 65 |   - metadata:
 66 |       name: cpusmall
 67 |     spec:
 68 |       resources:
 69 |         requests:
 70 |           cpu: "100m"
 71 |           memory: "100Mi"
 72 |         limits:
 73 |           cpu: "1"
 74 |           nvidia.com/gpu: 0
 75 |           memory: "1Gi"
 76 | 
 77 |   - metadata:
 78 |       name: defaultinstancetype
 79 |     spec:
 80 |       resources:
 81 |         requests:
 82 |           cpu: "1"
 83 |           memory: "1Gi" 
 84 |         limits:
 85 |           cpu: "1"
 86 |           nvidia.com/gpu: 0
 87 |           memory: "1Gi"
 88 | ```
 89 | 
 90 | The above example creates two instance types: `cpusmall` and `defaultinstancetype`.  The latter
 91 | is examplained in more detail in the following section.
 92 | 
 93 | ## Default instance types
 94 | If a training or inference workload is submitted without an instance type, it uses the default
 95 | instance type.  To specify a default instance type for a Kubernetes cluster, create an instance
 96 | type with name `defaultinstancetype`.  It will automatically be recognized as the default.
 97 | 
 98 | If no default instance type was defined, the following default behavior applies:
 99 | - No nodeSelector is applied, meaning the pod can get scheduled on any node.
100 | - The workload's pods are assigned default resources with 0.6 cpu cores, 1536Mi memory and 0 GPU:
101 | ```yaml
102 | resources:
103 |   requests:
104 |     cpu: "0.6"
105 |     memory: "1536Mi"
106 |   limits:
107 |     cpu: "0.6"
108 |     memory: "1536Mi"
109 |     nvidia.com/gpu: null
110 | ```
111 | - This default instance type will not appear as an InstanceType custom resource in the cluster when running the command ```kubectl get instancetype```,
112 | but it will appear in all clients (UI, CLI, SDK).
113 | 
114 | **Note:** The default instance type purposefully uses little resources.  To ensure all ML workloads
115 | run with appropriate resources, for example GPU resource,  it is highly recommended to create custom instance types.
116 | 
117 | ## Select instance type to submit training job
118 | To select an instance type for a training job using CLI (V2), specify its name as part of the
119 | `compute` section.  For example:
120 | ```yaml
121 | command: python -c "print('Hello world!')"
122 | environment:
123 |   image: library/python:latest
124 | compute: azureml:<compute_target_name>
125 | resources:
126 |   instance_type: <instance_type_name>
127 | ```
128 | 
129 | In the above example, replace `<compute_target_name>` with the name of your Kubernetes compute
130 | target and `<instance_type_name>` with the name of the instance type you wish to select.
131 | 
132 | ## Select instance type to deploy model
133 | 
134 | To select an instance type for a model deployment using CLI (V2), specify its name deployment YAML.  For example:
135 | 
136 | ```yaml
137 | name: blue
138 | app_insights_enabled: true
139 | endpoint_name: <endpoint name>
140 | model: 
141 |   path: ./model/sklearn_mnist_model.pkl
142 | code_configuration:
143 |   code: ./script/
144 |   scoring_script: score.py
145 | instance_type: <instance type name>
146 | environment: 
147 |   conda_file: file:./model/conda.yml
148 |   image: mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20210727.v1
149 | ```
150 | 


--------------------------------------------------------------------------------
/docs/limitations-and-known-issues.md:
--------------------------------------------------------------------------------
 1 | # Limitations and known issues
 2 | 
 3 | ## Failed to find any PEM data in certificate for gateway and cluster-status-reporter
 4 | 
 5 | If you see this error during AzureML extension deployment, it means the cluster lacks ```--cluster-signing-cert-file``` and ```--cluster-signing-key-file``` parameters in its controller manager setting. You can set ```enable_https``` to false and it will use http for in-cluster components communication. For morning please refer to [Kubernetes documentation](https://kubernetes.io/docs/tasks/tls/managing-tls-in-a-cluster/#a-note-to-cluster-administrators).
 6 | 
 7 | ## Custom IP interface for MPI job
 8 | 
 9 | For MPI job on Azure Arc-enabled on-premise Kubernetes cluster, AzureML provides a good default value if eth0 is not available. However this good default value might not be correct and MPI job will fail. To ensure that MPI job gets correct IP interface, you can st custome IP interface at AzureML extension deployment time by appending ```amloperator.custom_ip_interface_enabled=True``` and ```amloperator.custom_ip_interface=<your-ip-interface-name>``` to ```--configuration-settings``` parameter.  
10 | 
11 | ## AML Dataset support
12 | 
13 | Azure Arc-enabled Machine Learning job supports mounting/downloading an AML Dataset to a local path specified by the field "PathOnCompute". But this path can not be any of following: under root folder (e.g. /<myfolder>), priviledge folder (e.g. /data/<myfolder>), and an existing folder. 
14 |   
15 | ## Compute Managed Identity in Azure Machine Learning Workspace with private endpoint
16 |   
17 | AMLArc compute can be assigned with either system-assigned or user-assigned managed identity, to access custom Azure Container Registry (ACR) or access AML dataset. The current limitation is that the managed identity doesn't work in AML workspace with private endpoint.
18 | 


--------------------------------------------------------------------------------
/docs/managed-identity.md:
--------------------------------------------------------------------------------
 1 | # Assign Managed Identity to the compute target
 2 | 
 3 | A common challenge for developers is the management of secrets and credentials used to secure communication between different components making up a solution. [Managed Identity](https://docs.microsoft.com/en-us/azure/active-directory/managed-identities-azure-resources/overview) eliminate the need for developers to manage credentials.
 4 | 
 5 | To access Azure Container Registry (ACR) for Docker image, and Storage Account for trainig data, attach AMLArc compute with system-assigned or user-assigned managed identity enabled.
 6 | 
 7 | ## Assign Managed Identity 
 8 | 
 9 | - You can assign Managed Identity to the compute in [compute attach](./attach-compute.md)
10 | - If the compute has been attached, you can update the settings of Managed Identity in Machine Learning Studio.
11 |   - Go to Azure Machine Learning Studio - Compute - Attached compute, select your attached compute.
12 |   - Edit Managed Identity.
13 |    ![Managed identity](./media/edit-identity.png)
14 |    ![Managed identity](./media/update-identity2.png)
15 | 
16 | ## Assign Azure roles to Managed Identity
17 | 
18 | Azure offers a couple of ways to assign roles to Managed Identity.
19 | - [Use Azure Portal to assign roles](https://docs.microsoft.com/en-us/azure/role-based-access-control/role-assignments-portal?tabs=current).
20 | - [Use CLI to assign roles](https://docs.microsoft.com/en-us/azure/role-based-access-control/role-assignments-cli)
21 | - [Use PowerShell to assign roles](https://docs.microsoft.com/en-us/azure/role-based-access-control/role-assignments-powershell)
22 | 
23 | >If use Portal to assign roles, and you have system-assigned managed identity, select **User,group,or service principal**. Click Select members, find and search the identity name formatted as ``\<workspace name>/computes/\<compute target name>``
24 | >
25 | > If you have user-assigned managed identity, select **Managed identity** to find the target identity. 
26 |    ![Managed identity](./media/assign-role.png)
27 |    
28 |    ### Use Managed Identity to pull image from Azure Container Registry
29 |    
30 |    "AcrPull" role shoule be granted to the compute Managed Identity.
31 |    
32 |    ### Use Managed Identity to access Azure Blob
33 |    
34 | -	For read-only purpose, `Storage Blob Data Reader` role should be granted to the compute Managed Identity.
35 | -	For read-write purpose, `Storage Blob Data Contributor` role should be granted to the compute Managed Identity.
36 | 


--------------------------------------------------------------------------------
/docs/media/README.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/docs/media/assign-role.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/assign-role.png


--------------------------------------------------------------------------------
/docs/media/attach-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/attach-1.png


--------------------------------------------------------------------------------
/docs/media/attach-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/attach-4.png


--------------------------------------------------------------------------------
/docs/media/attach.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/attach.png


--------------------------------------------------------------------------------
/docs/media/detach.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/detach.png


--------------------------------------------------------------------------------
/docs/media/edit-identity.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/edit-identity.png


--------------------------------------------------------------------------------
/docs/media/gke-ssh.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/gke-ssh.png


--------------------------------------------------------------------------------
/docs/media/gkecreate.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/gkecreate.png


--------------------------------------------------------------------------------
/docs/media/privatelink-networkflow-v3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/privatelink-networkflow-v3.png


--------------------------------------------------------------------------------
/docs/media/privatelink/acr_subnet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/privatelink/acr_subnet.png


--------------------------------------------------------------------------------
/docs/media/privatelink/acr_target.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/privatelink/acr_target.png


--------------------------------------------------------------------------------
/docs/media/privatelink/acr_trusted.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/privatelink/acr_trusted.png


--------------------------------------------------------------------------------
/docs/media/privatelink/aks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/privatelink/aks.png


--------------------------------------------------------------------------------
/docs/media/privatelink/dns.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/privatelink/dns.png


--------------------------------------------------------------------------------
/docs/media/privatelink/kv_target.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/privatelink/kv_target.png


--------------------------------------------------------------------------------
/docs/media/privatelink/kv_trusted.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/privatelink/kv_trusted.png


--------------------------------------------------------------------------------
/docs/media/privatelink/kv_vnet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/privatelink/kv_vnet.png


--------------------------------------------------------------------------------
/docs/media/privatelink/ml_compute.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/privatelink/ml_compute.png


--------------------------------------------------------------------------------
/docs/media/privatelink/ml_computemsi.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/privatelink/ml_computemsi.png


--------------------------------------------------------------------------------
/docs/media/privatelink/ml_disablepublicaccess.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/privatelink/ml_disablepublicaccess.png


--------------------------------------------------------------------------------
/docs/media/privatelink/ml_privateendpoint.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/privatelink/ml_privateendpoint.png


--------------------------------------------------------------------------------
/docs/media/privatelink/onprem.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/privatelink/onprem.png


--------------------------------------------------------------------------------
/docs/media/privatelink/relay.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/privatelink/relay.png


--------------------------------------------------------------------------------
/docs/media/privatelink/relay_connectstring.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/privatelink/relay_connectstring.png


--------------------------------------------------------------------------------
/docs/media/privatelink/relay_resourceid.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/privatelink/relay_resourceid.png


--------------------------------------------------------------------------------
/docs/media/privatelink/storageaccount.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/privatelink/storageaccount.png


--------------------------------------------------------------------------------
/docs/media/privatelink/ts_curl.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/privatelink/ts_curl.png


--------------------------------------------------------------------------------
/docs/media/privatelink/ts_expected.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/privatelink/ts_expected.png


--------------------------------------------------------------------------------
/docs/media/privatelink/ts_getpo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/privatelink/ts_getpo.png


--------------------------------------------------------------------------------
/docs/media/privatelink/ts_nslookup.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/privatelink/ts_nslookup.png


--------------------------------------------------------------------------------
/docs/media/privatelink/ts_ws.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/privatelink/ts_ws.png


--------------------------------------------------------------------------------
/docs/media/profileConfig.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/profileConfig.png


--------------------------------------------------------------------------------
/docs/media/update-identity2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/update-identity2.png


--------------------------------------------------------------------------------
/docs/media/ws-msi.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/ws-msi.png


--------------------------------------------------------------------------------
/docs/network-requirements.md:
--------------------------------------------------------------------------------
 1 | ## Meet network requirements
 2 | Clusters running behind an outbound proxy server or firewall need additional network configurations. 
 3 | - For Azure Arc enabled Kubernetes, fulfill [Azure Arc network requirements](https://docs.microsoft.com/en-us/azure/azure-arc/kubernetes/quickstart-connect-cluster?tabs=azure-cli#meet-network-requirement) needed by Azure Arc agents. If the cluster has an outbound proxy,  make sure that `127.0.0.1` and `localhost` are added to `--proxy-skip-range` when connecting with Azure Arc.
 4 | - For AKS customer without Azure Arc connection, fullfill [AKS cluster extension network requirements](https://docs.microsoft.com/en-us/azure/aks/limit-egress-traffic#cluster-extensions).
 5 | 
 6 | Besides, the following outbound URLs are required for Azure Machine Learning,
 7 | 
 8 | | Outbound Endpoint| Port | Description|Training |Inference |
 9 | |--|--|--|--|--|
10 | | *.kusto.windows.net,<br> \*.table.core.windows.net, <br>\*.queue.core.windows.net | https:443 | Required to upload system logs to Kusto. You can skip this if you have a data exfiltration concern to add table and queue FQDNs, but you cannot get the error diagnosis support from Microsoft.|**&check;**|**&check;**|
11 | | \<your ACR name>\.azurecr.io<br>\<your ACR name>\.\<region name>\.data.azurecr.io | https:443 | Azure container registry, required to pull docker images used for machine learning workloads.|**&check;**|**&check;**|
12 | | \<your Storage Account name>\.blob.core.windows.net | https:443 | Azure blob storage, required to fetch machine learning project scripts,data or models, and upload job logs/outputs.|**&check;**|**&check;**|
13 | | \<your AML workspace ID\>.workspace.\<region\>.api.azureml.ms ,<br>  \<region\>.experiments.azureml.net, <br> \<region\>.api.azureml.ms | https:443 | Azure mahince learning service API.|**&check;**|**&check;**|
14 | | pypi.org | https:443 | Python package index, to install pip packages used for training job environment initialization.|**&check;**|N/A|
15 | 
16 | > [!NOTE]
17 | > `<region>` is the lowcase full spelling of Azure Region, for example, eastus, southeastasia.
18 | >
19 | > `<your AML workspace ID>` can be found in Azure portal - your Machine Learning resource page - Properties - Workspace ID.
20 | 


--------------------------------------------------------------------------------
/docs/nginx-ingress-controller.md:
--------------------------------------------------------------------------------
  1 | # Tutorial
  2 | 
  3 | These tutorials help illustrate how to integrate [Nginx Ingress Controller](https://github.com/kubernetes/ingress-nginx) with AzureML extension over HTTP or HTTPS.
  4 | 
  5 | ## Table of Contents
  6 | 
  7 | - [Prerequisites](#prerequisites)
  8 | - [Deploy AzureML extension](#deploy-azureml-extension)
  9 | - [Expose services over HTTP](#expose-services-over-http)
 10 | - [Expose services over HTTPS](#expose-services-over-https)
 11 | 
 12 | ## Prerequisites
 13 | 
 14 | - Install the latest k8s-extension and ml cli.
 15 |   - `az extension add -n k8s-extension --upgrade`
 16 |   - `az extension add -n ml --upgrade`
 17 | - Setup Nginx Ingress Conroller.
 18 |   - [**Create a basic controller**](https://docs.microsoft.com/en-us/azure/aks/ingress-basic): If you are starting from scratch, refer to these instructions.
 19 | - If you want to use HTTPS on this application, you will need a x509 certificate and its private key.
 20 | 
 21 | ## Deploy AzureML extension
 22 | 
 23 | [Deploy extension](https://github.com/Azure/AML-Kubernetes/blob/master/docs/deploy-extension.md#azureml-extension-deployment-scenarios) with `inferenceRouterServiceType=ClusterIP` and `allowInsecureConnections=True`, so that the Nginx Ingress Conroller can handle TLS termination by itself instead of handing it over to azureml-fe (azureml inference router created by extension) when service is exposed over HTTPS.
 24 | 
 25 | 
 26 | ## Expose services over HTTP
 27 | 
 28 | In order to expose the azureml-fe we will using the following ingress resource:
 29 | 
 30 | ```yaml
 31 | apiVersion: networking.k8s.io/v1
 32 | kind: Ingress
 33 | metadata:
 34 |   name: azureml-fe
 35 |   namespace: azureml
 36 | spec:
 37 |   ingressClassName: nginx
 38 |   rules:
 39 |   - http:
 40 |       paths:
 41 |       - path: /
 42 |         backend:
 43 |           service:
 44 |             name: azureml-fe
 45 |             port:
 46 |               number: 80
 47 |         pathType: Prefix
 48 | ```
 49 | 
 50 | This ingress will expose the `azureml-fe` service and the selected deployment as a default backend of the Nginx Ingress Controller.
 51 | 
 52 | Save the above ingress resource as `ing-azureml-fe.yaml`.
 53 | 
 54 | 1. Deploy `ing-azureml-fe.yaml` by running:
 55 | 
 56 |     ```bash
 57 |     kubectl apply -f ing-azureml-fe.yaml
 58 |     ```
 59 | 
 60 | 2. Check the log of the ingress controller for deployment status.
 61 | 
 62 | 3. Now the `azureml-fe` application should be available. You can check this by visiting the public LoadBalancer address of the Nginx Ingress Controller.
 63 | 
 64 | 4. [Create an inference job and invoke](https://github.com/Azure/AML-Kubernetes/blob/master/docs/simple-flow.md).
 65 | 
 66 |     *NOTE:* Replace the ip in scoring_uri with public LoadBalancer address of the Nginx Ingress Controller before invoking.
 67 | 
 68 | ## Expose services over HTTPS
 69 | 
 70 | 1. Before deploying ingress, you need to create a kubernetes secret to host the certificate and private key. You can create a kubernetes secret by running
 71 | 
 72 |     ```bash
 73 |     kubectl create secret tls <ingress-secret-name> -n azureml --key <path-to-key> --cert <path-to-cert>
 74 |     ```
 75 | 
 76 | 2. Define the following ingress. In the ingress, specify the name of the secret in the `secretName` section.
 77 | 
 78 |     ```yaml
 79 |     apiVersion: networking.k8s.io/v1
 80 |     kind: Ingress
 81 |     metadata:
 82 |       name: azureml-fe
 83 |       namespace: azureml
 84 |     spec:
 85 |       ingressClassName: nginx
 86 |       tls:
 87 |       - hosts:
 88 |         - <domain>
 89 |         secretName: <ingress-secret-name>
 90 |       rules:
 91 |       - host: <domain>
 92 |         http:
 93 |           paths:
 94 |           - path: /
 95 |             backend:
 96 |               service:
 97 |                 name: azureml-fe
 98 |                 port:
 99 |                   number: 80
100 |             pathType: Prefix
101 |     ```
102 | 
103 |     *NOTE:* Replace `<domain>` and `<ingress-secret-name>` in the above Ingress Resource with the domain pointing to LoadBalancer of the Nginx ingress controller and name of your secret. Store the above Ingress Resource in a file name `ing-azureml-fe-tls.yaml`.
104 | 
105 | 1. Deploy ing-azureml-fe-tls.yaml by running
106 | 
107 |     ```bash
108 |     kubectl apply -f ing-azureml-fe-tls.yaml
109 |     ```
110 | 
111 | 2. Check the log of the ingress controller for deployment status.
112 | 
113 | 3. Now the `azureml-fe` application will be available on HTTPS. You can check this by visiting the public LoadBalancer address of the Nginx Ingress Controller.
114 | 
115 | 4. [Create an inference job and invoke](https://github.com/Azure/AML-Kubernetes/blob/master/docs/simple-flow.md).
116 | 
117 |     *NOTE:* Replace the protocol and ip in scoring_uri with https and domain pointing to LoadBalancer of the Nginx Ingress Controller before invoking.
118 | 


--------------------------------------------------------------------------------
/docs/pvc.md:
--------------------------------------------------------------------------------
 1 | ### PV/PVC support in AMLArc training job
 2 | 
 3 | Now you can leverage Kubernetes native way to mount various data storage via [Persistent Volume (PV) and Persistent Volume Claim (PVC)](https://kubernetes.io/docs/concepts/storage/persistent-volumes/).
 4 | 
 5 | 1. Create PV, take NFS as example,
 6 | 
 7 | ```
 8 | apiVersion: v1
 9 | kind: PersistentVolume
10 | metadata:
11 |   name: nfs-pv 
12 | spec:
13 |   capacity:
14 |     storage: 1Gi 
15 |   accessModes:
16 |     - ReadWriteMany 
17 |   persistentVolumeReclaimPolicy: Retain
18 |   storageClassName: ""
19 |   nfs: 
20 |     path: /share/nfs
21 |     server: 20.98.110.84 
22 |     readOnly: false
23 | ```
24 | 2. Create PVC. In `metadata`, you **must** add label `ml.azure.com/pvc: "true"` to indicate the PVC can be mounted to the upcoming training job, and add annotation  `ml.azure.com/mountpath: <mount path>` to specify the mount path. 
25 | 
26 | ```
27 | apiVersion: v1
28 | kind: PersistentVolumeClaim
29 | metadata:
30 |   name: nfs-pvc  
31 |   namespace: default
32 |   labels:
33 |     ml.azure.com/pvc: "true"
34 |   annotations:
35 |     ml.azure.com/mountpath: "/mnt/nfs"
36 | spec:
37 |   storageClassName: ""
38 |   accessModes:
39 |   - ReadWriteMany      
40 |   resources:
41 |      requests:
42 |        storage: 1Gi
43 | ```
44 | <!-- ### Access control
45 | 
46 | All AML workloads will be executed as a special user with `uid` `200513`. To have proper read/write permission to the mounted file system like NFS, refer to the following guidance,
47 | 
48 | * If the shared folder has '777' permission, all ML workloads will have read and write permission. 
49 | * if the file system has set all_squash and anonuid/anongid, all ML workloads will have read and write permission. 
50 | * If `uid 200513` is added to the group with proper read/write access to NFS, all ML workloads will have read and write permission. In this case, use the annotation `ml.azure.com/gid: "XXX"` in PVC setting, then the uid 200513 will be added to the specified group. **Note** to remove `RPCMOUNTDOPTS="--manage-gids` at the path of `/etc/default/nfs-kernel-server ` in NFS server to respect the permission control in the client side. -->
51 | 
52 | 
53 | 
54 | ### How AML will use the PVC
55 | 
56 | The training job in the same `namespace` with the PVC will be mounted the volume automatically. Then data scientist can access the mount path in the training job.
57 | 
58 | By default, the job will be created in  `default` namespace. IT operator can decide the namespace in attached compute attach.
59 | 
60 | 
61 | 
62 | 
63 | 


--------------------------------------------------------------------------------
/docs/release-notes.md:
--------------------------------------------------------------------------------
 1 | New features are released at a biweekly cadance.
 2 | 
 3 | **Dec 27, 2022 Release**
 4 | 
 5 | Version 1.1.17
 6 | * Move the Fluent-bit from DaemonSet to sidecars 
 7 | * Add MDC support
 8 | * Refine error messages
 9 | * Support cluster mode (windows, linux) jobs
10 | * Bugfixes
11 | 
12 | **Aug 29, 2022 Release**
13 | 
14 | Version 1.1.9
15 | * Improved health check logic
16 | * Bugfixes
17 | 
18 | **Jun 23, 2022 Release**
19 | 
20 | Version 1.1.6
21 | * Bugfixes
22 | 
23 | **Jun 15, 2022 Release**
24 | 
25 | Version 1.1.5
26 | * Updated training to use new common runtime to run jobs
27 | * Removed Azure Relay usage for Aks extension
28 | * Removed service bus usage from the extension
29 | * Updated security context usage
30 | * Updated inference scorefe to v2
31 | * Updated to use Volcano as training job scheduler
32 | * Bugfixes
33 | 
34 | **Oct 14, 2021 Release**
35 | 
36 | * [PV/PVC volume mount support in AMLArc training job](./pvc.md).
37 | 
38 | **Sept 16, 2021 Release**
39 | 
40 | * New regions available, WestUS, CentralUS, NorthCentralUS, KoreaCentral.
41 | * Job queue explanability. See job queue details in AML Workspace Studio.
42 | * Auto-killing policy. Support `max_run_duration_seconds` in ``ScriptRunConfig``. The system will attempt to automatically cancel the run if it took longer than the setting value.
43 | * Performance improvement on cluster autoscale support.
44 | * [Arc agent and ML extension deployment from on-prem container registry](https://github.com/Azure/azure-arc-kubernetes-preview/blob/master/docs/custom-registry/connect-cluster.md) 
45 | 
46 | **August 24, 2021 Release**
47 | 
48 | * [Compute instance type is supported in job YAML](./docs/simple-train-cli.md).  
49 | * [Assign Managed Identity to AMLArc compute](./docs/managed-identity.md)
50 | 
51 | **August 10, 2021 Release**
52 | 
53 | * New Kubernetes distribution support, K3S - Lightweight Kubernetes. 
54 | * [Deploy AzureML extension to your AKS cluster without connecting via Azure Arc](./docs/deploy-ml-extension-on-AKS-without-arc.md).
55 | * [Automated Machine Learning (AutoML) via Python SDK](https://docs.microsoft.com/en-us/azure/machine-learning/concept-automated-ml) 
56 | * [Use 2.0 CLI to attach the Kubernetes cluster to AML Workspace](./docs/attach-compute.md#Create-compute-target-via-Azure-ML-2.0-CLI)
57 | * Optimize AzureML extension components CPU/memory resources utilization. 
58 | 
59 | **July 2, 2021 Release**
60 | 
61 | * New Kubernetes distributions support, OpenShift Kubernetes and GKE (Google Kubernetes Engine). 
62 | * Autoscale support. If the user-managed Kubernetes cluster enables the autoscale, the cluster will be automatically scaled out or scaled in according to the volume of active runs and deployments.  
63 | * Performance improvement on job laucher, which shortens the job execution time to a great deal.
64 | 


--------------------------------------------------------------------------------
/docs/setup-ephemeral-nfs-volume.md:
--------------------------------------------------------------------------------
 1 | ### Set up NFS server
 2 | 
 3 | Set up on Ubuntu [Link](https://help.ubuntu.com/community/SettingUpNFSHowTo) and make sure to grant NFS Share Access to your Kubernetes cluster.
 4 | 
 5 | 
 6 | ### Create a Configmap with nfs server properties
 7 | 
 8 | ```mount-config.yaml
 9 | kind: ConfigMap
10 | apiVersion: v1
11 | metadata:
12 |   name: mount-config
13 |   namespace: azureml
14 | data:
15 |   mounts.yaml: |
16 |     mountPoints:
17 |     - mountPath: /nfs_share
18 |       mountType: nfs
19 |       name: nfs-name
20 |       path: /path/to/shared-folder
21 |       server: nfs-server.domain-name.com
22 | ```
23 | 
24 | ### Apply the Configmap
25 | 
26 | `kubectl apply -f mount-config.yaml`
27 | 
28 | ### Documentation on Specific Fields
29 | * `mountPath`: defines the path that the NFS volume will be mounted into inside your job
30 | * `mountType`: must be `nfs`
31 | * `name`: arbitrary symbolic name for your mount.  If you define multiple mounts then this must be unique per mount
32 | * `path`: path (on the server) to the folder you want to mount
33 | * `server`: NFS server address
34 | 
35 | Multiple NFS mounts may be defined under `mountPoints`
36 | 
37 | The rest of the `mount-config.yaml` file must be exactly as above
38 | 
39 | ### How AML will use the mount for jobs
40 | 
41 | All jobs will look for the 'mount-config' ConfigMap.  If this ConfigMap is missing or malformed then no mounts will be applied.
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/docs/simple-flow.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # Deploy an image classification model - create an endpoint with blue deployment
  3 | 
  4 | ## Azure CLI for ML installation and project setup
  5 | 
  6 | 1. Remove any previous Azure ML CLI extension installations
  7 | 
  8 |     ```azurecli
  9 |     az extension remove -n ml
 10 |     az extension remove -n azure-cli-ml
 11 |     ```
 12 | 
 13 | 1. Install the latest Azure CLI for ML, which is in public preview, and then verify installation
 14 | 
 15 |     ```azurecli
 16 |     az extension add -n ml
 17 |     az ml -h
 18 |     ```
 19 | 
 20 | 1. Let's set some defaults for all subsequent "az ml" CLI commands
 21 | 
 22 |     ```azurecli
 23 |     az account set --subscription <subscription id>
 24 |     az configure --defaults workspace=<azureml workspace name> group=<resource group>
 25 |     ```
 26 | 
 27 | 1. For this simple deployment flow, we have following project directory structure:
 28 | 
 29 |     ``` code
 30 |     simple-flow
 31 |     |-- model
 32 |     |   |-- conda.yml
 33 |     |   |-- sklearn_mnist_model.pkl
 34 |     |-- script
 35 |     |   |-- score.py
 36 |     |-- blue-deployment.yml
 37 |     |-- endpoint.yml
 38 |     |-- sample_request.json
 39 |     ```
 40 | 
 41 |     As you can see from above, "model" directory contains model and Conda environment definition, "score.py" is under "script" directory. At top level directory, we have endpoint, blue deployment YAML definition and sample request JSON file. In general, this is very typical project setup for Azure Arc enabled ML model deployment.
 42 | 
 43 | ## Simple deployment flow
 44 | 
 45 | Now let's see simple deployment flow in action!
 46 | 
 47 | 1. Git clone preview Github repo and switch to simple-flow directory
 48 | 
 49 |     ```console
 50 |     git clone https://github.com/Azure/AML-Kubernetes.git
 51 |     cd AML-Kubernetes/examples/inference/simple-flow
 52 |     ```
 53 | 
 54 | 1. Modify endpoint YAML file to replace "\<your compute target name>" with your own compute target name, and replace "\<your instance type>" to the instance type defined in your compute configuration. Create an endpoint with blue deployment with following CLI command, endpoint creation and deployment might take a few minutes.
 55 | 
 56 | > Note that the resource requirements (CPU, memory, GPU) defined in the endpoint yaml should be no more than the resource limit of the specified instance type.
 57 | 
 58 | 
 59 | 1. Create endpoint
 60 |     ```azurecli
 61 |     az ml online-endpoint create --name sklearn-mnist -f endpoint.yml
 62 |     ```
 63 | 1. Check status of endpoint
 64 | 
 65 |     ```azurecli
 66 |     az ml online-endpoint show -n sklearn-mnist
 67 |     ```
 68 | 
 69 | 1. Create blue deployment
 70 |     ```azurecli
 71 |     az ml online-deployment create --name blue --endpoint sklearn-mnist -f blue-deployment.yml --all-traffic
 72 |     ```
 73 | 
 74 | 1. Check status of blue deployment
 75 | 
 76 |     ```azurecli
 77 |     az ml online-deployment show --name blue --endpoint sklearn-mnist
 78 |     ```
 79 | 
 80 | 1. Test endpoint by scoring request
 81 | 
 82 |     ```azurecli
 83 |     az ml online-endpoint invoke -n sklearn-mnist -r sample-request.json
 84 |     ```
 85 | 
 86 |     You can also send a scoring request using cURL.
 87 | 
 88 |     * Obtain a token/keys for the scoring endpoint
 89 | 
 90 |     ```azurecli
 91 |     az ml online-endpoint get-credentials -n sklearn-mnist
 92 |     ```
 93 | 
 94 |     * Obtain the `scoring_uri` of the endpoint
 95 |   
 96 |     ```azurecli
 97 |     az ml online-endpoint show -n sklearn-mnist
 98 |     ```
 99 |   
100 |     * Score using the token/key obtained above
101 | 
102 |     ```bash
103 |     curl -v -i -X POST -H "Content-Type:application/json" -H "Authorization: Bearer <key_or_token>" -d '<sample_data>' <scoring_uri>
104 |     ```
105 | 
106 |     That is it! You have successfully deployed an image classification model and scored the model with a request.
107 | 
108 | 1. Get logs
109 | 
110 |     ```azurecli
111 |     az ml online-deployment get-logs --name blue --endpoint sklearn-mnist
112 |     ```
113 | 
114 | 1. Delete endpoint
115 | 
116 |     ```azurecli
117 |     az ml online-endpoint delete -n sklearn-mnist
118 |     ```
119 | 
120 | ## Additional resources
121 | 
122 | * [Deploy model using customer container with built-in model or entry script](inference-byoc.md). In this case, the model and the entry script will not be saved at the cloud, but in local.
123 | * To learn more about Azure ML endpoint and deployment concents, please check [Managed Online Endpoints](https://docs.microsoft.com/azure/machine-learning/how-to-deploy-managed-online-endpoints).
124 | * [Additional Examples](https://github.com/Azure/azureml-examples/tree/main/cli/endpoints/online)
125 | 


--------------------------------------------------------------------------------
/docs/simple-train-cli.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Train an image classification model with AML 2.0 CLI
 3 | 
 4 | 1. Remove any previous AML CLI extension installations
 5 | 
 6 |    ```azurecli
 7 |    az extension remove -n ml
 8 |    az extension remove -n azure-cli-ml
 9 |    ```
10 | 
11 | 1. Install the latest AML 2.0 CLI, which is in public preview, and then verify installation
12 | 
13 |    ```azurecli
14 |    az extension add -n ml
15 |    az ml -h
16 |    ```
17 | 
18 | 1. Let's set some defaults for all subsequent "az ml" CLI commands
19 | 
20 |    ```azurecli
21 |    az account set --subscription <subcription id>
22 |    az configure --defaults workspace=<azureml workspace name> group=<resource group>
23 |    ```
24 | 
25 | 1. For this simple training job with AML 2.0 CLI, we have following project directory structure:
26 | 
27 |    ``` code
28 |    simple-train-cli
29 |    |-- src
30 |    |   |-- train.py
31 |    |   |-- utils.py
32 |    |-- job.yml
33 |    ```
34 | 
35 |    As you can see from above, the project simply contains a job YAML file and some Python training scripts. In general, this a very typical project setup for Azure Arc-enabled ML training. Let's take a look at job YAML file: 
36 | 
37 |    ```yaml
38 |    experiment_name: Tutorial-sklearn-mnist
39 |    code: ./src
40 |    command: python train.py --data-folder ./mnist-data --regularization 0.5
41 |    environment: azureml:AzureML-sklearn-0.24-ubuntu18.04-py37-cpu:7
42 |    compute: azureml:<your compute target name>
43 |    resources:
44 |      instance_type: <your instance type>
45 |    ```
46 |    
47 |    **Note**: **Instance type** is optional parameter. If it's not given, like the YAML file below, the compute default instance type will be used. 
48 |    
49 |    ```yaml
50 |    experiment_name: Tutorial-sklearn-mnist
51 |    code: ./src
52 |    command: python train.py --data-folder ./mnist-data --regularization 0.5
53 |    environment: azureml:AzureML-sklearn-0.24-ubuntu18.04-py37-cpu:7
54 |    compute: azureml:<your compute target name>
55 |    ```
56 |    
57 |    Refer to [here](./instance-type.md) to learn how to create different instance types.
58 | 
59 | 1. Git clone preview Github repo and switch to simple-train-cli directory
60 | 
61 |    ```console
62 |    git clone https://github.com/Azure/AML-Kubernetes.git
63 |    cd AML-Kubernetes/examples/training/simple-train-cli
64 |    ```
65 | 
66 | 1. Modify job YAML file to specify your own compute target name
67 | 
68 | 1. Run the image classification training job
69 | 
70 |    ```azurecli
71 |    az ml job create -f job.yml --web
72 |    ```
73 | 
74 |    Creating this job uploads any specified local assets, like the source code directory, validates the YAML file, and submits the run. If needed, the environment is built, then the compute is scaled up and configured for running the job.
75 | 
76 | 1. Once the job is compute, you can download the outputs:
77 | 
78 |    ```azurecli
79 |    az ml job download -n $run_id --outputs
80 |    ```
81 | 
82 | That is it! You have successfully trained an image classification model and download outputs to local directory.
83 | 
84 | ## Additional resources
85 | 
86 | * [Train models (create jobs) with the 2.0 CLI (preview)](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-train-cli)
87 | * [Additional examples](https://github.com/Azure/azureml-examples/tree/main/cli/jobs)
88 | 


--------------------------------------------------------------------------------
/examples/inference/gpu-inferencing/blue-deployment.yml:
--------------------------------------------------------------------------------
 1 | name: blue
 2 | type: kubernetes
 3 | endpoint_name: tf-mnist
 4 | app_insights_enabled: true
 5 | model:
 6 |   path: ./model/
 7 | code_configuration:
 8 |   code: ./script/
 9 |   scoring_script: score.py
10 | instance_type: myinstancetypename
11 | environment:
12 |   image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04
13 |   conda_file: ./model/conda.yml
14 | request_settings:
15 |   request_timeout_ms: 3000
16 |   max_concurrent_requests_per_instance: 1
17 |   max_queue_wait_ms: 3000
18 | resources:
19 |   requests:
20 |     cpu: "0.1"
21 |     memory: "500Mi"
22 |   limits:
23 |     cpu: "0.2"
24 |     memory: "1Gi"
25 |     nvidia.com/gpu: "1"
26 | liveness_probe:
27 |   initial_delay: 10
28 |   period: 10
29 |   timeout: 10
30 |   success_threshold: 1
31 |   failure_threshold: 1
32 | readiness_probe:
33 |   initial_delay: 10
34 |   period: 10
35 |   timeout: 10
36 |   success_threshold: 1
37 |   failure_threshold: 1
38 | scale_settings:
39 |   type: default


--------------------------------------------------------------------------------
/examples/inference/gpu-inferencing/endpoint.yml:
--------------------------------------------------------------------------------
1 | name: tf-mnist
2 | compute: azureml:<your GPU compute target name>
3 | auth_mode: key
4 | 


--------------------------------------------------------------------------------
/examples/inference/gpu-inferencing/model/checkpoint:
--------------------------------------------------------------------------------
1 | model_checkpoint_path: "mnist-tf.model"
2 | all_model_checkpoint_paths: "mnist-tf.model"
3 | 


--------------------------------------------------------------------------------
/examples/inference/gpu-inferencing/model/conda.yml:
--------------------------------------------------------------------------------
 1 | name: model-env
 2 | channels:
 3 |   - conda-forge
 4 | dependencies:
 5 |   - python=3.7
 6 |   - tensorflow-gpu==1.14.0
 7 |   - pip:
 8 |     - azureml-defaults
 9 |     - numpy==1.16.4
10 | 


--------------------------------------------------------------------------------
/examples/inference/gpu-inferencing/model/mnist-tf.model.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/examples/inference/gpu-inferencing/model/mnist-tf.model.data-00000-of-00001


--------------------------------------------------------------------------------
/examples/inference/gpu-inferencing/model/mnist-tf.model.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/examples/inference/gpu-inferencing/model/mnist-tf.model.index


--------------------------------------------------------------------------------
/examples/inference/gpu-inferencing/model/mnist-tf.model.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/examples/inference/gpu-inferencing/model/mnist-tf.model.meta


--------------------------------------------------------------------------------
/examples/inference/gpu-inferencing/sample-request.json:
--------------------------------------------------------------------------------
1 | {"data": [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.07450980392156863, 0.4588235294117647, 0.9450980392156862, 0.7529411764705882, 0.5529411764705883, 0.17254901960784313, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2, 0.6666666666666666, 0.8823529411764706, 0.9137254901960784, 0.7686274509803922, 0.7725490196078432, 0.9176470588235294, 0.8784313725490196, 0.4666666666666667, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.40784313725490196, 0.9647058823529412, 0.9568627450980393, 0.5568627450980392, 0.1450980392156863, 0.0, 0.0, 0.4666666666666667, 0.9882352941176471, 0.9882352941176471, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.396078431372549, 0.9647058823529412, 0.9882352941176471, 0.09803921568627451, 0.0, 0.0, 0.0, 0.10196078431372549, 0.9529411764705882, 0.9882352941176471, 0.4, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.11372549019607843, 0.8470588235294118, 0.807843137254902, 0.24705882352941178, 0.0, 0.0, 0.0, 0.0, 0.1607843137254902, 0.9921568627450981, 0.6588235294117647, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.8470588235294118, 0.9882352941176471, 0.36470588235294116, 0.0, 0.0, 0.0, 0.0, 0.0, 0.8470588235294118, 0.9882352941176471, 0.5568627450980392, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.9921568627450981, 0.9882352941176471, 0.2196078431372549, 0.0, 0.0, 0.0, 0.0, 0.2, 0.9921568627450981, 0.9137254901960784, 0.1450980392156863, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.9921568627450981, 0.9882352941176471, 0.9058823529411765, 0.49019607843137253, 0.6901960784313725, 0.8823529411764706, 0.19607843137254902, 0.2, 0.9921568627450981, 0.5686274509803921, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.050980392156862744, 0.7372549019607844, 0.8823529411764706, 0.8823529411764706, 0.4470588235294118, 0.14901960784313725, 0.0392156862745098, 0.8980392156862745, 1.0, 0.32941176470588235, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3333333333333333, 0.9882352941176471, 0.9176470588235294, 0.10980392156862745, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.4823529411764706, 0.9882352941176471, 0.5372549019607843, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.10196078431372549, 0.8705882352941177, 0.9882352941176471, 0.050980392156862744, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.47058823529411764, 0.9921568627450981, 0.9921568627450981, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.027450980392156862, 0.7333333333333333, 0.9882352941176471, 0.592156862745098, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.25882352941176473, 0.9882352941176471, 0.8784313725490196, 0.07450980392156863, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5529411764705883, 0.9882352941176471, 0.803921568627451, 0.050980392156862744, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5529411764705883, 0.9921568627450981, 0.807843137254902, 0.050980392156862744, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.6509803921568628, 0.9882352941176471, 0.5568627450980392, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.9921568627450981, 0.9137254901960784, 0.1450980392156863, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.796078431372549, 0.7686274509803922, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]}


--------------------------------------------------------------------------------
/examples/inference/gpu-inferencing/script/score.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import json
 3 | import numpy as np
 4 | import os
 5 | import tensorflow as tf
 6 | 
 7 | from azureml.core.model import Model
 8 | 
 9 | def init():
10 |     global X, output, sess
11 |     tf.reset_default_graph()
12 |     model_root = os.getenv('AZUREML_MODEL_DIR')
13 |     # the name of the folder in which to look for tensorflow model files
14 |     tf_model_folder = 'model'
15 |     saver = tf.train.import_meta_graph(
16 |         os.path.join(model_root, tf_model_folder, 'mnist-tf.model.meta'))
17 |     X = tf.get_default_graph().get_tensor_by_name("network/X:0")
18 |     output = tf.get_default_graph().get_tensor_by_name("network/output/MatMul:0")
19 | 
20 |     sess = tf.Session()
21 |     saver.restore(sess, os.path.join(model_root, tf_model_folder, 'mnist-tf.model'))
22 | 
23 | 
24 | def run(raw_data):
25 |     data = np.array(json.loads(raw_data)['data'])
26 |     # make prediction
27 |     out = output.eval(session=sess, feed_dict={X: data})
28 |     y_hat = np.argmax(out, axis=1)
29 |     return y_hat.tolist()
30 | 


--------------------------------------------------------------------------------
/examples/inference/simple-flow/blue-deployment.yml:
--------------------------------------------------------------------------------
 1 | name: blue
 2 | type: kubernetes
 3 | endpoint_name: sklearn-mnist
 4 | app_insights_enabled: true
 5 | model: 
 6 |   path: ./model/sklearn_mnist_model.pkl
 7 | code_configuration:
 8 |   code: ./script/
 9 |   scoring_script: score.py
10 | instance_type: defaultinstancetype
11 | environment:
12 |   image: mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:latest
13 |   conda_file: ./model/conda.yml
14 | request_settings:
15 |   request_timeout_ms: 3000
16 |   max_queue_wait_ms: 3000
17 | resources:
18 |   requests:
19 |     cpu: "0.1"
20 |     memory: "0.1Gi"
21 |   limits:
22 |     cpu: "0.2"
23 |     memory: "0.2Gi"
24 | scale_settings:
25 |   type: target_utilization
26 |   min_instances: 1
27 |   max_instances: 3
28 |   polling_interval: 10
29 |   target_utilization_percentage: 70
30 | 


--------------------------------------------------------------------------------
/examples/inference/simple-flow/endpoint.yml:
--------------------------------------------------------------------------------
1 | name: sklearn-mnist
2 | compute: azureml:tailwind-k8s
3 | auth_mode: key
4 | 


--------------------------------------------------------------------------------
/examples/inference/simple-flow/model/conda.yml:
--------------------------------------------------------------------------------
 1 | name: model-env
 2 | channels:
 3 |   - conda-forge
 4 | dependencies:
 5 |   - python=3.7
 6 |   - numpy=1.21.2
 7 |   - pip=21.2.4
 8 |   - scikit-learn=0.24.2
 9 |   - scipy=1.7.1
10 |   - pip:
11 |     - azureml-defaults==1.38.0
12 |     - joblib==1.0.1
13 | 


--------------------------------------------------------------------------------
/examples/inference/simple-flow/model/sklearn_mnist_model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/examples/inference/simple-flow/model/sklearn_mnist_model.pkl


--------------------------------------------------------------------------------
/examples/inference/simple-flow/script/score.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import numpy as np
 3 | import os
 4 | import pickle
 5 | import joblib
 6 | 
 7 | def init():
 8 |     global model
 9 |     # AZUREML_MODEL_DIR is an environment variable created during deployment.
10 |     # It is the path to the model folder (./azureml-models/$MODEL_NAME/$VERSION)
11 |     # For multiple models, it points to the folder containing all deployed models (./azureml-models)
12 |     model_path = os.path.join(os.getenv('AZUREML_MODEL_DIR'), 'sklearn_mnist_model.pkl')
13 |     model = joblib.load(model_path)
14 | 
15 | def run(raw_data):
16 |     data = np.array(json.loads(raw_data)['data'])
17 |     # make prediction
18 |     y_hat = model.predict(data)
19 |     # you can return any data type as long as it is JSON-serializable
20 |     return y_hat.tolist()


--------------------------------------------------------------------------------
/examples/inference/simple-flow/sklearn-model.yml:
--------------------------------------------------------------------------------
1 | $schema: https://azuremlschemas.azureedge.net/latest/model.schema.json
2 | name: sklearn-model
3 | version: 1
4 | datastore: azureml:workspaceartifactstore
5 | path: "ExperimentRun/dcid.9c143e8b-a6ed-4c6f-a907-34b8ec21127c/outputs/sklearn_mnist_model.pkl"
6 | description: Model asset from run output folder.


--------------------------------------------------------------------------------
/examples/training/additional-sdk-examples/001-Tensorflow/tf_mnist_with_checkpoint.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation. All rights reserved.
  2 | # Licensed under the MIT License.
  3 | 
  4 | import numpy as np
  5 | import argparse
  6 | import os
  7 | import re
  8 | import tensorflow as tf
  9 | import glob
 10 | 
 11 | from azureml.core import Run
 12 | from utils import load_data
 13 | 
 14 | print("TensorFlow version:", tf.__version__)
 15 | 
 16 | parser = argparse.ArgumentParser()
 17 | parser.add_argument('--data-folder', type=str, dest='data_folder', help='data folder mounting point')
 18 | 
 19 | parser.add_argument('--resume-from', type=str, default=None,
 20 |                     help='location of the model or checkpoint files from where to resume the training')
 21 | args = parser.parse_args()
 22 | 
 23 | 
 24 | previous_model_location = args.resume_from
 25 | # You can also use environment variable to get the model/checkpoint files location
 26 | # previous_model_location = os.path.expandvars(os.getenv("AZUREML_DATAREFERENCE_MODEL_LOCATION", None))
 27 | 
 28 | data_folder = args.data_folder
 29 | print('Data folder:', data_folder)
 30 | 
 31 | # load train and test set into numpy arrays
 32 | # note we scale the pixel intensity values to 0-1 (by dividing it with 255.0) so the model can converge faster.
 33 | 
 34 | X_train = load_data(glob.glob(os.path.join(data_folder, '**/train-images-idx3-ubyte.gz'),
 35 |                               recursive=True)[0], False) / 255.0
 36 | X_test = load_data(glob.glob(os.path.join(data_folder, '**/t10k-images-idx3-ubyte.gz'),
 37 |                              recursive=True)[0], False) / 255.0
 38 | y_train = load_data(glob.glob(os.path.join(data_folder, '**/train-labels-idx1-ubyte.gz'),
 39 |                               recursive=True)[0], True).reshape(-1)
 40 | y_test = load_data(glob.glob(os.path.join(data_folder, '**/t10k-labels-idx1-ubyte.gz'),
 41 |                              recursive=True)[0], True).reshape(-1)
 42 | 
 43 | print(X_train.shape, y_train.shape, X_test.shape, y_test.shape, sep='\n')
 44 | 
 45 | training_set_size = X_train.shape[0]
 46 | 
 47 | n_inputs = 28 * 28
 48 | n_h1 = 100
 49 | n_h2 = 100
 50 | n_outputs = 10
 51 | learning_rate = 0.01
 52 | n_epochs = 1000000000000000000000000000
 53 | batch_size = 50
 54 | 
 55 | with tf.name_scope('network'):
 56 |     # construct the DNN
 57 |     X = tf.placeholder(tf.float32, shape=(None, n_inputs), name='X')
 58 |     y = tf.placeholder(tf.int64, shape=(None), name='y')
 59 |     h1 = tf.layers.dense(X, n_h1, activation=tf.nn.relu, name='h1')
 60 |     h2 = tf.layers.dense(h1, n_h2, activation=tf.nn.relu, name='h2')
 61 |     output = tf.layers.dense(h2, n_outputs, name='output')
 62 | 
 63 | with tf.name_scope('train'):
 64 |     cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=output)
 65 |     loss = tf.reduce_mean(cross_entropy, name='loss')
 66 |     optimizer = tf.train.GradientDescentOptimizer(learning_rate)
 67 |     train_op = optimizer.minimize(loss)
 68 | 
 69 | with tf.name_scope('eval'):
 70 |     correct = tf.nn.in_top_k(output, y, 1)
 71 |     acc_op = tf.reduce_mean(tf.cast(correct, tf.float32))
 72 | 
 73 | init = tf.global_variables_initializer()
 74 | saver = tf.train.Saver()
 75 | 
 76 | # start an Azure ML run
 77 | run = Run.get_context()
 78 | 
 79 | with tf.Session() as sess:
 80 |     start_epoch = 0
 81 |     if previous_model_location:
 82 |         checkpoint_file_path = tf.train.latest_checkpoint(previous_model_location)
 83 |         saver.restore(sess, checkpoint_file_path)
 84 |         checkpoint_filename = os.path.basename(checkpoint_file_path)
 85 |         num_found = re.search(r'\d+', checkpoint_filename)
 86 |         if num_found:
 87 |             start_epoch = int(num_found.group(0))
 88 |             print("Resuming from epoch {}".format(str(start_epoch)))
 89 |     else:
 90 |         init.run()
 91 | 
 92 |     for epoch in range(start_epoch, n_epochs):
 93 | 
 94 |         # randomly shuffle training set
 95 |         indices = np.random.permutation(training_set_size)
 96 |         X_train = X_train[indices]
 97 |         y_train = y_train[indices]
 98 | 
 99 |         # batch index
100 |         b_start = 0
101 |         b_end = b_start + batch_size
102 |         for _ in range(training_set_size // batch_size):
103 |             # get a batch
104 |             X_batch, y_batch = X_train[b_start: b_end], y_train[b_start: b_end]
105 | 
106 |             # update batch index for the next batch
107 |             b_start = b_start + batch_size
108 |             b_end = min(b_start + batch_size, training_set_size)
109 | 
110 |             # train
111 |             sess.run(train_op, feed_dict={X: X_batch, y: y_batch})
112 |         # evaluate training set
113 |         acc_train = acc_op.eval(feed_dict={X: X_batch, y: y_batch})
114 |         # evaluate validation set
115 |         acc_val = acc_op.eval(feed_dict={X: X_test, y: y_test})
116 | 
117 |         # log accuracies
118 |         run.log('training_acc', np.float(acc_train))
119 |         run.log('validation_acc', np.float(acc_val))
120 |         print(epoch, '-- Training accuracy:', acc_train, '\b Validation accuracy:', acc_val)
121 |         y_hat = np.argmax(output.eval(feed_dict={X: X_test}), axis=1)
122 | 
123 |         if epoch % 5 == 0:
124 |             saver.save(sess, './outputs/', global_step=epoch)
125 | 
126 |         # saving only half of the model and resuming again from same epoch
127 |         if not previous_model_location and epoch == 10:
128 |             break
129 | 
130 |     run.log('final_acc', np.float(acc_val))
131 | 


--------------------------------------------------------------------------------
/examples/training/additional-sdk-examples/001-Tensorflow/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import gzip
 5 | import numpy as np
 6 | import struct
 7 | 
 8 | 
 9 | # load compressed MNIST gz files and return numpy arrays
10 | def load_data(filename, label=False):
11 |     with gzip.open(filename) as gz:
12 |         struct.unpack('I', gz.read(4))
13 |         n_items = struct.unpack('>I', gz.read(4))
14 |         if not label:
15 |             n_rows = struct.unpack('>I', gz.read(4))[0]
16 |             n_cols = struct.unpack('>I', gz.read(4))[0]
17 |             res = np.frombuffer(gz.read(n_items[0] * n_rows * n_cols), dtype=np.uint8)
18 |             res = res.reshape(n_items[0], n_rows * n_cols)
19 |         else:
20 |             res = np.frombuffer(gz.read(n_items[0]), dtype=np.uint8)
21 |             res = res.reshape(n_items[0], 1)
22 |     return res
23 | 
24 | 
25 | # one-hot encode a 1-D array
26 | def one_hot_encode(array, num_of_classes):
27 |     return np.eye(num_of_classes)[array.reshape(-1)]
28 | 


--------------------------------------------------------------------------------
/examples/training/additional-sdk-examples/002-SciKitLearn/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import gzip
 5 | import numpy as np
 6 | import struct
 7 | 
 8 | 
 9 | # load compressed MNIST gz files and return numpy arrays
10 | def load_data(filename, label=False):
11 |     with gzip.open(filename) as gz:
12 |         struct.unpack('I', gz.read(4))
13 |         n_items = struct.unpack('>I', gz.read(4))
14 |         if not label:
15 |             n_rows = struct.unpack('>I', gz.read(4))[0]
16 |             n_cols = struct.unpack('>I', gz.read(4))[0]
17 |             res = np.frombuffer(gz.read(n_items[0] * n_rows * n_cols), dtype=np.uint8)
18 |             res = res.reshape(n_items[0], n_rows * n_cols)
19 |         else:
20 |             res = np.frombuffer(gz.read(n_items[0]), dtype=np.uint8)
21 |             res = res.reshape(n_items[0], 1)
22 |     return res
23 | 
24 | 
25 | # one-hot encode a 1-D array
26 | def one_hot_encode(array, num_of_classes):
27 |     return np.eye(num_of_classes)[array.reshape(-1)]
28 | 


--------------------------------------------------------------------------------
/examples/training/simple-train-cli/job.yml:
--------------------------------------------------------------------------------
1 | experiment_name: Tutorial-sklearn-mnist
2 | code: ./src
3 | command: python train.py --data-folder ./mnist-data --regularization 0.5
4 | environment: azureml:AzureML-sklearn-0.24-ubuntu18.04-py37-cpu@latest
5 | compute: azureml:tailwind-k8s
6 | resources:
7 |   instance_type: <your instance type>
8 |     
9 | 


--------------------------------------------------------------------------------
/examples/training/simple-train-cli/src/mnist-data/t10k-images-idx3-ubyte.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/examples/training/simple-train-cli/src/mnist-data/t10k-images-idx3-ubyte.gz


--------------------------------------------------------------------------------
/examples/training/simple-train-cli/src/mnist-data/t10k-labels-idx1-ubyte.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/examples/training/simple-train-cli/src/mnist-data/t10k-labels-idx1-ubyte.gz


--------------------------------------------------------------------------------
/examples/training/simple-train-cli/src/mnist-data/train-images-idx3-ubyte.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/examples/training/simple-train-cli/src/mnist-data/train-images-idx3-ubyte.gz


--------------------------------------------------------------------------------
/examples/training/simple-train-cli/src/mnist-data/train-labels-idx1-ubyte.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/examples/training/simple-train-cli/src/mnist-data/train-labels-idx1-ubyte.gz


--------------------------------------------------------------------------------
/examples/training/simple-train-cli/src/train.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import argparse
 3 | import os
 4 | import numpy as np
 5 | import glob
 6 | 
 7 | from sklearn.linear_model import LogisticRegression
 8 | import joblib
 9 | 
10 | from azureml.core import Run
11 | from utils import load_data
12 | 
13 | # let user feed in 2 parameters, the dataset to mount or download, and the regularization rate of the logistic regression model
14 | parser = argparse.ArgumentParser()
15 | parser.add_argument('--data-folder', type=str, dest='data_folder', help='data folder mounting point')
16 | parser.add_argument('--regularization', type=float, dest='reg', default=0.01, help='regularization rate')
17 | args = parser.parse_args()
18 | 
19 | data_folder = args.data_folder
20 | print('Data folder:', data_folder)
21 | 
22 | # load train and test set into numpy arrays
23 | # note we scale the pixel intensity values to 0-1 (by dividing it with 255.0) so the model can converge faster.
24 | X_train = load_data(glob.glob(os.path.join(data_folder, '**/train-images-idx3-ubyte.gz'), recursive=True)[0], False) / 255.0
25 | X_test = load_data(glob.glob(os.path.join(data_folder, '**/t10k-images-idx3-ubyte.gz'), recursive=True)[0], False) / 255.0
26 | y_train = load_data(glob.glob(os.path.join(data_folder, '**/train-labels-idx1-ubyte.gz'), recursive=True)[0], True).reshape(-1)
27 | y_test = load_data(glob.glob(os.path.join(data_folder, '**/t10k-labels-idx1-ubyte.gz'), recursive=True)[0], True).reshape(-1)
28 | 
29 | print(X_train.shape, y_train.shape, X_test.shape, y_test.shape, sep = '\n')
30 | 
31 | # get hold of the current run
32 | run = Run.get_context()
33 | 
34 | print('Train a logistic regression model with regularization rate of', args.reg)
35 | clf = LogisticRegression(C=1.0/args.reg, solver="liblinear", multi_class="auto", random_state=42)
36 | clf.fit(X_train, y_train)
37 | 
38 | print('Predict the test set')
39 | y_hat = clf.predict(X_test)
40 | 
41 | # calculate accuracy on the prediction
42 | acc = np.average(y_hat == y_test)
43 | print('Accuracy is', acc)
44 | 
45 | run.log('regularization rate', np.float(args.reg))
46 | run.log('accuracy', np.float(acc))
47 | 
48 | os.makedirs('outputs', exist_ok=True)
49 | # note file saved in the outputs folder is automatically uploaded into experiment record
50 | joblib.dump(value=clf, filename='outputs/sklearn_mnist_model.pkl')
51 | 


--------------------------------------------------------------------------------
/examples/training/simple-train-cli/src/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import gzip
 5 | import numpy as np
 6 | import struct
 7 | 
 8 | 
 9 | # load compressed MNIST gz files and return numpy arrays
10 | def load_data(filename, label=False):
11 |     with gzip.open(filename) as gz:
12 |         struct.unpack('I', gz.read(4))
13 |         n_items = struct.unpack('>I', gz.read(4))
14 |         if not label:
15 |             n_rows = struct.unpack('>I', gz.read(4))[0]
16 |             n_cols = struct.unpack('>I', gz.read(4))[0]
17 |             res = np.frombuffer(gz.read(n_items[0] * n_rows * n_cols), dtype=np.uint8)
18 |             res = res.reshape(n_items[0], n_rows * n_cols)
19 |         else:
20 |             res = np.frombuffer(gz.read(n_items[0]), dtype=np.uint8)
21 |             res = res.reshape(n_items[0], 1)
22 |     return res
23 | 
24 | 
25 | # one-hot encode a 1-D array
26 | def one_hot_encode(array, num_of_classes):
27 |     return np.eye(num_of_classes)[array.reshape(-1)]
28 | 


--------------------------------------------------------------------------------
/examples/training/simple-train-sdk/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import gzip
 5 | import numpy as np
 6 | import struct
 7 | 
 8 | 
 9 | # load compressed MNIST gz files and return numpy arrays
10 | def load_data(filename, label=False):
11 |     with gzip.open(filename) as gz:
12 |         struct.unpack('I', gz.read(4))
13 |         n_items = struct.unpack('>I', gz.read(4))
14 |         if not label:
15 |             n_rows = struct.unpack('>I', gz.read(4))[0]
16 |             n_cols = struct.unpack('>I', gz.read(4))[0]
17 |             res = np.frombuffer(gz.read(n_items[0] * n_rows * n_cols), dtype=np.uint8)
18 |             res = res.reshape(n_items[0], n_rows * n_cols)
19 |         else:
20 |             res = np.frombuffer(gz.read(n_items[0]), dtype=np.uint8)
21 |             res = res.reshape(n_items[0], 1)
22 |     return res
23 | 
24 | 
25 | # one-hot encode a 1-D array
26 | def one_hot_encode(array, num_of_classes):
27 |     return np.eye(num_of_classes)[array.reshape(-1)]
28 | 


--------------------------------------------------------------------------------
/examples/training/train-using-nfs/amlarc-nfs-setup/README.md:
--------------------------------------------------------------------------------
 1 | # Setting up an NFS Server on AML Arc
 2 | 
 3 | Before you can run any of the examples in this section you will need to setup an NFS mount on your
 4 | Arc-enabled Kubernetes cluster.
 5 | 
 6 | The included mount-config.yaml file can be used as a template to do this. You will need to replace `<nfs-server-ip>` with the
 7 | actual address of your server. Then run the following:
 8 | 
 9 | ```
10 | kubectl apply -f mount-config.yaml
11 | ```
12 | 
13 | More detailed documentation on ephemeral NFS volume usage in Arc-enabled Machine Learning 
14 | can be found [here](../../../docs/setup-ephemeral-nfs-volume.md)
15 | 


--------------------------------------------------------------------------------
/examples/training/train-using-nfs/amlarc-nfs-setup/mount-config.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | data:
 3 |   mounts.yaml: |
 4 |     mountPoints:
 5 |     - mountPath: /nfs_share
 6 |       mountType: nfs
 7 |       name: amlarc-nfs-share-0
 8 |       path: /disks/4TB/code/nfs_share
 9 |       server: <nfs-server-ip>
10 | kind: ConfigMap
11 | metadata:
12 |   name: mount-config
13 |   namespace: azureml
14 | 


--------------------------------------------------------------------------------
/examples/training/train-using-nfs/pytorch-on-amlarc-with-nfs/scripts/train.py:
--------------------------------------------------------------------------------
  1 | from azureml.core.run import Run
  2 | 
  3 | import argparse
  4 | import os
  5 | import torch
  6 | import torch.distributed as dist
  7 | import torch.nn as nn
  8 | import torch.nn.functional as F
  9 | import torch.optim as optim
 10 | 
 11 | from math import ceil
 12 | from random import Random
 13 | from torch.multiprocessing import Process
 14 | from torch.autograd import Variable
 15 | from torchvision import datasets, transforms
 16 | 
 17 | parser = argparse.ArgumentParser()
 18 | parser.add_argument('--data-dir', required=True, help='path to input data directory')
 19 | parser.add_argument('--backend', required=True, help='Pytorch backend (gloo or nccl)')
 20 | args = parser.parse_args()
 21 | 
 22 | 
 23 | 
 24 | class Partition(object):
 25 |     def __init__(self, data, index):
 26 |         self.data = data
 27 |         self.index = index
 28 | 
 29 |     def __len__(self):
 30 |         return len(self.index)
 31 | 
 32 |     def __getitem__(self, index):
 33 |         data_idx = self.index[index]
 34 |         return self.data[data_idx]
 35 | 
 36 | 
 37 | class DataPartitioner(object):
 38 |     def __init__(self, data, sizes=[0.7, 0.2, 0.1], seed=1234):
 39 |         self.data = data
 40 |         self.partitions = []
 41 |         rng = Random()
 42 |         rng.seed(seed)
 43 |         data_len = len(data)
 44 |         indexes = [x for x in range(0, data_len)]
 45 |         rng.shuffle(indexes)
 46 | 
 47 |         for frac in sizes:
 48 |             part_len = int(frac * data_len)
 49 |             self.partitions.append(indexes[0:part_len])
 50 |             indexes = indexes[part_len:]
 51 | 
 52 |     def use(self, partition):
 53 |         return Partition(self.data, self.partitions[partition])
 54 | 
 55 | 
 56 | class Net(nn.Module):
 57 |     def __init__(self):
 58 |         super(Net, self).__init__()
 59 |         self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
 60 |         self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
 61 |         self.conv2_drop = nn.Dropout2d()
 62 |         self.fc1 = nn.Linear(320, 50)
 63 |         self.fc2 = nn.Linear(50, 10)
 64 | 
 65 |     def forward(self, x):
 66 |         x = F.relu(F.max_pool2d(self.conv1(x), 2))
 67 |         x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
 68 |         x = x.view(-1, 320)
 69 |         x = F.relu(self.fc1(x))
 70 |         x = F.dropout(x, training=self.training)
 71 |         x = self.fc2(x)
 72 |         return F.log_softmax(x)
 73 | 
 74 | 
 75 | def partition_dataset():
 76 |     dataset = datasets.MNIST(
 77 |         args.data_dir,
 78 |         train=True,
 79 |         download=False,
 80 |         transform=transforms.Compose([
 81 |             transforms.ToTensor(),
 82 |             transforms.Normalize((0.1307, ), (0.3081, ))
 83 |         ]))
 84 | 
 85 |     size = dist.get_world_size()
 86 |     bsz = 128 // size
 87 |     partition_sizes = [1.0 / size for _ in range(size)]
 88 |     partition = DataPartitioner(dataset, partition_sizes)
 89 |     partition = partition.use(dist.get_rank())
 90 |     train_set = torch.utils.data.DataLoader(
 91 |         partition, batch_size=bsz, shuffle=True)
 92 |     return train_set, bsz
 93 | 
 94 | 
 95 | def average_gradients(model):
 96 |     size = float(dist.get_world_size())
 97 |     for param in model.parameters():
 98 |         dist.all_reduce(param.grad.data, op=dist.reduce_op.SUM)
 99 |         param.grad.data /= size
100 | 
101 | 
102 | def run(rank, size):
103 |     torch.manual_seed(1234)
104 |     train_set, bsz = partition_dataset()
105 |     model = Net()
106 |     optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
107 | 
108 |     num_batches = ceil(len(train_set.dataset) / float(bsz))
109 |     for epoch in range(5):
110 |         epoch_loss = 0.0
111 |         for data, target in train_set:
112 |             data, target = Variable(data), Variable(target)
113 |             optimizer.zero_grad()
114 |             output = model(data)
115 |             loss = F.nll_loss(output, target)
116 |             epoch_loss += loss.data
117 |             loss.backward()
118 |             average_gradients(model)
119 |             optimizer.step()
120 | 
121 |         if dist.get_rank() == 0:
122 |             run_object.log('Training loss', epoch_loss / num_batches)
123 | 
124 | 
125 | if __name__ == "__main__":
126 |     run_object = Run.get_context()
127 |     dist.init_process_group(args.backend)
128 |     run(dist.get_rank(), dist.get_world_size())
129 | 


--------------------------------------------------------------------------------
/examples/training/train-using-nfs/scikit-learn-on-amlarc-with-nfs/iris.csv:
--------------------------------------------------------------------------------
  1 | sepal_length,sepal_width,petal_length,petal_width,species
  2 | 5.1,3.5,1.4,0.2,Iris-setosa
  3 | 4.9,3,1.4,0.2,Iris-setosa
  4 | 4.7,3.2,1.3,0.2,Iris-setosa
  5 | 4.6,3.1,1.5,0.2,Iris-setosa
  6 | 5,3.6,1.4,0.2,Iris-setosa
  7 | 5.4,3.9,1.7,0.4,Iris-setosa
  8 | 4.6,3.4,1.4,0.3,Iris-setosa
  9 | 5,3.4,1.5,0.2,Iris-setosa
 10 | 4.4,2.9,1.4,0.2,Iris-setosa
 11 | 4.9,3.1,1.5,0.1,Iris-setosa
 12 | 5.4,3.7,1.5,0.2,Iris-setosa
 13 | 4.8,3.4,1.6,0.2,Iris-setosa
 14 | 4.8,3,1.4,0.1,Iris-setosa
 15 | 4.3,3,1.1,0.1,Iris-setosa
 16 | 5.8,4,1.2,0.2,Iris-setosa
 17 | 5.7,4.4,1.5,0.4,Iris-setosa
 18 | 5.4,3.9,1.3,0.4,Iris-setosa
 19 | 5.1,3.5,1.4,0.3,Iris-setosa
 20 | 5.7,3.8,1.7,0.3,Iris-setosa
 21 | 5.1,3.8,1.5,0.3,Iris-setosa
 22 | 5.4,3.4,1.7,0.2,Iris-setosa
 23 | 5.1,3.7,1.5,0.4,Iris-setosa
 24 | 4.6,3.6,1,0.2,Iris-setosa
 25 | 5.1,3.3,1.7,0.5,Iris-setosa
 26 | 4.8,3.4,1.9,0.2,Iris-setosa
 27 | 5,3,1.6,0.2,Iris-setosa
 28 | 5,3.4,1.6,0.4,Iris-setosa
 29 | 5.2,3.5,1.5,0.2,Iris-setosa
 30 | 5.2,3.4,1.4,0.2,Iris-setosa
 31 | 4.7,3.2,1.6,0.2,Iris-setosa
 32 | 4.8,3.1,1.6,0.2,Iris-setosa
 33 | 5.4,3.4,1.5,0.4,Iris-setosa
 34 | 5.2,4.1,1.5,0.1,Iris-setosa
 35 | 5.5,4.2,1.4,0.2,Iris-setosa
 36 | 4.9,3.1,1.5,0.1,Iris-setosa
 37 | 5,3.2,1.2,0.2,Iris-setosa
 38 | 5.5,3.5,1.3,0.2,Iris-setosa
 39 | 4.9,3.1,1.5,0.1,Iris-setosa
 40 | 4.4,3,1.3,0.2,Iris-setosa
 41 | 5.1,3.4,1.5,0.2,Iris-setosa
 42 | 5,3.5,1.3,0.3,Iris-setosa
 43 | 4.5,2.3,1.3,0.3,Iris-setosa
 44 | 4.4,3.2,1.3,0.2,Iris-setosa
 45 | 5,3.5,1.6,0.6,Iris-setosa
 46 | 5.1,3.8,1.9,0.4,Iris-setosa
 47 | 4.8,3,1.4,0.3,Iris-setosa
 48 | 5.1,3.8,1.6,0.2,Iris-setosa
 49 | 4.6,3.2,1.4,0.2,Iris-setosa
 50 | 5.3,3.7,1.5,0.2,Iris-setosa
 51 | 5,3.3,1.4,0.2,Iris-setosa
 52 | 7,3.2,4.7,1.4,Iris-versicolor
 53 | 6.4,3.2,4.5,1.5,Iris-versicolor
 54 | 6.9,3.1,4.9,1.5,Iris-versicolor
 55 | 5.5,2.3,4,1.3,Iris-versicolor
 56 | 6.5,2.8,4.6,1.5,Iris-versicolor
 57 | 5.7,2.8,4.5,1.3,Iris-versicolor
 58 | 6.3,3.3,4.7,1.6,Iris-versicolor
 59 | 4.9,2.4,3.3,1,Iris-versicolor
 60 | 6.6,2.9,4.6,1.3,Iris-versicolor
 61 | 5.2,2.7,3.9,1.4,Iris-versicolor
 62 | 5,2,3.5,1,Iris-versicolor
 63 | 5.9,3,4.2,1.5,Iris-versicolor
 64 | 6,2.2,4,1,Iris-versicolor
 65 | 6.1,2.9,4.7,1.4,Iris-versicolor
 66 | 5.6,2.9,3.6,1.3,Iris-versicolor
 67 | 6.7,3.1,4.4,1.4,Iris-versicolor
 68 | 5.6,3,4.5,1.5,Iris-versicolor
 69 | 5.8,2.7,4.1,1,Iris-versicolor
 70 | 6.2,2.2,4.5,1.5,Iris-versicolor
 71 | 5.6,2.5,3.9,1.1,Iris-versicolor
 72 | 5.9,3.2,4.8,1.8,Iris-versicolor
 73 | 6.1,2.8,4,1.3,Iris-versicolor
 74 | 6.3,2.5,4.9,1.5,Iris-versicolor
 75 | 6.1,2.8,4.7,1.2,Iris-versicolor
 76 | 6.4,2.9,4.3,1.3,Iris-versicolor
 77 | 6.6,3,4.4,1.4,Iris-versicolor
 78 | 6.8,2.8,4.8,1.4,Iris-versicolor
 79 | 6.7,3,5,1.7,Iris-versicolor
 80 | 6,2.9,4.5,1.5,Iris-versicolor
 81 | 5.7,2.6,3.5,1,Iris-versicolor
 82 | 5.5,2.4,3.8,1.1,Iris-versicolor
 83 | 5.5,2.4,3.7,1,Iris-versicolor
 84 | 5.8,2.7,3.9,1.2,Iris-versicolor
 85 | 6,2.7,5.1,1.6,Iris-versicolor
 86 | 5.4,3,4.5,1.5,Iris-versicolor
 87 | 6,3.4,4.5,1.6,Iris-versicolor
 88 | 6.7,3.1,4.7,1.5,Iris-versicolor
 89 | 6.3,2.3,4.4,1.3,Iris-versicolor
 90 | 5.6,3,4.1,1.3,Iris-versicolor
 91 | 5.5,2.5,4,1.3,Iris-versicolor
 92 | 5.5,2.6,4.4,1.2,Iris-versicolor
 93 | 6.1,3,4.6,1.4,Iris-versicolor
 94 | 5.8,2.6,4,1.2,Iris-versicolor
 95 | 5,2.3,3.3,1,Iris-versicolor
 96 | 5.6,2.7,4.2,1.3,Iris-versicolor
 97 | 5.7,3,4.2,1.2,Iris-versicolor
 98 | 5.7,2.9,4.2,1.3,Iris-versicolor
 99 | 6.2,2.9,4.3,1.3,Iris-versicolor
100 | 5.1,2.5,3,1.1,Iris-versicolor
101 | 5.7,2.8,4.1,1.3,Iris-versicolor
102 | 6.3,3.3,6,2.5,Iris-virginica
103 | 5.8,2.7,5.1,1.9,Iris-virginica
104 | 7.1,3,5.9,2.1,Iris-virginica
105 | 6.3,2.9,5.6,1.8,Iris-virginica
106 | 6.5,3,5.8,2.2,Iris-virginica
107 | 7.6,3,6.6,2.1,Iris-virginica
108 | 4.9,2.5,4.5,1.7,Iris-virginica
109 | 7.3,2.9,6.3,1.8,Iris-virginica
110 | 6.7,2.5,5.8,1.8,Iris-virginica
111 | 7.2,3.6,6.1,2.5,Iris-virginica
112 | 6.5,3.2,5.1,2,Iris-virginica
113 | 6.4,2.7,5.3,1.9,Iris-virginica
114 | 6.8,3,5.5,2.1,Iris-virginica
115 | 5.7,2.5,5,2,Iris-virginica
116 | 5.8,2.8,5.1,2.4,Iris-virginica
117 | 6.4,3.2,5.3,2.3,Iris-virginica
118 | 6.5,3,5.5,1.8,Iris-virginica
119 | 7.7,3.8,6.7,2.2,Iris-virginica
120 | 7.7,2.6,6.9,2.3,Iris-virginica
121 | 6,2.2,5,1.5,Iris-virginica
122 | 6.9,3.2,5.7,2.3,Iris-virginica
123 | 5.6,2.8,4.9,2,Iris-virginica
124 | 7.7,2.8,6.7,2,Iris-virginica
125 | 6.3,2.7,4.9,1.8,Iris-virginica
126 | 6.7,3.3,5.7,2.1,Iris-virginica
127 | 7.2,3.2,6,1.8,Iris-virginica
128 | 6.2,2.8,4.8,1.8,Iris-virginica
129 | 6.1,3,4.9,1.8,Iris-virginica
130 | 6.4,2.8,5.6,2.1,Iris-virginica
131 | 7.2,3,5.8,1.6,Iris-virginica
132 | 7.4,2.8,6.1,1.9,Iris-virginica
133 | 7.9,3.8,6.4,2,Iris-virginica
134 | 6.4,2.8,5.6,2.2,Iris-virginica
135 | 6.3,2.8,5.1,1.5,Iris-virginica
136 | 6.1,2.6,5.6,1.4,Iris-virginica
137 | 7.7,3,6.1,2.3,Iris-virginica
138 | 6.3,3.4,5.6,2.4,Iris-virginica
139 | 6.4,3.1,5.5,1.8,Iris-virginica
140 | 6,3,4.8,1.8,Iris-virginica
141 | 6.9,3.1,5.4,2.1,Iris-virginica
142 | 6.7,3.1,5.6,2.4,Iris-virginica
143 | 6.9,3.1,5.1,2.3,Iris-virginica
144 | 5.8,2.7,5.1,1.9,Iris-virginica
145 | 6.8,3.2,5.9,2.3,Iris-virginica
146 | 6.7,3.3,5.7,2.5,Iris-virginica
147 | 6.7,3,5.2,2.3,Iris-virginica
148 | 6.3,2.5,5,1.9,Iris-virginica
149 | 6.5,3,5.2,2,Iris-virginica
150 | 6.2,3.4,5.4,2.3,Iris-virginica
151 | 5.9,3,5.1,1.8,Iris-virginica
152 | 


--------------------------------------------------------------------------------
/examples/training/train-using-nfs/scikit-learn-on-amlarc-with-nfs/scripts/train.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft. All rights reserved.
 2 | # Licensed under the MIT license.
 3 | 
 4 | import os
 5 | import argparse
 6 | import pandas
 7 | 
 8 | from sklearn.model_selection import train_test_split
 9 | from sklearn.tree import DecisionTreeClassifier
10 | # sklearn.externals.joblib is removed in 0.23
11 | from sklearn import __version__ as sklearnver
12 | from packaging.version import Version
13 | if Version(sklearnver) < Version("0.23.0"):
14 |     from sklearn.externals import joblib
15 | else:
16 |     import joblib
17 | 
18 | parser = argparse.ArgumentParser()
19 | parser.add_argument('--data-dir', required=True, help='path to input data directory')
20 | args = parser.parse_args()
21 | 
22 | # get input data
23 | data_file = os.path.join(args.data_dir, 'iris.csv')
24 | df = pandas.read_csv(data_file)
25 | 
26 | x_col = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
27 | y_col = ['species']
28 | x_df = df.loc[:, x_col]
29 | y_df = df.loc[:, y_col]
30 | 
31 | #dividing X,y into train and test data
32 | x_train, x_test, y_train, y_test = train_test_split(x_df, y_df, test_size=0.2, random_state=223)
33 | 
34 | data = {'train': {'X': x_train, 'y': y_train},
35 | 
36 |         'test': {'X': x_test, 'y': y_test}}
37 | 
38 | clf = DecisionTreeClassifier().fit(data['train']['X'], data['train']['y'])
39 | model_file_name = 'decision_tree.pkl'
40 | 
41 | print('Accuracy of Decision Tree classifier on training set: {:.2f}'.format(clf.score(x_train, y_train)))
42 | print('Accuracy of Decision Tree classifier on test set: {:.2f}'.format(clf.score(x_test, y_test)))
43 | 
44 | os.makedirs('./outputs', exist_ok=True)
45 | with open(model_file_name, 'wb') as file:
46 |     joblib.dump(value=clf, filename='outputs/' + model_file_name)
47 | 


--------------------------------------------------------------------------------
/files/deploy-amlarc.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | subscription_id="<YOUR_SUBSCRIPTION_ID>"
 6 | resource_group="<YOUR_RESOURCE_GROUP>"
 7 | cluster_name="<YOUR_AKS_CLUSTER_NAME>"
 8 | 
 9 | arcml_extension_name="arcml-extension"
10 | 
11 | ssl_cname="<YOUR_SSL_CNAME>"
12 | ssl_cert_pem_file="<YOUR_CERT_PEM_FILE>"
13 | ssl_key_pem_file="<YOUR_KEY_PEM_FILE>"
14 | 
15 | # STEP1 Register feature providers
16 | echo 'Register features...'
17 | az feature register --namespace Microsoft.ContainerService -n AKS-ExtensionManager --subscription "$subscription_id"
18 | echo 'Waiting for feature register...'
19 | while [ "$(az feature list --query "[?contains(name, 'Microsoft.ContainerService/AKS-ExtensionManager')].[properties.state]" -o json |jq '.[0][0]')" == 'Registered' ]
20 | do
21 |   sleep 5
22 | done
23 | az provider register -n Microsoft.ContainerService 1
24 | 
25 | 
26 | # STEP2 Deploy AmlArc extension
27 | # OPTION A) AKS service has public https endpoint
28 | az k8s-extension create --cluster-name $cluster_name --cluster-type managedClusters -n $arcml_extension_name \
29 | --extension-type Microsoft.AzureML.Kubernetes --scope cluster --configuration-settings enableInference=True \
30 | sslCname=$ssl_cname --config-protected sslCertPemFile=$ssl_cert_pem_file sslKeyPemFile=$ssl_key_pem_file \
31 | --subscription $subscription_id -g $resource_group --auto-upgrade-minor-version False
32 | 
33 | # OPTION B) AKS service has public http endpoint
34 | #az k8s-extension create --cluster-name $cluster_name --cluster-type managedClusters -n $arcml_extension_name \
35 | #--extension-type Microsoft.AzureML.Kubernetes --scope cluster --configuration-settings enableInference=True allowInsecureConnections=true \
36 | #--subscription $subscription_id -g $resource_group --auto-upgrade-minor-version False
37 | 
38 | # OPTION C) AKS service has private http endpoint
39 | #az k8s-extension create --cluster-name $cluster_name --cluster-type managedClusters -n $arcml_extension_name \
40 | #--extension-type Microsoft.AzureML.Kubernetes --scope cluster --configuration-settings enableInference=True allowInsecureConnections=true \
41 | #privateEndpointILB=True --subscription $subscription_id -g $resource_group --auto-upgrade-minor-version False
42 | 
43 | 
44 | extension_install_state=$(az k8s-extension show --name $arcml_extension_name  --cluster-type managedClusters --cluster-name "$cluster_name"   --resource-group "$resource_group" --subscription "$subscription_id" | jq -r '.provisioningState')
45 | echo "$extension_install_state"
46 | if [[ $extension_install_state == "Succeeded" ]]
47 | then
48 |   echo "AzureML extention created successfully"
49 | else
50 |   echo "AzureML extention creation failed"
51 |   exit 1
52 | fi
53 | 


--------------------------------------------------------------------------------
/files/deployextension.parameters.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentParameters.json#",
 3 |     "contentVersion": "1.0.0.0",
 4 |     "parameters": {
 5 |         "extensionName": {
 6 |             "value": ""
 7 |         },
 8 |         "autoUpgradeMinorVersion": {
 9 |             "value": false
10 |         },
11 |         "enableTraining": {
12 |             "value": true
13 |         },
14 |         "enableInference": {
15 |             "value": true
16 |         },
17 |         "allowInsecureConnections": {
18 |             "value": false
19 |         },
20 |         "aksResourceId": {
21 |             "value": "/subscriptions/00000000-0000-0000-0000-000000000000/resourcegroups/foo/providers/Microsoft.ContainerService/managedClusters/bar"
22 |         },
23 |         "aksLocation": {
24 |             "value": "eastus"
25 |         },
26 |         "inferenceRouterServiceType": {
27 |             "value": "LoadBalancer"
28 |         },
29 |         "internalLoadBalancerProvider": {
30 |             "value": "azure"
31 |         },
32 |         "inferenceRouterHA": {
33 |             "value": true
34 |         },
35 |         "installNvidiaDevicePlugin": {
36 |             "value": false
37 |         },
38 |         "installPromOp": {
39 |             "value": true
40 |         },
41 |         "installVolcano": {
42 |             "value": true
43 |         },
44 |         "installDcgmExporter": {
45 |             "value": false
46 |         },
47 |         "nodeSelector": {
48 |             "value": {
49 |                 "nodeSelector.<customized key 0>": "<customized value 0>",
50 |                 "nodeSelector.<customized key 1>": "<customized value 1>"
51 |             },
52 |             "metadata": {
53 |                 "description": "This field is optional"
54 |             }
55 |         },
56 |         "sslCname": {
57 |             "value": "foo.bar.com"
58 |         },
59 |         "sslSecret": {
60 |             "value": ""
61 |         },
62 |         "sslCertificate": {
63 |             "value": "<base64 encoded certificate pem content>"
64 |         },
65 |         "sslKey": {
66 |             "value": "<base64 encoded private key pem content>"
67 |         }
68 |     }
69 | }


--------------------------------------------------------------------------------
/files/entry.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | AZ_K8S_EXTENSION_VERSION='1.2.2'
4 | AZ_CONNECTED_K8S_VERSION='1.2.8'
5 | 
6 | az extension add --name connectedk8s --version $AZ_CONNECTED_K8S_VERSION
7 | az extension add --name k8s-extension --version $AZ_K8S_EXTENSION_VERSION
8 | 
9 | python deploy.py


--------------------------------------------------------------------------------
/files/quota setting tool/get_quotaoverrides_cr.py:
--------------------------------------------------------------------------------
 1 | from logging import getLogger
 2 | import argparse
 3 | from utils import config_logging, read_yaml, write_yaml_file, hash_string
 4 | from microsoft_graph import get_group_transitive_members_by_group_id, get_user_oid_by_mail, get_token
 5 | 
 6 | logger = getLogger(__name__)
 7 | userIdentifiers = []
 8 | crTemplateFilePath = "./quotaoverridesCRTemplate.yaml"
 9 | 
10 | 
11 | def add_unique_identifier(identifier):
12 |     if identifier not in userIdentifiers:
13 |         userIdentifiers.append(identifier)
14 | 
15 | 
16 | def process_identities(identities):
17 |     users, groups = identities.get('users', []), identities.get('groups', [])    
18 | 
19 |     token = get_token()
20 |     
21 |     for user in users:
22 |         logger.info(f"processing user {user}")
23 |         user_oid = get_user_oid_by_mail(user, token)
24 |         add_unique_identifier(hash_string(user_oid))
25 | 
26 |     for group in groups:
27 |         logger.info(f"processing group {group}")
28 |         for member in get_group_transitive_members_by_group_id(group, token):
29 |             add_unique_identifier(hash_string(member))
30 | 
31 | 
32 | def get_quotaoverride_cr(args):
33 |     config_file, output_file, name = args.config, args.output, args.name
34 | 
35 |     config, output = read_yaml(config_file), read_yaml(crTemplateFilePath)
36 | 
37 |     output['metadata']['name'] = name
38 |     output['metadata']['labels']['app.kubernetes.io/instance'] = name
39 |     output['spec']['tierOverrides'] = config['tierOverrides']
40 |     process_identities(config['userIdentifiers'])
41 |     output['spec']['userIdentifiers'] = userIdentifiers
42 | 
43 |     write_yaml_file(output_file, output)
44 | 
45 |     logger.info(f"generated quotaoverrides custom resource file, file path : {output_file} ")
46 | 
47 |     
48 | def main():
49 |     config_logging()
50 | 
51 |     parser = argparse.ArgumentParser(description='give a config yaml, generate quotaoverrides custom resource yaml, suggests to run [az login] first before using the command')
52 | 
53 |     parser.add_argument('--config', required=True, help="yaml file path of user's quota override config file")
54 |     parser.add_argument('--output', required=True, help="yaml file path of generated k8s quotaoverrides custom resource file")
55 |     parser.add_argument('--name', required=True, help="name of quotaoverrides custom resource")
56 | 
57 |     parser.set_defaults(func=get_quotaoverride_cr)	
58 | 
59 |     args = parser.parse_args()
60 |     args.func(args)
61 | 
62 | 
63 | if __name__ == '__main__':
64 |     main()
65 | 


--------------------------------------------------------------------------------
/files/quota setting tool/microsoft_graph.py:
--------------------------------------------------------------------------------
 1 | from logging import getLogger
 2 | import subprocess, json, requests
 3 | 
 4 | logger = getLogger(__name__)
 5 | 
 6 | def get_token():
 7 |     exitcode, data = subprocess.getstatusoutput('az account get-access-token --resource-type ms-graph')
 8 |     if exitcode != 0:
 9 |         logger.exception(data)
10 |         raise Exception('Exception in get-access-token')
11 |     
12 |     token = json.loads(data)['accessToken']
13 |     
14 |     logger.info('get ms-graph access token : {}'.format(token))
15 |     return token
16 | 
17 | 
18 | def _send_request(url, token):
19 |     try:
20 |         logger.info('sending url : {}'.format(url))
21 | 
22 |         headers = {
23 |             'Authorization': 'Bearer {}'.format(token),
24 |             'Host': 'graph.microsoft.com'
25 |         }
26 |         response = requests.get(url=url, headers=headers)
27 |         response.raise_for_status()
28 |         response_json = response.json()
29 |     except Exception as err:
30 |         raise SystemExit(err)
31 |     else:
32 |         return response_json
33 | 
34 | 
35 | def _iter_objects(url, token):
36 |     while url is not None:
37 |         response_json = _send_request(url, token)
38 | 
39 |         objects = response_json.get('value')
40 |         logger.info('Fetched {} objects from {}'.format(len(objects), url))
41 | 
42 |         yield from objects
43 |         url = response_json.get('@odata.nextLink')
44 | 
45 | 
46 | def get_group_transitive_members_by_group_id(group_id, token):
47 |     logger.info('get ms-graph group transitive members, group id : {}'.format(group_id))
48 | 
49 |     url = 'https://graph.microsoft.com/v1.0/groups/{}/transitiveMembers'.format(group_id)
50 | 
51 |     member_oids = []
52 |     for member in _iter_objects(url, token):
53 |         if member['@odata.type'] == '#microsoft.graph.user':
54 |             member_oids.append(member['id'])
55 | 
56 |     return member_oids
57 | 
58 | 
59 | def get_user_oid_by_mail(mail, token):
60 |     logger.info('get ms-graph user oids, user mail : {}'.format(mail))
61 | 
62 |     url = 'https://graph.microsoft.com/v1.0/users/{}'.format(mail)
63 | 
64 |     response_json = _send_request(url, token)
65 |     return response_json['id']
66 | 


--------------------------------------------------------------------------------
/files/quota setting tool/quotaoverridesCRTemplate.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: amlarc.azureml.com/v1
 2 | kind: QuotaOverride
 3 | metadata:
 4 |   labels:
 5 |     app.kubernetes.io/name: quotaoverride
 6 |     app.kubernetes.io/instance: <name>
 7 |   name: <name>
 8 | spec:
 9 |   tierOverrides:
10 |     <tierOverrides>
11 |   userIdentifiers:
12 |     <userIdentifiers>
13 | 


--------------------------------------------------------------------------------
/files/quota setting tool/readme.md:
--------------------------------------------------------------------------------
 1 | CLI tool to generate k8s quotaoverrides custom resource file accourding to user's config file
 2 | ```
 3 | usage: get_quotaoverrides_cr.py [-h] --config CONFIG --output OUTPUT --name NAME
 4 | 
 5 | give a config yaml, generate quotaoverrides custom resource yaml, suggests to run [az login] first before using the command
 6 | 
 7 | optional arguments:
 8 |   -h, --help       show this help message and exit
 9 |   --config CONFIG  yaml file path of user's quota override config file
10 |   --output OUTPUT  yaml file path of generated k8s quotaoverrides custom resource file
11 |   --name NAME      name of quotaoverrides custom resource
12 | ```
13 | the user config file should be like this:
14 | ```yaml
15 | tierOverrides:
16 |   <tier_to_override>:
17 |     <quota_resource_type>: <quota_resource_value>
18 | userIdentifiers:
19 |   users:
20 |     - <user_mail>
21 |   groups:
22 |     - <group_id>
23 | ```
24 | 
25 | ## example
26 | ### edit the config file in current path, name it config.yaml
27 | ```yaml
28 | tierOverrides:
29 |   my_tier1:
30 |     myquota1: myquota1
31 |     myquota2: myquota2
32 |   my_tier2:
33 |     myquota1: myquota1
34 |     myquota2: myquota2
35 | userIdentifiers:
36 |   users:
37 |     - my-first-user
38 |     - my-second-user
39 |   groups:
40 |     - my-first-group
41 |     - my-second-group
42 | ```
43 | ### run the CLI command, set the output file in current path
44 | ```
45 | get_quotaoverrides_cr.py --config ./config.yaml --output ./output.yaml --name example
46 | ```
47 | ### check output.yaml in current path
48 | ```yaml
49 | apiVersion: amlarc.azureml.com/v1
50 | kind: QuotaOverride
51 | metadata:
52 |   labels:
53 |     app.kubernetes.io/instance: example
54 |     app.kubernetes.io/name: quotaoverride
55 |   name: example
56 | spec:
57 |   tierOverrides:
58 |     my_tier1:
59 |       myquota1: myquota1
60 |       myquota2: myquota2
61 |     my_tier2:
62 |       myquota1: myquota1
63 |       myquota2: myquota2
64 |   userIdentifiers:
65 |   - userIdentifiers
66 | ```


--------------------------------------------------------------------------------
/files/quota setting tool/utils.py:
--------------------------------------------------------------------------------
 1 | from sys import stdout
 2 | from logging import getLogger, StreamHandler
 3 | import yaml, hashlib
 4 | import logging
 5 | 
 6 | logger = getLogger(__name__)
 7 | 
 8 | def config_logging():
 9 |     logging.basicConfig(
10 |         level=logging.INFO,
11 |         format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
12 |         datefmt='%Y-%m-%d %H:%M:%S'
13 |     )
14 | 
15 | 
16 | def read_yaml(path):
17 |     with open(path, 'r') as f:
18 |         data = yaml.safe_load(f)
19 |     return data
20 | 
21 | 
22 | def write_yaml_file(path, data):
23 |     with open(path, 'w') as f:
24 |         yaml.dump(data, f)
25 | 
26 | 
27 | def hash_string(string):
28 |     myhash = hashlib.sha1(string.encode('utf-8'))
29 |     return myhash.hexdigest().upper()
30 | 


--------------------------------------------------------------------------------
/files/sslsecret.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | data:
 3 |   cert.pem: <PEM-encoded SSL certificate> 
 4 |   key.pem: <PEM-encoded SSL key>
 5 | kind: Secret
 6 | metadata:
 7 |   name: <secret name>
 8 |   namespace: azureml
 9 | type: Opaque
10 | 


--------------------------------------------------------------------------------
/files/terraform-template.tf:
--------------------------------------------------------------------------------
 1 | 
 2 | terraform {
 3 |   required_version = ">=0.12"
 4 |   
 5 |   required_providers {
 6 |     azurerm = {
 7 |       source  = "hashicorp/azurerm"
 8 |       version = "~>2.0"
 9 |     }
10 |     azapi = {
11 |       source  = "Azure/azapi"
12 |     }
13 |   }
14 | }
15 | 
16 | provider "azurerm" {
17 |   features {}
18 | }
19 | 
20 | provider "azapi" {
21 |   # More information on the authentication methods supported by
22 |   # the AzureRM Provider can be found here:
23 |   # https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs
24 | 
25 |   # subscription_id = "..."
26 |   # client_id       = "..."
27 |   # client_secret   = "..."
28 |   # tenant_id       = "..."
29 | }
30 | 
31 | resource "azapi_resource" "mlextension" {
32 |   type = "Microsoft.KubernetesConfiguration/extensions@2022-03-01"
33 |   name = "{extension-name}"
34 |   parent_id = "/subscriptions/{subscription}/resourcegroups/{resource-group}/providers/Microsoft.ContainerService/managedClusters/{cluster-name}"
35 |   identity {
36 |     type = "SystemAssigned"
37 |   }
38 |   body = jsonencode({
39 |     "properties"= {
40 |         "extensionType"= "microsoft.azureml.kubernetes"
41 |         "releaseTrain"= "stable"
42 |         "scope"= {
43 |             "cluster"= {
44 |                 "releaseNamespace"= "azureml"
45 |             }
46 |         }
47 |         "configurationSettings"= {
48 |             "enableTraining"= "True"
49 |             "enableInference"= "True"
50 |             "allowInsecureConnections"= "True"
51 |             "inferenceRouterServiceType"= "loadBalancer"
52 |             "cluster_name"= "/subscriptions/{subscription}/resourcegroups/{resource-group}/providers/Microsoft.ContainerService/managedClusters/{cluster-name}"
53 |             "domain"= "{region}.cloudapp.azure.com"
54 |             "location"= "{region}"
55 |             "jobSchedulerLocation"= "eastus"
56 |             "cluster_name_friendly"= "{cluster-name}"
57 |             "servicebus.enabled"= "false"
58 |             "relayserver.enabled"= "false"
59 |             "nginxIngress.enabled"= "true"
60 |             "clusterId"= "/subscriptions/{subscription}/resourcegroups/{resource-group}/providers/Microsoft.ContainerService/managedClusters/{cluster-name}"
61 |             "prometheus.prometheusSpec.externalLabels.cluster_name"= "/subscriptions/{subscription}/resourcegroups/{resource-group}/providers/Microsoft.ContainerService/managedClusters/{cluster-name}"
62 |         },
63 |         "configurationProtectedSettings"= {}
64 |     }
65 |   })
66 | }
67 | 


--------------------------------------------------------------------------------
/pics/check_scoringfe_v2_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/pics/check_scoringfe_v2_output.png


--------------------------------------------------------------------------------
/pics/nvml_error.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/pics/nvml_error.png


--------------------------------------------------------------------------------
/pics/permission_denied.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/pics/permission_denied.png


--------------------------------------------------------------------------------