├── .github ├── amlarc-tool.sh ├── convert.py ├── run_pipeline.py └── workflows │ ├── kubernetes-compute-simple-examples.yml │ └── portal-provision-pr-gate.yaml ├── .gitignore ├── CODE_OF_CONDUCT.md ├── LICENSE ├── README.md ├── SECURITY.md ├── docs ├── 4. View metrics in Compute level and runs level.markdown ├── AKS-HCI │ ├── AML-ARC-Compute.md │ ├── README.md │ ├── Train-AzureArc.md │ ├── Yaml │ │ ├── kfserving_v1_4_1.yaml │ │ └── serving-default-domain-knative-1-4-0.yaml │ ├── cli │ │ ├── README.md │ │ └── mnist │ │ │ ├── README.md │ │ │ ├── deployment.yml │ │ │ ├── endpoint.yml │ │ │ ├── mnist_script │ │ │ ├── train.py │ │ │ └── utils.py │ │ │ ├── model │ │ │ └── conda.yml │ │ │ ├── sample-request.json │ │ │ ├── score.py │ │ │ ├── train_env │ │ │ └── conda.yml │ │ │ └── training.yml │ ├── imgs │ │ ├── Cstorage.png │ │ ├── Inner-compute.png │ │ ├── Inner-workspace.png │ │ ├── azureml_log.png │ │ ├── container.png │ │ ├── cors.png │ │ ├── datastore-set.png │ │ ├── datastore.png │ │ ├── kubernetes_arc.png │ │ ├── network.png │ │ ├── sas-token.png │ │ ├── sas.png │ │ ├── structure.png │ │ ├── studio-arc.png │ │ ├── studio-s.png │ │ ├── studio.png │ │ ├── url.png │ │ └── vid-img.png │ ├── nfs │ │ ├── README.md │ │ ├── Verify_NFS_Setup_in_AMLArc.ipynb │ │ ├── config.json │ │ ├── images │ │ │ ├── configure-public-ip.png │ │ │ ├── create-ubuntu-vm.png │ │ │ ├── reset-network.png │ │ │ ├── ssh-status.png │ │ │ ├── ubuntu-vm-created.png │ │ │ ├── ufw-nfs.png │ │ │ ├── ufw-ssh.png │ │ │ ├── verify-nfs-training.png │ │ │ └── verify-nfs-vm.png │ │ ├── mount-config.yaml │ │ ├── nfs-server-setup.sh │ │ └── nfs_script │ │ │ └── test.py │ ├── notebooks │ │ ├── README.md │ │ ├── distributed-cifar10 │ │ │ ├── README.md │ │ │ ├── config.json │ │ │ ├── data.json │ │ │ ├── distributed-pytorch-cifar10.ipynb │ │ │ ├── helpers.py │ │ │ ├── pt_deployment.yml │ │ │ ├── pt_endpoint.yml │ │ │ ├── pytorch-script │ │ │ │ ├── cifar_dist_main.py │ │ │ │ └── conda_dependencies.yml │ │ │ ├── score_pytorch.py │ │ │ ├── score_tf.py │ │ │ ├── test_imgs │ │ │ │ ├── test_img_0_cat.jpg │ │ │ │ ├── test_img_1_ship.jpg │ │ │ │ ├── test_img_2_ship.jpg │ │ │ │ ├── test_img_3_plane.jpg │ │ │ │ └── test_img_4_frog.jpg │ │ │ └── test_request_pytorch │ │ │ │ ├── cifar_test_input_img_0_cat_pytorch.json │ │ │ │ ├── cifar_test_input_img_1_ship_pytorch.json │ │ │ │ ├── cifar_test_input_img_2_ship_pytorch.json │ │ │ │ ├── cifar_test_input_img_3_airplane_pytorch.json │ │ │ │ ├── cifar_test_input_img_4_frog_pytorch.json │ │ │ │ └── cifar_test_input_img_first_5_pytorch.json │ │ ├── mnist │ │ │ ├── MNIST_Training_with_AKS-HCI_Cluster_and_NFS.ipynb │ │ │ ├── README.md │ │ │ ├── config.json │ │ │ ├── confusion.png │ │ │ ├── deployment.yml │ │ │ ├── digit_7.jpg │ │ │ ├── endpoint.yml │ │ │ ├── helpers.py │ │ │ ├── mnist_script │ │ │ │ ├── train.py │ │ │ │ └── utils.py │ │ │ ├── model │ │ │ │ └── conda.yml │ │ │ └── score.py │ │ ├── object-segmentation-on-azure-stack │ │ │ ├── FudanPed00001.png │ │ │ ├── README.md │ │ │ ├── aml_src │ │ │ │ ├── Dockerfile.gpu │ │ │ │ ├── coco_eval.py │ │ │ │ ├── coco_utils.py │ │ │ │ ├── conda-env.yaml │ │ │ │ ├── engine.py │ │ │ │ ├── obj_segment_step_data_process.py │ │ │ │ ├── obj_segment_step_training.py │ │ │ │ ├── transforms.py │ │ │ │ └── utils.py │ │ │ ├── config.json │ │ │ ├── deployment.yml │ │ │ ├── endpoint.yml │ │ │ ├── helpers.py │ │ │ ├── object_segmentation-akshci-nfs.ipynb │ │ │ ├── object_segmentation-akshci.ipynb │ │ │ └── score.py │ │ ├── pipeline │ │ │ ├── README.md │ │ │ ├── config.json │ │ │ ├── deployment.yml │ │ │ ├── endpoint.yml │ │ │ ├── helpers.py │ │ │ ├── images │ │ │ │ ├── pipeline-using-dataflow.png │ │ │ │ └── pipeline-using-stepsequence.png │ │ │ ├── model │ │ │ │ └── conda.yml │ │ │ ├── nyc-taxi-data-regression-model-building-nfs.ipynb │ │ │ ├── nyc-taxi-data-regression-model-building.ipynb │ │ │ ├── score.py │ │ │ ├── scripts │ │ │ │ ├── prepdata │ │ │ │ │ ├── cleanse.py │ │ │ │ │ ├── filter.py │ │ │ │ │ ├── merge.py │ │ │ │ │ ├── normalize.py │ │ │ │ │ └── transform.py │ │ │ │ └── trainmodel │ │ │ │ │ ├── train_step.py │ │ │ │ │ └── train_test_split.py │ │ │ └── test_set.csv │ │ └── upload-download-model │ │ │ ├── AML-model-download-upload.ipynb │ │ │ ├── README.md │ │ │ └── config.json │ ├── test-data │ │ ├── cifar10_test_input.json │ │ └── flower_sample_test_input.json │ ├── troubleshooting.md │ └── video │ │ └── kfserving_tf_blob_structure.mp4 ├── application-gateway-ingress-controller.md ├── attach-compute-on-aks-v1-cluster.md ├── attach-compute.md ├── azureml-aks-ta-support.md ├── deploy-extension.md ├── deploy-on-ocp.md ├── faq.md ├── gke-setup.md ├── happy-path.md ├── how-to-debug-arc-kubernetes-training.md ├── inference-byoc.md ├── instance-type.md ├── limitations-and-known-issues.md ├── managed-identity.md ├── media │ ├── README.md │ ├── assign-role.png │ ├── attach-1.png │ ├── attach-4.png │ ├── attach.png │ ├── detach.png │ ├── edit-identity.png │ ├── gke-ssh.png │ ├── gkecreate.png │ ├── privatelink-networkflow-v3.png │ ├── privatelink │ │ ├── acr_subnet.png │ │ ├── acr_target.png │ │ ├── acr_trusted.png │ │ ├── aks.png │ │ ├── dns.png │ │ ├── kv_target.png │ │ ├── kv_trusted.png │ │ ├── kv_vnet.png │ │ ├── ml_compute.png │ │ ├── ml_computemsi.png │ │ ├── ml_disablepublicaccess.png │ │ ├── ml_privateendpoint.png │ │ ├── onprem.png │ │ ├── relay.png │ │ ├── relay_connectstring.png │ │ ├── relay_resourceid.png │ │ ├── storageaccount.png │ │ ├── ts_curl.png │ │ ├── ts_expected.png │ │ ├── ts_getpo.png │ │ ├── ts_nslookup.png │ │ └── ts_ws.png │ ├── profileConfig.png │ ├── update-identity2.png │ └── ws-msi.png ├── network-requirements.md ├── nginx-ingress-controller.md ├── private-link.md ├── pvc.md ├── release-notes.md ├── setup-ephemeral-nfs-volume.md ├── simple-flow.md ├── simple-train-cli.md ├── troubleshooting.md └── workflows.md ├── examples ├── inference │ ├── gpu-inferencing │ │ ├── blue-deployment.yml │ │ ├── endpoint.yml │ │ ├── model │ │ │ ├── checkpoint │ │ │ ├── conda.yml │ │ │ ├── mnist-tf.model.data-00000-of-00001 │ │ │ ├── mnist-tf.model.index │ │ │ └── mnist-tf.model.meta │ │ ├── sample-request.json │ │ └── script │ │ │ └── score.py │ └── simple-flow │ │ ├── blue-deployment.yml │ │ ├── endpoint.yml │ │ ├── model │ │ ├── conda.yml │ │ └── sklearn_mnist_model.pkl │ │ ├── sample-request.json │ │ ├── script │ │ └── score.py │ │ └── sklearn-model.yml └── training │ ├── additional-sdk-examples │ ├── 001-Tensorflow │ │ ├── tf_mnist_with_checkpoint.py │ │ ├── train-tensorflow-resume-training.ipynb │ │ └── utils.py │ ├── 002-SciKitLearn │ │ ├── img-classification-part1-training.ipynb │ │ └── utils.py │ └── 003-Distributed TensorFlow with parameter server │ │ ├── distributed-tensorflow-with-parameter-server.ipynb │ │ └── tf_mnist_replica.py │ ├── simple-train-cli │ ├── job.yml │ └── src │ │ ├── mnist-data │ │ ├── t10k-images-idx3-ubyte.gz │ │ ├── t10k-labels-idx1-ubyte.gz │ │ ├── train-images-idx3-ubyte.gz │ │ └── train-labels-idx1-ubyte.gz │ │ ├── train.py │ │ └── utils.py │ ├── simple-train-sdk │ ├── img-classification-training.ipynb │ └── utils.py │ └── train-using-nfs │ ├── amlarc-nfs-setup │ ├── README.md │ └── mount-config.yaml │ ├── pytorch-on-amlarc-with-nfs │ ├── pytorch-on-amlarc-with-nfs.ipynb │ └── scripts │ │ └── train.py │ └── scikit-learn-on-amlarc-with-nfs │ ├── iris.csv │ ├── scikit-learn-on-amlarc-with-nfs.ipynb │ └── scripts │ └── train.py ├── files ├── deploy-amlarc.sh ├── deploy.py ├── deployextension.json ├── deployextension.parameters.json ├── entry.sh ├── quota setting tool │ ├── get_quotaoverrides_cr.py │ ├── microsoft_graph.py │ ├── quotaoverridesCRTemplate.yaml │ ├── readme.md │ └── utils.py ├── sslsecret.yaml └── terraform-template.tf └── pics ├── check_scoringfe_v2_output.png ├── nvml_error.png └── permission_denied.png /.github/convert.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import yaml 3 | import os 4 | 5 | 6 | def convert(input_file, compute_target, instance_type, common_runtime, output_file): 7 | def _convert(input_file, data, job_schema): 8 | # check job type 9 | is_pipeline_job = False 10 | is_sweep_job = False 11 | if "pipelineJob" in job_schema or "jobs" in data: 12 | is_pipeline_job = True 13 | if "sweepJob" in job_schema or data.get("type") == "sweep": 14 | is_sweep_job = True 15 | 16 | print("Job type: pipelineJob", is_pipeline_job, "sweepJob:", is_sweep_job) 17 | 18 | # change compute target 19 | if compute_target: 20 | data["compute"] = "azureml:%s" % compute_target 21 | if is_pipeline_job: 22 | settings = data.get("settings", {}) 23 | settings["default_compute"] = "azureml:%s" % compute_target 24 | data["settings"] = settings 25 | 26 | # set instance type 27 | if not is_pipeline_job and instance_type: 28 | resources = data.get("resources", {}) 29 | resources["instance_type"] = instance_type 30 | data["resources"] = resources 31 | 32 | for field in ["trial", "component"]: 33 | if field not in data: 34 | continue 35 | 36 | file_field = data[field] 37 | if not isinstance(file_field, str): 38 | continue 39 | 40 | if file_field.startswith("file:"): 41 | file_field = file_field.split(":", 1)[1] 42 | 43 | print("Found sub job spec:", file_field) 44 | dirname = os.path.dirname(input_file) 45 | convert( 46 | os.path.join(dirname, file_field), 47 | compute_target, 48 | instance_type, 49 | common_runtime, 50 | "", 51 | ) 52 | 53 | if is_pipeline_job: 54 | jobs = data.get("jobs", {}) 55 | for step in jobs: 56 | print("Found step:", step) 57 | _convert(input_file, jobs[step], "") 58 | return 59 | 60 | print("Processing file:", input_file) 61 | if not os.path.exists(input_file): 62 | print("Warning: File doesn't exist: ", input_file) 63 | return 64 | with open(input_file, "r") as f: 65 | data = yaml.load(f, Loader=yaml.FullLoader) 66 | job_schema = data.get("$schema", "") 67 | _convert(input_file, data, job_schema) 68 | 69 | # write to output file if output file is specified, otherwise change inplace. 70 | if output_file: 71 | with open(output_file, "w") as f: 72 | yaml.dump(data, f) 73 | else: 74 | with open(input_file, "w") as f: 75 | yaml.dump(data, f) 76 | 77 | 78 | if __name__ == "__main__": 79 | # Parse command line arguments 80 | parser = argparse.ArgumentParser( 81 | description="Convert test case to AMLARC-compatible files." 82 | ) 83 | parser.add_argument("-i", "--input", required=True, help="Input test case file") 84 | parser.add_argument( 85 | "-o", 86 | "--output", 87 | required=False, 88 | help="Output AMLARC-compatible file, if not provides, " "replace file inplace", 89 | ) 90 | parser.add_argument("-c", "--compute-target", required=False, help="Compute target") 91 | parser.add_argument("-it", "--instance-type", required=False, help="Instance type") 92 | parser.add_argument( 93 | "-cr", 94 | "--common-runtime", 95 | required=False, 96 | default=False, 97 | action="store_true", 98 | help='Enable common runtime explicitly, default is "false"', 99 | ) 100 | args = parser.parse_args() 101 | convert( 102 | args.input, 103 | args.compute_target, 104 | args.instance_type, 105 | args.common_runtime, 106 | args.output, 107 | ) -------------------------------------------------------------------------------- /.github/run_pipeline.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | from msrest.authentication import BasicAuthentication 4 | from azure.devops.connection import Connection 5 | from azure.devops.v6_0.pipelines.models import RunPipelineParameters, RunResourcesParameters, RepositoryResourceParameters, Run, Variable 6 | import time 7 | 8 | def init_parser(): 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument( 11 | '-d', 12 | '--definition-id', 13 | type=int, 14 | required=False, 15 | help='branch name' 16 | ) 17 | parser.add_argument( 18 | '--variables', 19 | '-v', 20 | type=str, 21 | nargs='+', 22 | help='variables set to the pipeline' 23 | ) 24 | 25 | return parser 26 | 27 | def init_clients(): 28 | token = os.environ["PAT_TOKEN"] 29 | credentials = BasicAuthentication('', token) 30 | 31 | organization_url = 'https://dev.azure.com/msdata' 32 | 33 | connection = Connection(base_url=organization_url, creds=credentials) 34 | clients = connection.clients_v6_0 35 | 36 | return clients 37 | 38 | def trigger_build(clients, branch, def_id, variables = {}) -> Run: 39 | prj = 'Vienna' 40 | branch = f'refs/heads/{branch}' 41 | repo = RepositoryResourceParameters(branch) 42 | res = RunResourcesParameters(repositories={'self': repo}) 43 | params = RunPipelineParameters(resources=res) 44 | if variables: 45 | params.variables = variables 46 | 47 | pipeline = clients.get_pipelines_client() 48 | run = pipeline.run_pipeline(params, prj, def_id) 49 | return run 50 | 51 | def wait_run_complete(clients, def_id, run_id, timeout_in_sec=3600) -> bool: 52 | pipeline = clients.get_pipelines_client() 53 | run = pipeline.get_run('Vienna', def_id, run_id) 54 | current = time.time() 55 | while run.state != 'completed' and time.time() - current < timeout_in_sec: 56 | time.sleep(30) 57 | for _ in range(3): 58 | run = pipeline.get_run('Vienna', def_id, run_id) 59 | if run: 60 | break 61 | if not run: 62 | print("failed to get pipeline status") 63 | return False 64 | if run.state != 'completed': 65 | return False 66 | if run.result == 'failed': 67 | return False 68 | return True 69 | 70 | if __name__ == '__main__': 71 | parser = init_parser() 72 | args = parser.parse_args() 73 | variables = {} 74 | if args.variables: 75 | for kv in args.variables: 76 | key, value = kv.split('=') 77 | variables[key] = Variable(False, value) 78 | print(f'variables: {variables}') 79 | clients = init_clients() 80 | 81 | run = trigger_build(clients, 'master', args.definition_id, variables) 82 | if not run: 83 | exit(1) 84 | res = wait_run_complete(clients, args.definition_id, run.id) 85 | if not res: 86 | exit(1) 87 | -------------------------------------------------------------------------------- /.github/workflows/portal-provision-pr-gate.yaml: -------------------------------------------------------------------------------- 1 | name: Portal Provision PR Gate 2 | on: 3 | # Triggers the workflow on push or pull request events but only for the master branch 4 | push: 5 | branches: [ master ] 6 | # TODO: move these files into a folder 7 | paths: 8 | - files/deploy.py 9 | - files/entry.sh 10 | pull_request: 11 | branches: [ master ] 12 | # TODO: move these files into a folder 13 | paths: 14 | - files/deploy.py 15 | - files/entry.sh 16 | jobs: 17 | build: 18 | name: Call Azure Pipeline 19 | runs-on: ubuntu-latest 20 | steps: 21 | - name: check out repo 22 | uses: actions/checkout@v3 23 | - name: Extract branch name 24 | shell: bash 25 | run: | 26 | if [ $EVENT_NAME = "push" ] 27 | then 28 | echo "##[set-output name=branch;]$(echo ${GITHUB_REF#refs/heads/})" 29 | else 30 | echo "##[set-output name=branch;]$(echo $BASE_BRANCH)" 31 | fi 32 | env: 33 | BASE_BRANCH: ${{ github.head_ref }} 34 | EVENT_NAME: ${{ github.event_name }} 35 | id: extract_branch 36 | 37 | - uses: actions/setup-python@v3 38 | 39 | - name: Install dependencies 40 | shell: bash 41 | run: | 42 | pip install msrest 43 | pip install azure-devops 44 | id: install_dependencies 45 | 46 | - name: Run pipeline 47 | shell: bash 48 | env: 49 | PAT_TOKEN: ${{ secrets.AMLARC_PORTAL_PROVISION_PIPELINE_PAT }} 50 | BRANCH: ${{ steps.extract_branch.outputs.branch }} 51 | run: python .github/run_pipeline.py --definition-id 21230 --variables GITHUB_BRANCH=$BRANCH 52 | id: run_pipeline 53 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/settings.json 2 | .ipynb_checkpoints/ 3 | PennFudanPed 4 | mnist_data 5 | cifar10-data 6 | .idea/ 7 | data 8 | __pycache__ -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd). 40 | 41 | -------------------------------------------------------------------------------- /docs/4. View metrics in Compute level and runs level.markdown: -------------------------------------------------------------------------------- 1 | 2 | For Data Scientists, job(run) level monitoring and reporting during training import to them, it can help them to: 3 | 4 | - Understand the performance of their training script (does the job fully used capacity of GPUs, CPUs, what is the throughput of this model etc.) 5 | - Find the bottleneck of this training job(run), GPU Memory, network etc. 6 | - Better understand to different training parameters impact their job (batch size, epoch, learning rate etc.) 7 | 8 | For Admin, compute level monitor and reporting are important, it can help them to: 9 | - Optimized quota(resource) allocation strategy 10 | - Understand the utilization of resource 11 | 12 | ## Enable custom metrics 13 | 14 | CMAKS use application insights of AML worksapce to show the metrics of compute level and run level. To enable this function, you need enable custom metrics in ```appinsights>config>usage & estimated cost> custom metrics```. 15 | 16 | ![custom metrics](/pics/2.6custommetrics.png) 17 | 18 | ![ennable custom metrics](/pics/2.7ennablecustommetrics.png) 19 | 20 | 21 | After AML agnet is sucessful installed, you can [attach CMAKS compute](https://github.com/Azure/CMK8s-Samples/blob/master/docs/2.%20Attach%20CMAKS%20compute.markdown) 22 | 23 | ### Using flight 24 | Because this function is under preview, you need use `flight=computeMetrics` to enble it manually. 25 | 26 | ### View compute level metrics 27 | 28 | To view the CMKAKS compute level metrics, you can go to `attached compute > compute detail > Monitoring`. Note, only CMAKS compute target support this page. 29 | 30 | ![compute level metrics](/pics/5.2computemetrics.png) 31 | 32 | ### View run level metrics 33 | 34 | To view the CMKAKS run level metrics, you can use to two workflow: 35 | 1. `compute detail > runs > Monitoring` 36 | 2. `experiment > runs > Monitoring` 37 | 38 | ![run level metrics](/pics/5.3runmetrics.png) 39 | 40 | ## Useful chart toolkit 41 | We provide some useful functions in the charts: 42 | - Zoom: Zoom time range, mouse drag and drop on the charts 43 | - Pan: change the time range through pan the x axis 44 | - Reset Axes: Reset axes to default value, double click on the chart 45 | - Switch Chart type: switch the chart type between bar chart and line chart 46 | - Select/unselect legends: click on the legend will select or unselect legend 47 | - Only select one legend: double click on one selected legend will only show the data of this legend, this is very helpful if you have many legend on the chart. 48 | 49 | 50 | -------------------------------------------------------------------------------- /docs/AKS-HCI/Yaml/serving-default-domain-knative-1-4-0.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2019 The Knative Authors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: batch/v1 16 | kind: Job 17 | metadata: 18 | name: default-domain 19 | namespace: knative-serving 20 | labels: 21 | app: "default-domain" 22 | serving.knative.dev/release: "v0.14.0" 23 | spec: 24 | template: 25 | metadata: 26 | labels: 27 | app: "default-domain" 28 | serving.knative.dev/release: "v0.14.0" 29 | spec: 30 | serviceAccountName: controller 31 | containers: 32 | - name: default-domain 33 | # This is the Go import path for the binary that is containerized 34 | # and substituted here. 35 | image: gcr.io/knative-releases/knative.dev/serving/cmd/default-domain@sha256:3f9f0baebbb2ace4aaa6f38537f2a76aa9f02669d43b1a9d8386bf6497559257 36 | args: ["-magic-dns=xip.io"] 37 | ports: 38 | - name: http 39 | containerPort: 8080 40 | readinessProbe: &probe 41 | httpGet: 42 | port: 8080 43 | livenessProbe: *probe 44 | env: 45 | - name: POD_NAME 46 | valueFrom: 47 | fieldRef: 48 | fieldPath: metadata.name 49 | - name: SYSTEM_NAMESPACE 50 | valueFrom: 51 | fieldRef: 52 | fieldPath: metadata.namespace 53 | restartPolicy: OnFailure 54 | backoffLimit: 10 55 | --- 56 | apiVersion: v1 57 | kind: Service 58 | metadata: 59 | name: default-domain-service 60 | namespace: knative-serving 61 | labels: 62 | app: default-domain 63 | serving.knative.dev/release: "v0.14.0" 64 | spec: 65 | selector: 66 | app: default-domain 67 | ports: 68 | - name: http 69 | port: 80 70 | targetPort: 8080 71 | type: ClusterIP 72 | 73 | --- 74 | -------------------------------------------------------------------------------- /docs/AKS-HCI/cli/README.md: -------------------------------------------------------------------------------- 1 | # Sample Azure Machine Learning CLI v2 examples 2 | 3 | After following the setup documents, you can go through the CLI examples linked below to get a better understanding of how the process works and the possibilities it can unlock: 4 | 5 | ## CLI v2 Examples 6 | 7 | ### Prerequisites 8 | 9 | Follow this [doc](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-train-cli?view=azure-devops#prerequisites) to setup the prerequisites of using Azure Machine CLI v2. 10 | 11 | ### Examples 12 | 13 | * [Image Classification Using Scikit-learn](mnist/README.md) (Image Classification) 14 | 15 | This example serves as "hello world" of using for training and inference with AKS-HCI Cluster, on-premise NFS Server and Azure Machine Learning workspaces, including 16 | * Training with AKS-HCI cluster and on-premise NFS Server 17 | * Register model 18 | * Inference with registered model on AKS-HCI cluster 19 | * Test model 20 | 21 | -------------------------------------------------------------------------------- /docs/AKS-HCI/cli/mnist/deployment.yml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json 2 | type: kubernetes 3 | app_insights_enabled: true 4 | model: azureml:: 5 | code_configuration: 6 | code: 7 | local_path: ./ 8 | scoring_script: score.py 9 | instance_type: 10 | environment: 11 | name: sklearn-mnist-env 12 | version: 1 13 | conda_file: ./model/conda.yml 14 | image: mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20210727.v1 15 | instance_count: 1 16 | request_settings: 17 | request_timeout_ms: 1000 18 | max_concurrent_requests_per_instance: 1 19 | max_queue_wait_ms: 1000 20 | resources: 21 | requests: 22 | cpu: "1" 23 | memory: "1Gi" 24 | liveness_probe: 25 | initial_delay: 10 26 | period: 10 27 | timeout: 10 28 | success_threshold: 1 29 | failure_threshold: 1 30 | 31 | -------------------------------------------------------------------------------- /docs/AKS-HCI/cli/mnist/endpoint.yml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/managedOnlineEndpoint.schema.json 2 | auth_mode: aml_token 3 | compute: -------------------------------------------------------------------------------- /docs/AKS-HCI/cli/mnist/mnist_script/train.py: -------------------------------------------------------------------------------- 1 | 2 | import argparse 3 | import os 4 | import numpy as np 5 | import glob 6 | 7 | from sklearn.linear_model import LogisticRegression 8 | import joblib 9 | 10 | from azureml.core import Run 11 | from utils import load_data 12 | 13 | # let user feed in 2 parameters, the dataset to mount or download, and the regularization rate of the logistic regression model 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument('--data-folder', type=str, dest='data_folder', help='data folder mounting point') 16 | parser.add_argument('--regularization', type=float, dest='reg', default=0.01, help='regularization rate') 17 | args = parser.parse_args() 18 | 19 | data_folder = args.data_folder 20 | print('Data folder:', data_folder) 21 | 22 | # load train and test set into numpy arrays 23 | # note we scale the pixel intensity values to 0-1 (by dividing it with 255.0) so the model can converge faster. 24 | X_train = load_data(glob.glob(os.path.join(data_folder, '**/train-images-idx3-ubyte.gz'), recursive=True)[0], False) / 255.0 25 | X_test = load_data(glob.glob(os.path.join(data_folder, '**/t10k-images-idx3-ubyte.gz'), recursive=True)[0], False) / 255.0 26 | y_train = load_data(glob.glob(os.path.join(data_folder, '**/train-labels-idx1-ubyte.gz'), recursive=True)[0], True).reshape(-1) 27 | y_test = load_data(glob.glob(os.path.join(data_folder, '**/t10k-labels-idx1-ubyte.gz'), recursive=True)[0], True).reshape(-1) 28 | 29 | print(X_train.shape, y_train.shape, X_test.shape, y_test.shape, sep = '\n') 30 | 31 | # get hold of the current run 32 | run = Run.get_context() 33 | 34 | print('Train a logistic regression model with regularization rate of', args.reg) 35 | clf = LogisticRegression(C=1.0/args.reg, solver="liblinear", multi_class="auto", random_state=42) 36 | clf.fit(X_train, y_train) 37 | 38 | print('Predict the test set') 39 | y_hat = clf.predict(X_test) 40 | 41 | # calculate accuracy on the prediction 42 | acc = np.average(y_hat == y_test) 43 | print('Accuracy is', acc) 44 | 45 | run.log('regularization rate', np.float(args.reg)) 46 | run.log('accuracy', np.float(acc)) 47 | 48 | os.makedirs('outputs', exist_ok=True) 49 | # note file saved in the outputs folder is automatically uploaded into experiment record 50 | joblib.dump(value=clf, filename='outputs/sklearn_mnist_model.pkl') 51 | -------------------------------------------------------------------------------- /docs/AKS-HCI/cli/mnist/mnist_script/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import gzip 5 | import numpy as np 6 | import struct 7 | 8 | 9 | # load compressed MNIST gz files and return numpy arrays 10 | def load_data(filename, label=False): 11 | with gzip.open(filename) as gz: 12 | struct.unpack('I', gz.read(4)) 13 | n_items = struct.unpack('>I', gz.read(4)) 14 | if not label: 15 | n_rows = struct.unpack('>I', gz.read(4))[0] 16 | n_cols = struct.unpack('>I', gz.read(4))[0] 17 | res = np.frombuffer(gz.read(n_items[0] * n_rows * n_cols), dtype=np.uint8) 18 | res = res.reshape(n_items[0], n_rows * n_cols) 19 | else: 20 | res = np.frombuffer(gz.read(n_items[0]), dtype=np.uint8) 21 | res = res.reshape(n_items[0], 1) 22 | return res 23 | 24 | 25 | # one-hot encode a 1-D array 26 | def one_hot_encode(array, num_of_classes): 27 | return np.eye(num_of_classes)[array.reshape(-1)] 28 | -------------------------------------------------------------------------------- /docs/AKS-HCI/cli/mnist/model/conda.yml: -------------------------------------------------------------------------------- 1 | name: mnist-demo-env 2 | 3 | dependencies: 4 | - python=3.6.2 5 | 6 | - pip: 7 | - azureml-dataset-runtime[pandas,fuse]~=1.24.0.0 8 | - azureml-defaults~=1.24.0.0 9 | - scikit-learn==0.22.1 10 | 11 | channels: 12 | - anaconda 13 | - conda-forge 14 | -------------------------------------------------------------------------------- /docs/AKS-HCI/cli/mnist/score.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | import os 4 | import pickle 5 | import joblib 6 | 7 | def init(): 8 | global model 9 | # AZUREML_MODEL_DIR is an environment variable created during deployment. 10 | # It is the path to the model folder (./azureml-models/$MODEL_NAME/$VERSION) 11 | # For multiple models, it points to the folder containing all deployed models (./azureml-models) 12 | model_path = os.path.join(os.getenv('AZUREML_MODEL_DIR'), 'sklearn_mnist_model.pkl') 13 | model = joblib.load(model_path) 14 | 15 | def run(raw_data): 16 | data = np.array(json.loads(raw_data)['data']) 17 | # make prediction 18 | y_hat = model.predict(data) 19 | # you can return any data type as long as it is JSON-serializable 20 | return y_hat.tolist() -------------------------------------------------------------------------------- /docs/AKS-HCI/cli/mnist/train_env/conda.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - anaconda 3 | - conda-forge 4 | dependencies: 5 | - python=3.6.2 6 | - pip: 7 | - 'azureml-dataset-runtime[pandas,fuse]~=1.34.0' 8 | - azureml-defaults~=1.34.0 9 | - scikit-learn==0.22.1 -------------------------------------------------------------------------------- /docs/AKS-HCI/cli/mnist/training.yml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json 2 | code: 3 | local_path: mnist_script 4 | command: >- 5 | python train.py 6 | --data-folder /mnist 7 | --regularization 0.5 8 | environment: 9 | name: tutorial-env 10 | version: 1 11 | path: . 12 | conda_file: file:./train_env/conda.yml 13 | docker: 14 | image: mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20210806.v1 15 | compute: 16 | target: azureml: 17 | instance_type: 18 | experiment_name: mnist-demo 19 | description: Image Classification Using Scikit-learn -------------------------------------------------------------------------------- /docs/AKS-HCI/imgs/Cstorage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/imgs/Cstorage.png -------------------------------------------------------------------------------- /docs/AKS-HCI/imgs/Inner-compute.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/imgs/Inner-compute.png -------------------------------------------------------------------------------- /docs/AKS-HCI/imgs/Inner-workspace.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/imgs/Inner-workspace.png -------------------------------------------------------------------------------- /docs/AKS-HCI/imgs/azureml_log.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/imgs/azureml_log.png -------------------------------------------------------------------------------- /docs/AKS-HCI/imgs/container.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/imgs/container.png -------------------------------------------------------------------------------- /docs/AKS-HCI/imgs/cors.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/imgs/cors.png -------------------------------------------------------------------------------- /docs/AKS-HCI/imgs/datastore-set.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/imgs/datastore-set.png -------------------------------------------------------------------------------- /docs/AKS-HCI/imgs/datastore.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/imgs/datastore.png -------------------------------------------------------------------------------- /docs/AKS-HCI/imgs/kubernetes_arc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/imgs/kubernetes_arc.png -------------------------------------------------------------------------------- /docs/AKS-HCI/imgs/network.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/imgs/network.png -------------------------------------------------------------------------------- /docs/AKS-HCI/imgs/sas-token.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/imgs/sas-token.png -------------------------------------------------------------------------------- /docs/AKS-HCI/imgs/sas.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/imgs/sas.png -------------------------------------------------------------------------------- /docs/AKS-HCI/imgs/structure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/imgs/structure.png -------------------------------------------------------------------------------- /docs/AKS-HCI/imgs/studio-arc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/imgs/studio-arc.png -------------------------------------------------------------------------------- /docs/AKS-HCI/imgs/studio-s.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/imgs/studio-s.png -------------------------------------------------------------------------------- /docs/AKS-HCI/imgs/studio.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/imgs/studio.png -------------------------------------------------------------------------------- /docs/AKS-HCI/imgs/url.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/imgs/url.png -------------------------------------------------------------------------------- /docs/AKS-HCI/imgs/vid-img.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/imgs/vid-img.png -------------------------------------------------------------------------------- /docs/AKS-HCI/nfs/README.md: -------------------------------------------------------------------------------- 1 | # Verify the NFS Setup in AMLArc 2 | 3 | After configuring the NFS Setup in AMLArc, using this notebook to list the contents of the NFS Server on training pods. This could help you to verify, 4 | * The network access between AKS-HCI cluster to the NFS Server 5 | * The config map of NFS Setup used in AMLArc 6 | * Learn how to access the NFS data in training pods 7 | 8 | ## Notebooks 9 | 10 | * [Verify the NFS Setup in AMLArc](Verify_NFS_Setup_in_AMLArc.ipynb) 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /docs/AKS-HCI/nfs/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "subscription_id": "", 3 | "resource_group": "", 4 | "workspace_name": "" 5 | } -------------------------------------------------------------------------------- /docs/AKS-HCI/nfs/images/configure-public-ip.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/nfs/images/configure-public-ip.png -------------------------------------------------------------------------------- /docs/AKS-HCI/nfs/images/create-ubuntu-vm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/nfs/images/create-ubuntu-vm.png -------------------------------------------------------------------------------- /docs/AKS-HCI/nfs/images/reset-network.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/nfs/images/reset-network.png -------------------------------------------------------------------------------- /docs/AKS-HCI/nfs/images/ssh-status.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/nfs/images/ssh-status.png -------------------------------------------------------------------------------- /docs/AKS-HCI/nfs/images/ubuntu-vm-created.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/nfs/images/ubuntu-vm-created.png -------------------------------------------------------------------------------- /docs/AKS-HCI/nfs/images/ufw-nfs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/nfs/images/ufw-nfs.png -------------------------------------------------------------------------------- /docs/AKS-HCI/nfs/images/ufw-ssh.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/nfs/images/ufw-ssh.png -------------------------------------------------------------------------------- /docs/AKS-HCI/nfs/images/verify-nfs-training.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/nfs/images/verify-nfs-training.png -------------------------------------------------------------------------------- /docs/AKS-HCI/nfs/images/verify-nfs-vm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/nfs/images/verify-nfs-vm.png -------------------------------------------------------------------------------- /docs/AKS-HCI/nfs/mount-config.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | data: 3 | mounts.yaml: | 4 | mountPoints: 5 | - mountPath: 6 | mountType: nfs 7 | name: 8 | path: 9 | server: 10 | kind: ConfigMap 11 | metadata: 12 | name: mount-config 13 | namespace: azureml -------------------------------------------------------------------------------- /docs/AKS-HCI/nfs/nfs-server-setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This script should be executed on Linux Ubuntu Virtual Machine 4 | 5 | DATA_DIRECTORY=${1:-/data} 6 | AKS_SUBNET=${2:-*} 7 | 8 | EXPORT_DIRECTORY="/export/$(basename "$DATA_DIRECTORY")" 9 | 10 | echo "Updating packages" 11 | apt-get -y update 12 | 13 | echo "Installing NFS kernel server" 14 | 15 | apt-get -y install nfs-kernel-server 16 | 17 | echo "Making data directory ${DATA_DIRECTORY}" 18 | mkdir -p ${DATA_DIRECTORY} 19 | 20 | echo "Making new directory to be exported and linked to data directory: ${EXPORT_DIRECTORY}" 21 | mkdir -p ${EXPORT_DIRECTORY} 22 | 23 | echo "Mount binding ${DATA_DIRECTORY} to ${EXPORT_DIRECTORY}" 24 | mount --bind ${DATA_DIRECTORY} ${EXPORT_DIRECTORY} 25 | 26 | echo "Giving 777 permissions to ${EXPORT_DIRECTORY} directory" 27 | chmod 777 ${EXPORT_DIRECTORY} 28 | 29 | parentdir="$(dirname "$EXPORT_DIRECTORY")" 30 | echo "Giving 777 permissions to parent: ${parentdir} directory" 31 | chmod 777 $parentdir 32 | 33 | echo "Appending bound directories into fstab" 34 | echo "${DATA_DIRECTORY} ${EXPORT_DIRECTORY} none bind 0 0" >> /etc/fstab 35 | 36 | echo "Appending localhost and Kubernetes subnet address ${AKS_SUBNET} to exports configuration file" 37 | echo "/export ${AKS_SUBNET}(rw,async,insecure,fsid=0,crossmnt,no_subtree_check)" >> /etc/exports 38 | echo "/export localhost(rw,async,insecure,fsid=0,crossmnt,no_subtree_check)" >> /etc/exports 39 | 40 | nohup service nfs-kernel-server restart -------------------------------------------------------------------------------- /docs/AKS-HCI/nfs/nfs_script/test.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import glob 4 | 5 | from pathlib import Path 6 | 7 | 8 | # let user feed in 2 parameters, the dataset to mount or download, and the regularization rate of the logistic regression model 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--nfs-folder', type=str, dest='nfs_folder', help='NFS Server mounting point') 11 | args = parser.parse_args() 12 | 13 | nfs_folder = args.nfs_folder 14 | print('NFS folder:', nfs_folder) 15 | 16 | if (Path(nfs_folder).exists() != True): 17 | raise Exception(f"{nfs_folder} doesn't exist") 18 | if (Path(nfs_folder).is_dir() != True): 19 | raise Exception(f"{nfs_folder} is not a directory") 20 | 21 | from itertools import islice 22 | 23 | space = ' ' 24 | branch = '│ ' 25 | tee = '├── ' 26 | last = '└── ' 27 | 28 | def tree(dir_path: Path, level: int=-1, limit_to_directories: bool=False, 29 | length_limit: int=1000): 30 | """Given a directory Path object print a visual tree structure""" 31 | dir_path = Path(dir_path) # accept string coerceable to Path 32 | files = 0 33 | directories = 0 34 | def inner(dir_path: Path, prefix: str='', level=-1): 35 | nonlocal files, directories 36 | if not level: 37 | return # 0, stop iterating 38 | if limit_to_directories: 39 | contents = [d for d in dir_path.iterdir() if d.is_dir()] 40 | else: 41 | contents = list(dir_path.iterdir()) 42 | pointers = [tee] * (len(contents) - 1) + [last] 43 | for pointer, path in zip(pointers, contents): 44 | if path.is_dir(): 45 | yield prefix + pointer + path.name 46 | directories += 1 47 | extension = branch if pointer == tee else space 48 | yield from inner(path, prefix=prefix+extension, level=level-1) 49 | elif not limit_to_directories: 50 | yield prefix + pointer + path.name 51 | files += 1 52 | print(dir_path.name) 53 | iterator = inner(dir_path, level=level) 54 | for line in islice(iterator, length_limit): 55 | print(line) 56 | if next(iterator, None): 57 | print(f'... length_limit, {length_limit}, reached, counted:') 58 | print(f'\n{directories} directories' + (f', {files} files' if files else '')) 59 | 60 | tree(nfs_folder) -------------------------------------------------------------------------------- /docs/AKS-HCI/notebooks/README.md: -------------------------------------------------------------------------------- 1 | # Sample Notebooks 2 | 3 | After following the setup documents, you can go through the sample notebooks linked below to get a better understanding of how the process works and the possibilities it can unlock: 4 | 5 | * [Image Classification Using Scikit-learn](mnist/MNIST_Training_with_AKS-HCI_Cluster_and_NFS.ipynb) (Image Classification) 6 | 7 | This notebook serves as "hello world" of using for training and inference with AKS-HCI Cluster, on-premise NFS Server, and Azure Machine Learning, including 8 | * Training with AKS-HCI cluster and on-premise NFS Server 9 | * Register model 10 | * Inference with the registered model on AKS-HCI cluster 11 | * Test model 12 | 13 | * [Distributed PyTorch Training with DistributedDataParallel](distributed-cifar10/distributed-pytorch-cifar10.ipynb) (Image Classification) 14 | 15 | This notebook demonstrates an example of Image classification with PyTorch, including, 16 | * Distributed training using PyTorch with 2 worker nodes on AKS-HCI cluster and the training data is stored in on-premise NFS Server 17 | * Register model 18 | * Inference with the registered model on AKS-HCI cluster 19 | * Test model 20 | 21 | * [Object Segmentation with Transfer Learning](object-segmentation-on-azure-stack/object_segmentation-akshci.ipynb) (Object Segmentation) 22 | 23 | Object segmentation using pre-trained Mask R-CNN model on PyTorch. AML pipeline steps are used for data preprocessing. **Training data are stored in on-premise NFS server, and the intermediate data are stored in default datastore associated with the ML workspace.** The whole flow includes, 24 | * Use AML pipelines to read training data from on-premise NFS server, do data preprocessing and generate intermediate data to default datastore 25 | * Use AML pipelines to trigger train step on AKS-HCI cluster 26 | * Register model 27 | * Inference with the registered model on AKS-HCI cluster 28 | * Test model 29 | 30 | * [Object Segmentation with Transfer Learning with all data on NFS server](object-segmentation-on-azure-stack/object_segmentation-akshci-nfs.ipynb) (Object Segmentation) 31 | 32 | Object segmentation using pre-trained Mask R-CNN model on PyTorch. AML pipeline steps are used for data preprocessing. **Both the training and intermediate data are stored in on-prem NFS server.** The whole flow includes, 33 | * Use AML pipelines to read training data from on-premise NFS server, do data preprocessing and generate intermediate data to NFS server. 34 | * Use AML pipelines to trigger train step on AKS-HCI cluster 35 | * Register model 36 | * Inference with the registered model on AKS-HCI cluster 37 | * Test model 38 | 39 | * [AML Pipelines with NYC-TAXI-DATA](pipeline/nyc-taxi-data-regression-model-building.ipynb) (Structured Text Data Prediction) 40 | 41 | This notebook demonstrates an example of Structured Text Data Prediction, preparing / preprocessing / training data in **default datastore associated with the ML workspace**. The whole flow includes, 42 | * Download and upload training data to default datastore 43 | * Use AML pipelines to preprocess and train 44 | * Cleanse data in parallel 45 | * Merge cleansed data 46 | * Normalize data 47 | * Transform data 48 | * Split data 49 | * Train model 50 | * Register model 51 | * Inference with the registered model on AKS-HCI cluster 52 | * Test model 53 | 54 | * [AML Pipelines with NYC-TAXI-DATA with all data on NFS server](pipeline/nyc-taxi-data-regression-model-building-nfs.ipynb) (Structured Text Data Prediction) 55 | 56 | This notebook demonstrates an example of Structured Text Data Prediction, preparing / preprocessing / training data on **on-prem NFS server**. The whole flow includes, 57 | * Download and upload training data to default datastore 58 | * Use AML pipelines to preprocess and train 59 | * Cleanse data in parallel 60 | * Merge cleansed data 61 | * Normalize data 62 | * Transform data 63 | * Split data 64 | * Train model 65 | * Register model 66 | * Inference with the registered model on AKS-HCI cluster 67 | * Test model 68 | 69 | * [Model Download and Upload](upload-download-model/AML-model-download-upload.ipynb) -------------------------------------------------------------------------------- /docs/AKS-HCI/notebooks/distributed-cifar10/README.md: -------------------------------------------------------------------------------- 1 | # Distributed Training on AKS-HCI and on-premise NFS Server 2 | 3 | These sample notebooks guide you through a distributed training workload that trains an ML model on [CIFAR10](https://www.cs.toronto.edu/~kriz/cifar.html) dataset hosted on on-premise NFS Server. We offer two notebooks taking advantage of the most popular deep learning frameworks PyTorch. 4 | 5 | ## Notebooks 6 | 7 | * [Distributed PyTorch Training with DistributedDataParallel](distributed-pytorch-cifar10.ipynb) (Image Classification) 8 | 9 | This notebook demonstrates an example of Image classification with PyTorch, including, 10 | * Distributed training using PyTorch with 2 worker nodes on AKS-HCI cluster and the training data is stored in on-premise NFS Server 11 | * Register model 12 | * Inference with the registered model on AKS-HCI cluster 13 | * Test model -------------------------------------------------------------------------------- /docs/AKS-HCI/notebooks/distributed-cifar10/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "subscription_id": "", 3 | "resource_group": "", 4 | "workspace_name": "" 5 | } -------------------------------------------------------------------------------- /docs/AKS-HCI/notebooks/distributed-cifar10/pt_deployment.yml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json 2 | type: kubernetes 3 | app_insights_enabled: true 4 | model: 5 | code_configuration: 6 | code: 7 | local_path: ./ 8 | scoring_script: score_pytorch.py 9 | instance_type: 10 | environment: 11 | name: pytorch-cifar 12 | version: 1 13 | conda_file: ./pytorch-script/conda_dependencies.yml 14 | image: mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20210727.v1 15 | instance_count: 1 16 | request_settings: 17 | request_timeout_ms: 1000 18 | max_concurrent_requests_per_instance: 1 19 | max_queue_wait_ms: 1000 20 | resources: 21 | requests: 22 | cpu: "1" 23 | memory: "1Gi" 24 | liveness_probe: 25 | initial_delay: 10 26 | period: 10 27 | timeout: 10 28 | success_threshold: 1 29 | failure_threshold: 1 -------------------------------------------------------------------------------- /docs/AKS-HCI/notebooks/distributed-cifar10/pt_endpoint.yml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/managedOnlineEndpoint.schema.json 2 | compute: 3 | auth_mode: aml_token -------------------------------------------------------------------------------- /docs/AKS-HCI/notebooks/distributed-cifar10/pytorch-script/conda_dependencies.yml: -------------------------------------------------------------------------------- 1 | 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - python=3.6.2 6 | - pip: 7 | - azureml-defaults 8 | - torch==1.6.0 9 | - torchvision==0.7.0 10 | - future==0.17.1 11 | -------------------------------------------------------------------------------- /docs/AKS-HCI/notebooks/distributed-cifar10/score_tf.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import tensorflow as tf 4 | import json 5 | 6 | # Called when the deployed service starts 7 | def init(): 8 | global model 9 | 10 | # Get the path where the deployed model can be found. 11 | model_file_path = os.path.join(os.getenv('AZUREML_MODEL_DIR'), '001') 12 | # model_file_path = model_path + '/obj_segmentation.pkl' 13 | #model_file_path = "C:\\Users\\v-songshanli\projects\\ashexplore\BIG_FILES\\001" 14 | 15 | model = tf.saved_model.load(model_file_path) 16 | 17 | # Handle requests to the service 18 | def run(data): 19 | try: 20 | # Pick out the text property of the JSON request. 21 | # This expects a request in the form of {"text": "some text to score for sentiment"} 22 | 23 | start_at = time.time() 24 | inputs = json.loads(data) 25 | img_data_list = inputs["instances"] 26 | 27 | signature_name = inputs["signature_name"] 28 | infer = model.signatures[signature_name] 29 | 30 | inputs_tensor = tf.constant(img_data_list, dtype=tf.float32) 31 | 32 | res = infer(tf.constant(inputs_tensor)) 33 | return {"predictions": res["dense_1"].numpy().tolist(), 34 | "elapsed_time": time.time() - start_at} 35 | except Exception as e: 36 | error = str(e) 37 | print(error) 38 | raise e 39 | -------------------------------------------------------------------------------- /docs/AKS-HCI/notebooks/distributed-cifar10/test_imgs/test_img_0_cat.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/notebooks/distributed-cifar10/test_imgs/test_img_0_cat.jpg -------------------------------------------------------------------------------- /docs/AKS-HCI/notebooks/distributed-cifar10/test_imgs/test_img_1_ship.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/notebooks/distributed-cifar10/test_imgs/test_img_1_ship.jpg -------------------------------------------------------------------------------- /docs/AKS-HCI/notebooks/distributed-cifar10/test_imgs/test_img_2_ship.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/notebooks/distributed-cifar10/test_imgs/test_img_2_ship.jpg -------------------------------------------------------------------------------- /docs/AKS-HCI/notebooks/distributed-cifar10/test_imgs/test_img_3_plane.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/notebooks/distributed-cifar10/test_imgs/test_img_3_plane.jpg -------------------------------------------------------------------------------- /docs/AKS-HCI/notebooks/distributed-cifar10/test_imgs/test_img_4_frog.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/notebooks/distributed-cifar10/test_imgs/test_img_4_frog.jpg -------------------------------------------------------------------------------- /docs/AKS-HCI/notebooks/mnist/README.md: -------------------------------------------------------------------------------- 1 | # Image Classification Using Scikit-learn 2 | 3 | Using MNIST as an example, this sample notebook demonstrates how to train a machine learning model using AKS-HCI Arc compute and an on-premise NFS Server. Training data is stored on on-premise NFS Server. The register the trained model and deploy that on the AKS-HCI Arc compute for inference. 4 | 5 | ## Notebooks 6 | 7 | * [Image Classification Using Scikit-learn](MNIST_Training_with_AKS-HCI_Cluster_and_NFS.ipynb) 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /docs/AKS-HCI/notebooks/mnist/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "subscription_id": "", 3 | "resource_group": "", 4 | "workspace_name": "" 5 | } -------------------------------------------------------------------------------- /docs/AKS-HCI/notebooks/mnist/confusion.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/notebooks/mnist/confusion.png -------------------------------------------------------------------------------- /docs/AKS-HCI/notebooks/mnist/deployment.yml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json 2 | type: kubernetes 3 | app_insights_enabled: true 4 | model: 5 | code_configuration: 6 | code: 7 | local_path: ./ 8 | scoring_script: score.py 9 | instance_type: 10 | environment: 11 | name: sklearn-mnist-env 12 | version: 1 13 | conda_file: ./model/conda.yml 14 | image: mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20210727.v1 15 | instance_count: 1 16 | request_settings: 17 | request_timeout_ms: 1000 18 | max_concurrent_requests_per_instance: 1 19 | max_queue_wait_ms: 1000 20 | resources: 21 | requests: 22 | cpu: "1" 23 | memory: "1Gi" 24 | liveness_probe: 25 | initial_delay: 10 26 | period: 10 27 | timeout: 10 28 | success_threshold: 1 29 | failure_threshold: 1 30 | 31 | -------------------------------------------------------------------------------- /docs/AKS-HCI/notebooks/mnist/digit_7.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/notebooks/mnist/digit_7.jpg -------------------------------------------------------------------------------- /docs/AKS-HCI/notebooks/mnist/endpoint.yml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/managedOnlineEndpoint.schema.json 2 | auth_mode: aml_token 3 | compute: -------------------------------------------------------------------------------- /docs/AKS-HCI/notebooks/mnist/mnist_script/train.py: -------------------------------------------------------------------------------- 1 | 2 | import argparse 3 | import os 4 | import numpy as np 5 | import glob 6 | 7 | from sklearn.linear_model import LogisticRegression 8 | import joblib 9 | 10 | from azureml.core import Run 11 | from utils import load_data 12 | 13 | # let user feed in 2 parameters, the dataset to mount or download, and the regularization rate of the logistic regression model 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument('--data-folder', type=str, dest='data_folder', help='data folder mounting point') 16 | parser.add_argument('--regularization', type=float, dest='reg', default=0.01, help='regularization rate') 17 | args = parser.parse_args() 18 | 19 | data_folder = args.data_folder 20 | print('Data folder:', data_folder) 21 | 22 | # load train and test set into numpy arrays 23 | # note we scale the pixel intensity values to 0-1 (by dividing it with 255.0) so the model can converge faster. 24 | X_train = load_data(glob.glob(os.path.join(data_folder, '**/train-images-idx3-ubyte.gz'), recursive=True)[0], False) / 255.0 25 | X_test = load_data(glob.glob(os.path.join(data_folder, '**/t10k-images-idx3-ubyte.gz'), recursive=True)[0], False) / 255.0 26 | y_train = load_data(glob.glob(os.path.join(data_folder, '**/train-labels-idx1-ubyte.gz'), recursive=True)[0], True).reshape(-1) 27 | y_test = load_data(glob.glob(os.path.join(data_folder, '**/t10k-labels-idx1-ubyte.gz'), recursive=True)[0], True).reshape(-1) 28 | 29 | print(X_train.shape, y_train.shape, X_test.shape, y_test.shape, sep = '\n') 30 | 31 | # get hold of the current run 32 | run = Run.get_context() 33 | 34 | print('Train a logistic regression model with regularization rate of', args.reg) 35 | clf = LogisticRegression(C=1.0/args.reg, solver="liblinear", multi_class="auto", random_state=42) 36 | clf.fit(X_train, y_train) 37 | 38 | print('Predict the test set') 39 | y_hat = clf.predict(X_test) 40 | 41 | # calculate accuracy on the prediction 42 | acc = np.average(y_hat == y_test) 43 | print('Accuracy is', acc) 44 | 45 | run.log('regularization rate', np.float(args.reg)) 46 | run.log('accuracy', np.float(acc)) 47 | 48 | os.makedirs('outputs', exist_ok=True) 49 | # note file saved in the outputs folder is automatically uploaded into experiment record 50 | joblib.dump(value=clf, filename='outputs/sklearn_mnist_model.pkl') 51 | -------------------------------------------------------------------------------- /docs/AKS-HCI/notebooks/mnist/mnist_script/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import gzip 5 | import numpy as np 6 | import struct 7 | 8 | 9 | # load compressed MNIST gz files and return numpy arrays 10 | def load_data(filename, label=False): 11 | with gzip.open(filename) as gz: 12 | struct.unpack('I', gz.read(4)) 13 | n_items = struct.unpack('>I', gz.read(4)) 14 | if not label: 15 | n_rows = struct.unpack('>I', gz.read(4))[0] 16 | n_cols = struct.unpack('>I', gz.read(4))[0] 17 | res = np.frombuffer(gz.read(n_items[0] * n_rows * n_cols), dtype=np.uint8) 18 | res = res.reshape(n_items[0], n_rows * n_cols) 19 | else: 20 | res = np.frombuffer(gz.read(n_items[0]), dtype=np.uint8) 21 | res = res.reshape(n_items[0], 1) 22 | return res 23 | 24 | 25 | # one-hot encode a 1-D array 26 | def one_hot_encode(array, num_of_classes): 27 | return np.eye(num_of_classes)[array.reshape(-1)] 28 | -------------------------------------------------------------------------------- /docs/AKS-HCI/notebooks/mnist/model/conda.yml: -------------------------------------------------------------------------------- 1 | 2 | name: model-env 3 | 4 | dependencies: 5 | - python=3.6.2 6 | 7 | - pip: 8 | - azureml-dataset-runtime[pandas,fuse]~=1.24.0.0 9 | - azureml-defaults~=1.24.0.0 10 | - scikit-learn==0.22.1 11 | 12 | channels: 13 | - anaconda 14 | - conda-forge 15 | -------------------------------------------------------------------------------- /docs/AKS-HCI/notebooks/mnist/score.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | import os 4 | import pickle 5 | import joblib 6 | 7 | def init(): 8 | global model 9 | # AZUREML_MODEL_DIR is an environment variable created during deployment. 10 | # It is the path to the model folder (./azureml-models/$MODEL_NAME/$VERSION) 11 | # For multiple models, it points to the folder containing all deployed models (./azureml-models) 12 | model_path = os.path.join(os.getenv('AZUREML_MODEL_DIR'), 'sklearn_mnist_model.pkl') 13 | model = joblib.load(model_path) 14 | 15 | def run(raw_data): 16 | data = np.array(json.loads(raw_data)['data']) 17 | # make prediction 18 | y_hat = model.predict(data) 19 | # you can return any data type as long as it is JSON-serializable 20 | return y_hat.tolist() -------------------------------------------------------------------------------- /docs/AKS-HCI/notebooks/object-segmentation-on-azure-stack/FudanPed00001.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/notebooks/object-segmentation-on-azure-stack/FudanPed00001.png -------------------------------------------------------------------------------- /docs/AKS-HCI/notebooks/object-segmentation-on-azure-stack/README.md: -------------------------------------------------------------------------------- 1 | # Object Segmentation: Pipeline Training Run on AKS-HCI cluster and on-premise NFS server 2 | 3 | Using Object Segmentation as an example, this sample notebook demonstrates how to run [Azure Machine Learning Pipelines](https://aka.ms/aml-pipelines) using AKS-HCI cluster and on-premise NFS server 4 | 5 | ## Notebooks 6 | 7 | * [Object Segmentation with Transfer Learning](object_segmentation-akshci.ipynb) (Object Segmentation) 8 | 9 | Object segmentation using pre-trained Mask R-CNN model on PyTorch. AML pipeline steps are used for data preprocessing. **Training data are stored in on-premise NFS server, and the intermediate data are stored in default datastore associated with the ML workspace.** The whole flow includes, 10 | * Use AML pipelines to read training data from on-premise NFS server, do data preprocessing and generate intermediate data to default datastore 11 | * Use AML pipelines to trigger train step on AKS-HCI cluster 12 | * Register model 13 | * Inference with the registered model on AKS-HCI cluster 14 | * Test model 15 | 16 | 17 | * [Object Segmentation with Transfer Learning with all data on NFS server](object_segmentation-akshci-nfs.ipynb) (Object Segmentation) 18 | 19 | Object segmentation using pre-trained Mask R-CNN model on PyTorch. AML pipeline steps are used for data preprocessing. **Both the training and intermediate data are stored in on-prem NFS server.** The whole flow includes, 20 | * Use AML pipelines to read training data from on-premise NFS server, do data preprocessing and generate intermediate data to NFS server. 21 | * Use AML pipelines to trigger train step on AKS-HCI cluster 22 | * Register model 23 | * Inference with the registered model on AKS-HCI cluster 24 | * Test model 25 | 26 | 27 | -------------------------------------------------------------------------------- /docs/AKS-HCI/notebooks/object-segmentation-on-azure-stack/aml_src/Dockerfile.gpu: -------------------------------------------------------------------------------- 1 | FROM tensorflow/tensorflow:2.4.0rc3-gpu 2 | 3 | ARG CONDA_VERSION=4.7.12 4 | ARG PYTHON_VERSION=3.7 5 | ARG AZUREML_SDK_VERSION=1.13.0 6 | ARG INFERENCE_SCHEMA_VERSION=1.1.0 7 | 8 | ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 9 | ENV PATH /opt/miniconda/bin:$PATH 10 | ENV DEBIAN_FRONTEND=noninteractive 11 | 12 | RUN apt-get update --fix-missing && \ 13 | apt-get install -y wget bzip2 && \ 14 | apt-get install -y fuse && \ 15 | apt-get clean -y && \ 16 | rm -rf /var/lib/apt/lists/* 17 | 18 | RUN useradd --create-home dockeruser 19 | WORKDIR /home/dockeruser 20 | USER dockeruser 21 | 22 | RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-${CONDA_VERSION}-Linux-x86_64.sh -O ~/miniconda.sh && \ 23 | /bin/bash ~/miniconda.sh -b -p ~/miniconda && \ 24 | rm ~/miniconda.sh && \ 25 | ~/miniconda/bin/conda clean -tipsy 26 | ENV PATH="/home/dockeruser/miniconda/bin/:${PATH}" 27 | -------------------------------------------------------------------------------- /docs/AKS-HCI/notebooks/object-segmentation-on-azure-stack/aml_src/conda-env.yaml: -------------------------------------------------------------------------------- 1 | name: pytorch 2 | dependencies: 3 | - python=3.7 4 | - python-graphviz 5 | - pip: 6 | - azureml-defaults 7 | - azure-storage-blob 8 | - Cython 9 | - torch==1.7.1 10 | - torchvision==0.8.2 11 | - pycocotools 12 | -------------------------------------------------------------------------------- /docs/AKS-HCI/notebooks/object-segmentation-on-azure-stack/aml_src/engine.py: -------------------------------------------------------------------------------- 1 | import math 2 | import sys 3 | import time 4 | import torch 5 | 6 | import torchvision.models.detection.mask_rcnn 7 | 8 | from coco_utils import get_coco_api_from_dataset 9 | from coco_eval import CocoEvaluator 10 | import utils 11 | 12 | 13 | def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq): 14 | model.train() 15 | metric_logger = utils.MetricLogger(delimiter=" ") 16 | metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) 17 | header = 'Epoch: [{}]'.format(epoch) 18 | 19 | lr_scheduler = None 20 | if epoch == 0: 21 | warmup_factor = 1. / 1000 22 | warmup_iters = min(1000, len(data_loader) - 1) 23 | 24 | lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) 25 | 26 | for images, targets in metric_logger.log_every(data_loader, print_freq, header): 27 | images = list(image.to(device) for image in images) 28 | targets = [{k: v.to(device) for k, v in t.items()} for t in targets] 29 | 30 | loss_dict = model(images, targets) 31 | 32 | losses = sum(loss for loss in loss_dict.values()) 33 | 34 | # reduce losses over all GPUs for logging purposes 35 | loss_dict_reduced = utils.reduce_dict(loss_dict) 36 | losses_reduced = sum(loss for loss in loss_dict_reduced.values()) 37 | 38 | loss_value = losses_reduced.item() 39 | 40 | if not math.isfinite(loss_value): 41 | print("Loss is {}, stopping training".format(loss_value)) 42 | print(loss_dict_reduced) 43 | sys.exit(1) 44 | 45 | optimizer.zero_grad() 46 | losses.backward() 47 | optimizer.step() 48 | 49 | if lr_scheduler is not None: 50 | lr_scheduler.step() 51 | 52 | metric_logger.update(loss=losses_reduced, **loss_dict_reduced) 53 | metric_logger.update(lr=optimizer.param_groups[0]["lr"]) 54 | 55 | return metric_logger 56 | 57 | 58 | def _get_iou_types(model): 59 | model_without_ddp = model 60 | if isinstance(model, torch.nn.parallel.DistributedDataParallel): 61 | model_without_ddp = model.module 62 | iou_types = ["bbox"] 63 | if isinstance(model_without_ddp, torchvision.models.detection.MaskRCNN): 64 | iou_types.append("segm") 65 | if isinstance(model_without_ddp, torchvision.models.detection.KeypointRCNN): 66 | iou_types.append("keypoints") 67 | return iou_types 68 | 69 | 70 | @torch.no_grad() 71 | def evaluate(model, data_loader, device): 72 | n_threads = torch.get_num_threads() 73 | # FIXME remove this and make paste_masks_in_image run on the GPU 74 | torch.set_num_threads(1) 75 | cpu_device = torch.device("cpu") 76 | model.eval() 77 | metric_logger = utils.MetricLogger(delimiter=" ") 78 | header = 'Test:' 79 | 80 | coco = get_coco_api_from_dataset(data_loader.dataset) 81 | iou_types = _get_iou_types(model) 82 | coco_evaluator = CocoEvaluator(coco, iou_types) 83 | 84 | for images, targets in metric_logger.log_every(data_loader, 100, header): 85 | images = list(img.to(device) for img in images) 86 | 87 | if torch.cuda.is_available(): 88 | torch.cuda.synchronize() 89 | model_time = time.time() 90 | outputs = model(images) 91 | 92 | outputs = [{k: v.to(cpu_device) for k, v in t.items()} for t in outputs] 93 | model_time = time.time() - model_time 94 | 95 | res = {target["image_id"].item(): output for target, output in zip(targets, outputs)} 96 | evaluator_time = time.time() 97 | coco_evaluator.update(res) 98 | evaluator_time = time.time() - evaluator_time 99 | metric_logger.update(model_time=model_time, evaluator_time=evaluator_time) 100 | 101 | # gather the stats from all processes 102 | metric_logger.synchronize_between_processes() 103 | print("Averaged stats:", metric_logger) 104 | coco_evaluator.synchronize_between_processes() 105 | 106 | # accumulate predictions from all images 107 | coco_evaluator.accumulate() 108 | coco_evaluator.summarize() 109 | torch.set_num_threads(n_threads) 110 | return coco_evaluator 111 | -------------------------------------------------------------------------------- /docs/AKS-HCI/notebooks/object-segmentation-on-azure-stack/aml_src/obj_segment_step_data_process.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import random 4 | import shutil 5 | 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument('--data-path', type=str, help='input data path') 8 | parser.add_argument('--train-split', type=str, help='training split data output path') 9 | parser.add_argument('--test-split', type=str, help='test split data output path') 10 | parser.add_argument('--test-size', type=int, help='test split data size') 11 | 12 | args = parser.parse_args() 13 | types = ["PNGImages", "PedMasks"] 14 | img_mask_list = [[os.path.join(args.data_path, type, file) for file in sorted(os.listdir(os.path.join(args.data_path, type))) ] for type in types] 15 | print("img_mask_list", img_mask_list) 16 | test_indices = random.sample(range(len(img_mask_list[0])), args.test_size) 17 | 18 | test_img_folder, test_mask_folder = [os.path.join(args.test_split, type) for type in types] 19 | train_img_folder, train_mask_folder = [os.path.join(args.train_split, type) for type in types] 20 | 21 | os.makedirs(test_img_folder, exist_ok=True) 22 | os.makedirs(test_mask_folder, exist_ok=True) 23 | os.makedirs(train_img_folder, exist_ok=True) 24 | os.makedirs(train_mask_folder, exist_ok=True) 25 | print("test_img_folder",test_img_folder) 26 | for idx, img_mask in enumerate(zip(*img_mask_list)): 27 | img, mask = img_mask 28 | if idx in test_indices: 29 | print("img path", img) 30 | print("mask path", mask) 31 | shutil.copy(img, test_img_folder) 32 | shutil.copy(mask, test_mask_folder) 33 | else: 34 | shutil.copy(img, train_img_folder) 35 | shutil.copy(mask, train_mask_folder) 36 | -------------------------------------------------------------------------------- /docs/AKS-HCI/notebooks/object-segmentation-on-azure-stack/aml_src/transforms.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | from torchvision.transforms import functional as F 4 | 5 | 6 | def _flip_coco_person_keypoints(kps, width): 7 | flip_inds = [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15] 8 | flipped_data = kps[:, flip_inds] 9 | flipped_data[..., 0] = width - flipped_data[..., 0] 10 | # Maintain COCO convention that if visibility == 0, then x, y = 0 11 | inds = flipped_data[..., 2] == 0 12 | flipped_data[inds] = 0 13 | return flipped_data 14 | 15 | 16 | class Compose(object): 17 | def __init__(self, transforms): 18 | self.transforms = transforms 19 | 20 | def __call__(self, image, target): 21 | for t in self.transforms: 22 | image, target = t(image, target) 23 | return image, target 24 | 25 | 26 | class RandomHorizontalFlip(object): 27 | def __init__(self, prob): 28 | self.prob = prob 29 | 30 | def __call__(self, image, target): 31 | if random.random() < self.prob: 32 | height, width = image.shape[-2:] 33 | image = image.flip(-1) 34 | bbox = target["boxes"] 35 | bbox[:, [0, 2]] = width - bbox[:, [2, 0]] 36 | target["boxes"] = bbox 37 | if "masks" in target: 38 | target["masks"] = target["masks"].flip(-1) 39 | if "keypoints" in target: 40 | keypoints = target["keypoints"] 41 | keypoints = _flip_coco_person_keypoints(keypoints, width) 42 | target["keypoints"] = keypoints 43 | return image, target 44 | 45 | 46 | class ToTensor(object): 47 | def __call__(self, image, target): 48 | image = F.to_tensor(image) 49 | return image, target 50 | -------------------------------------------------------------------------------- /docs/AKS-HCI/notebooks/object-segmentation-on-azure-stack/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "subscription_id": "", 3 | "resource_group": "", 4 | "workspace_name": "" 5 | } -------------------------------------------------------------------------------- /docs/AKS-HCI/notebooks/object-segmentation-on-azure-stack/deployment.yml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json 2 | type: kubernetes 3 | app_insights_enabled: true 4 | model: 5 | code_configuration: 6 | code: 7 | local_path: ./ 8 | scoring_script: score.py 9 | instance_type: 10 | instance_count: 1 11 | environment: azureml:AzureML-pytorch-1.7-ubuntu18.04-py37-cpu-inference:21 -------------------------------------------------------------------------------- /docs/AKS-HCI/notebooks/object-segmentation-on-azure-stack/endpoint.yml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/managedOnlineEndpoint.schema.json 2 | compute: 3 | auth_mode: aml_token -------------------------------------------------------------------------------- /docs/AKS-HCI/notebooks/object-segmentation-on-azure-stack/score.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import time 4 | import torch 5 | 6 | # Called when the deployed service starts 7 | def init(): 8 | global model 9 | global device 10 | 11 | # Get the path where the deployed model can be found. 12 | model_filename = 'obj_segmentation.pkl' 13 | model_path = os.path.join(os.environ['AZUREML_MODEL_DIR'], model_filename) 14 | 15 | device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 16 | model = torch.load(model_path, map_location=device) 17 | 18 | # Handle requests to the service 19 | def run(data): 20 | try: 21 | start_at = time.time() 22 | inputs = json.loads(data) 23 | img_data_list = inputs["instances"] 24 | img_tensor_list = [torch.tensor(item) for item in img_data_list] 25 | model.eval() 26 | with torch.no_grad(): 27 | predictions = model([item.to(device) for item in img_tensor_list]) 28 | 29 | pred_data_list = [{ 30 | "masks": prediction['masks'][0, 0].mul(255).byte().cpu().numpy().tolist(), 31 | "boxes": prediction['boxes'].numpy().tolist(), 32 | "labels": prediction['labels'].numpy().tolist(), 33 | "scores": prediction['scores'].numpy().tolist(), 34 | 35 | } for prediction in predictions] 36 | 37 | return {"predictions": pred_data_list, 38 | "elapsed_time": time.time() - start_at} 39 | 40 | except Exception as e: 41 | error = str(e) 42 | return error 43 | -------------------------------------------------------------------------------- /docs/AKS-HCI/notebooks/pipeline/README.md: -------------------------------------------------------------------------------- 1 | # Pipeline Run with AKS-HCI cluster 2 | 3 | These samples demonstrate how to run [Azure Machine Learning Pipelines](https://aka.ms/aml-pipelines) with Arc compute. 4 | 5 | ## Notebooks 6 | 7 | * [AML Pipelines with NYC-TAXI-DATA](nyc-taxi-data-regression-model-building.ipynb) (Structured Text Data Prediction) 8 | 9 | This notebook demonstrates an example of Structured Text Data Prediction, preparing / preprocessing / training data in **default datastore associated with the ML workspace**. The whole flow includes, 10 | * Download and upload training data to default datastore 11 | * Use AML pipelines to preprocess and train 12 | * Cleanse data in parallel 13 | * Merge cleansed data 14 | * Normalize data 15 | * Transform data 16 | * Split data 17 | * Train model 18 | * Register model 19 | * Inference with the registered model on AKS-HCI cluster 20 | * Test model 21 | 22 | * [AML Pipelines with NYC-TAXI-DATA with all data on NFS server](nyc-taxi-data-regression-model-building-nfs.ipynb) (Structured Text Data Prediction) 23 | 24 | This notebook demonstrates an example of Structured Text Data Prediction, preparing / preprocessing / training data on **on-prem NFS server**. The whole flow includes, 25 | * Download and upload training data to default datastore 26 | * Use AML pipelines to preprocess and train 27 | * Cleanse data in parallel 28 | * Merge cleansed data 29 | * Normalize data 30 | * Transform data 31 | * Split data 32 | * Train model 33 | * Register model 34 | * Inference with the registered model on AKS-HCI cluster 35 | * Test model 36 | 37 | 38 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /docs/AKS-HCI/notebooks/pipeline/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "subscription_id": "", 3 | "resource_group": "", 4 | "workspace_name": "" 5 | } -------------------------------------------------------------------------------- /docs/AKS-HCI/notebooks/pipeline/deployment.yml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json 2 | type: kubernetes 3 | app_insights_enabled: true 4 | model: 5 | code_configuration: 6 | code: 7 | local_path: ./ 8 | scoring_script: score.py 9 | instance_type: 10 | instance_count: 1 11 | environment: 12 | name: taxi-model-env 13 | version: 1 14 | conda_file: ./model/conda.yml 15 | image: mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20210727.v1 -------------------------------------------------------------------------------- /docs/AKS-HCI/notebooks/pipeline/endpoint.yml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/managedOnlineEndpoint.schema.json 2 | compute: 3 | auth_mode: aml_token -------------------------------------------------------------------------------- /docs/AKS-HCI/notebooks/pipeline/images/pipeline-using-dataflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/notebooks/pipeline/images/pipeline-using-dataflow.png -------------------------------------------------------------------------------- /docs/AKS-HCI/notebooks/pipeline/images/pipeline-using-stepsequence.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/notebooks/pipeline/images/pipeline-using-stepsequence.png -------------------------------------------------------------------------------- /docs/AKS-HCI/notebooks/pipeline/model/conda.yml: -------------------------------------------------------------------------------- 1 | 2 | name: model-env 3 | 4 | dependencies: 5 | - python=3.6.2 6 | - pip: 7 | - pyarrow 8 | - azureml-defaults 9 | - pandas 10 | - scikit-learn 11 | - numpy=1.19.5 12 | 13 | channels: 14 | - anaconda 15 | - conda-forge 16 | -------------------------------------------------------------------------------- /docs/AKS-HCI/notebooks/pipeline/score.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | import os 4 | import joblib 5 | 6 | def init(): 7 | global model 8 | # AZUREML_MODEL_DIR is an environment variable created during deployment. 9 | # It is the path to the model folder (./azureml-models/$MODEL_NAME/$VERSION) 10 | # For multiple models, it points to the folder containing all deployed models (./azureml-models) 11 | model_path = os.path.join(os.getenv('AZUREML_MODEL_DIR'), 'taxi.pkl') 12 | model = joblib.load(model_path) 13 | 14 | def run(raw_data): 15 | data = np.array(json.loads(raw_data)['data']) 16 | # make prediction 17 | scores = model.predict(data) 18 | # you can return any data type as long as it is JSON-serializable 19 | return scores.tolist() -------------------------------------------------------------------------------- /docs/AKS-HCI/notebooks/pipeline/scripts/prepdata/cleanse.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft. All rights reserved. 2 | # Licensed under the MIT license. 3 | 4 | import argparse 5 | import os 6 | from azureml.core import Run 7 | import pandas as pd 8 | 9 | 10 | def get_dict(dict_str): 11 | pairs = dict_str.strip("{}").split("\;") 12 | new_dict = {} 13 | for pair in pairs: 14 | key, value = pair.strip().split(":") 15 | new_dict[key.strip().strip("'")] = value.strip().strip("'") 16 | 17 | return new_dict 18 | 19 | 20 | print("Cleans the input data") 21 | 22 | parser = argparse.ArgumentParser("cleanse") 23 | parser.add_argument('--data-path', type=str, help='input data path') 24 | parser.add_argument("--output_cleanse", type=str, help="cleaned taxi data directory") 25 | parser.add_argument("--useful_columns", type=str, help="useful columns to keep") 26 | #parser.add_argument("--columns", type=str, help="rename column pattern") 27 | parser.add_argument("--columns_key", type=str, help="rename column pattern") 28 | parser.add_argument("--columns_value", type=str, help="rename column pattern") 29 | 30 | args = parser.parse_args() 31 | 32 | print("Argument 1(columns to keep): %s" % str(args.useful_columns.strip("[]").split("\;"))) 33 | print("Argument 2(columns renaming mapping Key): %s" % str(args.columns_key.strip("{}").split("\;"))) 34 | print("Argument 2(columns renaming mapping value): %s" % str(args.columns_value.strip("{}").split("\;"))) 35 | print("Argument 3(output cleansed taxi data path): %s" % args.output_cleanse) 36 | 37 | # These functions ensure that null data is removed from the dataset, 38 | # which will help increase machine learning model accuracy. 39 | 40 | useful_columns = [s.strip().strip("'") for s in args.useful_columns.strip("[]").split("\;")] 41 | columns_key = [s.strip().strip("'") for s in args.columns_key.strip("[]").split("\;")] 42 | columns_value = [s.strip().strip("'") for s in args.columns_value.strip("[]").split("\;")] 43 | 44 | columns = {key: value for key, value in zip(columns_key, columns_value)} 45 | 46 | 47 | raw_df = pd.read_csv(args.data_path) 48 | new_df = (raw_df 49 | .dropna(how='all') 50 | .rename(columns=columns))[useful_columns] 51 | 52 | new_df.reset_index(inplace=True, drop=True) 53 | 54 | if not (args.output_cleanse is None): 55 | os.makedirs(args.output_cleanse, exist_ok=True) 56 | print("%s created" % args.output_cleanse) 57 | path = args.output_cleanse + "/processed.csv" 58 | write_df = new_df.to_csv(path) 59 | -------------------------------------------------------------------------------- /docs/AKS-HCI/notebooks/pipeline/scripts/prepdata/filter.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import pandas as pd 4 | 5 | print("Filters out coordinates for locations that are outside the city border.", 6 | "Chain the column filter commands within the filter() function", 7 | "and define the minimum and maximum bounds for each field.") 8 | 9 | parser = argparse.ArgumentParser("filter") 10 | parser.add_argument('--data-path', type=str, help='input data path') 11 | parser.add_argument("--output_filter", type=str, help="filter out out of city locations") 12 | 13 | args = parser.parse_args() 14 | 15 | print("Argument (output filtered taxi data path): %s" % args.output_filter) 16 | 17 | # These functions filter out coordinates for locations that are outside the city border. 18 | 19 | # Filter out coordinates for locations that are outside the city border. 20 | # Chain the column filter commands within the filter() function 21 | # and define the minimum and maximum bounds for each field 22 | 23 | combined_df = pd.read_csv(args.data_path + "/processed.csv") 24 | 25 | combined_df = combined_df.astype({"pickup_longitude": 'float64', "pickup_latitude": 'float64', 26 | "dropoff_longitude": 'float64', "dropoff_latitude": 'float64'}) 27 | 28 | latlong_filtered_df = combined_df[(combined_df.pickup_longitude <= -73.72) & 29 | (combined_df.pickup_longitude >= -74.09) & 30 | (combined_df.pickup_latitude <= 40.88) & 31 | (combined_df.pickup_latitude >= 40.53) & 32 | (combined_df.dropoff_longitude <= -73.72) & 33 | (combined_df.dropoff_longitude >= -74.72) & 34 | (combined_df.dropoff_latitude <= 40.88) & 35 | (combined_df.dropoff_latitude >= 40.53)] 36 | 37 | latlong_filtered_df.reset_index(inplace=True, drop=True) 38 | 39 | if not (args.output_filter is None): 40 | os.makedirs(args.output_filter, exist_ok=True) 41 | print("%s created" % args.output_filter) 42 | path = args.output_filter + "/processed.csv" 43 | write_df = latlong_filtered_df.to_csv(path) 44 | -------------------------------------------------------------------------------- /docs/AKS-HCI/notebooks/pipeline/scripts/prepdata/merge.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import pandas as pd 4 | 5 | print("Merge Green and Yellow taxi data") 6 | 7 | parser = argparse.ArgumentParser("merge") 8 | parser.add_argument("--output_merge", type=str, help="green and yellow taxi data merged") 9 | parser.add_argument("--green_data_path", type=str, help="green data path") 10 | parser.add_argument("--yellow_data_path", type=str, help="yellow data path") 11 | 12 | args = parser.parse_args() 13 | print("Argument (output merge taxi data path): %s" % args.output_merge) 14 | 15 | 16 | green_df = pd.read_csv(args.green_data_path + "/processed.csv") 17 | yellow_df = pd.read_csv(args.yellow_data_path + "/processed.csv") 18 | 19 | # Appending yellow data to green data 20 | combined_df = green_df.append(yellow_df, ignore_index=True) 21 | combined_df.reset_index(inplace=True, drop=True) 22 | 23 | if not (args.output_merge is None): 24 | os.makedirs(args.output_merge, exist_ok=True) 25 | print("%s created" % args.output_merge) 26 | path = args.output_merge + "/processed.csv" 27 | write_df = combined_df.to_csv(path) 28 | -------------------------------------------------------------------------------- /docs/AKS-HCI/notebooks/pipeline/scripts/prepdata/normalize.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import pandas as pd 4 | import pandas as pd 5 | 6 | print("Replace undefined values to relavant values and rename columns to meaningful names") 7 | 8 | parser = argparse.ArgumentParser("normalize") 9 | parser.add_argument('--data-path', type=str, help='input data path') 10 | parser.add_argument("--output_normalize", type=str, help="replaced undefined values and renamed columns") 11 | 12 | args = parser.parse_args() 13 | 14 | print("Argument (output normalized taxi data path): %s" % args.output_normalize) 15 | 16 | combined_converted_df = pd.read_csv(args.data_path + "/processed.csv") 17 | 18 | # These functions replace undefined values and rename to use meaningful names. 19 | replaced_stfor_vals_df = (combined_converted_df.replace({"store_forward": "0"}, {"store_forward": "N"}) 20 | .fillna({"store_forward": "N"})) 21 | 22 | replaced_distance_vals_df = (replaced_stfor_vals_df.replace({"distance": ".00"}, {"distance": 0}) 23 | .fillna({"distance": 0})) 24 | 25 | normalized_df = replaced_distance_vals_df.astype({"distance": 'float64'}) 26 | 27 | temp = pd.DatetimeIndex(normalized_df["pickup_datetime"]) 28 | normalized_df["pickup_date"] = temp.date 29 | normalized_df["pickup_time"] = temp.time 30 | 31 | temp = pd.DatetimeIndex(normalized_df["dropoff_datetime"]) 32 | normalized_df["dropoff_date"] = temp.date 33 | normalized_df["dropoff_time"] = temp.time 34 | 35 | del normalized_df["pickup_datetime"] 36 | del normalized_df["dropoff_datetime"] 37 | 38 | normalized_df.reset_index(inplace=True, drop=True) 39 | 40 | if not (args.output_normalize is None): 41 | os.makedirs(args.output_normalize, exist_ok=True) 42 | print("%s created" % args.output_normalize) 43 | path = args.output_normalize + "/processed.csv" 44 | write_df = normalized_df.to_csv(path) 45 | -------------------------------------------------------------------------------- /docs/AKS-HCI/notebooks/pipeline/scripts/prepdata/transform.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import pandas as pd 4 | 5 | print("Transforms the renamed taxi data to the required format") 6 | 7 | 8 | parser = argparse.ArgumentParser("transform") 9 | parser.add_argument('--data-path', type=str, help='input data path') 10 | parser.add_argument("--output_transform", type=str, help="transformed taxi data") 11 | 12 | args = parser.parse_args() 13 | 14 | print("Argument 2(output final transformed taxi data): %s" % args.output_transform) 15 | 16 | # These functions transform the renamed data to be used finally for training. 17 | 18 | # Split the pickup and dropoff date further into the day of the week, day of the month, and month values. 19 | # To get the day of the week value, use the derive_column_by_example() function. 20 | # The function takes an array parameter of example objects that define the input data, 21 | # and the preferred output. The function automatically determines your preferred transformation. 22 | # For the pickup and dropoff time columns, split the time into the hour, minute, and second by using 23 | # the split_column_by_example() function with no example parameter. After you generate the new features, 24 | # use the drop_columns() function to delete the original fields as the newly generated features are preferred. 25 | # Rename the rest of the fields to use meaningful descriptions. 26 | 27 | normalized_df = pd.read_csv(args.data_path + "/processed.csv") 28 | normalized_df = normalized_df.astype({"pickup_date": 'datetime64[ns]', "dropoff_date": 'datetime64[ns]', 29 | "pickup_time": 'datetime64[us]', "dropoff_time": 'datetime64[us]', 30 | "distance": 'float64', "cost": 'float64'}) 31 | 32 | normalized_df["pickup_weekday"] = normalized_df["pickup_date"].dt.dayofweek 33 | normalized_df["pickup_month"] = normalized_df["pickup_date"].dt.month 34 | normalized_df["pickup_monthday"] = normalized_df["pickup_date"].dt.day 35 | 36 | normalized_df["dropoff_weekday"] = normalized_df["dropoff_date"].dt.dayofweek 37 | normalized_df["dropoff_month"] = normalized_df["dropoff_date"].dt.month 38 | normalized_df["dropoff_monthday"] = normalized_df["dropoff_date"].dt.day 39 | 40 | normalized_df["pickup_hour"] = normalized_df["pickup_time"].dt.hour 41 | normalized_df["pickup_minute"] = normalized_df["pickup_time"].dt.minute 42 | normalized_df["pickup_second"] = normalized_df["pickup_time"].dt.second 43 | 44 | normalized_df["dropoff_hour"] = normalized_df["dropoff_time"].dt.hour 45 | normalized_df["dropoff_minute"] = normalized_df["dropoff_time"].dt.minute 46 | normalized_df["dropoff_second"] = normalized_df["dropoff_time"].dt.second 47 | 48 | # Drop the pickup_date, dropoff_date, pickup_time, dropoff_time columns because they're 49 | # no longer needed (granular time features like hour, 50 | # minute and second are more useful for model training). 51 | del normalized_df["pickup_date"] 52 | del normalized_df["dropoff_date"] 53 | del normalized_df["pickup_time"] 54 | del normalized_df["dropoff_time"] 55 | 56 | # Before you package the dataset, run two final filters on the dataset. 57 | # To eliminate incorrectly captured data points, 58 | # filter the dataset on records where both the cost and distance variable values are greater than zero. 59 | # This step will significantly improve machine learning model accuracy, 60 | # because data points with a zero cost or distance represent major outliers that throw off prediction accuracy. 61 | 62 | final_df = normalized_df[(normalized_df.distance > 0) & (normalized_df.cost > 0)] 63 | final_df.reset_index(inplace=True, drop=True) 64 | 65 | # Writing the final dataframe to use for training in the following steps 66 | if not (args.output_transform is None): 67 | os.makedirs(args.output_transform, exist_ok=True) 68 | print("%s created" % args.output_transform) 69 | path = args.output_transform + "/processed.csv" 70 | write_df = final_df.to_csv(path) 71 | -------------------------------------------------------------------------------- /docs/AKS-HCI/notebooks/pipeline/scripts/trainmodel/train_step.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from sklearn.linear_model import LinearRegression 3 | import os 4 | import pandas as pd 5 | 6 | import joblib 7 | 8 | def train(): 9 | 10 | train_set = pd.read_csv(args.train_data_path + "/processed.csv") 11 | test_set = pd.read_csv(args.test_data_path + "/processed.csv") 12 | 13 | selected_columns = ['pickup_weekday', 'pickup_hour', 'distance', 'passengers', 'vendor', 'cost'] 14 | train_set = train_set[selected_columns] 15 | test_set = test_set[selected_columns] 16 | 17 | train_features = train_set.drop("cost", axis=1) 18 | train_labels = train_set["cost"].copy() 19 | lr = LinearRegression() 20 | lr.fit(train_features, train_labels) 21 | 22 | filename = os.path.join('outputs', 'taxi.pkl') 23 | 24 | joblib.dump(lr, filename) 25 | 26 | test_features = test_set.drop("cost", axis=1)[:3] 27 | test_labels = test_set["cost"].copy() 28 | preds = lr.predict(test_features) 29 | 30 | print("preds", preds) 31 | 32 | 33 | if __name__ == "__main__": 34 | parser = argparse.ArgumentParser("split") 35 | parser.add_argument("--train_data_path", type=str, help="train data path") 36 | parser.add_argument("--test_data_path", type=str, help="test data path") 37 | args = parser.parse_args() 38 | train() -------------------------------------------------------------------------------- /docs/AKS-HCI/notebooks/pipeline/scripts/trainmodel/train_test_split.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import pandas as pd 4 | from sklearn.model_selection import train_test_split 5 | 6 | def write_output(df, path): 7 | os.makedirs(path, exist_ok=True) 8 | print("%s created" % path) 9 | df.to_csv(path + "/processed.csv") 10 | 11 | 12 | print("Split the data into train and test") 13 | 14 | parser = argparse.ArgumentParser("split") 15 | parser.add_argument('--data-path', type=str, help='input data path') 16 | parser.add_argument("--output_split_train", type=str, help="output split train data") 17 | parser.add_argument("--output_split_test", type=str, help="output split test data") 18 | 19 | args = parser.parse_args() 20 | 21 | print("Argument 1(output training data split path): %s" % args.output_split_train) 22 | print("Argument 2(output test data split path): %s" % args.output_split_test) 23 | 24 | # These functions splits the input features and labels into test and train data 25 | # Visit https://docs.microsoft.com/en-us/azure/machine-learning/service/tutorial-auto-train-models for more detail 26 | 27 | 28 | transformed_df = pd.read_csv(args.data_path + "/processed.csv") 29 | output_split_train, output_split_test = train_test_split(transformed_df, test_size=0.2, random_state=223) 30 | output_split_train.reset_index(inplace=True, drop=True) 31 | output_split_test.reset_index(inplace=True, drop=True) 32 | 33 | if not (args.output_split_train is None and 34 | args.output_split_test is None): 35 | write_output(output_split_train, args.output_split_train) 36 | write_output(output_split_test, args.output_split_test) 37 | -------------------------------------------------------------------------------- /docs/AKS-HCI/notebooks/pipeline/test_set.csv: -------------------------------------------------------------------------------- 1 | ,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,Unnamed: 0.1.1.1.1,cost,distance,dropoff_latitude,dropoff_longitude,passengers,pickup_latitude,pickup_longitude,store_forward,vendor,pickup_weekday,pickup_month,pickup_monthday,dropoff_weekday,dropoff_month,dropoff_monthday,pickup_hour,pickup_minute,pickup_second,dropoff_hour,dropoff_minute,dropoff_second 2 | 0,3823,3891,3891,3933,3933,10.5,1.49,40.67766953,-73.96222687,1,40.6884613,-73.98020935,N,2,1,1,26,1,1,26,15,55,16,16,12,33 3 | 1,2408,2444,2444,2467,2467,20,6.66,40.73447418,-73.99212646,1,40.79590988,-73.93545532,N,2,6,1,24,6,1,24,23,20,56,23,36,9 4 | 2,2722,2766,2766,2792,2792,7.5,1.42,40.70057297,-73.99156189,2,40.68553925,-73.99443054,N,2,5,1,9,5,1,9,17,0,20,17,8,52 5 | 3,600,609,609,613,613,6,0.89,40.80518341,-73.94141388,1,40.81178665,-73.95517731,N,2,1,1,5,1,1,5,8,24,18,8,30,25 6 | 4,2181,2216,2216,2235,2235,5.5,0.98,40.71474075,-73.9499054,1,40.71577454,-73.96444702,N,2,4,1,1,4,1,1,22,44,16,22,48,52 7 | 5,1542,1567,1567,1580,1580,5.5,1,40.66854095,-73.99355316,1,40.67189789,-73.98403168,N,2,6,1,10,6,1,10,18,20,31,18,24,45 8 | 6,1389,1413,1413,1425,1425,23.5,6.59,40.7955246,-73.97167206,6,40.7430687,-73.91899109,N,2,0,1,18,0,1,18,6,48,55,7,16,50 9 | 7,653,663,663,667,667,7,1.4,40.7974472,-73.97264862,1,40.79085159,-73.95359039,N,1,5,1,2,5,1,2,14,49,24,14,55,57 10 | 8,2406,2442,2442,2465,2465,4.5,0.66,40.80567551,-73.94690704,1,40.81209183,-73.94241333,N,2,4,1,15,4,1,15,11,6,19,11,9,34 11 | -------------------------------------------------------------------------------- /docs/AKS-HCI/notebooks/upload-download-model/AML-model-download-upload.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Upload Downloaded AML Model Files To Azure Blobs\n", 8 | "\n", 9 | "In this notebook, you will download a model from azure machine learning workspace to your local envrionment, then upload the model files as azure storage blobs.\n", 10 | "\n", 11 | "## Prerequisites\n", 12 | "* Azure Machine Learning Workspace\n", 13 | "If you are using an Azure Machine Learning Notebook VM, you are all set. Otherwise, go through the [AZML-SDK-INSTALL](https://docs.microsoft.com/en-us/python/api/overview/azure/ml/install?view=azure-ml-py) to install the Azure Machine Learning Python SDK and create an Azure ML `Workspace`.\n", 14 | "* An registered AML machine learning model. For how to train and register model, please see [pytorch-cifar10-train](https://github.com/Azure/AML-Kubernetes/tree/master/docs/AKS-HCI/notebooks/distributed-cifar10/distributed-pytorch-cifar10.ipynb)\n", 15 | "* Azure storage blob client library. For more details, please see [here](https://docs.microsoft.com/en-us/python/api/overview/azure/storage-blob-readme?view=azure-python). " 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "from azureml.core import Workspace\n", 25 | "import os" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "## Initialize workspace\n", 33 | "\n", 34 | "Initialize a [Workspace](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#workspace) object from the existing workspace you created in the Prerequisites step. `Workspace.from_config()` creates a workspace object from the details stored in `config.json`. \n", 35 | "\n", 36 | "If you haven't done already please go to `config.json` file and fill in your workspace information." 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "ws = Workspace.from_config()\n", 46 | "print('Workspace name: ' + ws.name, \n", 47 | " 'Azure region: ' + ws.location, \n", 48 | " 'Subscription id: ' + ws.subscription_id, \n", 49 | " 'Resource group: ' + ws.resource_group, sep='\\n')" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "## Download model from AML Workspace" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "model_path = 'cifar10model'\n", 66 | "model_name = \"cifar10torch\"\n", 67 | "\n", 68 | "ws.models[model_name].download(target_dir=model_path, exist_ok=True)" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "## Upload model files to Azure storage blobs" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "from azure.storage.blob import BlobServiceClient, ContainerClient\n", 85 | " \n", 86 | "connection_string = \"\"\n", 87 | "blob_service_client = BlobServiceClient.from_connection_string(connection_string)\n", 88 | " \n", 89 | "container_name = \"pytorchmodel\"\n", 90 | "\n", 91 | "container_client = blob_service_client.get_container_client(container_name)\n", 92 | "\n", 93 | "try:\n", 94 | " container_properties = container_client.get_container_properties()\n", 95 | "except Exception as e:\n", 96 | " container_client.create_container()\n", 97 | "\n", 98 | "for root, dirs, files in os.walk(model_path):\n", 99 | " for file in files:\n", 100 | " source_file = os.path.join(root, file)\n", 101 | " blob_name = source_file\n", 102 | " blob_client = container_client.get_blob_client(blob_name)\n", 103 | " if blob_client.exists():\n", 104 | " blob_client.delete_blob()\n", 105 | " with open(source_file, \"rb\") as data:\n", 106 | " blob_client.upload_blob(data, blob_type=\"BlockBlob\")" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": {}, 112 | "source": [ 113 | "Now you should see the files are uploaded to Blob Storage" 114 | ] 115 | } 116 | ], 117 | "metadata": { 118 | "interpreter": { 119 | "hash": "fc402497f0168b24575e2ffafe64cd34c507b9a7fab971a93b09782ae565c5c6" 120 | }, 121 | "kernelspec": { 122 | "display_name": "Python 3.8.3 64-bit", 123 | "name": "python3" 124 | }, 125 | "language_info": { 126 | "codemirror_mode": { 127 | "name": "ipython", 128 | "version": 3 129 | }, 130 | "file_extension": ".py", 131 | "mimetype": "text/x-python", 132 | "name": "python", 133 | "nbconvert_exporter": "python", 134 | "pygments_lexer": "ipython3", 135 | "version": "3.8.3" 136 | } 137 | }, 138 | "nbformat": 4, 139 | "nbformat_minor": 4 140 | } 141 | -------------------------------------------------------------------------------- /docs/AKS-HCI/notebooks/upload-download-model/README.md: -------------------------------------------------------------------------------- 1 | # Upload Downloaded AML Model Files To Azure Blobs 2 | 3 | These samples demonstrate how to download AML model files and upload to Azure Blob 4 | 5 | ## Notebooks 6 | 7 | * [Model Download and Upload](AML-model-download-upload.ipynb) 8 | 9 | This notebook demonstrates an example of downloading AML model files then upload to Azure Blob 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /docs/AKS-HCI/notebooks/upload-download-model/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "subscription_id": "", 3 | "resource_group": "", 4 | "workspace_name": "" 5 | } -------------------------------------------------------------------------------- /docs/AKS-HCI/troubleshooting.md: -------------------------------------------------------------------------------- 1 | # Troubleshooting for AzureML Training on Azure Stack Hub Kubernetes Cluster and Storage 2 | 3 | ## AzureML Dataset and Datastore Issues 4 | 5 | * Known limitations: 6 | 7 | * AML Tabular Datasets are not currenly supported on ASH blob storage 8 | * AML Dataset downloading is not yet stable. PLease use mounting to access your files during training 9 | 10 | * If you can't access your datastore (mounting or uploading): Please check if your generated SAS token for your ASH storage container is still valid and not expired. If unsure you can try to repeat the instruction from step 5 of [this document](Train-AzureArc.md#create-and-configure-azure-stack-hubs-storage-account). 11 | 12 | ## ASH Kubernetes Cluster Related Issues 13 | 14 | * Attaching ASH Kubernetes Cluster to AzureML Workspace Failed 15 | 16 | * Make sure your Kubernetes cluster is connected to Azure using Azure Arc. Also, make sure the version of your Kubernetes cluster is [supported](https://docs.microsoft.com/en-us/azure/aks/supported-kubernetes-versions#kubernetes-version-support-policy). Please make sure you are using the latest Arc agent according to pre-requisites of [this doc](https://github.com/Azure/azure-arc-kubernetes-preview/blob/master/docs/k8s-extensions.md#pre-requisites). You can find most of your information from the Azure portal: 17 | 18 |

19 | 20 |

21 | 22 | * Please make sure the latest Arc extensions are installed and Arc connections are created as described in the pre-requisites of [this doc](https://github.com/Azure/azure-arc-kubernetes-preview/blob/master/docs/k8s-extensions.md#pre-requisites). 23 | 24 | 25 | You may also run the following kubectl commands against one of the master nodes of your cluster to check if you cluster is porperly attached to Azure via Azure Arc: 26 | 27 |
 kubectl get ns 
28 | 29 | You should see "azure-arc" is one of the namespaces. 30 | 31 |
 kubectl get pods -n azure-arc 
32 | You should see all the pods in "running" status. 33 | 34 | 35 | ## AzureML Run Issues 36 | 37 | * No Progress on AzureML Experiment Runs 38 | 39 | Currently, there is a limitation on the number of pods that can run simultaneously in a single node. There can't be more than 1 pending pod in a single node. This means that if there are more pending pods than the number of worker nodes, all the pending pods will remain pending indefinitely. As an example, if the combination of your training workloads is sending 5 pods to your Kubernetes cluster and you only have 4 worker nodes, your training pods will never get scheduled by the Kubernetes agent. If you face this issue, please cancel all of your none-progressing runs on your AzureML workspace and retry accordingly. If you do a distributed training run, you may need to reduce the node_count value in Run Configuration. 40 | 41 | * Out of memory issue 42 | 43 | If your training job fails without any apparent reasons: This could be because of not having sufficient memory on your Kubernetes nodes. Insufficient Node memory could also be the reason for cases in which training is successful for only one epoch but fails for multiple epochs. Please try to either increase your Nodes' memory or optimize your training code to become less memory intensive. 44 | 45 | * Bugs in your scripts 46 | 47 | 48 | These issues are relatively easy to debug. You can go to your AzureML Workspace to check run logs for information about environment image creations, run time errors, outputs generated by your scripts, etc. This way you can pinpoint the reason for your training workload failing. Here is a snapshot: 49 | 50 |

51 | 52 |

53 | 54 | 55 | ## Other Known Limitations 56 | 57 | Please check out our other [known limitations](../limitations-and-knownIssues.md) and [troubleshooting](../troubleshooting.md) docs. 58 | -------------------------------------------------------------------------------- /docs/AKS-HCI/video/kfserving_tf_blob_structure.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/AKS-HCI/video/kfserving_tf_blob_structure.mp4 -------------------------------------------------------------------------------- /docs/application-gateway-ingress-controller.md: -------------------------------------------------------------------------------- 1 | # Tutorial 2 | 3 | These tutorials help illustrate how to integrate [Azure Application Gateway](https://azure.microsoft.com/en-us/services/application-gateway/) with AzureML extension over HTTP or HTTPS. 4 | 5 | ## Table of Contents 6 | 7 | - [Prerequisites](#prerequisites) 8 | - [Deploy AzureML extension](#deploy-azureml-extension) 9 | - [Expose services over HTTP](#expose-services-over-http) 10 | - [Expose services over HTTPS](#expose-services-over-https) 11 | 12 | ## Prerequisites 13 | 14 | - Install the latest k8s-extension and ml cli. 15 | - `az extension add -n k8s-extension --upgrade` 16 | - `az extension add -n ml --upgrade` 17 | - Setup Application Gateway. 18 | - [**Greenfield Deployment**](https://docs.microsoft.com/en-us/azure/application-gateway/tutorial-ingress-controller-add-on-new): If you are starting from scratch, refer to these instructions. 19 | - [**Brownfield Deployment**](https://docs.microsoft.com/en-us/azure/application-gateway/tutorial-ingress-controller-add-on-existing): If you have an existing AKS cluster and Application Gateway, refer to these instructions. 20 | - If you want to use HTTPS on this application, you will need a x509 certificate and its private key. 21 | 22 | ## Deploy AzureML extension 23 | 24 | [Deploy extension](https://github.com/Azure/AML-Kubernetes/blob/master/docs/deploy-extension.md#azureml-extension-deployment-scenarios) with `inferenceRouterServiceType=ClusterIP` and `allowInsecureConnections=True`, so that the Application gateway can handle TLS termination by itself instead of handing it over to azureml-fe (azureml inference router created by extension) when service is exposed over HTTPS. 25 | 26 | 27 | ## Expose services over HTTP 28 | 29 | In order to expose the azureml-fe we will using the following ingress resource: 30 | 31 | ```yaml 32 | apiVersion: networking.k8s.io/v1 33 | kind: Ingress 34 | metadata: 35 | name: azureml-fe 36 | namespace: azureml 37 | spec: 38 | ingressClassName: azure-application-gateway 39 | rules: 40 | - http: 41 | paths: 42 | - path: / 43 | backend: 44 | service: 45 | name: azureml-fe 46 | port: 47 | number: 80 48 | pathType: Prefix 49 | ``` 50 | 51 | This ingress will expose the `azureml-fe` service and the selected deployment as a default backend of the Application Gateway. 52 | 53 | Save the above ingress resource as `ing-azureml-fe.yaml`. 54 | 55 | 1. Deploy `ing-azureml-fe.yaml` by running: 56 | 57 | ```bash 58 | kubectl apply -f ing-azureml-fe.yaml 59 | ``` 60 | 61 | 2. Check the log of the ingress controller for deployment status. 62 | 63 | 3. Now the `azureml-fe` application should be available. You can check this by visiting the public address of the Application Gateway. 64 | 65 | 4. [Create an inference job and invoke](https://github.com/Azure/AML-Kubernetes/blob/master/docs/simple-flow.md). 66 | 67 | *NOTE:* Replace the ip in scoring_uri with public address of the Application Gateway before invoking. 68 | 69 | ## Expose services over HTTPS 70 | 71 | 1. Before deploying ingress, you need to create a kubernetes secret to host the certificate and private key. You can create a kubernetes secret by running 72 | 73 | ```bash 74 | kubectl create secret tls -n azureml --key --cert 75 | ``` 76 | 77 | 2. Define the following ingress. In the ingress, specify the name of the secret in the `secretName` section. 78 | 79 | ```yaml 80 | apiVersion: networking.k8s.io/v1 81 | kind: Ingress 82 | metadata: 83 | name: azureml-fe 84 | namespace: azureml 85 | spec: 86 | ingressClassName: azure-application-gateway 87 | tls: 88 | - hosts: 89 | - 90 | secretName: 91 | rules: 92 | - host: 93 | http: 94 | paths: 95 | - path: / 96 | backend: 97 | service: 98 | name: azureml-fe 99 | port: 100 | number: 80 101 | pathType: Prefix 102 | ``` 103 | 104 | *NOTE:* Replace `` and `` in the above Ingress Resource with the domain pointing to the Application Gateway and the name of your secret. Store the above Ingress Resource in a file name `ing-azureml-fe-tls.yaml`. 105 | 106 | 1. Deploy ing-azureml-fe-tls.yaml by running 107 | 108 | ```bash 109 | kubectl apply -f ing-azureml-fe-tls.yaml 110 | ``` 111 | 112 | 2. Check the log of the ingress controller for deployment status. 113 | 114 | 3. Now the `azureml-fe` application will be available on HTTPS. You can check this by visiting the public address of the Application Gateway. 115 | 116 | 4. [Create an inference job and invoke](https://github.com/Azure/AML-Kubernetes/blob/master/docs/simple-flow.md). 117 | 118 | *NOTE:* Replace the protocol and ip in scoring_uri with https and domain pointing to the Application Gateway before invoking. 119 | -------------------------------------------------------------------------------- /docs/azureml-aks-ta-support.md: -------------------------------------------------------------------------------- 1 | # AzureML access to AKS clusters with special configurations 2 | 3 | Built-upon [AKS Trusted Access feature](https://learn.microsoft.com/azure/aks/trusted-access-feature), AzureML now supports access to AKS clusters with following special configurations: 4 | - AKS cluster with local account disabled 5 | - AKS cluster with authorized IP range 6 | - Private AKS with public FQDN configuration 7 | 8 | 📣 This feature has been deployed in the public cloud(AzureCloud). AzureUSGovernment, AzureChinaCloud and AirGap clouds have not enabled this feature. 9 | 10 | Once the feature is deplyed to your regions, you could (re/)attach your compute to enable it; you can verify if the feature has been enabled on your AKS cluster with following steps: 11 | - Verify that ```Microsoft.MachineLearningServices/workspaces/mlworkload``` role binding is created in AKS cluster. **Note**: AzureML role binding is per workspace, if your AKS cluster is shared among multiple workspace, you should have AzureML role binding for each workspace. 12 | ```shell 13 | az aks trustedaccess rolebinding list --resource-group --cluster-name 14 | ``` 15 | > **Notes**: 16 | > 17 | > * If you have any existing compute targets created before AzureML role binding was created, those compute targets will not work with AKS cluster with above special configurations. Please detach those existing compute targets to avoid any issues. 18 | > * This role binding does not work with legacy AksCompute (AKS inference cluster). -------------------------------------------------------------------------------- /docs/deploy-on-ocp.md: -------------------------------------------------------------------------------- 1 | # Deploy AzureML extension on OpenShift Container Platform 2 | 3 | Azure Arc enabled ML supports both Azure RedHat OpenShift Service (ARO) and OpenShift Container Platform (OCP). 4 | 5 | ## Prerequisites 6 | 7 | An ARO or OCP Kubernetes cluster is up and running. 8 | 9 | * To setup ARO Kubernetes cluster on Azure, please follow instruction [here](https://docs.microsoft.com/azure/openshift/tutorial-create-cluster) 10 | * to setup OCP Kubernetes clsuter, please follow instructure on [RedHat website](https://docs.openshift.com/container-platform/4.6/installing/installing_platform_agnostic/installing-platform-agnostic.html). 11 | 12 | ## Disable Security Enhanced Linux (SELinux) 13 | 14 | [AzureML dataset](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-train-with-datasets), usually used in ML training jobs, is not supported on machines with SELinux enabled. Therefore, to use AzureML dataset, please make sure `selinux` is disabled on workers for AzureML usage. 15 | 16 | ## Privileged setup for ARO and OCP 17 | 18 | For AzureML extension deployment on ARO or OCP cluster, grant privileged access to AzureML service accounts, run ```oc edit scc privileged``` command, and add following service accounts under "users:": 19 | 20 | * ```system:serviceaccount:azure-arc:azure-arc-kube-aad-proxy-sa``` 21 | * ```system:serviceaccount:azureml:{EXTENSION-NAME}-kube-state-metrics``` 22 | * ```system:serviceaccount:azureml:cluster-status-reporter``` 23 | * ```system:serviceaccount:azureml:prom-admission``` 24 | * ```system:serviceaccount:azureml:default``` 25 | * ```system:serviceaccount:azureml:prom-operator``` 26 | * ```system:serviceaccount:azureml:csi-blob-node-sa``` 27 | * ```system:serviceaccount:azureml:csi-blob-controller-sa``` 28 | * ```system:serviceaccount:azureml:load-amlarc-selinux-policy-sa``` 29 | * ```system:serviceaccount:azureml:azureml-fe``` 30 | * ```system:serviceaccount:azureml:prom-prometheus``` 31 | * ```system:serviceaccount:{KUBERNETES-COMPUTE-NAMESPACE}:default``` 32 | * ```system:serviceaccount:azureml:azureml-ingress-nginx``` 33 | * ```system:serviceaccount:azureml:azureml-ingress-nginx-admission``` 34 | > **Notes** 35 | >* **{EXTENSION-NAME}:** is the extension name specified with ```az k8s-extension create --name``` CLI command. 36 | >* **{KUBERNETES-COMPUTE-NAMESPACE}:** is the namespace of kubernetes compute specified with ```az ml compute attach --namespace``` CLI command. Skip configuring 'system:serviceaccount:{KUBERNETES-COMPUTE-NAMESPACE}:default' if no namespace specified with ```az ml compute attach ``` CLI command. 37 | -------------------------------------------------------------------------------- /docs/faq.md: -------------------------------------------------------------------------------- 1 | 2 | # Frequently Asked Questions 3 | 4 | ## Who is Azure Arc enabled Machine Learning intended for? 5 | 6 | With increasing adoption of Kubernetes for machine learning among enterprises, Azure Machine Learning provides enterprise ML infrastructure team to easily setup and enable Kubernetes for their data science teams to use. At the same time, data scientists can focus on building high quality models and model deployment professionals can focus on scaling models production without getting involved about Kubernetes technical details. 7 | 8 | ## Why should I use Azure Arc enabled Machine Learning? 9 | 10 | Many enterprises want to start machine learning now with where data lives today, which could be in multi-cloud or on-premises. Enterprises also want to optimize IT operation to leverage wherever workload is available. With flexibility of cloud-native development provided by Kubernetes, enterprises now can spin up Kubernetes cluster anywhere to meet their machine learning needs, at the same time to address security and privacy compliance requirements in a highly regulated environment. With Azure Arc enabled Machine Learning, enterprises now can have hybrid machine learning lifecycle such as train models in cloud and deploy models on-premises, or train models on-premises and deploy models in cloud, to leverage where compute and data available and broaden service access. 11 | 12 | ## Isn’t Azure Arc enabled Machine Learning still in public cloud? 13 | 14 | * The control plane (Azure Machine Learning Studio, Azure Machine Learning microservices, dependent Azure services) is in the cloud. The cluster and data can be on premises or in any cloud up to the infrastructure setup. The Azure Machine Learning extension deployed to the cluster is used to communicate with the control plance, and make machine learning workloads run properly in the cluster. 15 | 16 | * Azure Arc enabled Machine Learning extends AzureML anywhere to on-premises or any cloud. Both existing types of the compute in AzureML and Arc enabled cluster share the same AzureML control plane. 17 | 18 | * The hybrid archetecture (having control plane in cloud) benefits customer with the evolving experiences with Azure Machine Learning platform features. 19 | 20 | * Azure private link setup on Azure Arc and Azure Machine Learning related resources can avoid public network inbound and outbound. 21 | 22 | ## How do I use Azure Arc enabled Machine Learning? 23 | 24 | Enterprise IT operator can easily setup and enable Kubernetes for Azure Machine Learning with the following steps: 25 | 26 | * Spin up a Kubernetes cluster anywhere 27 | * Connect Kubernetes cluster to Azure cloud via Azure Arc 28 | * Deploy AzureML extension to Azure Arc enabled Kubernetes cluster 29 | * Attach Azure Arc enabled Kubernetes cluster to Azure ML workspace and create compute target for data science teams to use 30 | 31 | Once Kubernetes cluster is enabled for Azure Machine Learning, data science professionals can discover Kubernetes compute targets in AzureML workspace or through CLI command, and use those compute targets to submit training job or deploy model. 32 | 33 | ## How does model deployment with Azure Arc enabled Machine Learning compare to Azure Machine Learning Managed Online Endpoint? 34 | 35 | Both online endpoints are built on AzureML online endpoint concept, and customers use the same set of tools to create and manage both types of online endpoints. Managed online endpoint runs on powerful Azure managed compute, no compute and infrastructure management for customers and customer gets a turnkey solution with guaranteed SLA. Kubernetes online endpoint runs on customer managed Kubernetes, customer is responsible for managing Kubernetes cluster and ensuring online endpoint SLA. 36 | 37 | 38 | ## Recommended AKS cluster resources 39 | 40 | We recommend you use a at least 3 nodes cluster, each node having at least 2C 4G. And if you want to running GPU jobs, you need some GPU nodes. 41 | 42 | ## Why the nodes run occupied in a run is more than node count in run list? 43 | 44 | The node count in the number of worker, for distribute training job, such as ps-worker or MPI/horovod they may need extra launcher node or ps node, they may also ocuppy one node. We will optimise this in following version. 45 | 46 | ## What Azure storage does Azure Arc-enabled ML support? 47 | 48 | Azure Arc-enabled ML compute only support Azure blob container, if your data is in other Azure storage, please move it to Azure blob first. We will support other Azure storage in following iteration. 49 | -------------------------------------------------------------------------------- /docs/gke-setup.md: -------------------------------------------------------------------------------- 1 | ## GKE setup 2 | 1. Select Ubuntu OS image during cluster create 3 | 2. A minimum of 3 nodes is required; need enough resources for arc agent and amlk8s agent installation 4 | 3. DO NOT select smaller VM's than 'medium' size 5 | 6 | GKE console -> +Create Cluster -> Node Pools -> Default-pool -> Nodes 7 | ![GKEClusterCreate](/docs/media/gkecreate.png) 8 | 9 | 10 | 4. Once the installation is complete, you need to SSH into each node in your cluster (can be found in Compute Engine under VM instances, SSH tool found under connect column). 11 | 12 | ![GKEClusterSSH](/docs/media/gke-ssh.png) 13 | 14 | 5. Execute the following commands in each node: 15 | 16 | ```bash 17 | sudo ln -s /etc/kubernetes/volumeplugins/azure~blobfuse /home/kubernetes/flexvolume/ 18 | 19 | sudo apt-get update; sudo apt-get install jq 20 | 21 | wget https://packages.microsoft.com/config/ubuntu/18.04/packages-microsoft-prod.deb; sudo dpkg -i packages-microsoft-prod.deb; sudo apt-get update; sudo apt-get install blobfuse 22 | ``` 23 | -------------------------------------------------------------------------------- /docs/how-to-debug-arc-kubernetes-training.md: -------------------------------------------------------------------------------- 1 | # Job is pending for a long time 2 | 3 | ## Check the resource capacity of the nodes: 4 | 5 | ``` azure cli 6 | kubectl get nodes -o json | jq '.items[]|{name: .metadata.name, capacity: .status.capacity, allocatable: .status.allocatable}' 7 | ``` 8 | 9 | Here is a sample output 10 | 11 | ``` azure cli 12 | { 13 | "name": "aks-nodepool1-36994511-vmss000000", 14 | "capacity": { 15 | "attachable-volumes-azure-disk": "24", 16 | "cpu": "6", 17 | "ephemeral-storage": "129900528Ki", 18 | "github.com/fuse": "1k", 19 | "hugepages-1Gi": "0", 20 | "hugepages-2Mi": "0", 21 | "memory": "57584828Ki", 22 | "nvidia.com/gpu": "1", 23 | "pods": "110" 24 | }, 25 | "allocatable": { 26 | "attachable-volumes-azure-disk": "24", 27 | "cpu": "5840m", 28 | "ephemeral-storage": "119716326407", 29 | "github.com/fuse": "1k", 30 | "hugepages-1Gi": "0", 31 | "hugepages-2Mi": "0", 32 | "memory": "51573948Ki", 33 | "nvidia.com/gpu": "1", 34 | "pods": "110" 35 | } 36 | } 37 | ``` 38 | 39 | ## Insufficient github.com/fuse 40 | 41 | Check whether k8s-host-device-plugin-daemonset is installed properly. 42 | 43 | ``` azure cli 44 | kubectl get ds -A | grep k8s-host-device-plugin-daemonset 45 | ``` 46 | 47 | ## Insufficient nvidia.com/gpu 48 | 49 | Check whether nvidia-device-plugin-daemonset is installed properly. For more details, please refer to [k8s-device-plugin](https://github.com/NVIDIA/k8s-device-plugin) 50 | 51 | ``` azure cli 52 | kubectl get ds -A | grep nvidia-device-plugin-daemonset 53 | ``` 54 | 55 | ## The "ps-0" pod is stuck in pending status 56 | 57 | ``` azure cli 58 | kubectl get pods | grep ps-0 59 | ``` 60 | 61 | Please try using another instance-type of lower resource requested. 62 | 63 | ## blobfuse fails to mount 64 | 65 | Make sure the [blobfuse-flexvolume-installer](https://github.com/Azure/kubernetes-volume-drivers/tree/master/flexvolume#config-kubelet-service-to-enable-flexvolume-driver) daemonset is installed properly 66 | 67 | ## no volume plugin matched 68 | 69 | ``` error message 70 | Warning FailedMount xxx kubelet xxxx: failed to get Plugin from volumeSpec for volume "f38a008f5870bd913f36e68c12dc1827-blobfuse-0" err=no volume plugin matched 71 | ``` 72 | 73 | This error message indicates that the flexvol is not installed properly. 74 | 75 | Try changing the installation path for the volume plugin 76 | 77 | ``` azure cli 78 | az k8s-extension create -g -c --cluster-type connectedClusters --extension-type Microsoft.AzureML.Kubernetes -n trainingcompute --release-train stable --config enableTraining=true --configuration-settings volumePluginDir=/usr/libexec/kubernetes/kubelet-plugins/volume/exec/ 79 | ``` 80 | 81 | # Job is in failed status 82 | 83 | ## OOM Error (Out of Memory) 84 | 85 | Please try adjusting the batch size of the training job or using an instance-type with higher memory limit. For tensorflow jobs, unlike other distributed jobs set the memory limits of ps pod hard coded with 2048Mi, the memory limit of ps pods are the same as the worker nodes'. 86 | 87 | ## Permission Denied under '/workspaceblobstore/azureml' 88 | 89 | ![image](../pics/permission_denied.png) 90 | 91 | Please upgrade the blobfuse on the kubernetes nodes to 1.3.6 or above. 92 | 93 | ``` azure cli 94 | az k8s-extension create -g -c --cluster-type connectedClusters --extension-type Microsoft.AzureML.Kubernetes -n trainingcompute --release-train stable --config enableTraining=true blobfuseSysctlInstall.enabled=true 95 | ``` 96 | 97 | ## stderr: nvidia-container-cli: initialization error: nvml error: driver/library version mismatch 98 | 99 | ![image](../pics/nvml_error.png) 100 | 101 | 1. Try restarting the problematic node. 102 | 103 | 2. Check whether [nvml driver library version mismatch](https://stackoverflow.com/questions/43022843/nvidia-nvml-driver-library-version-mismatch) 104 | 105 | 106 | 107 | ## Job failed with blobfuse using SasToken 108 | 109 | It may be due to an outdated CRD of the aml-operator, please update the CRD in the cluster. 110 | 111 | 112 | ## x509: certificate signed by unknown authority 113 | 114 | It may be due to cluster is configured with an outbound proxy with self-signed certificate but arc extension doesn't trust the certificate. Please follow the [guidance to provide and trust proxy-cert when connect cluster to Azure Arc](https://docs.microsoft.com/en-us/azure/azure-arc/kubernetes/quickstart-connect-cluster?tabs=azure-cli#4a-connect-using-an-outbound-proxy-server) 115 | 116 | -------------------------------------------------------------------------------- /docs/instance-type.md: -------------------------------------------------------------------------------- 1 | # Instance types 2 | 3 | ## What are instance types? 4 | Instance types are an Azure Machine Learning concept that allows targeting certain types of 5 | compute nodes for training and inference workloads. For an Azure VM, an example for an 6 | instance type is `STANDARD_D2_V3`. 7 | 8 | In Kubernetes clusters, instance types are represented by two elements: 9 | [nodeSelector](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#nodeselector) 10 | and [resources](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/). 11 | In short, a `nodeSelector` lets us specify which node a pod should run on. The node must have a 12 | corresponding label. In the `resources` section, we can set the compute resources (CPU, memory and 13 | Nvidia GPU) for the pod. 14 | 15 | ## Create instance types 16 | Instance types are represented in a custom resource definition (CRD) that is installed with the 17 | Azure Machine Learning extension. To create a new instance type, create a new custom resource 18 | for the instance type CRD. For example: 19 | ```bash 20 | kubectl apply -f my_instance_type.yaml 21 | ``` 22 | 23 | With `my_instance_type.yaml`: 24 | ```yaml 25 | apiVersion: amlarc.azureml.com/v1alpha1 26 | kind: InstanceType 27 | metadata: 28 | name: myinstancetypename 29 | spec: 30 | nodeSelector: 31 | mylabel: mylabelvalue 32 | resources: 33 | limits: 34 | cpu: "1" 35 | nvidia.com/gpu: 1 36 | memory: "2Gi" 37 | requests: 38 | cpu: "700m" 39 | memory: "1500Mi" 40 | ``` 41 | 42 | This creates an instance type with the following behavior: 43 | - Pods will be scheduled only on nodes with label `mylabel: mylabelvalue`. 44 | - Pods will be assigned resource requests of `700m` CPU and `1500Mi` memory. 45 | - Pods will be assigned resource limits of `1` CPU, `2Gi` memory and `1` Nvidia GPU. 46 | 47 | Note: 48 | - Nvidia GPU resources are only specified in the `limits` section as integer values. For more information, 49 | please refer to the Kubernetes [documentation](https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/#using-device-plugins). 50 | - CPU and memory resources are string values. 51 | - CPU can be specified in millicores, for example `100m`, or in full numbers, for example `"1"` which 52 | is equivalent to `1000m`. 53 | - Memory can be specified as a full number + suffix, for example `1024Mi` for 1024 MiB. 54 | 55 | It is also possible to create multiple instance types at once: 56 | ```bash 57 | kubectl apply -f my_instance_type_list.yaml 58 | ``` 59 | 60 | With `my_instance_type_list.yaml`: 61 | ```yaml 62 | apiVersion: amlarc.azureml.com/v1alpha1 63 | kind: InstanceTypeList 64 | items: 65 | - metadata: 66 | name: cpusmall 67 | spec: 68 | resources: 69 | requests: 70 | cpu: "100m" 71 | memory: "100Mi" 72 | limits: 73 | cpu: "1" 74 | nvidia.com/gpu: 0 75 | memory: "1Gi" 76 | 77 | - metadata: 78 | name: defaultinstancetype 79 | spec: 80 | resources: 81 | requests: 82 | cpu: "1" 83 | memory: "1Gi" 84 | limits: 85 | cpu: "1" 86 | nvidia.com/gpu: 0 87 | memory: "1Gi" 88 | ``` 89 | 90 | The above example creates two instance types: `cpusmall` and `defaultinstancetype`. The latter 91 | is examplained in more detail in the following section. 92 | 93 | ## Default instance types 94 | If a training or inference workload is submitted without an instance type, it uses the default 95 | instance type. To specify a default instance type for a Kubernetes cluster, create an instance 96 | type with name `defaultinstancetype`. It will automatically be recognized as the default. 97 | 98 | If no default instance type was defined, the following default behavior applies: 99 | - No nodeSelector is applied, meaning the pod can get scheduled on any node. 100 | - The workload's pods are assigned default resources with 0.6 cpu cores, 1536Mi memory and 0 GPU: 101 | ```yaml 102 | resources: 103 | requests: 104 | cpu: "0.6" 105 | memory: "1536Mi" 106 | limits: 107 | cpu: "0.6" 108 | memory: "1536Mi" 109 | nvidia.com/gpu: null 110 | ``` 111 | - This default instance type will not appear as an InstanceType custom resource in the cluster when running the command ```kubectl get instancetype```, 112 | but it will appear in all clients (UI, CLI, SDK). 113 | 114 | **Note:** The default instance type purposefully uses little resources. To ensure all ML workloads 115 | run with appropriate resources, for example GPU resource, it is highly recommended to create custom instance types. 116 | 117 | ## Select instance type to submit training job 118 | To select an instance type for a training job using CLI (V2), specify its name as part of the 119 | `compute` section. For example: 120 | ```yaml 121 | command: python -c "print('Hello world!')" 122 | environment: 123 | image: library/python:latest 124 | compute: azureml: 125 | resources: 126 | instance_type: 127 | ``` 128 | 129 | In the above example, replace `` with the name of your Kubernetes compute 130 | target and `` with the name of the instance type you wish to select. 131 | 132 | ## Select instance type to deploy model 133 | 134 | To select an instance type for a model deployment using CLI (V2), specify its name deployment YAML. For example: 135 | 136 | ```yaml 137 | name: blue 138 | app_insights_enabled: true 139 | endpoint_name: 140 | model: 141 | path: ./model/sklearn_mnist_model.pkl 142 | code_configuration: 143 | code: ./script/ 144 | scoring_script: score.py 145 | instance_type: 146 | environment: 147 | conda_file: file:./model/conda.yml 148 | image: mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20210727.v1 149 | ``` 150 | -------------------------------------------------------------------------------- /docs/limitations-and-known-issues.md: -------------------------------------------------------------------------------- 1 | # Limitations and known issues 2 | 3 | ## Failed to find any PEM data in certificate for gateway and cluster-status-reporter 4 | 5 | If you see this error during AzureML extension deployment, it means the cluster lacks ```--cluster-signing-cert-file``` and ```--cluster-signing-key-file``` parameters in its controller manager setting. You can set ```enable_https``` to false and it will use http for in-cluster components communication. For morning please refer to [Kubernetes documentation](https://kubernetes.io/docs/tasks/tls/managing-tls-in-a-cluster/#a-note-to-cluster-administrators). 6 | 7 | ## Custom IP interface for MPI job 8 | 9 | For MPI job on Azure Arc-enabled on-premise Kubernetes cluster, AzureML provides a good default value if eth0 is not available. However this good default value might not be correct and MPI job will fail. To ensure that MPI job gets correct IP interface, you can st custome IP interface at AzureML extension deployment time by appending ```amloperator.custom_ip_interface_enabled=True``` and ```amloperator.custom_ip_interface=``` to ```--configuration-settings``` parameter. 10 | 11 | ## AML Dataset support 12 | 13 | Azure Arc-enabled Machine Learning job supports mounting/downloading an AML Dataset to a local path specified by the field "PathOnCompute". But this path can not be any of following: under root folder (e.g. /), priviledge folder (e.g. /data/), and an existing folder. 14 | 15 | ## Compute Managed Identity in Azure Machine Learning Workspace with private endpoint 16 | 17 | AMLArc compute can be assigned with either system-assigned or user-assigned managed identity, to access custom Azure Container Registry (ACR) or access AML dataset. The current limitation is that the managed identity doesn't work in AML workspace with private endpoint. 18 | -------------------------------------------------------------------------------- /docs/managed-identity.md: -------------------------------------------------------------------------------- 1 | # Assign Managed Identity to the compute target 2 | 3 | A common challenge for developers is the management of secrets and credentials used to secure communication between different components making up a solution. [Managed Identity](https://docs.microsoft.com/en-us/azure/active-directory/managed-identities-azure-resources/overview) eliminate the need for developers to manage credentials. 4 | 5 | To access Azure Container Registry (ACR) for Docker image, and Storage Account for trainig data, attach AMLArc compute with system-assigned or user-assigned managed identity enabled. 6 | 7 | ## Assign Managed Identity 8 | 9 | - You can assign Managed Identity to the compute in [compute attach](./attach-compute.md) 10 | - If the compute has been attached, you can update the settings of Managed Identity in Machine Learning Studio. 11 | - Go to Azure Machine Learning Studio - Compute - Attached compute, select your attached compute. 12 | - Edit Managed Identity. 13 | ![Managed identity](./media/edit-identity.png) 14 | ![Managed identity](./media/update-identity2.png) 15 | 16 | ## Assign Azure roles to Managed Identity 17 | 18 | Azure offers a couple of ways to assign roles to Managed Identity. 19 | - [Use Azure Portal to assign roles](https://docs.microsoft.com/en-us/azure/role-based-access-control/role-assignments-portal?tabs=current). 20 | - [Use CLI to assign roles](https://docs.microsoft.com/en-us/azure/role-based-access-control/role-assignments-cli) 21 | - [Use PowerShell to assign roles](https://docs.microsoft.com/en-us/azure/role-based-access-control/role-assignments-powershell) 22 | 23 | >If use Portal to assign roles, and you have system-assigned managed identity, select **User,group,or service principal**. Click Select members, find and search the identity name formatted as ``\/computes/\`` 24 | > 25 | > If you have user-assigned managed identity, select **Managed identity** to find the target identity. 26 | ![Managed identity](./media/assign-role.png) 27 | 28 | ### Use Managed Identity to pull image from Azure Container Registry 29 | 30 | "AcrPull" role shoule be granted to the compute Managed Identity. 31 | 32 | ### Use Managed Identity to access Azure Blob 33 | 34 | - For read-only purpose, `Storage Blob Data Reader` role should be granted to the compute Managed Identity. 35 | - For read-write purpose, `Storage Blob Data Contributor` role should be granted to the compute Managed Identity. 36 | -------------------------------------------------------------------------------- /docs/media/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /docs/media/assign-role.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/assign-role.png -------------------------------------------------------------------------------- /docs/media/attach-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/attach-1.png -------------------------------------------------------------------------------- /docs/media/attach-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/attach-4.png -------------------------------------------------------------------------------- /docs/media/attach.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/attach.png -------------------------------------------------------------------------------- /docs/media/detach.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/detach.png -------------------------------------------------------------------------------- /docs/media/edit-identity.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/edit-identity.png -------------------------------------------------------------------------------- /docs/media/gke-ssh.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/gke-ssh.png -------------------------------------------------------------------------------- /docs/media/gkecreate.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/gkecreate.png -------------------------------------------------------------------------------- /docs/media/privatelink-networkflow-v3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/privatelink-networkflow-v3.png -------------------------------------------------------------------------------- /docs/media/privatelink/acr_subnet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/privatelink/acr_subnet.png -------------------------------------------------------------------------------- /docs/media/privatelink/acr_target.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/privatelink/acr_target.png -------------------------------------------------------------------------------- /docs/media/privatelink/acr_trusted.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/privatelink/acr_trusted.png -------------------------------------------------------------------------------- /docs/media/privatelink/aks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/privatelink/aks.png -------------------------------------------------------------------------------- /docs/media/privatelink/dns.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/privatelink/dns.png -------------------------------------------------------------------------------- /docs/media/privatelink/kv_target.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/privatelink/kv_target.png -------------------------------------------------------------------------------- /docs/media/privatelink/kv_trusted.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/privatelink/kv_trusted.png -------------------------------------------------------------------------------- /docs/media/privatelink/kv_vnet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/privatelink/kv_vnet.png -------------------------------------------------------------------------------- /docs/media/privatelink/ml_compute.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/privatelink/ml_compute.png -------------------------------------------------------------------------------- /docs/media/privatelink/ml_computemsi.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/privatelink/ml_computemsi.png -------------------------------------------------------------------------------- /docs/media/privatelink/ml_disablepublicaccess.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/privatelink/ml_disablepublicaccess.png -------------------------------------------------------------------------------- /docs/media/privatelink/ml_privateendpoint.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/privatelink/ml_privateendpoint.png -------------------------------------------------------------------------------- /docs/media/privatelink/onprem.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/privatelink/onprem.png -------------------------------------------------------------------------------- /docs/media/privatelink/relay.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/privatelink/relay.png -------------------------------------------------------------------------------- /docs/media/privatelink/relay_connectstring.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/privatelink/relay_connectstring.png -------------------------------------------------------------------------------- /docs/media/privatelink/relay_resourceid.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/privatelink/relay_resourceid.png -------------------------------------------------------------------------------- /docs/media/privatelink/storageaccount.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/privatelink/storageaccount.png -------------------------------------------------------------------------------- /docs/media/privatelink/ts_curl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/privatelink/ts_curl.png -------------------------------------------------------------------------------- /docs/media/privatelink/ts_expected.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/privatelink/ts_expected.png -------------------------------------------------------------------------------- /docs/media/privatelink/ts_getpo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/privatelink/ts_getpo.png -------------------------------------------------------------------------------- /docs/media/privatelink/ts_nslookup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/privatelink/ts_nslookup.png -------------------------------------------------------------------------------- /docs/media/privatelink/ts_ws.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/privatelink/ts_ws.png -------------------------------------------------------------------------------- /docs/media/profileConfig.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/profileConfig.png -------------------------------------------------------------------------------- /docs/media/update-identity2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/update-identity2.png -------------------------------------------------------------------------------- /docs/media/ws-msi.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/docs/media/ws-msi.png -------------------------------------------------------------------------------- /docs/network-requirements.md: -------------------------------------------------------------------------------- 1 | ## Meet network requirements 2 | Clusters running behind an outbound proxy server or firewall need additional network configurations. 3 | - For Azure Arc enabled Kubernetes, fulfill [Azure Arc network requirements](https://docs.microsoft.com/en-us/azure/azure-arc/kubernetes/quickstart-connect-cluster?tabs=azure-cli#meet-network-requirement) needed by Azure Arc agents. If the cluster has an outbound proxy, make sure that `127.0.0.1` and `localhost` are added to `--proxy-skip-range` when connecting with Azure Arc. 4 | - For AKS customer without Azure Arc connection, fullfill [AKS cluster extension network requirements](https://docs.microsoft.com/en-us/azure/aks/limit-egress-traffic#cluster-extensions). 5 | 6 | Besides, the following outbound URLs are required for Azure Machine Learning, 7 | 8 | | Outbound Endpoint| Port | Description|Training |Inference | 9 | |--|--|--|--|--| 10 | | *.kusto.windows.net,
\*.table.core.windows.net,
\*.queue.core.windows.net | https:443 | Required to upload system logs to Kusto. You can skip this if you have a data exfiltration concern to add table and queue FQDNs, but you cannot get the error diagnosis support from Microsoft.|**✓**|**✓**| 11 | | \\.azurecr.io
\\.\\.data.azurecr.io | https:443 | Azure container registry, required to pull docker images used for machine learning workloads.|**✓**|**✓**| 12 | | \\.blob.core.windows.net | https:443 | Azure blob storage, required to fetch machine learning project scripts,data or models, and upload job logs/outputs.|**✓**|**✓**| 13 | | \.workspace.\.api.azureml.ms ,
\.experiments.azureml.net,
\.api.azureml.ms | https:443 | Azure mahince learning service API.|**✓**|**✓**| 14 | | pypi.org | https:443 | Python package index, to install pip packages used for training job environment initialization.|**✓**|N/A| 15 | 16 | > [!NOTE] 17 | > `` is the lowcase full spelling of Azure Region, for example, eastus, southeastasia. 18 | > 19 | > `` can be found in Azure portal - your Machine Learning resource page - Properties - Workspace ID. 20 | -------------------------------------------------------------------------------- /docs/nginx-ingress-controller.md: -------------------------------------------------------------------------------- 1 | # Tutorial 2 | 3 | These tutorials help illustrate how to integrate [Nginx Ingress Controller](https://github.com/kubernetes/ingress-nginx) with AzureML extension over HTTP or HTTPS. 4 | 5 | ## Table of Contents 6 | 7 | - [Prerequisites](#prerequisites) 8 | - [Deploy AzureML extension](#deploy-azureml-extension) 9 | - [Expose services over HTTP](#expose-services-over-http) 10 | - [Expose services over HTTPS](#expose-services-over-https) 11 | 12 | ## Prerequisites 13 | 14 | - Install the latest k8s-extension and ml cli. 15 | - `az extension add -n k8s-extension --upgrade` 16 | - `az extension add -n ml --upgrade` 17 | - Setup Nginx Ingress Conroller. 18 | - [**Create a basic controller**](https://docs.microsoft.com/en-us/azure/aks/ingress-basic): If you are starting from scratch, refer to these instructions. 19 | - If you want to use HTTPS on this application, you will need a x509 certificate and its private key. 20 | 21 | ## Deploy AzureML extension 22 | 23 | [Deploy extension](https://github.com/Azure/AML-Kubernetes/blob/master/docs/deploy-extension.md#azureml-extension-deployment-scenarios) with `inferenceRouterServiceType=ClusterIP` and `allowInsecureConnections=True`, so that the Nginx Ingress Conroller can handle TLS termination by itself instead of handing it over to azureml-fe (azureml inference router created by extension) when service is exposed over HTTPS. 24 | 25 | 26 | ## Expose services over HTTP 27 | 28 | In order to expose the azureml-fe we will using the following ingress resource: 29 | 30 | ```yaml 31 | apiVersion: networking.k8s.io/v1 32 | kind: Ingress 33 | metadata: 34 | name: azureml-fe 35 | namespace: azureml 36 | spec: 37 | ingressClassName: nginx 38 | rules: 39 | - http: 40 | paths: 41 | - path: / 42 | backend: 43 | service: 44 | name: azureml-fe 45 | port: 46 | number: 80 47 | pathType: Prefix 48 | ``` 49 | 50 | This ingress will expose the `azureml-fe` service and the selected deployment as a default backend of the Nginx Ingress Controller. 51 | 52 | Save the above ingress resource as `ing-azureml-fe.yaml`. 53 | 54 | 1. Deploy `ing-azureml-fe.yaml` by running: 55 | 56 | ```bash 57 | kubectl apply -f ing-azureml-fe.yaml 58 | ``` 59 | 60 | 2. Check the log of the ingress controller for deployment status. 61 | 62 | 3. Now the `azureml-fe` application should be available. You can check this by visiting the public LoadBalancer address of the Nginx Ingress Controller. 63 | 64 | 4. [Create an inference job and invoke](https://github.com/Azure/AML-Kubernetes/blob/master/docs/simple-flow.md). 65 | 66 | *NOTE:* Replace the ip in scoring_uri with public LoadBalancer address of the Nginx Ingress Controller before invoking. 67 | 68 | ## Expose services over HTTPS 69 | 70 | 1. Before deploying ingress, you need to create a kubernetes secret to host the certificate and private key. You can create a kubernetes secret by running 71 | 72 | ```bash 73 | kubectl create secret tls -n azureml --key --cert 74 | ``` 75 | 76 | 2. Define the following ingress. In the ingress, specify the name of the secret in the `secretName` section. 77 | 78 | ```yaml 79 | apiVersion: networking.k8s.io/v1 80 | kind: Ingress 81 | metadata: 82 | name: azureml-fe 83 | namespace: azureml 84 | spec: 85 | ingressClassName: nginx 86 | tls: 87 | - hosts: 88 | - 89 | secretName: 90 | rules: 91 | - host: 92 | http: 93 | paths: 94 | - path: / 95 | backend: 96 | service: 97 | name: azureml-fe 98 | port: 99 | number: 80 100 | pathType: Prefix 101 | ``` 102 | 103 | *NOTE:* Replace `` and `` in the above Ingress Resource with the domain pointing to LoadBalancer of the Nginx ingress controller and name of your secret. Store the above Ingress Resource in a file name `ing-azureml-fe-tls.yaml`. 104 | 105 | 1. Deploy ing-azureml-fe-tls.yaml by running 106 | 107 | ```bash 108 | kubectl apply -f ing-azureml-fe-tls.yaml 109 | ``` 110 | 111 | 2. Check the log of the ingress controller for deployment status. 112 | 113 | 3. Now the `azureml-fe` application will be available on HTTPS. You can check this by visiting the public LoadBalancer address of the Nginx Ingress Controller. 114 | 115 | 4. [Create an inference job and invoke](https://github.com/Azure/AML-Kubernetes/blob/master/docs/simple-flow.md). 116 | 117 | *NOTE:* Replace the protocol and ip in scoring_uri with https and domain pointing to LoadBalancer of the Nginx Ingress Controller before invoking. 118 | -------------------------------------------------------------------------------- /docs/pvc.md: -------------------------------------------------------------------------------- 1 | ### PV/PVC support in AMLArc training job 2 | 3 | Now you can leverage Kubernetes native way to mount various data storage via [Persistent Volume (PV) and Persistent Volume Claim (PVC)](https://kubernetes.io/docs/concepts/storage/persistent-volumes/). 4 | 5 | 1. Create PV, take NFS as example, 6 | 7 | ``` 8 | apiVersion: v1 9 | kind: PersistentVolume 10 | metadata: 11 | name: nfs-pv 12 | spec: 13 | capacity: 14 | storage: 1Gi 15 | accessModes: 16 | - ReadWriteMany 17 | persistentVolumeReclaimPolicy: Retain 18 | storageClassName: "" 19 | nfs: 20 | path: /share/nfs 21 | server: 20.98.110.84 22 | readOnly: false 23 | ``` 24 | 2. Create PVC. In `metadata`, you **must** add label `ml.azure.com/pvc: "true"` to indicate the PVC can be mounted to the upcoming training job, and add annotation `ml.azure.com/mountpath: ` to specify the mount path. 25 | 26 | ``` 27 | apiVersion: v1 28 | kind: PersistentVolumeClaim 29 | metadata: 30 | name: nfs-pvc 31 | namespace: default 32 | labels: 33 | ml.azure.com/pvc: "true" 34 | annotations: 35 | ml.azure.com/mountpath: "/mnt/nfs" 36 | spec: 37 | storageClassName: "" 38 | accessModes: 39 | - ReadWriteMany 40 | resources: 41 | requests: 42 | storage: 1Gi 43 | ``` 44 | 51 | 52 | 53 | 54 | ### How AML will use the PVC 55 | 56 | The training job in the same `namespace` with the PVC will be mounted the volume automatically. Then data scientist can access the mount path in the training job. 57 | 58 | By default, the job will be created in `default` namespace. IT operator can decide the namespace in attached compute attach. 59 | 60 | 61 | 62 | 63 | -------------------------------------------------------------------------------- /docs/release-notes.md: -------------------------------------------------------------------------------- 1 | New features are released at a biweekly cadance. 2 | 3 | **Dec 27, 2022 Release** 4 | 5 | Version 1.1.17 6 | * Move the Fluent-bit from DaemonSet to sidecars 7 | * Add MDC support 8 | * Refine error messages 9 | * Support cluster mode (windows, linux) jobs 10 | * Bugfixes 11 | 12 | **Aug 29, 2022 Release** 13 | 14 | Version 1.1.9 15 | * Improved health check logic 16 | * Bugfixes 17 | 18 | **Jun 23, 2022 Release** 19 | 20 | Version 1.1.6 21 | * Bugfixes 22 | 23 | **Jun 15, 2022 Release** 24 | 25 | Version 1.1.5 26 | * Updated training to use new common runtime to run jobs 27 | * Removed Azure Relay usage for Aks extension 28 | * Removed service bus usage from the extension 29 | * Updated security context usage 30 | * Updated inference scorefe to v2 31 | * Updated to use Volcano as training job scheduler 32 | * Bugfixes 33 | 34 | **Oct 14, 2021 Release** 35 | 36 | * [PV/PVC volume mount support in AMLArc training job](./pvc.md). 37 | 38 | **Sept 16, 2021 Release** 39 | 40 | * New regions available, WestUS, CentralUS, NorthCentralUS, KoreaCentral. 41 | * Job queue explanability. See job queue details in AML Workspace Studio. 42 | * Auto-killing policy. Support `max_run_duration_seconds` in ``ScriptRunConfig``. The system will attempt to automatically cancel the run if it took longer than the setting value. 43 | * Performance improvement on cluster autoscale support. 44 | * [Arc agent and ML extension deployment from on-prem container registry](https://github.com/Azure/azure-arc-kubernetes-preview/blob/master/docs/custom-registry/connect-cluster.md) 45 | 46 | **August 24, 2021 Release** 47 | 48 | * [Compute instance type is supported in job YAML](./docs/simple-train-cli.md). 49 | * [Assign Managed Identity to AMLArc compute](./docs/managed-identity.md) 50 | 51 | **August 10, 2021 Release** 52 | 53 | * New Kubernetes distribution support, K3S - Lightweight Kubernetes. 54 | * [Deploy AzureML extension to your AKS cluster without connecting via Azure Arc](./docs/deploy-ml-extension-on-AKS-without-arc.md). 55 | * [Automated Machine Learning (AutoML) via Python SDK](https://docs.microsoft.com/en-us/azure/machine-learning/concept-automated-ml) 56 | * [Use 2.0 CLI to attach the Kubernetes cluster to AML Workspace](./docs/attach-compute.md#Create-compute-target-via-Azure-ML-2.0-CLI) 57 | * Optimize AzureML extension components CPU/memory resources utilization. 58 | 59 | **July 2, 2021 Release** 60 | 61 | * New Kubernetes distributions support, OpenShift Kubernetes and GKE (Google Kubernetes Engine). 62 | * Autoscale support. If the user-managed Kubernetes cluster enables the autoscale, the cluster will be automatically scaled out or scaled in according to the volume of active runs and deployments. 63 | * Performance improvement on job laucher, which shortens the job execution time to a great deal. 64 | -------------------------------------------------------------------------------- /docs/setup-ephemeral-nfs-volume.md: -------------------------------------------------------------------------------- 1 | ### Set up NFS server 2 | 3 | Set up on Ubuntu [Link](https://help.ubuntu.com/community/SettingUpNFSHowTo) and make sure to grant NFS Share Access to your Kubernetes cluster. 4 | 5 | 6 | ### Create a Configmap with nfs server properties 7 | 8 | ```mount-config.yaml 9 | kind: ConfigMap 10 | apiVersion: v1 11 | metadata: 12 | name: mount-config 13 | namespace: azureml 14 | data: 15 | mounts.yaml: | 16 | mountPoints: 17 | - mountPath: /nfs_share 18 | mountType: nfs 19 | name: nfs-name 20 | path: /path/to/shared-folder 21 | server: nfs-server.domain-name.com 22 | ``` 23 | 24 | ### Apply the Configmap 25 | 26 | `kubectl apply -f mount-config.yaml` 27 | 28 | ### Documentation on Specific Fields 29 | * `mountPath`: defines the path that the NFS volume will be mounted into inside your job 30 | * `mountType`: must be `nfs` 31 | * `name`: arbitrary symbolic name for your mount. If you define multiple mounts then this must be unique per mount 32 | * `path`: path (on the server) to the folder you want to mount 33 | * `server`: NFS server address 34 | 35 | Multiple NFS mounts may be defined under `mountPoints` 36 | 37 | The rest of the `mount-config.yaml` file must be exactly as above 38 | 39 | ### How AML will use the mount for jobs 40 | 41 | All jobs will look for the 'mount-config' ConfigMap. If this ConfigMap is missing or malformed then no mounts will be applied. 42 | 43 | 44 | -------------------------------------------------------------------------------- /docs/simple-flow.md: -------------------------------------------------------------------------------- 1 | 2 | # Deploy an image classification model - create an endpoint with blue deployment 3 | 4 | ## Azure CLI for ML installation and project setup 5 | 6 | 1. Remove any previous Azure ML CLI extension installations 7 | 8 | ```azurecli 9 | az extension remove -n ml 10 | az extension remove -n azure-cli-ml 11 | ``` 12 | 13 | 1. Install the latest Azure CLI for ML, which is in public preview, and then verify installation 14 | 15 | ```azurecli 16 | az extension add -n ml 17 | az ml -h 18 | ``` 19 | 20 | 1. Let's set some defaults for all subsequent "az ml" CLI commands 21 | 22 | ```azurecli 23 | az account set --subscription 24 | az configure --defaults workspace= group= 25 | ``` 26 | 27 | 1. For this simple deployment flow, we have following project directory structure: 28 | 29 | ``` code 30 | simple-flow 31 | |-- model 32 | | |-- conda.yml 33 | | |-- sklearn_mnist_model.pkl 34 | |-- script 35 | | |-- score.py 36 | |-- blue-deployment.yml 37 | |-- endpoint.yml 38 | |-- sample_request.json 39 | ``` 40 | 41 | As you can see from above, "model" directory contains model and Conda environment definition, "score.py" is under "script" directory. At top level directory, we have endpoint, blue deployment YAML definition and sample request JSON file. In general, this is very typical project setup for Azure Arc enabled ML model deployment. 42 | 43 | ## Simple deployment flow 44 | 45 | Now let's see simple deployment flow in action! 46 | 47 | 1. Git clone preview Github repo and switch to simple-flow directory 48 | 49 | ```console 50 | git clone https://github.com/Azure/AML-Kubernetes.git 51 | cd AML-Kubernetes/examples/inference/simple-flow 52 | ``` 53 | 54 | 1. Modify endpoint YAML file to replace "\" with your own compute target name, and replace "\" to the instance type defined in your compute configuration. Create an endpoint with blue deployment with following CLI command, endpoint creation and deployment might take a few minutes. 55 | 56 | > Note that the resource requirements (CPU, memory, GPU) defined in the endpoint yaml should be no more than the resource limit of the specified instance type. 57 | 58 | 59 | 1. Create endpoint 60 | ```azurecli 61 | az ml online-endpoint create --name sklearn-mnist -f endpoint.yml 62 | ``` 63 | 1. Check status of endpoint 64 | 65 | ```azurecli 66 | az ml online-endpoint show -n sklearn-mnist 67 | ``` 68 | 69 | 1. Create blue deployment 70 | ```azurecli 71 | az ml online-deployment create --name blue --endpoint sklearn-mnist -f blue-deployment.yml --all-traffic 72 | ``` 73 | 74 | 1. Check status of blue deployment 75 | 76 | ```azurecli 77 | az ml online-deployment show --name blue --endpoint sklearn-mnist 78 | ``` 79 | 80 | 1. Test endpoint by scoring request 81 | 82 | ```azurecli 83 | az ml online-endpoint invoke -n sklearn-mnist -r sample-request.json 84 | ``` 85 | 86 | You can also send a scoring request using cURL. 87 | 88 | * Obtain a token/keys for the scoring endpoint 89 | 90 | ```azurecli 91 | az ml online-endpoint get-credentials -n sklearn-mnist 92 | ``` 93 | 94 | * Obtain the `scoring_uri` of the endpoint 95 | 96 | ```azurecli 97 | az ml online-endpoint show -n sklearn-mnist 98 | ``` 99 | 100 | * Score using the token/key obtained above 101 | 102 | ```bash 103 | curl -v -i -X POST -H "Content-Type:application/json" -H "Authorization: Bearer " -d '' 104 | ``` 105 | 106 | That is it! You have successfully deployed an image classification model and scored the model with a request. 107 | 108 | 1. Get logs 109 | 110 | ```azurecli 111 | az ml online-deployment get-logs --name blue --endpoint sklearn-mnist 112 | ``` 113 | 114 | 1. Delete endpoint 115 | 116 | ```azurecli 117 | az ml online-endpoint delete -n sklearn-mnist 118 | ``` 119 | 120 | ## Additional resources 121 | 122 | * [Deploy model using customer container with built-in model or entry script](inference-byoc.md). In this case, the model and the entry script will not be saved at the cloud, but in local. 123 | * To learn more about Azure ML endpoint and deployment concents, please check [Managed Online Endpoints](https://docs.microsoft.com/azure/machine-learning/how-to-deploy-managed-online-endpoints). 124 | * [Additional Examples](https://github.com/Azure/azureml-examples/tree/main/cli/endpoints/online) 125 | -------------------------------------------------------------------------------- /docs/simple-train-cli.md: -------------------------------------------------------------------------------- 1 | 2 | # Train an image classification model with AML 2.0 CLI 3 | 4 | 1. Remove any previous AML CLI extension installations 5 | 6 | ```azurecli 7 | az extension remove -n ml 8 | az extension remove -n azure-cli-ml 9 | ``` 10 | 11 | 1. Install the latest AML 2.0 CLI, which is in public preview, and then verify installation 12 | 13 | ```azurecli 14 | az extension add -n ml 15 | az ml -h 16 | ``` 17 | 18 | 1. Let's set some defaults for all subsequent "az ml" CLI commands 19 | 20 | ```azurecli 21 | az account set --subscription 22 | az configure --defaults workspace= group= 23 | ``` 24 | 25 | 1. For this simple training job with AML 2.0 CLI, we have following project directory structure: 26 | 27 | ``` code 28 | simple-train-cli 29 | |-- src 30 | | |-- train.py 31 | | |-- utils.py 32 | |-- job.yml 33 | ``` 34 | 35 | As you can see from above, the project simply contains a job YAML file and some Python training scripts. In general, this a very typical project setup for Azure Arc-enabled ML training. Let's take a look at job YAML file: 36 | 37 | ```yaml 38 | experiment_name: Tutorial-sklearn-mnist 39 | code: ./src 40 | command: python train.py --data-folder ./mnist-data --regularization 0.5 41 | environment: azureml:AzureML-sklearn-0.24-ubuntu18.04-py37-cpu:7 42 | compute: azureml: 43 | resources: 44 | instance_type: 45 | ``` 46 | 47 | **Note**: **Instance type** is optional parameter. If it's not given, like the YAML file below, the compute default instance type will be used. 48 | 49 | ```yaml 50 | experiment_name: Tutorial-sklearn-mnist 51 | code: ./src 52 | command: python train.py --data-folder ./mnist-data --regularization 0.5 53 | environment: azureml:AzureML-sklearn-0.24-ubuntu18.04-py37-cpu:7 54 | compute: azureml: 55 | ``` 56 | 57 | Refer to [here](./instance-type.md) to learn how to create different instance types. 58 | 59 | 1. Git clone preview Github repo and switch to simple-train-cli directory 60 | 61 | ```console 62 | git clone https://github.com/Azure/AML-Kubernetes.git 63 | cd AML-Kubernetes/examples/training/simple-train-cli 64 | ``` 65 | 66 | 1. Modify job YAML file to specify your own compute target name 67 | 68 | 1. Run the image classification training job 69 | 70 | ```azurecli 71 | az ml job create -f job.yml --web 72 | ``` 73 | 74 | Creating this job uploads any specified local assets, like the source code directory, validates the YAML file, and submits the run. If needed, the environment is built, then the compute is scaled up and configured for running the job. 75 | 76 | 1. Once the job is compute, you can download the outputs: 77 | 78 | ```azurecli 79 | az ml job download -n $run_id --outputs 80 | ``` 81 | 82 | That is it! You have successfully trained an image classification model and download outputs to local directory. 83 | 84 | ## Additional resources 85 | 86 | * [Train models (create jobs) with the 2.0 CLI (preview)](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-train-cli) 87 | * [Additional examples](https://github.com/Azure/azureml-examples/tree/main/cli/jobs) 88 | -------------------------------------------------------------------------------- /examples/inference/gpu-inferencing/blue-deployment.yml: -------------------------------------------------------------------------------- 1 | name: blue 2 | type: kubernetes 3 | endpoint_name: tf-mnist 4 | app_insights_enabled: true 5 | model: 6 | path: ./model/ 7 | code_configuration: 8 | code: ./script/ 9 | scoring_script: score.py 10 | instance_type: myinstancetypename 11 | environment: 12 | image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04 13 | conda_file: ./model/conda.yml 14 | request_settings: 15 | request_timeout_ms: 3000 16 | max_concurrent_requests_per_instance: 1 17 | max_queue_wait_ms: 3000 18 | resources: 19 | requests: 20 | cpu: "0.1" 21 | memory: "500Mi" 22 | limits: 23 | cpu: "0.2" 24 | memory: "1Gi" 25 | nvidia.com/gpu: "1" 26 | liveness_probe: 27 | initial_delay: 10 28 | period: 10 29 | timeout: 10 30 | success_threshold: 1 31 | failure_threshold: 1 32 | readiness_probe: 33 | initial_delay: 10 34 | period: 10 35 | timeout: 10 36 | success_threshold: 1 37 | failure_threshold: 1 38 | scale_settings: 39 | type: default -------------------------------------------------------------------------------- /examples/inference/gpu-inferencing/endpoint.yml: -------------------------------------------------------------------------------- 1 | name: tf-mnist 2 | compute: azureml: 3 | auth_mode: key 4 | -------------------------------------------------------------------------------- /examples/inference/gpu-inferencing/model/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "mnist-tf.model" 2 | all_model_checkpoint_paths: "mnist-tf.model" 3 | -------------------------------------------------------------------------------- /examples/inference/gpu-inferencing/model/conda.yml: -------------------------------------------------------------------------------- 1 | name: model-env 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - python=3.7 6 | - tensorflow-gpu==1.14.0 7 | - pip: 8 | - azureml-defaults 9 | - numpy==1.16.4 10 | -------------------------------------------------------------------------------- /examples/inference/gpu-inferencing/model/mnist-tf.model.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/examples/inference/gpu-inferencing/model/mnist-tf.model.data-00000-of-00001 -------------------------------------------------------------------------------- /examples/inference/gpu-inferencing/model/mnist-tf.model.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/examples/inference/gpu-inferencing/model/mnist-tf.model.index -------------------------------------------------------------------------------- /examples/inference/gpu-inferencing/model/mnist-tf.model.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/examples/inference/gpu-inferencing/model/mnist-tf.model.meta -------------------------------------------------------------------------------- /examples/inference/gpu-inferencing/sample-request.json: -------------------------------------------------------------------------------- 1 | {"data": [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.07450980392156863, 0.4588235294117647, 0.9450980392156862, 0.7529411764705882, 0.5529411764705883, 0.17254901960784313, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2, 0.6666666666666666, 0.8823529411764706, 0.9137254901960784, 0.7686274509803922, 0.7725490196078432, 0.9176470588235294, 0.8784313725490196, 0.4666666666666667, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.40784313725490196, 0.9647058823529412, 0.9568627450980393, 0.5568627450980392, 0.1450980392156863, 0.0, 0.0, 0.4666666666666667, 0.9882352941176471, 0.9882352941176471, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.396078431372549, 0.9647058823529412, 0.9882352941176471, 0.09803921568627451, 0.0, 0.0, 0.0, 0.10196078431372549, 0.9529411764705882, 0.9882352941176471, 0.4, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.11372549019607843, 0.8470588235294118, 0.807843137254902, 0.24705882352941178, 0.0, 0.0, 0.0, 0.0, 0.1607843137254902, 0.9921568627450981, 0.6588235294117647, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.8470588235294118, 0.9882352941176471, 0.36470588235294116, 0.0, 0.0, 0.0, 0.0, 0.0, 0.8470588235294118, 0.9882352941176471, 0.5568627450980392, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.9921568627450981, 0.9882352941176471, 0.2196078431372549, 0.0, 0.0, 0.0, 0.0, 0.2, 0.9921568627450981, 0.9137254901960784, 0.1450980392156863, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.9921568627450981, 0.9882352941176471, 0.9058823529411765, 0.49019607843137253, 0.6901960784313725, 0.8823529411764706, 0.19607843137254902, 0.2, 0.9921568627450981, 0.5686274509803921, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.050980392156862744, 0.7372549019607844, 0.8823529411764706, 0.8823529411764706, 0.4470588235294118, 0.14901960784313725, 0.0392156862745098, 0.8980392156862745, 1.0, 0.32941176470588235, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3333333333333333, 0.9882352941176471, 0.9176470588235294, 0.10980392156862745, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.4823529411764706, 0.9882352941176471, 0.5372549019607843, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.10196078431372549, 0.8705882352941177, 0.9882352941176471, 0.050980392156862744, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.47058823529411764, 0.9921568627450981, 0.9921568627450981, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.027450980392156862, 0.7333333333333333, 0.9882352941176471, 0.592156862745098, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.25882352941176473, 0.9882352941176471, 0.8784313725490196, 0.07450980392156863, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5529411764705883, 0.9882352941176471, 0.803921568627451, 0.050980392156862744, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5529411764705883, 0.9921568627450981, 0.807843137254902, 0.050980392156862744, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.6509803921568628, 0.9882352941176471, 0.5568627450980392, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.9921568627450981, 0.9137254901960784, 0.1450980392156863, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.796078431372549, 0.7686274509803922, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]} -------------------------------------------------------------------------------- /examples/inference/gpu-inferencing/script/score.py: -------------------------------------------------------------------------------- 1 | 2 | import json 3 | import numpy as np 4 | import os 5 | import tensorflow as tf 6 | 7 | from azureml.core.model import Model 8 | 9 | def init(): 10 | global X, output, sess 11 | tf.reset_default_graph() 12 | model_root = os.getenv('AZUREML_MODEL_DIR') 13 | # the name of the folder in which to look for tensorflow model files 14 | tf_model_folder = 'model' 15 | saver = tf.train.import_meta_graph( 16 | os.path.join(model_root, tf_model_folder, 'mnist-tf.model.meta')) 17 | X = tf.get_default_graph().get_tensor_by_name("network/X:0") 18 | output = tf.get_default_graph().get_tensor_by_name("network/output/MatMul:0") 19 | 20 | sess = tf.Session() 21 | saver.restore(sess, os.path.join(model_root, tf_model_folder, 'mnist-tf.model')) 22 | 23 | 24 | def run(raw_data): 25 | data = np.array(json.loads(raw_data)['data']) 26 | # make prediction 27 | out = output.eval(session=sess, feed_dict={X: data}) 28 | y_hat = np.argmax(out, axis=1) 29 | return y_hat.tolist() 30 | -------------------------------------------------------------------------------- /examples/inference/simple-flow/blue-deployment.yml: -------------------------------------------------------------------------------- 1 | name: blue 2 | type: kubernetes 3 | endpoint_name: sklearn-mnist 4 | app_insights_enabled: true 5 | model: 6 | path: ./model/sklearn_mnist_model.pkl 7 | code_configuration: 8 | code: ./script/ 9 | scoring_script: score.py 10 | instance_type: defaultinstancetype 11 | environment: 12 | image: mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:latest 13 | conda_file: ./model/conda.yml 14 | request_settings: 15 | request_timeout_ms: 3000 16 | max_queue_wait_ms: 3000 17 | resources: 18 | requests: 19 | cpu: "0.1" 20 | memory: "0.1Gi" 21 | limits: 22 | cpu: "0.2" 23 | memory: "0.2Gi" 24 | scale_settings: 25 | type: target_utilization 26 | min_instances: 1 27 | max_instances: 3 28 | polling_interval: 10 29 | target_utilization_percentage: 70 30 | -------------------------------------------------------------------------------- /examples/inference/simple-flow/endpoint.yml: -------------------------------------------------------------------------------- 1 | name: sklearn-mnist 2 | compute: azureml:tailwind-k8s 3 | auth_mode: key 4 | -------------------------------------------------------------------------------- /examples/inference/simple-flow/model/conda.yml: -------------------------------------------------------------------------------- 1 | name: model-env 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - python=3.7 6 | - numpy=1.21.2 7 | - pip=21.2.4 8 | - scikit-learn=0.24.2 9 | - scipy=1.7.1 10 | - pip: 11 | - azureml-defaults==1.38.0 12 | - joblib==1.0.1 13 | -------------------------------------------------------------------------------- /examples/inference/simple-flow/model/sklearn_mnist_model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/examples/inference/simple-flow/model/sklearn_mnist_model.pkl -------------------------------------------------------------------------------- /examples/inference/simple-flow/script/score.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | import os 4 | import pickle 5 | import joblib 6 | 7 | def init(): 8 | global model 9 | # AZUREML_MODEL_DIR is an environment variable created during deployment. 10 | # It is the path to the model folder (./azureml-models/$MODEL_NAME/$VERSION) 11 | # For multiple models, it points to the folder containing all deployed models (./azureml-models) 12 | model_path = os.path.join(os.getenv('AZUREML_MODEL_DIR'), 'sklearn_mnist_model.pkl') 13 | model = joblib.load(model_path) 14 | 15 | def run(raw_data): 16 | data = np.array(json.loads(raw_data)['data']) 17 | # make prediction 18 | y_hat = model.predict(data) 19 | # you can return any data type as long as it is JSON-serializable 20 | return y_hat.tolist() -------------------------------------------------------------------------------- /examples/inference/simple-flow/sklearn-model.yml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/model.schema.json 2 | name: sklearn-model 3 | version: 1 4 | datastore: azureml:workspaceartifactstore 5 | path: "ExperimentRun/dcid.9c143e8b-a6ed-4c6f-a907-34b8ec21127c/outputs/sklearn_mnist_model.pkl" 6 | description: Model asset from run output folder. -------------------------------------------------------------------------------- /examples/training/additional-sdk-examples/001-Tensorflow/tf_mnist_with_checkpoint.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import numpy as np 5 | import argparse 6 | import os 7 | import re 8 | import tensorflow as tf 9 | import glob 10 | 11 | from azureml.core import Run 12 | from utils import load_data 13 | 14 | print("TensorFlow version:", tf.__version__) 15 | 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument('--data-folder', type=str, dest='data_folder', help='data folder mounting point') 18 | 19 | parser.add_argument('--resume-from', type=str, default=None, 20 | help='location of the model or checkpoint files from where to resume the training') 21 | args = parser.parse_args() 22 | 23 | 24 | previous_model_location = args.resume_from 25 | # You can also use environment variable to get the model/checkpoint files location 26 | # previous_model_location = os.path.expandvars(os.getenv("AZUREML_DATAREFERENCE_MODEL_LOCATION", None)) 27 | 28 | data_folder = args.data_folder 29 | print('Data folder:', data_folder) 30 | 31 | # load train and test set into numpy arrays 32 | # note we scale the pixel intensity values to 0-1 (by dividing it with 255.0) so the model can converge faster. 33 | 34 | X_train = load_data(glob.glob(os.path.join(data_folder, '**/train-images-idx3-ubyte.gz'), 35 | recursive=True)[0], False) / 255.0 36 | X_test = load_data(glob.glob(os.path.join(data_folder, '**/t10k-images-idx3-ubyte.gz'), 37 | recursive=True)[0], False) / 255.0 38 | y_train = load_data(glob.glob(os.path.join(data_folder, '**/train-labels-idx1-ubyte.gz'), 39 | recursive=True)[0], True).reshape(-1) 40 | y_test = load_data(glob.glob(os.path.join(data_folder, '**/t10k-labels-idx1-ubyte.gz'), 41 | recursive=True)[0], True).reshape(-1) 42 | 43 | print(X_train.shape, y_train.shape, X_test.shape, y_test.shape, sep='\n') 44 | 45 | training_set_size = X_train.shape[0] 46 | 47 | n_inputs = 28 * 28 48 | n_h1 = 100 49 | n_h2 = 100 50 | n_outputs = 10 51 | learning_rate = 0.01 52 | n_epochs = 1000000000000000000000000000 53 | batch_size = 50 54 | 55 | with tf.name_scope('network'): 56 | # construct the DNN 57 | X = tf.placeholder(tf.float32, shape=(None, n_inputs), name='X') 58 | y = tf.placeholder(tf.int64, shape=(None), name='y') 59 | h1 = tf.layers.dense(X, n_h1, activation=tf.nn.relu, name='h1') 60 | h2 = tf.layers.dense(h1, n_h2, activation=tf.nn.relu, name='h2') 61 | output = tf.layers.dense(h2, n_outputs, name='output') 62 | 63 | with tf.name_scope('train'): 64 | cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=output) 65 | loss = tf.reduce_mean(cross_entropy, name='loss') 66 | optimizer = tf.train.GradientDescentOptimizer(learning_rate) 67 | train_op = optimizer.minimize(loss) 68 | 69 | with tf.name_scope('eval'): 70 | correct = tf.nn.in_top_k(output, y, 1) 71 | acc_op = tf.reduce_mean(tf.cast(correct, tf.float32)) 72 | 73 | init = tf.global_variables_initializer() 74 | saver = tf.train.Saver() 75 | 76 | # start an Azure ML run 77 | run = Run.get_context() 78 | 79 | with tf.Session() as sess: 80 | start_epoch = 0 81 | if previous_model_location: 82 | checkpoint_file_path = tf.train.latest_checkpoint(previous_model_location) 83 | saver.restore(sess, checkpoint_file_path) 84 | checkpoint_filename = os.path.basename(checkpoint_file_path) 85 | num_found = re.search(r'\d+', checkpoint_filename) 86 | if num_found: 87 | start_epoch = int(num_found.group(0)) 88 | print("Resuming from epoch {}".format(str(start_epoch))) 89 | else: 90 | init.run() 91 | 92 | for epoch in range(start_epoch, n_epochs): 93 | 94 | # randomly shuffle training set 95 | indices = np.random.permutation(training_set_size) 96 | X_train = X_train[indices] 97 | y_train = y_train[indices] 98 | 99 | # batch index 100 | b_start = 0 101 | b_end = b_start + batch_size 102 | for _ in range(training_set_size // batch_size): 103 | # get a batch 104 | X_batch, y_batch = X_train[b_start: b_end], y_train[b_start: b_end] 105 | 106 | # update batch index for the next batch 107 | b_start = b_start + batch_size 108 | b_end = min(b_start + batch_size, training_set_size) 109 | 110 | # train 111 | sess.run(train_op, feed_dict={X: X_batch, y: y_batch}) 112 | # evaluate training set 113 | acc_train = acc_op.eval(feed_dict={X: X_batch, y: y_batch}) 114 | # evaluate validation set 115 | acc_val = acc_op.eval(feed_dict={X: X_test, y: y_test}) 116 | 117 | # log accuracies 118 | run.log('training_acc', np.float(acc_train)) 119 | run.log('validation_acc', np.float(acc_val)) 120 | print(epoch, '-- Training accuracy:', acc_train, '\b Validation accuracy:', acc_val) 121 | y_hat = np.argmax(output.eval(feed_dict={X: X_test}), axis=1) 122 | 123 | if epoch % 5 == 0: 124 | saver.save(sess, './outputs/', global_step=epoch) 125 | 126 | # saving only half of the model and resuming again from same epoch 127 | if not previous_model_location and epoch == 10: 128 | break 129 | 130 | run.log('final_acc', np.float(acc_val)) 131 | -------------------------------------------------------------------------------- /examples/training/additional-sdk-examples/001-Tensorflow/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import gzip 5 | import numpy as np 6 | import struct 7 | 8 | 9 | # load compressed MNIST gz files and return numpy arrays 10 | def load_data(filename, label=False): 11 | with gzip.open(filename) as gz: 12 | struct.unpack('I', gz.read(4)) 13 | n_items = struct.unpack('>I', gz.read(4)) 14 | if not label: 15 | n_rows = struct.unpack('>I', gz.read(4))[0] 16 | n_cols = struct.unpack('>I', gz.read(4))[0] 17 | res = np.frombuffer(gz.read(n_items[0] * n_rows * n_cols), dtype=np.uint8) 18 | res = res.reshape(n_items[0], n_rows * n_cols) 19 | else: 20 | res = np.frombuffer(gz.read(n_items[0]), dtype=np.uint8) 21 | res = res.reshape(n_items[0], 1) 22 | return res 23 | 24 | 25 | # one-hot encode a 1-D array 26 | def one_hot_encode(array, num_of_classes): 27 | return np.eye(num_of_classes)[array.reshape(-1)] 28 | -------------------------------------------------------------------------------- /examples/training/additional-sdk-examples/002-SciKitLearn/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import gzip 5 | import numpy as np 6 | import struct 7 | 8 | 9 | # load compressed MNIST gz files and return numpy arrays 10 | def load_data(filename, label=False): 11 | with gzip.open(filename) as gz: 12 | struct.unpack('I', gz.read(4)) 13 | n_items = struct.unpack('>I', gz.read(4)) 14 | if not label: 15 | n_rows = struct.unpack('>I', gz.read(4))[0] 16 | n_cols = struct.unpack('>I', gz.read(4))[0] 17 | res = np.frombuffer(gz.read(n_items[0] * n_rows * n_cols), dtype=np.uint8) 18 | res = res.reshape(n_items[0], n_rows * n_cols) 19 | else: 20 | res = np.frombuffer(gz.read(n_items[0]), dtype=np.uint8) 21 | res = res.reshape(n_items[0], 1) 22 | return res 23 | 24 | 25 | # one-hot encode a 1-D array 26 | def one_hot_encode(array, num_of_classes): 27 | return np.eye(num_of_classes)[array.reshape(-1)] 28 | -------------------------------------------------------------------------------- /examples/training/simple-train-cli/job.yml: -------------------------------------------------------------------------------- 1 | experiment_name: Tutorial-sklearn-mnist 2 | code: ./src 3 | command: python train.py --data-folder ./mnist-data --regularization 0.5 4 | environment: azureml:AzureML-sklearn-0.24-ubuntu18.04-py37-cpu@latest 5 | compute: azureml:tailwind-k8s 6 | resources: 7 | instance_type: 8 | 9 | -------------------------------------------------------------------------------- /examples/training/simple-train-cli/src/mnist-data/t10k-images-idx3-ubyte.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/examples/training/simple-train-cli/src/mnist-data/t10k-images-idx3-ubyte.gz -------------------------------------------------------------------------------- /examples/training/simple-train-cli/src/mnist-data/t10k-labels-idx1-ubyte.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/examples/training/simple-train-cli/src/mnist-data/t10k-labels-idx1-ubyte.gz -------------------------------------------------------------------------------- /examples/training/simple-train-cli/src/mnist-data/train-images-idx3-ubyte.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/examples/training/simple-train-cli/src/mnist-data/train-images-idx3-ubyte.gz -------------------------------------------------------------------------------- /examples/training/simple-train-cli/src/mnist-data/train-labels-idx1-ubyte.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/examples/training/simple-train-cli/src/mnist-data/train-labels-idx1-ubyte.gz -------------------------------------------------------------------------------- /examples/training/simple-train-cli/src/train.py: -------------------------------------------------------------------------------- 1 | 2 | import argparse 3 | import os 4 | import numpy as np 5 | import glob 6 | 7 | from sklearn.linear_model import LogisticRegression 8 | import joblib 9 | 10 | from azureml.core import Run 11 | from utils import load_data 12 | 13 | # let user feed in 2 parameters, the dataset to mount or download, and the regularization rate of the logistic regression model 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument('--data-folder', type=str, dest='data_folder', help='data folder mounting point') 16 | parser.add_argument('--regularization', type=float, dest='reg', default=0.01, help='regularization rate') 17 | args = parser.parse_args() 18 | 19 | data_folder = args.data_folder 20 | print('Data folder:', data_folder) 21 | 22 | # load train and test set into numpy arrays 23 | # note we scale the pixel intensity values to 0-1 (by dividing it with 255.0) so the model can converge faster. 24 | X_train = load_data(glob.glob(os.path.join(data_folder, '**/train-images-idx3-ubyte.gz'), recursive=True)[0], False) / 255.0 25 | X_test = load_data(glob.glob(os.path.join(data_folder, '**/t10k-images-idx3-ubyte.gz'), recursive=True)[0], False) / 255.0 26 | y_train = load_data(glob.glob(os.path.join(data_folder, '**/train-labels-idx1-ubyte.gz'), recursive=True)[0], True).reshape(-1) 27 | y_test = load_data(glob.glob(os.path.join(data_folder, '**/t10k-labels-idx1-ubyte.gz'), recursive=True)[0], True).reshape(-1) 28 | 29 | print(X_train.shape, y_train.shape, X_test.shape, y_test.shape, sep = '\n') 30 | 31 | # get hold of the current run 32 | run = Run.get_context() 33 | 34 | print('Train a logistic regression model with regularization rate of', args.reg) 35 | clf = LogisticRegression(C=1.0/args.reg, solver="liblinear", multi_class="auto", random_state=42) 36 | clf.fit(X_train, y_train) 37 | 38 | print('Predict the test set') 39 | y_hat = clf.predict(X_test) 40 | 41 | # calculate accuracy on the prediction 42 | acc = np.average(y_hat == y_test) 43 | print('Accuracy is', acc) 44 | 45 | run.log('regularization rate', np.float(args.reg)) 46 | run.log('accuracy', np.float(acc)) 47 | 48 | os.makedirs('outputs', exist_ok=True) 49 | # note file saved in the outputs folder is automatically uploaded into experiment record 50 | joblib.dump(value=clf, filename='outputs/sklearn_mnist_model.pkl') 51 | -------------------------------------------------------------------------------- /examples/training/simple-train-cli/src/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import gzip 5 | import numpy as np 6 | import struct 7 | 8 | 9 | # load compressed MNIST gz files and return numpy arrays 10 | def load_data(filename, label=False): 11 | with gzip.open(filename) as gz: 12 | struct.unpack('I', gz.read(4)) 13 | n_items = struct.unpack('>I', gz.read(4)) 14 | if not label: 15 | n_rows = struct.unpack('>I', gz.read(4))[0] 16 | n_cols = struct.unpack('>I', gz.read(4))[0] 17 | res = np.frombuffer(gz.read(n_items[0] * n_rows * n_cols), dtype=np.uint8) 18 | res = res.reshape(n_items[0], n_rows * n_cols) 19 | else: 20 | res = np.frombuffer(gz.read(n_items[0]), dtype=np.uint8) 21 | res = res.reshape(n_items[0], 1) 22 | return res 23 | 24 | 25 | # one-hot encode a 1-D array 26 | def one_hot_encode(array, num_of_classes): 27 | return np.eye(num_of_classes)[array.reshape(-1)] 28 | -------------------------------------------------------------------------------- /examples/training/simple-train-sdk/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import gzip 5 | import numpy as np 6 | import struct 7 | 8 | 9 | # load compressed MNIST gz files and return numpy arrays 10 | def load_data(filename, label=False): 11 | with gzip.open(filename) as gz: 12 | struct.unpack('I', gz.read(4)) 13 | n_items = struct.unpack('>I', gz.read(4)) 14 | if not label: 15 | n_rows = struct.unpack('>I', gz.read(4))[0] 16 | n_cols = struct.unpack('>I', gz.read(4))[0] 17 | res = np.frombuffer(gz.read(n_items[0] * n_rows * n_cols), dtype=np.uint8) 18 | res = res.reshape(n_items[0], n_rows * n_cols) 19 | else: 20 | res = np.frombuffer(gz.read(n_items[0]), dtype=np.uint8) 21 | res = res.reshape(n_items[0], 1) 22 | return res 23 | 24 | 25 | # one-hot encode a 1-D array 26 | def one_hot_encode(array, num_of_classes): 27 | return np.eye(num_of_classes)[array.reshape(-1)] 28 | -------------------------------------------------------------------------------- /examples/training/train-using-nfs/amlarc-nfs-setup/README.md: -------------------------------------------------------------------------------- 1 | # Setting up an NFS Server on AML Arc 2 | 3 | Before you can run any of the examples in this section you will need to setup an NFS mount on your 4 | Arc-enabled Kubernetes cluster. 5 | 6 | The included mount-config.yaml file can be used as a template to do this. You will need to replace `` with the 7 | actual address of your server. Then run the following: 8 | 9 | ``` 10 | kubectl apply -f mount-config.yaml 11 | ``` 12 | 13 | More detailed documentation on ephemeral NFS volume usage in Arc-enabled Machine Learning 14 | can be found [here](../../../docs/setup-ephemeral-nfs-volume.md) 15 | -------------------------------------------------------------------------------- /examples/training/train-using-nfs/amlarc-nfs-setup/mount-config.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | data: 3 | mounts.yaml: | 4 | mountPoints: 5 | - mountPath: /nfs_share 6 | mountType: nfs 7 | name: amlarc-nfs-share-0 8 | path: /disks/4TB/code/nfs_share 9 | server: 10 | kind: ConfigMap 11 | metadata: 12 | name: mount-config 13 | namespace: azureml 14 | -------------------------------------------------------------------------------- /examples/training/train-using-nfs/pytorch-on-amlarc-with-nfs/scripts/train.py: -------------------------------------------------------------------------------- 1 | from azureml.core.run import Run 2 | 3 | import argparse 4 | import os 5 | import torch 6 | import torch.distributed as dist 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | import torch.optim as optim 10 | 11 | from math import ceil 12 | from random import Random 13 | from torch.multiprocessing import Process 14 | from torch.autograd import Variable 15 | from torchvision import datasets, transforms 16 | 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument('--data-dir', required=True, help='path to input data directory') 19 | parser.add_argument('--backend', required=True, help='Pytorch backend (gloo or nccl)') 20 | args = parser.parse_args() 21 | 22 | 23 | 24 | class Partition(object): 25 | def __init__(self, data, index): 26 | self.data = data 27 | self.index = index 28 | 29 | def __len__(self): 30 | return len(self.index) 31 | 32 | def __getitem__(self, index): 33 | data_idx = self.index[index] 34 | return self.data[data_idx] 35 | 36 | 37 | class DataPartitioner(object): 38 | def __init__(self, data, sizes=[0.7, 0.2, 0.1], seed=1234): 39 | self.data = data 40 | self.partitions = [] 41 | rng = Random() 42 | rng.seed(seed) 43 | data_len = len(data) 44 | indexes = [x for x in range(0, data_len)] 45 | rng.shuffle(indexes) 46 | 47 | for frac in sizes: 48 | part_len = int(frac * data_len) 49 | self.partitions.append(indexes[0:part_len]) 50 | indexes = indexes[part_len:] 51 | 52 | def use(self, partition): 53 | return Partition(self.data, self.partitions[partition]) 54 | 55 | 56 | class Net(nn.Module): 57 | def __init__(self): 58 | super(Net, self).__init__() 59 | self.conv1 = nn.Conv2d(1, 10, kernel_size=5) 60 | self.conv2 = nn.Conv2d(10, 20, kernel_size=5) 61 | self.conv2_drop = nn.Dropout2d() 62 | self.fc1 = nn.Linear(320, 50) 63 | self.fc2 = nn.Linear(50, 10) 64 | 65 | def forward(self, x): 66 | x = F.relu(F.max_pool2d(self.conv1(x), 2)) 67 | x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) 68 | x = x.view(-1, 320) 69 | x = F.relu(self.fc1(x)) 70 | x = F.dropout(x, training=self.training) 71 | x = self.fc2(x) 72 | return F.log_softmax(x) 73 | 74 | 75 | def partition_dataset(): 76 | dataset = datasets.MNIST( 77 | args.data_dir, 78 | train=True, 79 | download=False, 80 | transform=transforms.Compose([ 81 | transforms.ToTensor(), 82 | transforms.Normalize((0.1307, ), (0.3081, )) 83 | ])) 84 | 85 | size = dist.get_world_size() 86 | bsz = 128 // size 87 | partition_sizes = [1.0 / size for _ in range(size)] 88 | partition = DataPartitioner(dataset, partition_sizes) 89 | partition = partition.use(dist.get_rank()) 90 | train_set = torch.utils.data.DataLoader( 91 | partition, batch_size=bsz, shuffle=True) 92 | return train_set, bsz 93 | 94 | 95 | def average_gradients(model): 96 | size = float(dist.get_world_size()) 97 | for param in model.parameters(): 98 | dist.all_reduce(param.grad.data, op=dist.reduce_op.SUM) 99 | param.grad.data /= size 100 | 101 | 102 | def run(rank, size): 103 | torch.manual_seed(1234) 104 | train_set, bsz = partition_dataset() 105 | model = Net() 106 | optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5) 107 | 108 | num_batches = ceil(len(train_set.dataset) / float(bsz)) 109 | for epoch in range(5): 110 | epoch_loss = 0.0 111 | for data, target in train_set: 112 | data, target = Variable(data), Variable(target) 113 | optimizer.zero_grad() 114 | output = model(data) 115 | loss = F.nll_loss(output, target) 116 | epoch_loss += loss.data 117 | loss.backward() 118 | average_gradients(model) 119 | optimizer.step() 120 | 121 | if dist.get_rank() == 0: 122 | run_object.log('Training loss', epoch_loss / num_batches) 123 | 124 | 125 | if __name__ == "__main__": 126 | run_object = Run.get_context() 127 | dist.init_process_group(args.backend) 128 | run(dist.get_rank(), dist.get_world_size()) 129 | -------------------------------------------------------------------------------- /examples/training/train-using-nfs/scikit-learn-on-amlarc-with-nfs/iris.csv: -------------------------------------------------------------------------------- 1 | sepal_length,sepal_width,petal_length,petal_width,species 2 | 5.1,3.5,1.4,0.2,Iris-setosa 3 | 4.9,3,1.4,0.2,Iris-setosa 4 | 4.7,3.2,1.3,0.2,Iris-setosa 5 | 4.6,3.1,1.5,0.2,Iris-setosa 6 | 5,3.6,1.4,0.2,Iris-setosa 7 | 5.4,3.9,1.7,0.4,Iris-setosa 8 | 4.6,3.4,1.4,0.3,Iris-setosa 9 | 5,3.4,1.5,0.2,Iris-setosa 10 | 4.4,2.9,1.4,0.2,Iris-setosa 11 | 4.9,3.1,1.5,0.1,Iris-setosa 12 | 5.4,3.7,1.5,0.2,Iris-setosa 13 | 4.8,3.4,1.6,0.2,Iris-setosa 14 | 4.8,3,1.4,0.1,Iris-setosa 15 | 4.3,3,1.1,0.1,Iris-setosa 16 | 5.8,4,1.2,0.2,Iris-setosa 17 | 5.7,4.4,1.5,0.4,Iris-setosa 18 | 5.4,3.9,1.3,0.4,Iris-setosa 19 | 5.1,3.5,1.4,0.3,Iris-setosa 20 | 5.7,3.8,1.7,0.3,Iris-setosa 21 | 5.1,3.8,1.5,0.3,Iris-setosa 22 | 5.4,3.4,1.7,0.2,Iris-setosa 23 | 5.1,3.7,1.5,0.4,Iris-setosa 24 | 4.6,3.6,1,0.2,Iris-setosa 25 | 5.1,3.3,1.7,0.5,Iris-setosa 26 | 4.8,3.4,1.9,0.2,Iris-setosa 27 | 5,3,1.6,0.2,Iris-setosa 28 | 5,3.4,1.6,0.4,Iris-setosa 29 | 5.2,3.5,1.5,0.2,Iris-setosa 30 | 5.2,3.4,1.4,0.2,Iris-setosa 31 | 4.7,3.2,1.6,0.2,Iris-setosa 32 | 4.8,3.1,1.6,0.2,Iris-setosa 33 | 5.4,3.4,1.5,0.4,Iris-setosa 34 | 5.2,4.1,1.5,0.1,Iris-setosa 35 | 5.5,4.2,1.4,0.2,Iris-setosa 36 | 4.9,3.1,1.5,0.1,Iris-setosa 37 | 5,3.2,1.2,0.2,Iris-setosa 38 | 5.5,3.5,1.3,0.2,Iris-setosa 39 | 4.9,3.1,1.5,0.1,Iris-setosa 40 | 4.4,3,1.3,0.2,Iris-setosa 41 | 5.1,3.4,1.5,0.2,Iris-setosa 42 | 5,3.5,1.3,0.3,Iris-setosa 43 | 4.5,2.3,1.3,0.3,Iris-setosa 44 | 4.4,3.2,1.3,0.2,Iris-setosa 45 | 5,3.5,1.6,0.6,Iris-setosa 46 | 5.1,3.8,1.9,0.4,Iris-setosa 47 | 4.8,3,1.4,0.3,Iris-setosa 48 | 5.1,3.8,1.6,0.2,Iris-setosa 49 | 4.6,3.2,1.4,0.2,Iris-setosa 50 | 5.3,3.7,1.5,0.2,Iris-setosa 51 | 5,3.3,1.4,0.2,Iris-setosa 52 | 7,3.2,4.7,1.4,Iris-versicolor 53 | 6.4,3.2,4.5,1.5,Iris-versicolor 54 | 6.9,3.1,4.9,1.5,Iris-versicolor 55 | 5.5,2.3,4,1.3,Iris-versicolor 56 | 6.5,2.8,4.6,1.5,Iris-versicolor 57 | 5.7,2.8,4.5,1.3,Iris-versicolor 58 | 6.3,3.3,4.7,1.6,Iris-versicolor 59 | 4.9,2.4,3.3,1,Iris-versicolor 60 | 6.6,2.9,4.6,1.3,Iris-versicolor 61 | 5.2,2.7,3.9,1.4,Iris-versicolor 62 | 5,2,3.5,1,Iris-versicolor 63 | 5.9,3,4.2,1.5,Iris-versicolor 64 | 6,2.2,4,1,Iris-versicolor 65 | 6.1,2.9,4.7,1.4,Iris-versicolor 66 | 5.6,2.9,3.6,1.3,Iris-versicolor 67 | 6.7,3.1,4.4,1.4,Iris-versicolor 68 | 5.6,3,4.5,1.5,Iris-versicolor 69 | 5.8,2.7,4.1,1,Iris-versicolor 70 | 6.2,2.2,4.5,1.5,Iris-versicolor 71 | 5.6,2.5,3.9,1.1,Iris-versicolor 72 | 5.9,3.2,4.8,1.8,Iris-versicolor 73 | 6.1,2.8,4,1.3,Iris-versicolor 74 | 6.3,2.5,4.9,1.5,Iris-versicolor 75 | 6.1,2.8,4.7,1.2,Iris-versicolor 76 | 6.4,2.9,4.3,1.3,Iris-versicolor 77 | 6.6,3,4.4,1.4,Iris-versicolor 78 | 6.8,2.8,4.8,1.4,Iris-versicolor 79 | 6.7,3,5,1.7,Iris-versicolor 80 | 6,2.9,4.5,1.5,Iris-versicolor 81 | 5.7,2.6,3.5,1,Iris-versicolor 82 | 5.5,2.4,3.8,1.1,Iris-versicolor 83 | 5.5,2.4,3.7,1,Iris-versicolor 84 | 5.8,2.7,3.9,1.2,Iris-versicolor 85 | 6,2.7,5.1,1.6,Iris-versicolor 86 | 5.4,3,4.5,1.5,Iris-versicolor 87 | 6,3.4,4.5,1.6,Iris-versicolor 88 | 6.7,3.1,4.7,1.5,Iris-versicolor 89 | 6.3,2.3,4.4,1.3,Iris-versicolor 90 | 5.6,3,4.1,1.3,Iris-versicolor 91 | 5.5,2.5,4,1.3,Iris-versicolor 92 | 5.5,2.6,4.4,1.2,Iris-versicolor 93 | 6.1,3,4.6,1.4,Iris-versicolor 94 | 5.8,2.6,4,1.2,Iris-versicolor 95 | 5,2.3,3.3,1,Iris-versicolor 96 | 5.6,2.7,4.2,1.3,Iris-versicolor 97 | 5.7,3,4.2,1.2,Iris-versicolor 98 | 5.7,2.9,4.2,1.3,Iris-versicolor 99 | 6.2,2.9,4.3,1.3,Iris-versicolor 100 | 5.1,2.5,3,1.1,Iris-versicolor 101 | 5.7,2.8,4.1,1.3,Iris-versicolor 102 | 6.3,3.3,6,2.5,Iris-virginica 103 | 5.8,2.7,5.1,1.9,Iris-virginica 104 | 7.1,3,5.9,2.1,Iris-virginica 105 | 6.3,2.9,5.6,1.8,Iris-virginica 106 | 6.5,3,5.8,2.2,Iris-virginica 107 | 7.6,3,6.6,2.1,Iris-virginica 108 | 4.9,2.5,4.5,1.7,Iris-virginica 109 | 7.3,2.9,6.3,1.8,Iris-virginica 110 | 6.7,2.5,5.8,1.8,Iris-virginica 111 | 7.2,3.6,6.1,2.5,Iris-virginica 112 | 6.5,3.2,5.1,2,Iris-virginica 113 | 6.4,2.7,5.3,1.9,Iris-virginica 114 | 6.8,3,5.5,2.1,Iris-virginica 115 | 5.7,2.5,5,2,Iris-virginica 116 | 5.8,2.8,5.1,2.4,Iris-virginica 117 | 6.4,3.2,5.3,2.3,Iris-virginica 118 | 6.5,3,5.5,1.8,Iris-virginica 119 | 7.7,3.8,6.7,2.2,Iris-virginica 120 | 7.7,2.6,6.9,2.3,Iris-virginica 121 | 6,2.2,5,1.5,Iris-virginica 122 | 6.9,3.2,5.7,2.3,Iris-virginica 123 | 5.6,2.8,4.9,2,Iris-virginica 124 | 7.7,2.8,6.7,2,Iris-virginica 125 | 6.3,2.7,4.9,1.8,Iris-virginica 126 | 6.7,3.3,5.7,2.1,Iris-virginica 127 | 7.2,3.2,6,1.8,Iris-virginica 128 | 6.2,2.8,4.8,1.8,Iris-virginica 129 | 6.1,3,4.9,1.8,Iris-virginica 130 | 6.4,2.8,5.6,2.1,Iris-virginica 131 | 7.2,3,5.8,1.6,Iris-virginica 132 | 7.4,2.8,6.1,1.9,Iris-virginica 133 | 7.9,3.8,6.4,2,Iris-virginica 134 | 6.4,2.8,5.6,2.2,Iris-virginica 135 | 6.3,2.8,5.1,1.5,Iris-virginica 136 | 6.1,2.6,5.6,1.4,Iris-virginica 137 | 7.7,3,6.1,2.3,Iris-virginica 138 | 6.3,3.4,5.6,2.4,Iris-virginica 139 | 6.4,3.1,5.5,1.8,Iris-virginica 140 | 6,3,4.8,1.8,Iris-virginica 141 | 6.9,3.1,5.4,2.1,Iris-virginica 142 | 6.7,3.1,5.6,2.4,Iris-virginica 143 | 6.9,3.1,5.1,2.3,Iris-virginica 144 | 5.8,2.7,5.1,1.9,Iris-virginica 145 | 6.8,3.2,5.9,2.3,Iris-virginica 146 | 6.7,3.3,5.7,2.5,Iris-virginica 147 | 6.7,3,5.2,2.3,Iris-virginica 148 | 6.3,2.5,5,1.9,Iris-virginica 149 | 6.5,3,5.2,2,Iris-virginica 150 | 6.2,3.4,5.4,2.3,Iris-virginica 151 | 5.9,3,5.1,1.8,Iris-virginica 152 | -------------------------------------------------------------------------------- /examples/training/train-using-nfs/scikit-learn-on-amlarc-with-nfs/scripts/train.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft. All rights reserved. 2 | # Licensed under the MIT license. 3 | 4 | import os 5 | import argparse 6 | import pandas 7 | 8 | from sklearn.model_selection import train_test_split 9 | from sklearn.tree import DecisionTreeClassifier 10 | # sklearn.externals.joblib is removed in 0.23 11 | from sklearn import __version__ as sklearnver 12 | from packaging.version import Version 13 | if Version(sklearnver) < Version("0.23.0"): 14 | from sklearn.externals import joblib 15 | else: 16 | import joblib 17 | 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument('--data-dir', required=True, help='path to input data directory') 20 | args = parser.parse_args() 21 | 22 | # get input data 23 | data_file = os.path.join(args.data_dir, 'iris.csv') 24 | df = pandas.read_csv(data_file) 25 | 26 | x_col = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width'] 27 | y_col = ['species'] 28 | x_df = df.loc[:, x_col] 29 | y_df = df.loc[:, y_col] 30 | 31 | #dividing X,y into train and test data 32 | x_train, x_test, y_train, y_test = train_test_split(x_df, y_df, test_size=0.2, random_state=223) 33 | 34 | data = {'train': {'X': x_train, 'y': y_train}, 35 | 36 | 'test': {'X': x_test, 'y': y_test}} 37 | 38 | clf = DecisionTreeClassifier().fit(data['train']['X'], data['train']['y']) 39 | model_file_name = 'decision_tree.pkl' 40 | 41 | print('Accuracy of Decision Tree classifier on training set: {:.2f}'.format(clf.score(x_train, y_train))) 42 | print('Accuracy of Decision Tree classifier on test set: {:.2f}'.format(clf.score(x_test, y_test))) 43 | 44 | os.makedirs('./outputs', exist_ok=True) 45 | with open(model_file_name, 'wb') as file: 46 | joblib.dump(value=clf, filename='outputs/' + model_file_name) 47 | -------------------------------------------------------------------------------- /files/deploy-amlarc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | subscription_id="" 6 | resource_group="" 7 | cluster_name="" 8 | 9 | arcml_extension_name="arcml-extension" 10 | 11 | ssl_cname="" 12 | ssl_cert_pem_file="" 13 | ssl_key_pem_file="" 14 | 15 | # STEP1 Register feature providers 16 | echo 'Register features...' 17 | az feature register --namespace Microsoft.ContainerService -n AKS-ExtensionManager --subscription "$subscription_id" 18 | echo 'Waiting for feature register...' 19 | while [ "$(az feature list --query "[?contains(name, 'Microsoft.ContainerService/AKS-ExtensionManager')].[properties.state]" -o json |jq '.[0][0]')" == 'Registered' ] 20 | do 21 | sleep 5 22 | done 23 | az provider register -n Microsoft.ContainerService 1 24 | 25 | 26 | # STEP2 Deploy AmlArc extension 27 | # OPTION A) AKS service has public https endpoint 28 | az k8s-extension create --cluster-name $cluster_name --cluster-type managedClusters -n $arcml_extension_name \ 29 | --extension-type Microsoft.AzureML.Kubernetes --scope cluster --configuration-settings enableInference=True \ 30 | sslCname=$ssl_cname --config-protected sslCertPemFile=$ssl_cert_pem_file sslKeyPemFile=$ssl_key_pem_file \ 31 | --subscription $subscription_id -g $resource_group --auto-upgrade-minor-version False 32 | 33 | # OPTION B) AKS service has public http endpoint 34 | #az k8s-extension create --cluster-name $cluster_name --cluster-type managedClusters -n $arcml_extension_name \ 35 | #--extension-type Microsoft.AzureML.Kubernetes --scope cluster --configuration-settings enableInference=True allowInsecureConnections=true \ 36 | #--subscription $subscription_id -g $resource_group --auto-upgrade-minor-version False 37 | 38 | # OPTION C) AKS service has private http endpoint 39 | #az k8s-extension create --cluster-name $cluster_name --cluster-type managedClusters -n $arcml_extension_name \ 40 | #--extension-type Microsoft.AzureML.Kubernetes --scope cluster --configuration-settings enableInference=True allowInsecureConnections=true \ 41 | #privateEndpointILB=True --subscription $subscription_id -g $resource_group --auto-upgrade-minor-version False 42 | 43 | 44 | extension_install_state=$(az k8s-extension show --name $arcml_extension_name --cluster-type managedClusters --cluster-name "$cluster_name" --resource-group "$resource_group" --subscription "$subscription_id" | jq -r '.provisioningState') 45 | echo "$extension_install_state" 46 | if [[ $extension_install_state == "Succeeded" ]] 47 | then 48 | echo "AzureML extention created successfully" 49 | else 50 | echo "AzureML extention creation failed" 51 | exit 1 52 | fi 53 | -------------------------------------------------------------------------------- /files/deployextension.parameters.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentParameters.json#", 3 | "contentVersion": "1.0.0.0", 4 | "parameters": { 5 | "extensionName": { 6 | "value": "" 7 | }, 8 | "autoUpgradeMinorVersion": { 9 | "value": false 10 | }, 11 | "enableTraining": { 12 | "value": true 13 | }, 14 | "enableInference": { 15 | "value": true 16 | }, 17 | "allowInsecureConnections": { 18 | "value": false 19 | }, 20 | "aksResourceId": { 21 | "value": "/subscriptions/00000000-0000-0000-0000-000000000000/resourcegroups/foo/providers/Microsoft.ContainerService/managedClusters/bar" 22 | }, 23 | "aksLocation": { 24 | "value": "eastus" 25 | }, 26 | "inferenceRouterServiceType": { 27 | "value": "LoadBalancer" 28 | }, 29 | "internalLoadBalancerProvider": { 30 | "value": "azure" 31 | }, 32 | "inferenceRouterHA": { 33 | "value": true 34 | }, 35 | "installNvidiaDevicePlugin": { 36 | "value": false 37 | }, 38 | "installPromOp": { 39 | "value": true 40 | }, 41 | "installVolcano": { 42 | "value": true 43 | }, 44 | "installDcgmExporter": { 45 | "value": false 46 | }, 47 | "nodeSelector": { 48 | "value": { 49 | "nodeSelector.": "", 50 | "nodeSelector.": "" 51 | }, 52 | "metadata": { 53 | "description": "This field is optional" 54 | } 55 | }, 56 | "sslCname": { 57 | "value": "foo.bar.com" 58 | }, 59 | "sslSecret": { 60 | "value": "" 61 | }, 62 | "sslCertificate": { 63 | "value": "" 64 | }, 65 | "sslKey": { 66 | "value": "" 67 | } 68 | } 69 | } -------------------------------------------------------------------------------- /files/entry.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | AZ_K8S_EXTENSION_VERSION='1.2.2' 4 | AZ_CONNECTED_K8S_VERSION='1.2.8' 5 | 6 | az extension add --name connectedk8s --version $AZ_CONNECTED_K8S_VERSION 7 | az extension add --name k8s-extension --version $AZ_K8S_EXTENSION_VERSION 8 | 9 | python deploy.py -------------------------------------------------------------------------------- /files/quota setting tool/get_quotaoverrides_cr.py: -------------------------------------------------------------------------------- 1 | from logging import getLogger 2 | import argparse 3 | from utils import config_logging, read_yaml, write_yaml_file, hash_string 4 | from microsoft_graph import get_group_transitive_members_by_group_id, get_user_oid_by_mail, get_token 5 | 6 | logger = getLogger(__name__) 7 | userIdentifiers = [] 8 | crTemplateFilePath = "./quotaoverridesCRTemplate.yaml" 9 | 10 | 11 | def add_unique_identifier(identifier): 12 | if identifier not in userIdentifiers: 13 | userIdentifiers.append(identifier) 14 | 15 | 16 | def process_identities(identities): 17 | users, groups = identities.get('users', []), identities.get('groups', []) 18 | 19 | token = get_token() 20 | 21 | for user in users: 22 | logger.info(f"processing user {user}") 23 | user_oid = get_user_oid_by_mail(user, token) 24 | add_unique_identifier(hash_string(user_oid)) 25 | 26 | for group in groups: 27 | logger.info(f"processing group {group}") 28 | for member in get_group_transitive_members_by_group_id(group, token): 29 | add_unique_identifier(hash_string(member)) 30 | 31 | 32 | def get_quotaoverride_cr(args): 33 | config_file, output_file, name = args.config, args.output, args.name 34 | 35 | config, output = read_yaml(config_file), read_yaml(crTemplateFilePath) 36 | 37 | output['metadata']['name'] = name 38 | output['metadata']['labels']['app.kubernetes.io/instance'] = name 39 | output['spec']['tierOverrides'] = config['tierOverrides'] 40 | process_identities(config['userIdentifiers']) 41 | output['spec']['userIdentifiers'] = userIdentifiers 42 | 43 | write_yaml_file(output_file, output) 44 | 45 | logger.info(f"generated quotaoverrides custom resource file, file path : {output_file} ") 46 | 47 | 48 | def main(): 49 | config_logging() 50 | 51 | parser = argparse.ArgumentParser(description='give a config yaml, generate quotaoverrides custom resource yaml, suggests to run [az login] first before using the command') 52 | 53 | parser.add_argument('--config', required=True, help="yaml file path of user's quota override config file") 54 | parser.add_argument('--output', required=True, help="yaml file path of generated k8s quotaoverrides custom resource file") 55 | parser.add_argument('--name', required=True, help="name of quotaoverrides custom resource") 56 | 57 | parser.set_defaults(func=get_quotaoverride_cr) 58 | 59 | args = parser.parse_args() 60 | args.func(args) 61 | 62 | 63 | if __name__ == '__main__': 64 | main() 65 | -------------------------------------------------------------------------------- /files/quota setting tool/microsoft_graph.py: -------------------------------------------------------------------------------- 1 | from logging import getLogger 2 | import subprocess, json, requests 3 | 4 | logger = getLogger(__name__) 5 | 6 | def get_token(): 7 | exitcode, data = subprocess.getstatusoutput('az account get-access-token --resource-type ms-graph') 8 | if exitcode != 0: 9 | logger.exception(data) 10 | raise Exception('Exception in get-access-token') 11 | 12 | token = json.loads(data)['accessToken'] 13 | 14 | logger.info('get ms-graph access token : {}'.format(token)) 15 | return token 16 | 17 | 18 | def _send_request(url, token): 19 | try: 20 | logger.info('sending url : {}'.format(url)) 21 | 22 | headers = { 23 | 'Authorization': 'Bearer {}'.format(token), 24 | 'Host': 'graph.microsoft.com' 25 | } 26 | response = requests.get(url=url, headers=headers) 27 | response.raise_for_status() 28 | response_json = response.json() 29 | except Exception as err: 30 | raise SystemExit(err) 31 | else: 32 | return response_json 33 | 34 | 35 | def _iter_objects(url, token): 36 | while url is not None: 37 | response_json = _send_request(url, token) 38 | 39 | objects = response_json.get('value') 40 | logger.info('Fetched {} objects from {}'.format(len(objects), url)) 41 | 42 | yield from objects 43 | url = response_json.get('@odata.nextLink') 44 | 45 | 46 | def get_group_transitive_members_by_group_id(group_id, token): 47 | logger.info('get ms-graph group transitive members, group id : {}'.format(group_id)) 48 | 49 | url = 'https://graph.microsoft.com/v1.0/groups/{}/transitiveMembers'.format(group_id) 50 | 51 | member_oids = [] 52 | for member in _iter_objects(url, token): 53 | if member['@odata.type'] == '#microsoft.graph.user': 54 | member_oids.append(member['id']) 55 | 56 | return member_oids 57 | 58 | 59 | def get_user_oid_by_mail(mail, token): 60 | logger.info('get ms-graph user oids, user mail : {}'.format(mail)) 61 | 62 | url = 'https://graph.microsoft.com/v1.0/users/{}'.format(mail) 63 | 64 | response_json = _send_request(url, token) 65 | return response_json['id'] 66 | -------------------------------------------------------------------------------- /files/quota setting tool/quotaoverridesCRTemplate.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: amlarc.azureml.com/v1 2 | kind: QuotaOverride 3 | metadata: 4 | labels: 5 | app.kubernetes.io/name: quotaoverride 6 | app.kubernetes.io/instance: 7 | name: 8 | spec: 9 | tierOverrides: 10 | 11 | userIdentifiers: 12 | 13 | -------------------------------------------------------------------------------- /files/quota setting tool/readme.md: -------------------------------------------------------------------------------- 1 | CLI tool to generate k8s quotaoverrides custom resource file accourding to user's config file 2 | ``` 3 | usage: get_quotaoverrides_cr.py [-h] --config CONFIG --output OUTPUT --name NAME 4 | 5 | give a config yaml, generate quotaoverrides custom resource yaml, suggests to run [az login] first before using the command 6 | 7 | optional arguments: 8 | -h, --help show this help message and exit 9 | --config CONFIG yaml file path of user's quota override config file 10 | --output OUTPUT yaml file path of generated k8s quotaoverrides custom resource file 11 | --name NAME name of quotaoverrides custom resource 12 | ``` 13 | the user config file should be like this: 14 | ```yaml 15 | tierOverrides: 16 | : 17 | : 18 | userIdentifiers: 19 | users: 20 | - 21 | groups: 22 | - 23 | ``` 24 | 25 | ## example 26 | ### edit the config file in current path, name it config.yaml 27 | ```yaml 28 | tierOverrides: 29 | my_tier1: 30 | myquota1: myquota1 31 | myquota2: myquota2 32 | my_tier2: 33 | myquota1: myquota1 34 | myquota2: myquota2 35 | userIdentifiers: 36 | users: 37 | - my-first-user 38 | - my-second-user 39 | groups: 40 | - my-first-group 41 | - my-second-group 42 | ``` 43 | ### run the CLI command, set the output file in current path 44 | ``` 45 | get_quotaoverrides_cr.py --config ./config.yaml --output ./output.yaml --name example 46 | ``` 47 | ### check output.yaml in current path 48 | ```yaml 49 | apiVersion: amlarc.azureml.com/v1 50 | kind: QuotaOverride 51 | metadata: 52 | labels: 53 | app.kubernetes.io/instance: example 54 | app.kubernetes.io/name: quotaoverride 55 | name: example 56 | spec: 57 | tierOverrides: 58 | my_tier1: 59 | myquota1: myquota1 60 | myquota2: myquota2 61 | my_tier2: 62 | myquota1: myquota1 63 | myquota2: myquota2 64 | userIdentifiers: 65 | - userIdentifiers 66 | ``` -------------------------------------------------------------------------------- /files/quota setting tool/utils.py: -------------------------------------------------------------------------------- 1 | from sys import stdout 2 | from logging import getLogger, StreamHandler 3 | import yaml, hashlib 4 | import logging 5 | 6 | logger = getLogger(__name__) 7 | 8 | def config_logging(): 9 | logging.basicConfig( 10 | level=logging.INFO, 11 | format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', 12 | datefmt='%Y-%m-%d %H:%M:%S' 13 | ) 14 | 15 | 16 | def read_yaml(path): 17 | with open(path, 'r') as f: 18 | data = yaml.safe_load(f) 19 | return data 20 | 21 | 22 | def write_yaml_file(path, data): 23 | with open(path, 'w') as f: 24 | yaml.dump(data, f) 25 | 26 | 27 | def hash_string(string): 28 | myhash = hashlib.sha1(string.encode('utf-8')) 29 | return myhash.hexdigest().upper() 30 | -------------------------------------------------------------------------------- /files/sslsecret.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | data: 3 | cert.pem: 4 | key.pem: 5 | kind: Secret 6 | metadata: 7 | name: 8 | namespace: azureml 9 | type: Opaque 10 | -------------------------------------------------------------------------------- /files/terraform-template.tf: -------------------------------------------------------------------------------- 1 | 2 | terraform { 3 | required_version = ">=0.12" 4 | 5 | required_providers { 6 | azurerm = { 7 | source = "hashicorp/azurerm" 8 | version = "~>2.0" 9 | } 10 | azapi = { 11 | source = "Azure/azapi" 12 | } 13 | } 14 | } 15 | 16 | provider "azurerm" { 17 | features {} 18 | } 19 | 20 | provider "azapi" { 21 | # More information on the authentication methods supported by 22 | # the AzureRM Provider can be found here: 23 | # https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs 24 | 25 | # subscription_id = "..." 26 | # client_id = "..." 27 | # client_secret = "..." 28 | # tenant_id = "..." 29 | } 30 | 31 | resource "azapi_resource" "mlextension" { 32 | type = "Microsoft.KubernetesConfiguration/extensions@2022-03-01" 33 | name = "{extension-name}" 34 | parent_id = "/subscriptions/{subscription}/resourcegroups/{resource-group}/providers/Microsoft.ContainerService/managedClusters/{cluster-name}" 35 | identity { 36 | type = "SystemAssigned" 37 | } 38 | body = jsonencode({ 39 | "properties"= { 40 | "extensionType"= "microsoft.azureml.kubernetes" 41 | "releaseTrain"= "stable" 42 | "scope"= { 43 | "cluster"= { 44 | "releaseNamespace"= "azureml" 45 | } 46 | } 47 | "configurationSettings"= { 48 | "enableTraining"= "True" 49 | "enableInference"= "True" 50 | "allowInsecureConnections"= "True" 51 | "inferenceRouterServiceType"= "loadBalancer" 52 | "cluster_name"= "/subscriptions/{subscription}/resourcegroups/{resource-group}/providers/Microsoft.ContainerService/managedClusters/{cluster-name}" 53 | "domain"= "{region}.cloudapp.azure.com" 54 | "location"= "{region}" 55 | "jobSchedulerLocation"= "eastus" 56 | "cluster_name_friendly"= "{cluster-name}" 57 | "servicebus.enabled"= "false" 58 | "relayserver.enabled"= "false" 59 | "nginxIngress.enabled"= "true" 60 | "clusterId"= "/subscriptions/{subscription}/resourcegroups/{resource-group}/providers/Microsoft.ContainerService/managedClusters/{cluster-name}" 61 | "prometheus.prometheusSpec.externalLabels.cluster_name"= "/subscriptions/{subscription}/resourcegroups/{resource-group}/providers/Microsoft.ContainerService/managedClusters/{cluster-name}" 62 | }, 63 | "configurationProtectedSettings"= {} 64 | } 65 | }) 66 | } 67 | -------------------------------------------------------------------------------- /pics/check_scoringfe_v2_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/pics/check_scoringfe_v2_output.png -------------------------------------------------------------------------------- /pics/nvml_error.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/pics/nvml_error.png -------------------------------------------------------------------------------- /pics/permission_denied.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/AML-Kubernetes/b533ee43ba7c8641c61c8b03ae4c674b0f711451/pics/permission_denied.png --------------------------------------------------------------------------------