├── .coveragerc ├── .github ├── CODE_OF_CONDUCT.md ├── ISSUE_TEMPLATE.md ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md ├── PULL_REQUEST_TEMPLATE.md ├── actions │ ├── bicep-to-arm-template-diff │ │ └── action.yaml │ ├── open-provision-resources │ │ └── action.yaml │ ├── run-shared-unit-tests │ │ └── action.yaml │ ├── submit-aml-literal-pipeline │ │ └── action.yaml │ ├── submit-aml-scatter-gather-pipeline │ │ └── action.yaml │ ├── submit-example-pipeline │ │ └── action.yaml │ ├── submit-multiply-data-pipeline │ │ └── action.yaml │ ├── submit-upload-data-pipeline │ │ └── action.yaml │ └── vnet-provision-resources │ │ └── action.yaml ├── scripts │ └── delete-run-history.sh └── workflows │ ├── build-test.yaml │ ├── clear-pipeline-run-history.yaml │ ├── pipeline-e2e-test.yaml │ └── release-branch-test.yaml ├── .gitignore ├── CHANGELOG.md ├── CONTRIBUTING.md ├── LICENSE.md ├── README.md ├── docs ├── README.md ├── concepts │ ├── benchmarking.md │ ├── confidentiality.md │ ├── glossary.md │ ├── guide.md │ ├── mlops_for_fl.md │ ├── plan-your-fl-project.md │ └── vertical-fl.md ├── frameworks │ ├── flower.md │ └── nvflare.md ├── pics │ ├── ccfraud_acc.jpg │ ├── ccfraud_ddp.jpg │ ├── ccfraud_time.jpg │ ├── combined-losses-silos.PNG │ ├── diagram.png │ ├── fl_fig.png │ ├── fldatatypes.png │ ├── industry-bank-marketing.png │ ├── industry-fraud-detection.png │ ├── industry-medical-imaging.png │ ├── industry-ner.png │ ├── metrics.PNG │ ├── ner_acc.jpg │ ├── ner_ddp.jpg │ ├── ner_time.jpg │ ├── pipeline-aml.PNG │ ├── pneumonia_acc.jpg │ ├── pneumonia_ddp.jpg │ ├── pneumonia_ddp_1tb.jpg │ ├── pneumonia_time.jpg │ ├── sandboxes_confidential.png │ ├── sandboxes_eyesoff.png │ ├── sandboxes_eyeson.png │ ├── sandboxes_private.png │ ├── vfltrainingloop.png │ └── vnet_silo_provisioning.png ├── provisioning │ ├── README.md │ ├── external-silos.md │ ├── jumpbox_cc.md │ ├── orchestrator_open.md │ ├── orchestrator_vnet.md │ ├── sandboxes.md │ ├── silo_open.md │ ├── silo_open_aks_with_cc.md │ ├── silo_vnet_existingstorage.md │ └── silo_vnet_newstorage.md ├── quickstart.md ├── real-world-examples │ ├── bank-marketing-vertical.md │ ├── ccfraud-horizontal.md │ ├── ccfraud-vertical.md │ ├── ccfraud-vetical-fedonce.md │ ├── ner-horizontal.md │ └── pneumonia-horizontal.md ├── troubleshoot.md └── tutorials │ ├── add-kaggle-credentials.md │ ├── dp-for-cross-silo-horizontal-fl.md │ ├── e2e-fl-on-cc.md │ ├── literal-scatter-gather-tutorial.md │ ├── read-local-data-in-k8s-silo.md │ └── update-local-data-to-silo-storage-account.md ├── examples ├── .gitignore ├── cli-jobs │ └── upload-local-data │ │ ├── confidential_io.py │ │ ├── job.yml │ │ └── run.py ├── components │ ├── BANK_MARKETING_VERTICAL │ │ ├── traininsilo │ │ │ ├── aml_comm.py │ │ │ ├── aml_smpc.py │ │ │ ├── conda.yaml │ │ │ ├── contributor.py │ │ │ ├── contributor_spec.yaml │ │ │ ├── datasets.py │ │ │ ├── host.py │ │ │ ├── host_spec.yaml │ │ │ ├── models.py │ │ │ └── samplers.py │ │ └── upload_data │ │ │ ├── conda.yaml │ │ │ ├── run.py │ │ │ └── spec.yaml │ ├── CCFRAUD │ │ ├── preprocessing │ │ │ ├── conda.yaml │ │ │ ├── confidential_io.py │ │ │ ├── run.py │ │ │ └── spec.yaml │ │ ├── traininsilo │ │ │ ├── conda.yaml │ │ │ ├── confidential_io.py │ │ │ ├── datasets.py │ │ │ ├── models.py │ │ │ ├── run.py │ │ │ └── spec.yaml │ │ └── upload_data │ │ │ ├── conda.yaml │ │ │ ├── confidential_io.py │ │ │ ├── run.py │ │ │ ├── spec.yaml │ │ │ └── us_regions.csv │ ├── CCFRAUD_VERTICAL │ │ ├── preprocessing │ │ │ ├── run.py │ │ │ └── spec.yaml │ │ ├── psi │ │ │ ├── aml_comm.py │ │ │ ├── aml_smpc.py │ │ │ ├── context │ │ │ │ ├── Dockerfile │ │ │ │ ├── SymmetricPSI │ │ │ │ │ ├── CMakeLists.txt │ │ │ │ │ └── psi.cpp │ │ │ │ └── vcpkg.json │ │ │ ├── run.py │ │ │ └── spec.yaml │ │ ├── traininsilo │ │ │ ├── aml_comm.py │ │ │ ├── aml_smpc.py │ │ │ ├── conda.yaml │ │ │ ├── contributor.py │ │ │ ├── contributor_spec.yaml │ │ │ ├── datasets.py │ │ │ ├── host.py │ │ │ ├── host_spec.yaml │ │ │ ├── models.py │ │ │ └── samplers.py │ │ └── upload_data │ │ │ ├── conda.yaml │ │ │ ├── run.py │ │ │ ├── spec.yaml │ │ │ └── us_regions.csv │ ├── CCFRAUD_VERTICAL_FEDONCE │ │ ├── preprocessing │ │ │ ├── run.py │ │ │ └── spec.yaml │ │ ├── pretraining │ │ │ ├── conda.yaml │ │ │ ├── datasets.py │ │ │ ├── models.py │ │ │ ├── run.py │ │ │ └── spec.yaml │ │ ├── traininsilo │ │ │ ├── conda.yaml │ │ │ ├── datasets.py │ │ │ ├── models.py │ │ │ ├── run.py │ │ │ └── spec.yaml │ │ └── upload_data │ │ │ ├── conda.yaml │ │ │ ├── run.py │ │ │ ├── spec.yaml │ │ │ └── us_regions.csv │ ├── FLWR │ │ ├── client │ │ │ ├── pneumonia_network.py │ │ │ ├── run.py │ │ │ └── spec.yaml │ │ ├── flower_pytorch_env │ │ │ ├── context │ │ │ │ ├── Dockerfile │ │ │ │ └── requirements.txt │ │ │ └── env.yml │ │ └── server │ │ │ ├── run.py │ │ │ └── spec.yaml │ ├── HELLOWORLD │ │ ├── aggregatemodelweights │ │ │ ├── run.py │ │ │ └── spec.yaml │ │ ├── preprocessing │ │ │ ├── run.py │ │ │ └── spec.yaml │ │ └── traininsilo │ │ │ ├── run.py │ │ │ └── spec.yaml │ ├── MNIST │ │ ├── preprocessing │ │ │ ├── conda.yaml │ │ │ ├── run.py │ │ │ └── spec.yaml │ │ └── traininsilo │ │ │ ├── conda.yaml │ │ │ ├── run.py │ │ │ └── spec.yaml │ ├── MNIST_VERTICAL │ │ ├── traininsilo │ │ │ ├── aml_comm.py │ │ │ ├── aml_smpc.py │ │ │ ├── conda.yaml │ │ │ ├── contributor.py │ │ │ ├── contributor_spec.yaml │ │ │ ├── host.py │ │ │ ├── host_spec.yaml │ │ │ └── samplers.py │ │ └── upload_data │ │ │ ├── conda.yaml │ │ │ ├── run.py │ │ │ └── spec.yaml │ ├── NER │ │ ├── preprocessing │ │ │ ├── conda.yaml │ │ │ ├── run.py │ │ │ └── spec.yaml │ │ ├── traininsilo │ │ │ ├── conda.yaml │ │ │ ├── labels.json │ │ │ ├── run.py │ │ │ └── spec.yaml │ │ └── upload_data │ │ │ ├── conda.yaml │ │ │ ├── run.py │ │ │ └── spec.yaml │ ├── NVFLARE │ │ ├── client │ │ │ ├── environment │ │ │ │ ├── context │ │ │ │ │ └── Dockerfile │ │ │ │ └── env.yml │ │ │ ├── run.py │ │ │ └── spec.yaml │ │ ├── provision │ │ │ ├── environment │ │ │ │ ├── context │ │ │ │ │ └── Dockerfile │ │ │ │ └── env.yml │ │ │ └── spec.yaml │ │ └── server │ │ │ ├── environment │ │ │ ├── context │ │ │ │ └── Dockerfile │ │ │ └── env.yml │ │ │ ├── run.py │ │ │ └── spec.yaml │ ├── PNEUMONIA │ │ ├── traininsilo │ │ │ ├── conda.yaml │ │ │ ├── pneumonia_network.py │ │ │ ├── run.py │ │ │ └── spec.yaml │ │ └── upload_data │ │ │ ├── conda.yaml │ │ │ ├── run.py │ │ │ └── spec.yaml │ ├── shared │ │ ├── aml_comm.py │ │ ├── aml_smpc.py │ │ ├── confidential_io.py │ │ └── samplers.py │ └── utils │ │ ├── aggregatemodelweights │ │ ├── conda.yaml │ │ ├── run.py │ │ └── spec.yaml │ │ ├── data_analysis │ │ ├── run.py │ │ └── spec.yaml │ │ └── multiply_data_files │ │ ├── conda.yaml │ │ ├── run.py │ │ └── spec.yaml └── pipelines │ ├── bank_marketing_vertical │ ├── config.yaml │ └── submit.py │ ├── ccfraud │ ├── config.yaml │ └── submit.py │ ├── ccfraud_vertical │ ├── config.yaml │ └── submit.py │ ├── ccfraud_vertical_fedonce │ ├── config.yaml │ └── submit.py │ ├── environment.yml │ ├── fl_cross_silo_literal │ ├── config.yaml │ └── submit.py │ ├── fl_cross_silo_scatter_gather │ ├── config.yaml │ ├── fl_helper.py │ └── submit.py │ ├── mnist_vertical │ ├── config.yaml │ └── submit.py │ ├── ner │ ├── config.yaml │ └── submit.py │ ├── pneumonia │ ├── config.yaml │ └── submit.py │ ├── pneumonia_flwr │ ├── config.yaml │ └── submit.py │ ├── pneumonia_nvflare │ ├── pneumonia_federated │ │ ├── config │ │ │ ├── config_fed_client.json │ │ │ └── config_fed_server.json │ │ └── custom │ │ │ ├── mlflow_receiver.py │ │ │ ├── pneumonia_network.py │ │ │ ├── pt_constants.py │ │ │ └── pt_learner.py │ ├── project.yaml │ └── submit.py │ ├── requirements.txt │ └── utils │ ├── multiply_data_files │ ├── config.yaml │ └── submit.py │ └── upload_data │ ├── config.yaml │ └── submit.py ├── mlops ├── arm │ ├── README.md │ ├── jumpbox_cc.json │ ├── open_aks_with_confcomp_storage_pair.json │ ├── open_compute_storage_pair.json │ ├── sandbox_fl_confidential.json │ ├── sandbox_fl_eyesoff_cpu.json │ ├── sandbox_fl_eyesoff_cpu_gpu.json │ ├── sandbox_fl_eyesoff_gpu.json │ ├── sandbox_fl_eyeson_cpu.json │ ├── sandbox_fl_eyeson_cpu_gpu.json │ ├── sandbox_fl_eyeson_gpu.json │ ├── sandbox_fl_private_cpu.json │ ├── sandbox_fl_private_cpu_gpu.json │ ├── sandbox_fl_private_gpu.json │ ├── sandbox_minimal.json │ ├── vnet_compute_existing_storage.json │ ├── vnet_compute_storage_pair.json │ ├── vnet_private_sandbox_setup.json │ ├── vnet_publicip_sandbox_aks_confcomp_setup.json │ └── vnet_publicip_sandbox_setup.json ├── bicep │ ├── modules │ │ ├── azureml │ │ │ ├── attach_aks_training_to_azureml.bicep │ │ │ ├── azureml_resources_ples.bicep │ │ │ ├── deploy_aks_azureml_extension.bicep │ │ │ ├── deploy_aks_azureml_extension_via_script.bicep │ │ │ ├── open_azureml_workspace.bicep │ │ │ └── private_azureml_workspace.bicep │ │ ├── computes │ │ │ ├── open_new_aks_with_confcomp.bicep │ │ │ ├── open_new_aml_compute.bicep │ │ │ ├── vnet_new_aks_with_confcomp.bicep │ │ │ └── vnet_new_aml_compute.bicep │ │ ├── fl_pairs │ │ │ ├── open_aks_with_confcomp_storage_pair.bicep │ │ │ ├── open_compute_storage_pair.bicep │ │ │ ├── vnet_aks_storage_pair.bicep │ │ │ ├── vnet_compute_existing_storage.bicep │ │ │ └── vnet_compute_storage_pair.bicep │ │ ├── networking │ │ │ ├── azureml_capable_nsg.bicep │ │ │ ├── private_dns_zone.bicep │ │ │ ├── private_endpoint.bicep │ │ │ ├── vnet.bicep │ │ │ └── vnet_peering.bicep │ │ ├── permissions │ │ │ └── msi_storage_rw.bicep │ │ ├── resources │ │ │ ├── confidentiality_keyvault.bicep │ │ │ ├── jumpbox_cc.bicep │ │ │ ├── private_acr.bicep │ │ │ ├── private_appinsights.bicep │ │ │ ├── private_keyvault.bicep │ │ │ └── private_storage.bicep │ │ └── storages │ │ │ ├── existing_blob_storage_datastore.bicep │ │ │ └── new_blob_storage_datastore.bicep │ ├── sandbox_fl_confidential.bicep │ ├── sandbox_fl_eyesoff_cpu.bicep │ ├── sandbox_fl_eyesoff_cpu_gpu.bicep │ ├── sandbox_fl_eyesoff_gpu.bicep │ ├── sandbox_fl_eyeson_cpu.bicep │ ├── sandbox_fl_eyeson_cpu_gpu.bicep │ ├── sandbox_fl_eyeson_gpu.bicep │ ├── sandbox_fl_private_cpu.bicep │ ├── sandbox_fl_private_cpu_gpu.bicep │ ├── sandbox_fl_private_gpu.bicep │ ├── sandbox_minimal.bicep │ ├── vnet_private_sandbox_setup.bicep │ ├── vnet_publicip_sandbox_aks_confcomp_setup.bicep │ └── vnet_publicip_sandbox_setup.bicep └── k8s_templates │ ├── README.md │ ├── deploy_pvc.yaml │ ├── instance-type.yaml │ ├── k8s_config.yaml │ ├── pv.yaml │ └── pvc.yaml └── tests ├── examples └── components │ └── shared │ ├── test_aml_comm.py │ ├── test_aml_smpc.py │ ├── test_samplers.py │ └── utils.py └── requirements.txt /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | concurrency=multiprocessing 3 | omit = 4 | tests/* -------------------------------------------------------------------------------- /.github/CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 4 | > Please provide us with the following information: 5 | > --------------------------------------------------------------- 6 | 7 | ### This issue is for a: (mark with an `x`) 8 | ``` 9 | - [ ] bug report -> please search issues before submitting 10 | - [ ] feature request 11 | - [ ] documentation issue or request 12 | - [ ] regression (a behavior that used to work and stopped in a new release) 13 | ``` 14 | 15 | ### Minimal steps to reproduce 16 | > 17 | 18 | ### Any log messages given by the failure 19 | > 20 | 21 | ### Expected/desired behavior 22 | > 23 | 24 | ### OS and Version? 25 | > Windows 7, 8 or 10. Linux (which distribution). macOS (Yosemite? El Capitan? Sierra?) 26 | 27 | ### Versions 28 | > 29 | 30 | ### Mention any other details that might be useful 31 | 32 | > --------------------------------------------------------------- 33 | > Thanks! We'll be in touch soon. 34 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Additional context** 27 | Add any other context about the problem here. 28 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ## Purpose 2 | 3 | * ... 4 | 5 | ## What is the expected review turnaround time? 6 | 7 | Urgency: 8 | - [ ] High (needs review today) 9 | - [ ] Medium (needs review within a few days - most common) 10 | - [ ] Low (can wait a week) 11 | 12 | ## Does this introduce a breaking change? 13 | 14 | ``` 15 | [ ] Yes 16 | [ ] No 17 | ``` 18 | 19 | ## Pull Request Type 20 | What kind of change does this Pull Request introduce? 21 | 22 | 23 | ``` 24 | [ ] Bugfix 25 | [ ] Feature 26 | [ ] Code style update (formatting, local variables) 27 | [ ] Refactoring (no functional changes, no api changes) 28 | [ ] Documentation content changes 29 | [ ] Other... Please describe: 30 | ``` 31 | 32 | ## How to Test 33 | * Get the code 34 | 35 | ``` 36 | git clone [repo-address] 37 | cd [repo-name] 38 | git checkout [branch-name] 39 | npm install 40 | ``` 41 | 42 | * Test the code 43 | 44 | ``` 45 | ``` 46 | 47 | ## What to Check 48 | Verify that the following are valid 49 | * ... 50 | 51 | ## Other Information 52 | -------------------------------------------------------------------------------- /.github/actions/bicep-to-arm-template-diff/action.yaml: -------------------------------------------------------------------------------- 1 | name: Bicep to ARM template diff 2 | description: Bicep build to ARM template diff 3 | inputs: 4 | source-file-path: 5 | description: Bicep script path 6 | required: true 7 | target-file-path: 8 | description: Arm template file path 9 | required: true 10 | 11 | runs: 12 | using: composite 13 | steps: 14 | - name: Build bicep into arm 15 | shell: bash 16 | run: | 17 | az config set bicep.use_binary_from_path=False 18 | az bicep install --version v0.14.85 19 | az bicep build --file ${{ inputs.source-file-path }} --stdout | jq -S . > source.json 20 | 21 | - name: Refactor ARM template file 22 | shell: bash 23 | run: jq -S . ${{ inputs.target-file-path }} > target.json 24 | 25 | 26 | - name: Source and Target file diff 27 | shell: bash 28 | run: | 29 | if cmp -s ./source.json ./target.json; then 30 | printf 'Rebuilding bicep "%s" produces exact match with target file "%s"\n' ${{ inputs.source-file-path }} ${{ inputs.target-file-path }} 31 | else 32 | printf 'Diff between "%s" build and "%s":\n' ${{ inputs.source-file-path }} ${{ inputs.target-file-path }} 33 | diff source.json target.json 34 | exit 1 35 | fi 36 | -------------------------------------------------------------------------------- /.github/actions/run-shared-unit-tests/action.yaml: -------------------------------------------------------------------------------- 1 | name: Run unit tests for shared component files 2 | description: Run unit tests for shared components files and provide coverage 3 | 4 | runs: 5 | using: composite 6 | steps: 7 | - name: Install python dependencies 8 | shell: bash 9 | run: pip install -r tests/requirements.txt 10 | 11 | - name: Run unit tests 12 | shell: bash 13 | run: | 14 | coverage run --source=examples/components/shared -m unittest discover -s tests/examples/components/shared -v 15 | 16 | - name: Run coverage 17 | shell: bash 18 | run: | 19 | coverage combine 20 | coverage report -m -------------------------------------------------------------------------------- /.github/actions/submit-aml-literal-pipeline/action.yaml: -------------------------------------------------------------------------------- 1 | name: Submit example literal pipeline 2 | description: Submit example literal pipeline in AML 3 | inputs: 4 | client-id: 5 | description: Client ID of the service principal 6 | required: true 7 | tenant-id: 8 | description: Tenant ID of the service principal 9 | required: true 10 | subscription-id: 11 | description: Subscription to use for resources 12 | required: true 13 | resource-group: 14 | description: Resource group of the AML workspace 15 | required: true 16 | workspace-name: 17 | description: Workspace name 18 | required: true 19 | example: 20 | description: Example pipline to run 21 | required: true 22 | 23 | runs: 24 | using: composite 25 | steps: 26 | - name: Setup python 27 | uses: actions/setup-python@v2.2.1 28 | with: 29 | python-version: 3.8 30 | 31 | - name: Azure login 32 | uses: azure/login@v1 33 | with: 34 | client-id: ${{ inputs.client-id }} 35 | tenant-id: ${{ inputs.tenant-id }} 36 | subscription-id: ${{ inputs.subscription-id }} 37 | 38 | - name: Install azure ml latest extension 39 | shell: bash 40 | run: | 41 | az extension remove -n azure-cli-ml || echo "azure-cli-ml extension is not installed." 42 | az extension remove -n ml || echo "ml extension is not installed." 43 | az extension add -n ml -y 44 | 45 | - name: Install python dependencies 46 | shell: bash 47 | run: pip install -r examples/pipelines/requirements.txt 48 | 49 | - name: Submit fl_cross_silo_literal pipeline 50 | shell: bash 51 | run: python examples/pipelines/fl_cross_silo_literal/submit.py --subscription_id ${{ inputs.subscription-id }} --resource_group ${{ inputs.resource-group }} --workspace_name ${{ inputs.workspace-name }} --example ${{ inputs.example }} --wait || [ $? == 5 ] 52 | 53 | -------------------------------------------------------------------------------- /.github/actions/submit-aml-scatter-gather-pipeline/action.yaml: -------------------------------------------------------------------------------- 1 | name: Submit example scatter-gather pipeline 2 | description: Submit example scatter-gather pipeline in AML 3 | inputs: 4 | client-id: 5 | description: Client ID of the service principal 6 | required: true 7 | tenant-id: 8 | description: Tenant ID of the service principal 9 | required: true 10 | subscription-id: 11 | description: Subscription to use for resources 12 | required: true 13 | resource-group: 14 | description: Resource group of the AML workspace 15 | required: true 16 | workspace-name: 17 | description: Workspace name 18 | required: true 19 | example: 20 | description: Example pipline to run 21 | required: true 22 | 23 | runs: 24 | using: composite 25 | steps: 26 | - name: Setup python 27 | uses: actions/setup-python@v2.2.1 28 | with: 29 | python-version: 3.8 30 | 31 | - name: Azure login 32 | uses: azure/login@v1 33 | with: 34 | client-id: ${{ inputs.client-id }} 35 | tenant-id: ${{ inputs.tenant-id }} 36 | subscription-id: ${{ inputs.subscription-id }} 37 | 38 | - name: Install azure ml latest extension 39 | shell: bash 40 | run: | 41 | az extension remove -n azure-cli-ml || echo "azure-cli-ml extension is not installed." 42 | az extension remove -n ml || echo "ml extension is not installed." 43 | az extension add -n ml -y 44 | 45 | - name: Install python dependencies 46 | shell: bash 47 | run: pip install -r examples/pipelines/requirements.txt 48 | 49 | - name: Submit fl_cross_silo_scatter_gather pipeline 50 | shell: bash 51 | run: python examples/pipelines/fl_cross_silo_scatter_gather/submit.py --subscription_id ${{ inputs.subscription-id }} --resource_group ${{ inputs.resource-group }} --workspace_name ${{ inputs.workspace-name }} --example ${{ inputs.example }} --ignore_validation --wait || [ $? == 5 ] 52 | -------------------------------------------------------------------------------- /.github/actions/submit-example-pipeline/action.yaml: -------------------------------------------------------------------------------- 1 | name: Submit example pipeline 2 | description: Submit example pipeline in AML 3 | inputs: 4 | client-id: 5 | description: Client ID of the service principal 6 | required: true 7 | tenant-id: 8 | description: Tenant ID of the service principal 9 | required: true 10 | subscription-id: 11 | description: Subscription to use for resources 12 | required: true 13 | resource-group: 14 | description: Resource group of the AML workspace 15 | required: true 16 | workspace-name: 17 | description: Workspace name 18 | required: true 19 | example: 20 | description: Example pipeline to run 21 | required: true 22 | 23 | runs: 24 | using: composite 25 | steps: 26 | - name: Setup python 27 | uses: actions/setup-python@v2.2.1 28 | with: 29 | python-version: 3.8 30 | 31 | - name: Azure login 32 | uses: azure/login@v1 33 | with: 34 | client-id: ${{ inputs.client-id }} 35 | tenant-id: ${{ inputs.tenant-id }} 36 | subscription-id: ${{ inputs.subscription-id }} 37 | 38 | - name: Install azure ml latest extension 39 | shell: bash 40 | run: | 41 | az extension remove -n azure-cli-ml || echo "azure-cli-ml extension is not installed." 42 | az extension remove -n ml || echo "ml extension is not installed." 43 | az extension add -n ml -y 44 | 45 | - name: Install python dependencies 46 | shell: bash 47 | run: pip install -r examples/pipelines/requirements.txt 48 | 49 | - name: Submit example pipeline 50 | shell: bash 51 | run: python examples/pipelines/${{ inputs.example }}/submit.py --subscription_id ${{ inputs.subscription-id }} --resource_group ${{ inputs.resource-group }} --workspace_name ${{ inputs.workspace-name }} --wait || [ $? == 5 ] 52 | 53 | -------------------------------------------------------------------------------- /.github/actions/submit-multiply-data-pipeline/action.yaml: -------------------------------------------------------------------------------- 1 | name: Submit multiply data pipeline 2 | description: Submit multiply data pipeline in AML 3 | inputs: 4 | client-id: 5 | description: Client ID of the service principal 6 | required: true 7 | tenant-id: 8 | description: Tenant ID of the service principal 9 | required: true 10 | subscription-id: 11 | description: Subscription to use for resources 12 | required: true 13 | resource-group: 14 | description: Resource group of the AML workspace 15 | required: true 16 | workspace-name: 17 | description: Workspace name 18 | required: true 19 | 20 | runs: 21 | using: composite 22 | steps: 23 | - name: Setup python 24 | uses: actions/setup-python@v2.2.1 25 | with: 26 | python-version: 3.8 27 | 28 | - name: Azure login 29 | uses: azure/login@v1 30 | with: 31 | client-id: ${{ inputs.client-id }} 32 | tenant-id: ${{ inputs.tenant-id }} 33 | subscription-id: ${{ inputs.subscription-id }} 34 | 35 | - name: Install azure ml latest extension 36 | shell: bash 37 | run: | 38 | az extension remove -n azure-cli-ml || echo "azure-cli-ml extension is not installed." 39 | az extension remove -n ml || echo "ml extension is not installed." 40 | az extension add -n ml -y 41 | 42 | - name: Install python dependencies 43 | shell: bash 44 | run: pip install -r examples/pipelines/requirements.txt 45 | 46 | - name: Submit example pipeline 47 | shell: bash 48 | run: python examples/pipelines/utils/multiply_data_files/submit.py --subscription_id ${{ inputs.subscription-id }} --resource_group ${{ inputs.resource-group }} --workspace_name ${{ inputs.workspace-name }} --submit --wait || [ $? == 5 ] 49 | 50 | -------------------------------------------------------------------------------- /.github/actions/submit-upload-data-pipeline/action.yaml: -------------------------------------------------------------------------------- 1 | name: Submit upload data pipeline 2 | description: Submit upload data pipeline in AML 3 | inputs: 4 | client-id: 5 | description: Client ID of the service principal 6 | required: true 7 | tenant-id: 8 | description: Tenant ID of the service principal 9 | required: true 10 | subscription-id: 11 | description: Subscription to use for resources 12 | required: true 13 | resource-group: 14 | description: Resource group of the AML workspace 15 | required: true 16 | workspace-name: 17 | description: Workspace name 18 | required: true 19 | example: 20 | description: Example upload data pipeline to run 21 | required: true 22 | 23 | runs: 24 | using: composite 25 | steps: 26 | - name: Setup python 27 | uses: actions/setup-python@v2.2.1 28 | with: 29 | python-version: 3.8 30 | 31 | - name: Azure login 32 | uses: azure/login@v1 33 | with: 34 | client-id: ${{ inputs.client-id }} 35 | tenant-id: ${{ inputs.tenant-id }} 36 | subscription-id: ${{ inputs.subscription-id }} 37 | 38 | - name: Install azure ml latest extension 39 | shell: bash 40 | run: | 41 | az extension remove -n azure-cli-ml || echo "azure-cli-ml extension is not installed." 42 | az extension remove -n ml || echo "ml extension is not installed." 43 | az extension add -n ml -y 44 | 45 | - name: Install python dependencies 46 | shell: bash 47 | run: pip install -r examples/pipelines/requirements.txt 48 | 49 | - name: Submit example pipeline 50 | shell: bash 51 | run: python examples/pipelines/utils/upload_data/submit.py --subscription_id ${{ inputs.subscription-id }} --resource_group ${{ inputs.resource-group }} --workspace_name ${{ inputs.workspace-name }} --wait --example ${{ inputs.example }} || [ $? == 5 ] 52 | 53 | -------------------------------------------------------------------------------- /.github/scripts/delete-run-history.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | job_name_with_created_date=$(az ml job list -g $1 -w $2 --all-results true | jq ".[] | .name, .creation_context.created_at") 4 | echo $job_name_with_created_date 5 | job_name="" 6 | i=1 7 | for item in $job_name_with_created_date; do 8 | item=`sed -e 's/^"//' -e 's/"$//' <<< "$item"` 9 | if [[ $i%2 -eq 1 ]]; then 10 | job_name=$item 11 | i=$((i+1)) 12 | continue 13 | fi 14 | 15 | num_of_days=$((($(date +%s) - $(date -d $item +%s)) / (60 * 60 * 24) )) 16 | echo Job name: $job_name, Number of days: $num_of_days 17 | 18 | # Archive jobs that are older than 60 days 19 | if [[ $num_of_days -gt 60 ]]; then 20 | az ml job archive -g $1 -w $2 -n $job_name 21 | else 22 | echo "Number of days are less than 60." 23 | fi 24 | i=$((i+1)) 25 | done 26 | 27 | 28 | -------------------------------------------------------------------------------- /.github/workflows/build-test.yaml: -------------------------------------------------------------------------------- 1 | name: Pipeline-validation 2 | 3 | on: 4 | push: 5 | branches: 6 | - "*" 7 | pull_request: 8 | branches: 9 | - "*" 10 | 11 | jobs: 12 | build: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: actions/checkout@v2 16 | 17 | - uses: actions/setup-python@v2.2.1 18 | with: 19 | python-version: 3.8 20 | 21 | - run: pip install black 22 | 23 | - run: black --check . 24 | if: github.base_ref 25 | 26 | - name: Intall python dependencies 27 | run: pip install -r examples/pipelines/requirements.txt 28 | 29 | - name: Validate fl_cross_silo_literal pipeline 30 | run: python examples/pipelines/fl_cross_silo_literal/submit.py --offline 31 | 32 | - name: Run unit tests for shared component files 33 | uses: ./.github/actions/run-shared-unit-tests 34 | -------------------------------------------------------------------------------- /.github/workflows/clear-pipeline-run-history.yaml: -------------------------------------------------------------------------------- 1 | name: Delete run history 2 | on: 3 | schedule: 4 | - cron: "0 0 * * *" 5 | 6 | jobs: 7 | delete-history: 8 | runs-on: ubuntu-latest 9 | permissions: 10 | id-token: write 11 | steps: 12 | - uses: actions/checkout@v2 13 | 14 | - name: Azure login 15 | uses: azure/login@v1 16 | with: 17 | client-id: ${{ secrets.AZURE_CLIENT_ID }} 18 | tenant-id: ${{ secrets.AZURE_TENANT_ID }} 19 | subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} 20 | 21 | - name: Install azure ml latest extension 22 | shell: bash 23 | run: | 24 | az extension remove -n azure-cli-ml || echo "azure-cli-ml extension is not installed." 25 | az extension remove -n ml || echo "ml extension is not installed." 26 | az extension add -n ml -y 27 | 28 | - name: Delete run history 29 | shell: bash 30 | run: ./.github/scripts/delete-run-history.sh ${{ secrets.RESOURCE_GROUP }} ${{ secrets.AML_WORKSPACE_NAME }} -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # if you use an AzureML config file locally 132 | config.json 133 | 134 | # for ignoring test jobs 135 | /examples/pipelines/test* 136 | 137 | # for ignoring local sandbox files for debugging/testing 138 | /sandbox/* 139 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE -------------------------------------------------------------------------------- /docs/concepts/confidentiality.md: -------------------------------------------------------------------------------- 1 | # Confidentiality and Federated Learning 2 | 3 | :construction: This page is under construction :construction: 4 | 5 | In Azure, there are several ways your data (customer data, models, etc) are protected. Azure services provide a flexible and comprehensive set of tools to help you meet your compliance and security requirements. This page describes a technique to ensure data encryption at rest from within your training/preprocessing code. Combined with confidential computing, it can allow you to maximize security and minimize the risk of data leakage. 6 | 7 | ## Use private storage, with service-side encryption 8 | 9 | :construction: 10 | 11 | ## Use confidential computing 12 | 13 | :construction: 14 | 15 | ## Use encryption as rest in your code 16 | 17 | :construction: 18 | -------------------------------------------------------------------------------- /docs/concepts/glossary.md: -------------------------------------------------------------------------------- 1 | # Glossary 2 | 3 | ## Data 4 | 5 | Any file or collection of files. Data will be described in terms of classification. 6 | Only three classifications are required for the context of this document. "Sensitive" (cannot be moved or even looked at), "intermediate" (can be moved around, but looser restrictions on visibility), and "eyes-on" (can be moved freely and seen by everyone participating in the federated training). 7 | 8 | ## Storage 9 | 10 | Wherever data is stored. In this file, storage is assumed to live in Azure. It may exist in locked-down virtual networks. 11 | 12 | ## Compute 13 | 14 | Anything that can run "code" (deliberately vague). In this file, compute is assumed to live in Azure. 15 | 16 | ## Job 17 | 18 | Execute code (a collection of files) in an environment (a Docker image) against data (from storage). A job can consume data from multiple storage instances and write back to multiple instances. 19 | 20 | ## Approval 21 | 22 | REST endpoint to which the platform "asks permission" before running any job. The platform sends the approval endpoint information including: 23 | 24 | 1. Input and output storage 25 | 2. Which compute the job wishes to run in 26 | 3. The author of the code the job is running 27 | 4. Whether or not the job has been code-signed by the configured policies 28 | 29 | The approval endpoint can either approve / reject the job based on checked-in configuration (e.g., of which storage accounts are associated with which silo) or pass this information on for manual approval. 30 | 31 | :exclamation: Note that the approval endpoints do not support 3P-facing AML yet. 32 | 33 | ## Silo 34 | 35 | Isolated collection of storage and compute. Here, "isolated" means that the platform guarantees: 36 | 37 | - Only compute within the silo can "touch" storage within the silo. 38 | - Only data of intermediate or eyes-on classification can be moved outside the silo. 39 | - Only "approved" jobs can change the classification of data or move it outside the silo. 40 | 41 | Silos are expected to be reliable (i.e., no concerns around network connectivity or uptime). 42 | 43 | :exclamation: Note that we assume a hard cap of ≤ 100 silos at current stage. 44 | 45 | ## Orchestrator 46 | 47 | Collection of storage and compute. The storage is for model parameters, rather than the actual data. A task orchestrator broadcasts the FL task, sends the current model to each silo, and aggregates the gradients from the silos. In this file, orchestrator is assumed to live in an AML workspace. 48 | 49 | ## Internal Silos 50 | 51 | Collection of silos belong to the same Azure tenant. 52 | 53 | ## External Silos 54 | 55 | Collection of silos that resides in either different Azure tenant or different cloud provider. 56 | -------------------------------------------------------------------------------- /docs/pics/ccfraud_acc.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/azure-ml-federated-learning/d72148a08394978db29277b922817cbe871fd2e5/docs/pics/ccfraud_acc.jpg -------------------------------------------------------------------------------- /docs/pics/ccfraud_ddp.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/azure-ml-federated-learning/d72148a08394978db29277b922817cbe871fd2e5/docs/pics/ccfraud_ddp.jpg -------------------------------------------------------------------------------- /docs/pics/ccfraud_time.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/azure-ml-federated-learning/d72148a08394978db29277b922817cbe871fd2e5/docs/pics/ccfraud_time.jpg -------------------------------------------------------------------------------- /docs/pics/combined-losses-silos.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/azure-ml-federated-learning/d72148a08394978db29277b922817cbe871fd2e5/docs/pics/combined-losses-silos.PNG -------------------------------------------------------------------------------- /docs/pics/diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/azure-ml-federated-learning/d72148a08394978db29277b922817cbe871fd2e5/docs/pics/diagram.png -------------------------------------------------------------------------------- /docs/pics/fl_fig.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/azure-ml-federated-learning/d72148a08394978db29277b922817cbe871fd2e5/docs/pics/fl_fig.png -------------------------------------------------------------------------------- /docs/pics/fldatatypes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/azure-ml-federated-learning/d72148a08394978db29277b922817cbe871fd2e5/docs/pics/fldatatypes.png -------------------------------------------------------------------------------- /docs/pics/industry-bank-marketing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/azure-ml-federated-learning/d72148a08394978db29277b922817cbe871fd2e5/docs/pics/industry-bank-marketing.png -------------------------------------------------------------------------------- /docs/pics/industry-fraud-detection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/azure-ml-federated-learning/d72148a08394978db29277b922817cbe871fd2e5/docs/pics/industry-fraud-detection.png -------------------------------------------------------------------------------- /docs/pics/industry-medical-imaging.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/azure-ml-federated-learning/d72148a08394978db29277b922817cbe871fd2e5/docs/pics/industry-medical-imaging.png -------------------------------------------------------------------------------- /docs/pics/industry-ner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/azure-ml-federated-learning/d72148a08394978db29277b922817cbe871fd2e5/docs/pics/industry-ner.png -------------------------------------------------------------------------------- /docs/pics/metrics.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/azure-ml-federated-learning/d72148a08394978db29277b922817cbe871fd2e5/docs/pics/metrics.PNG -------------------------------------------------------------------------------- /docs/pics/ner_acc.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/azure-ml-federated-learning/d72148a08394978db29277b922817cbe871fd2e5/docs/pics/ner_acc.jpg -------------------------------------------------------------------------------- /docs/pics/ner_ddp.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/azure-ml-federated-learning/d72148a08394978db29277b922817cbe871fd2e5/docs/pics/ner_ddp.jpg -------------------------------------------------------------------------------- /docs/pics/ner_time.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/azure-ml-federated-learning/d72148a08394978db29277b922817cbe871fd2e5/docs/pics/ner_time.jpg -------------------------------------------------------------------------------- /docs/pics/pipeline-aml.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/azure-ml-federated-learning/d72148a08394978db29277b922817cbe871fd2e5/docs/pics/pipeline-aml.PNG -------------------------------------------------------------------------------- /docs/pics/pneumonia_acc.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/azure-ml-federated-learning/d72148a08394978db29277b922817cbe871fd2e5/docs/pics/pneumonia_acc.jpg -------------------------------------------------------------------------------- /docs/pics/pneumonia_ddp.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/azure-ml-federated-learning/d72148a08394978db29277b922817cbe871fd2e5/docs/pics/pneumonia_ddp.jpg -------------------------------------------------------------------------------- /docs/pics/pneumonia_ddp_1tb.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/azure-ml-federated-learning/d72148a08394978db29277b922817cbe871fd2e5/docs/pics/pneumonia_ddp_1tb.jpg -------------------------------------------------------------------------------- /docs/pics/pneumonia_time.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/azure-ml-federated-learning/d72148a08394978db29277b922817cbe871fd2e5/docs/pics/pneumonia_time.jpg -------------------------------------------------------------------------------- /docs/pics/sandboxes_confidential.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/azure-ml-federated-learning/d72148a08394978db29277b922817cbe871fd2e5/docs/pics/sandboxes_confidential.png -------------------------------------------------------------------------------- /docs/pics/sandboxes_eyesoff.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/azure-ml-federated-learning/d72148a08394978db29277b922817cbe871fd2e5/docs/pics/sandboxes_eyesoff.png -------------------------------------------------------------------------------- /docs/pics/sandboxes_eyeson.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/azure-ml-federated-learning/d72148a08394978db29277b922817cbe871fd2e5/docs/pics/sandboxes_eyeson.png -------------------------------------------------------------------------------- /docs/pics/sandboxes_private.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/azure-ml-federated-learning/d72148a08394978db29277b922817cbe871fd2e5/docs/pics/sandboxes_private.png -------------------------------------------------------------------------------- /docs/pics/vfltrainingloop.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/azure-ml-federated-learning/d72148a08394978db29277b922817cbe871fd2e5/docs/pics/vfltrainingloop.png -------------------------------------------------------------------------------- /docs/pics/vnet_silo_provisioning.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/azure-ml-federated-learning/d72148a08394978db29277b922817cbe871fd2e5/docs/pics/vnet_silo_provisioning.png -------------------------------------------------------------------------------- /docs/provisioning/jumpbox_cc.md: -------------------------------------------------------------------------------- 1 | # Create a confidential compute jumpbox VM inside a vnet 2 | 3 | This tutorial will let you create a jumpbox VM inside a vnet, optionally by using Azure Bastion to connect via HTTPS. 4 | 5 | :warning: This should be used for **development purpose only**. 6 | 7 | ## Prerequisites 8 | 9 | To enjoy these quickstart, you will need to: 10 | 11 | - have an active [Azure subscription](https://azure.microsoft.com) that you can use for development purposes, 12 | - have permissions to create resources, set permissions, and create identities in this subscription (or at least in one resource group), 13 | - Note that to set permissions, you typically need _Owner_ role in the subscription or resource group - _Contributor_ role is not enough. This is key for being able to _secure_ the setup. 14 | - [install the Azure CLI](https://learn.microsoft.com/en-us/cli/azure/install-azure-cli). 15 | 16 | ## Deploy a confidential compute VM inside a vNet 17 | 18 | > Check availability of [confidential compute VMS in your region.](https://azure.microsoft.com/en-us/explore/global-infrastructure/products-by-region/?products=virtual-machines®ions=all). 19 | 20 | ### Option 1 : one click deployment 21 | 22 | 1. Click on [![Deploy to Azure](https://aka.ms/deploytoazurebutton)](https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2FAzure-Samples%2Fazure-ml-federated-learning%2Fmain%2Fmlops%2Farm%2Fjumpbox_cc.json) 23 | 24 | 2. Adjust parameters, in particular: 25 | 26 | - vnetName: name of the vNet to join. 27 | - subnetName: name of the subnet inside the vNet. 28 | - nsgName: name of the existing security group applying to the VM. 29 | 30 | ### Option 2 : deployment using az cli 31 | 32 | In the resource group of your AzureML workspace, use the following command with parameters corresponding to your setup: 33 | 34 | ```bash 35 | az deployment group create --template-file ./mlops/bicep/modules/resources/jumpbox_cc.bicep --resource-group --parameters vnetName="..." subnetName="..." nsgName="..." jumpboxOs="linux" 36 | ``` 37 | -------------------------------------------------------------------------------- /docs/provisioning/orchestrator_open.md: -------------------------------------------------------------------------------- 1 | # Create an open sandbox orchestrator 2 | 3 | :warning: This should be used for **development purpose only**. 4 | 5 | ## Prerequisites 6 | 7 | To run these deployment options, you first need: 8 | 9 | - an existing Azure ML workspace (see [cookbook](README.md#create-an-azure-ml-workspace)) 10 | - have permissions to create resources, set permissions, and create identities in this subscription (or at least in one resource group), 11 | - Note that to set permissions, you typically need _Owner_ role in the subscription or resource group - _Contributor_ role is not enough. This is key for being able to _secure_ the setup. 12 | - Optional: [install the Azure CLI](https://learn.microsoft.com/en-us/cli/azure/install-azure-cli). 13 | 14 | ## Create a compute and storage pair for the orchestrator 15 | 16 | > Note: both orchestrator and [silo](./silo_open.md) can be deployed using the same arm/bicep script, changing **Pair Base Name** accordingly. 17 | 18 | ### Option 1 : one click deployment 19 | 20 | 1. Click on [![Deploy to Azure](https://aka.ms/deploytoazurebutton)](https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2FAzure-Samples%2Fazure-ml-federated-learning%2Fmain%2Fmlops%2Farm%2Fopen_compute_storage_pair.json) 21 | 22 | 2. Adjust parameters, in particular: 23 | 24 | - Region: this will be set by Azure to the region of your resource group. 25 | - Machine Learning Name: need to match the name of the AzureML workspace in the resource group. 26 | - Machine Learning Region: the region in which the AzureML workspace was deployed (default: same as resource group). 27 | - Pair Region: the region where the compute and storage will be deployed (default: same as resource group). 28 | - Pair Base Name: a unique name for the **orchestrator**, example `orch`. This will be used to create all other resources (storage name, compute name, etc.). 29 | 30 | ### Option 2 : deployment using az cli 31 | 32 | In the resource group of your AzureML workspace, use the following command with parameters corresponding to your setup: 33 | 34 | ```bash 35 | az deployment group create --template-file ./mlops/bicep/modules/fl_pairs/open_compute_storage_pair.bicep --resource-group --parameters pairBaseName="orch" pairRegion="eastus" machineLearningName="aml-fldemo" machineLearningRegion="eastus" 36 | ``` 37 | -------------------------------------------------------------------------------- /docs/tutorials/add-kaggle-credentials.md: -------------------------------------------------------------------------------- 1 | # Add Kaggle credentials to your FL sandbox 2 | 3 | Your Azure ML workspace has an attached key vault that can be used to store workspace-level secrets (users of the workspace will have access to it). We can use this to store Kaggle API key so that jobs can download data from Kaggle. 4 | 5 | This tutorial shows you how to add your Kaggle credentials to your FL sandbox. 6 | 7 | ### Locate your workspace attached key vault 8 | 9 | You first need to locate your workspace key vault. It is provisioned by default in our [sandboxes](../provisioning/sandboxes.md) and is named `ws-shkv-`. You can find the name of your workspace in the Azure portal. 10 | 11 | ### Option 1: using Azure CLI 12 | 13 | 1. Let's first obtain your AAD identifier (object id) by running the following command. We'll use it in the next step. 14 | 15 | ```bash 16 | az ad signed-in-user show --query id 17 | ``` 18 | 19 | 2. Create a new key vault policy for yourself, and grant permissions to list, set & delete secrets. 20 | 21 | ```bash 22 | az keyvault set-policy -n --secret-permissions list set delete --object-id 23 | ``` 24 | 25 | > Note: The AML workspace you created with the aforementioned script contains the name of the key vault. Default is `ws-shkv-fldemo`. 26 | 27 | 3. With your newly created permissions, you can now create a secret to store the `kaggleusername`. 28 | 29 | ```bash 30 | az keyvault secret set --name kaggleusername --vault-name --value 31 | ``` 32 | 33 | > Make sure to provide your *Kaggle Username*. 34 | 35 | 4. Create a secret to store the `kagglekey`. 36 | 37 | ```bash 38 | az keyvault secret set --name kagglekey --vault-name --value 39 | ``` 40 | 41 | > Make sure to provide the *[Kaggle API Token](https://www.kaggle.com/docs/api#authentication)*. 42 | 43 | ### Option 2: using Azure UI 44 | 45 | 1. In your resource group (provisioned in the previous step), open "Access Policies" tab in the newly created key vault and click "Create". 46 | 47 | 2. Select *List, Set & Delete* right under "Secret Management Operations" and press "Next". 48 | 49 | 3. Lookup currently logged in user (using user id or an email), select it and press "Next". 50 | 51 | 4. Press "Next" and "Create" in the next screens. 52 | 53 | We are now able to create a secret in the key vault. 54 | 55 | 5. Open the "Secrets" tab. Create two plain text secrets: 56 | 57 | - **kaggleusername** - specifies your Kaggle user name 58 | - **kagglekey** - this is the API key that can be obtained from your account page on the Kaggle website. 59 | -------------------------------------------------------------------------------- /docs/tutorials/literal-scatter-gather-tutorial.md: -------------------------------------------------------------------------------- 1 | # Adapt our sample "literal" code to your needs 2 | 3 | IMPORTANT: the "literal" code available in this repo has been intentionally designed to: 4 | 5 | - provide an effortless setup to get started. 6 | - rely only on features that are currently generally available in AzureML SDK v2. 7 | 8 | This tutorial addresses the following scenarios: 9 | 10 | - To add/remove a silo: 11 | - You just need to make the changes in the "`federated_learning/silos`" section of the `examples/pipelines/fl_cross_silo_literal/config.yaml` file. 12 | 13 | - To change training hyper-parameters: 14 | - Adjust the parameters in the "`training_parameters`" section of the `examples/pipelines/fl_cross_silo_literal/config.yaml` file. 15 | 16 | ## Tutorial on how to adapt the "scatter-gather" code 17 | 18 | Please read the following points to have a better understanding of the "scatter-gather" code: 19 | 20 | - It has a `set_orchestrator` method that you can leverage to add an orchestrator to your pipeline. 21 | - The `add_silo` method lets you add `n` number of silos to the pipeline and you don't have to worry about the configuration. 22 | - It has a soft validation component that ensures that the appropriate permissions are granted for your assets. That being said, a compute `a` should not have access to dataset `b` and so on. 23 | - You can bypass the validation if you have your own custom rules. 24 | - Enabling type-check, ensures that no data is being saved and only model weights are kept in the datastore. 25 | 26 | This tutorial addresses the following scenarios: 27 | 28 | - To add/remove a silo: 29 | - You just need to make the changes in the "`strategy/horizontal`" section of the `examples/pipelines/fl_cross_silo_scatter_gather/config.yaml` file. 30 | 31 | - To change the training hyper-parameters: 32 | - Adjust the parameters in the "`inputs`" section of the `examples/pipelines/fl_cross_silo_scatter_gather/config.yaml` file. 33 | 34 | - To edit the flow of your training pipeline: 35 | - Pass your custom subgraph as a parameter to the `scatter_gather` method in the `examples/pipelines/fl_cross_silo_scatter_gather/submit.py` file. 36 | 37 | - To bypass the soft validation: 38 | - Use `--ignore_validation` argument while executing the `examples/pipelines/fl_cross_silo_scatter_gather/submit.py` file. 39 | 40 | - To enable multiple computes(CPU for preprocessing & GPU for training): 41 | - Set the `compute2` parameter to `true` while [provisioning](../quickstart.md#deploy-demo-resources-in-azure) the resources.(No further changes are required) 42 | -------------------------------------------------------------------------------- /examples/.gitignore: -------------------------------------------------------------------------------- 1 | # Hydra outputs. 2 | outputs/ 3 | -------------------------------------------------------------------------------- /examples/cli-jobs/upload-local-data/job.yml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json 2 | 3 | code: . 4 | 5 | command: > 6 | python run.py 7 | --local_data_folder ${{inputs.local_data_folder}} 8 | --destination_folder ${{outputs.destination_folder}} 9 | --method ${{inputs.method}} 10 | 11 | inputs: 12 | local_data_folder: 13 | type: uri_folder 14 | path: /path/to/local/data/folder # replace '/path/to/local/data/folder' by the actual path to the folder whose contents you want to upload 15 | 16 | method: 'copy' # just copy local to remote 17 | # method: 'encrypt' # to enable encryption of the outputs using your encryption keys 18 | 19 | outputs: 20 | destination_folder: 21 | type: uri_folder 22 | mode: upload 23 | path: azureml://datastores//paths// # replace '' by the actual datastore name for your silo, and by the path you want to use in the silo storage account 24 | 25 | environment: azureml:AzureML-sklearn-1.0-ubuntu20.04-py38-cpu@latest 26 | 27 | environment_variables: 28 | # used only if method='encrypt' 29 | CONFIDENTIALITY_KEYVAULT: https://.vault.azure.net # url of the keyvault 30 | CONFIDENTIALITY_KEY_NAME: dev-rsa-key # name of the secret containing your encryption public key 31 | 32 | compute: azureml: # replace '' by the actual compute name for your silo 33 | -------------------------------------------------------------------------------- /examples/components/BANK_MARKETING_VERTICAL/traininsilo/conda.yaml: -------------------------------------------------------------------------------- 1 | name: bank_marketing_vertical_train_conda_env 2 | channels: 3 | - defaults 4 | - pytorch 5 | - nvidia 6 | dependencies: 7 | - python=3.8 8 | - pip=22.3.1 9 | - pytorch=1.13.1 10 | - pytorch-cuda=11.6 11 | - pip: 12 | - azureml-core==1.47.0 13 | - azure-keyvault==4.2.0 14 | - azureml-mlflow==1.48.0 15 | - pandas==1.5.2 16 | - torchmetrics==0.10.3 17 | - redis==4.5.1 18 | -------------------------------------------------------------------------------- /examples/components/BANK_MARKETING_VERTICAL/traininsilo/datasets.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.data import Dataset 3 | 4 | 5 | class BankMarketingDataset(Dataset): 6 | """SubscribeDataset Dataset - combination of features and labels 7 | 8 | Args: 9 | feature: Transaction detail tensors 10 | target: Tensor of labels corresponding to features 11 | 12 | Returns: 13 | None 14 | """ 15 | 16 | def __init__(self, df): 17 | if "label" in df.columns: 18 | if len(df.columns) > 1: 19 | self.X = torch.tensor( 20 | df.loc[:, df.columns != "label"].values, dtype=torch.float 21 | ) 22 | else: 23 | self.X = None 24 | self.Y = torch.tensor(df.loc[:, "label"].values, dtype=torch.int) 25 | else: 26 | self.X = torch.tensor(df.values, dtype=torch.float) 27 | self.Y = None 28 | 29 | def __len__(self): 30 | if self.Y is None: 31 | return len(self.X) 32 | else: 33 | return len(self.Y) 34 | 35 | def features_count(self): 36 | if self.X is not None: 37 | return self.X.shape[1] 38 | return None 39 | 40 | def __getitem__(self, idx): 41 | if self.Y is None: 42 | return self.X[idx] 43 | elif self.X is None: 44 | return self.Y[idx] 45 | else: 46 | return self.X[idx], self.Y[idx] 47 | -------------------------------------------------------------------------------- /examples/components/BANK_MARKETING_VERTICAL/traininsilo/models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class SimpleLinearBottom(nn.Module): 6 | """Bottom (Contributor) part of the model composed of only Linear model interleaved with ReLU activations 7 | 8 | Args: 9 | input_dim (int): 10 | number of features to be consumed by the model 11 | """ 12 | 13 | def __init__(self, input_dim) -> None: 14 | super().__init__() 15 | 16 | self.input_dim = input_dim 17 | self.model = nn.Sequential( 18 | nn.Linear(input_dim, 512), 19 | nn.ReLU(), 20 | nn.Linear(512, 256), 21 | nn.ReLU(), 22 | nn.Linear(256, 128), 23 | nn.ReLU(), 24 | nn.Linear(128, 64), 25 | ) 26 | self._init_weights() 27 | 28 | def _init_weights(self): 29 | for m in self.modules(): 30 | if isinstance(m, nn.Embedding): 31 | torch.nn.init.uniform_(m.weight, -0.001, 0.001) 32 | elif isinstance(m, nn.Linear): 33 | torch.nn.init.xavier_uniform_(m.weight) 34 | m.bias.data.fill_(0.01) 35 | 36 | def forward(self, x): 37 | return self.model(x) 38 | 39 | 40 | class SimpleLinearTop(nn.Module): 41 | """Top (Host) part of the model composed of only Linear model interleaved with ReLU activations""" 42 | 43 | def __init__(self, world_size) -> None: 44 | super().__init__() 45 | 46 | self._world_size = world_size 47 | self.contributor_weights = torch.nn.ModuleList( 48 | [nn.Linear(64, 64) for _ in range(self._world_size)] 49 | ) 50 | 51 | self.model = nn.Sequential( 52 | nn.Linear(64, 1), 53 | nn.Sigmoid(), 54 | ) 55 | self._init_weights() 56 | 57 | def _init_weights(self): 58 | for m in self.modules(): 59 | if isinstance(m, nn.Embedding): 60 | torch.nn.init.uniform_(m.weight, -0.001, 0.001) 61 | elif isinstance(m, nn.Linear): 62 | torch.nn.init.xavier_uniform_(m.weight) 63 | m.bias.data.fill_(0.01) 64 | 65 | def forward(self, x): 66 | agg_x = self.contributor_weights[0](x[0]) 67 | for i in range(1, self._world_size - 1): 68 | agg_x += self.contributor_weights[i](x[i]) 69 | 70 | return self.model(agg_x).squeeze() 71 | -------------------------------------------------------------------------------- /examples/components/BANK_MARKETING_VERTICAL/traininsilo/samplers.py: -------------------------------------------------------------------------------- 1 | ########################################################################################## 2 | # WARNING # 3 | ########################################################################################## 4 | # Should this file change please update all copies of samplers.py file in the repository # 5 | ########################################################################################## 6 | 7 | import math 8 | import torch 9 | from torch.utils.data import Sampler 10 | 11 | 12 | class VerticallyDistributedBatchSampler(Sampler): 13 | """Batch sampler that uses a distributed communication backend to distribute samples indexes to each worker.""" 14 | 15 | def __init__(self, data_source, batch_size, comm, rank, world_size, shuffle=False): 16 | """Initializes the batch sampler. 17 | 18 | Args: 19 | data_source (torch.utils.data.Dataset): The dataset to sample from. 20 | batch_size (int): The size of the batch to sample. 21 | comm (AMLComm): The communicator to use for communication. 22 | rank (int): The rank of the current worker. 23 | world_size (int): The total number of workers. 24 | shuffle (bool, optional): Whether to shuffle the indices. Defaults to False. 25 | """ 26 | self.data_source = data_source 27 | self.batch_size = batch_size 28 | self.shuffle = shuffle 29 | self.rank = rank 30 | self.world_size = world_size 31 | self.comm = comm 32 | 33 | def __iter__(self): 34 | if self.rank == 0: 35 | if self.shuffle: 36 | indices = torch.randperm(len(self.data_source)) 37 | else: 38 | indices = torch.arange(len(self.data_source)) 39 | 40 | # Split the indices into batches 41 | batches = [ 42 | indices[i : i + self.batch_size] 43 | for i in range(0, len(indices), self.batch_size) 44 | ] 45 | 46 | for batch in batches: 47 | for i in range(1, self.world_size): 48 | # Send the batch to contributor i 49 | self.comm.send(batch, i) 50 | 51 | yield batch 52 | else: 53 | for i in range(0, len(self.data_source), self.batch_size): 54 | # Receive the batch from host 55 | batch = self.comm.recv(0) 56 | yield batch 57 | 58 | def __len__(self): 59 | return math.ceil(len(self.data_source) / self.batch_size) 60 | -------------------------------------------------------------------------------- /examples/components/BANK_MARKETING_VERTICAL/upload_data/conda.yaml: -------------------------------------------------------------------------------- 1 | name: bank_marketing_vertical_upload_data_conda_env 2 | channels: 3 | - defaults 4 | dependencies: 5 | - python=3.8 6 | - pip=22.2.2 7 | - pip: 8 | - azure-identity==1.12.0 9 | - azure-keyvault==4.2.0 10 | - azureml-core==1.47.0 11 | - kaggle==1.5.12 12 | - scikit-learn==1.1.3 13 | - numpy==1.23.5 14 | - pandas==1.3.5 15 | -------------------------------------------------------------------------------- /examples/components/BANK_MARKETING_VERTICAL/upload_data/spec.yaml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json 2 | name: msft_fl_bank_marketing_upload_data 3 | version: 0.0.1 4 | display_name: Download Bank Marketing data from Kaggle and upload to silo storage 5 | type: command 6 | is_deterministic: true 7 | 8 | inputs: 9 | silo_count: 10 | type: number 11 | optional: false 12 | silo_index: 13 | type: number 14 | optional: false 15 | 16 | outputs: 17 | raw_train_data: 18 | type: uri_folder 19 | description: the output Bank Marketing raw training data for a given silo 20 | raw_test_data: 21 | type: uri_folder 22 | description: the output Bank Marketing raw testing data for a given silo 23 | 24 | code: . 25 | 26 | command: >- 27 | python run.py --silo_count ${{inputs.silo_count}} --silo_index ${{inputs.silo_index}} --raw_train_data ${{outputs.raw_train_data}} --raw_test_data ${{outputs.raw_test_data}} 28 | 29 | environment: 30 | conda_file: ./conda.yaml 31 | image: mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04 32 | -------------------------------------------------------------------------------- /examples/components/CCFRAUD/preprocessing/conda.yaml: -------------------------------------------------------------------------------- 1 | name: ccfraud_preprocessing_conda_env 2 | channels: 3 | - defaults 4 | dependencies: 5 | - python=3.8 6 | - pip=22.3.1 7 | - pip: 8 | - azure-identity==1.12.0 9 | - azure-keyvault==4.2.0 10 | - azureml-mlflow==1.48.0 11 | - scikit-learn==1.2.2 12 | - numpy==1.24.2 13 | - pandas==1.5.3 14 | -------------------------------------------------------------------------------- /examples/components/CCFRAUD/preprocessing/spec.yaml: -------------------------------------------------------------------------------- 1 | 2 | $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json 3 | name: msft_fl_ccfraud_preprocessing_in_silo 4 | version: 0.3.0 5 | display_name: CC Fraud Pre-Processing (in silo) 6 | type: command 7 | description: Component for preprocessing raw data from silo's blob storage 8 | is_deterministic: true 9 | 10 | inputs: 11 | raw_training_data: 12 | type: uri_file 13 | description: the raw training data in a given silo 14 | raw_testing_data: 15 | type: uri_file 16 | description: the raw testing data in a given silo 17 | metrics_prefix: 18 | type: string 19 | description: Metrics prefix 20 | default: Default-prefix 21 | optional: true 22 | 23 | 24 | outputs: 25 | processed_train_data: 26 | type: uri_folder 27 | description: the output training data after preprocessing 28 | processed_test_data: 29 | type: uri_folder 30 | description: the output testing data after preprocessing 31 | 32 | code: . 33 | 34 | command: >- 35 | python run.py --raw_training_data ${{inputs.raw_training_data}} --raw_testing_data ${{inputs.raw_testing_data}} --train_output ${{outputs.processed_train_data}} --test_output ${{outputs.processed_test_data}} $[[--metrics_prefix ${{inputs.metrics_prefix}}]] 36 | 37 | # NOTE: using one of Azure ML's curated environments 38 | # which has all the dependencies needed for this job 39 | environment: 40 | conda_file: ./conda.yaml 41 | image: mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04 42 | -------------------------------------------------------------------------------- /examples/components/CCFRAUD/traininsilo/conda.yaml: -------------------------------------------------------------------------------- 1 | name: ccfraud_train_conda_env 2 | channels: 3 | - defaults 4 | - pytorch 5 | - nvidia 6 | dependencies: 7 | - python=3.8 8 | - pip=22.3.1 9 | - pytorch=1.13.1 10 | - pytorch-cuda=11.6 11 | - pip: 12 | - azure-identity==1.12.0 13 | - azure-keyvault==4.2.0 14 | - azureml-mlflow==1.48.0 15 | - pandas==1.5.3 16 | - torchmetrics==0.10.3 17 | - opacus==1.3.0 18 | - tqdm==4.64.1 19 | -------------------------------------------------------------------------------- /examples/components/CCFRAUD/traininsilo/datasets.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.data import Dataset 3 | 4 | 5 | class FraudDataset(Dataset): 6 | """FraudDataset Dataset - combination of features and labels 7 | 8 | Args: 9 | feature: Transaction detail tensors 10 | target: Tensor of labels corresponding to features 11 | 12 | Returns: 13 | None 14 | """ 15 | 16 | def __init__(self, df): 17 | self.X = torch.tensor( 18 | df.loc[:, df.columns != "is_fraud"].values, dtype=torch.float 19 | ) 20 | self.Y = torch.tensor(df.loc[:, "is_fraud"].values, dtype=torch.int) 21 | 22 | def __len__(self): 23 | return len(self.X) 24 | 25 | def __getitem__(self, idx): 26 | if self.Y is None: 27 | return [self.X[idx]] 28 | return self.X[idx], self.Y[idx] 29 | 30 | 31 | class FraudTimeDataset(Dataset): 32 | """FraudTimeDataset Dataset - combination of features and labels retrieved sequentially 33 | 34 | Args: 35 | feature: Transaction detail tensors 36 | target: Tensor of labels corresponding to features 37 | 38 | Returns: 39 | None 40 | """ 41 | 42 | def __init__(self, df, time_steps=100): 43 | self.X = torch.tensor( 44 | df.loc[:, df.columns != "is_fraud"].values, dtype=torch.float 45 | ) 46 | self.Y = torch.tensor(df.loc[:, "is_fraud"].values, dtype=torch.int) 47 | 48 | assert time_steps >= 10 49 | 50 | self._time_steps = time_steps 51 | self._time_step_overlaps = time_steps // 5 52 | 53 | def __len__(self): 54 | return ( 55 | len(self.X) // (self._time_steps // self._time_step_overlaps) 56 | - self._time_step_overlaps 57 | ) + 1 58 | 59 | def __getitem__(self, idx): 60 | if self.Y is None: 61 | return ( 62 | self.X[ 63 | idx 64 | * (self._time_steps // self._time_step_overlaps) : idx 65 | * (self._time_steps // self._time_step_overlaps) 66 | + self._time_steps 67 | ], 68 | ) 69 | return ( 70 | self.X[ 71 | idx 72 | * (self._time_steps // self._time_step_overlaps) : idx 73 | * (self._time_steps // self._time_step_overlaps) 74 | + self._time_steps 75 | ], 76 | self.Y[ 77 | idx 78 | * (self._time_steps // self._time_step_overlaps) : idx 79 | * (self._time_steps // self._time_step_overlaps) 80 | + self._time_steps 81 | ], 82 | ) 83 | -------------------------------------------------------------------------------- /examples/components/CCFRAUD/upload_data/conda.yaml: -------------------------------------------------------------------------------- 1 | name: ccfraud_upload_data_conda_env 2 | channels: 3 | - defaults 4 | dependencies: 5 | - python=3.8 6 | - pip=22.3.1 7 | - pip: 8 | - azure-identity==1.12.0 9 | - azure-keyvault==4.2.0 10 | - azureml-core==1.47.0 11 | - kaggle==1.5.12 12 | - scikit-learn==1.1.3 13 | - numpy==1.23.5 14 | - pandas==1.3.5 15 | -------------------------------------------------------------------------------- /examples/components/CCFRAUD/upload_data/spec.yaml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json 2 | name: msft_fl_ccfraud_upload_data 3 | version: 0.3.0 4 | display_name: Download CC Fraud data from Kaggle and upload to silo storage 5 | type: command 6 | is_deterministic: true 7 | 8 | inputs: 9 | silo_count: 10 | type: number 11 | optional: false 12 | silo_index: 13 | type: number 14 | optional: false 15 | 16 | outputs: 17 | raw_train_data: 18 | type: uri_folder 19 | description: the output CC Fraud raw training data for a given silo 20 | raw_test_data: 21 | type: uri_folder 22 | description: the output CC Fraud raw testing data for a given silo 23 | 24 | code: . 25 | 26 | command: >- 27 | python run.py 28 | --silo_count ${{inputs.silo_count}} 29 | --silo_index ${{inputs.silo_index}} 30 | --raw_train_data ${{outputs.raw_train_data}} 31 | --raw_test_data ${{outputs.raw_test_data}} 32 | 33 | environment: 34 | conda_file: ./conda.yaml 35 | image: mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04 36 | -------------------------------------------------------------------------------- /examples/components/CCFRAUD/upload_data/us_regions.csv: -------------------------------------------------------------------------------- 1 | State,StateCode,Region,Division 2 | Alaska,AK,West,Pacific 3 | Alabama,AL,South,East South Central 4 | Arkansas,AR,South,West South Central 5 | Arizona,AZ,West,Mountain 6 | California,CA,West,Pacific 7 | Colorado,CO,West,Mountain 8 | Connecticut,CT,Northeast,New England 9 | District of Columbia,DC,South,South Atlantic 10 | Delaware,DE,South,South Atlantic 11 | Florida,FL,South,South Atlantic 12 | Georgia,GA,South,South Atlantic 13 | Hawaii,HI,West,Pacific 14 | Iowa,IA,Midwest,West North Central 15 | Idaho,ID,West,Mountain 16 | Illinois,IL,Midwest,East North Central 17 | Indiana,IN,Midwest,East North Central 18 | Kansas,KS,Midwest,West North Central 19 | Kentucky,KY,South,East South Central 20 | Louisiana,LA,South,West South Central 21 | Massachusetts,MA,Northeast,New England 22 | Maryland,MD,South,South Atlantic 23 | Maine,ME,Northeast,New England 24 | Michigan,MI,Midwest,East North Central 25 | Minnesota,MN,Midwest,West North Central 26 | Missouri,MO,Midwest,West North Central 27 | Mississippi,MS,South,East South Central 28 | Montana,MT,West,Mountain 29 | North Carolina,NC,South,South Atlantic 30 | North Dakota,ND,Midwest,West North Central 31 | Nebraska,NE,Midwest,West North Central 32 | New Hampshire,NH,Northeast,New England 33 | New Jersey,NJ,Northeast,Middle Atlantic 34 | New Mexico,NM,West,Mountain 35 | Nevada,NV,West,Mountain 36 | New York,NY,Northeast,Middle Atlantic 37 | Ohio,OH,Midwest,East North Central 38 | Oklahoma,OK,South,West South Central 39 | Oregon,OR,West,Pacific 40 | Pennsylvania,PA,Northeast,Middle Atlantic 41 | Rhode Island,RI,Northeast,New England 42 | South Carolina,SC,South,South Atlantic 43 | South Dakota,SD,Midwest,West North Central 44 | Tennessee,TN,South,East South Central 45 | Texas,TX,South,West South Central 46 | Utah,UT,West,Mountain 47 | Virginia,VA,South,South Atlantic 48 | Vermont,VT,Northeast,New England 49 | Washington,WA,West,Pacific 50 | Wisconsin,WI,Midwest,East North Central 51 | West Virginia,WV,South,South Atlantic 52 | Wyoming,WY,West,Mountain 53 | -------------------------------------------------------------------------------- /examples/components/CCFRAUD_VERTICAL/preprocessing/spec.yaml: -------------------------------------------------------------------------------- 1 | 2 | $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json 3 | name: msft_fl_ccfraud_vertical_preprocessing_in_silo 4 | version: 0.0.1 5 | display_name: CC Fraud Pre-Processing (in silo) 6 | type: command 7 | description: Component for preprocessing raw data from silo's blob storage 8 | is_deterministic: true 9 | 10 | inputs: 11 | raw_training_data: 12 | type: uri_file 13 | description: the raw training data in a given silo 14 | raw_testing_data: 15 | type: uri_file 16 | description: the raw testing data in a given silo 17 | metrics_prefix: 18 | type: string 19 | description: Metrics prefix 20 | default: Default-prefix 21 | optional: true 22 | 23 | 24 | outputs: 25 | processed_train_data: 26 | type: uri_folder 27 | description: the output training data after preprocessing 28 | processed_test_data: 29 | type: uri_folder 30 | description: the output testing data after preprocessing 31 | 32 | code: . 33 | 34 | command: >- 35 | python run.py --raw_training_data ${{inputs.raw_training_data}} --raw_testing_data ${{inputs.raw_testing_data}} --train_output ${{outputs.processed_train_data}} --test_output ${{outputs.processed_test_data}} $[[--metrics_prefix ${{inputs.metrics_prefix}}]] 36 | 37 | # NOTE: using one of Azure ML's curated environments 38 | # which has all the dependencies needed for this job 39 | environment: azureml:AzureML-sklearn-1.0-ubuntu20.04-py38-cpu@latest 40 | -------------------------------------------------------------------------------- /examples/components/CCFRAUD_VERTICAL/psi/context/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:20230412.v1 2 | 3 | COPY vcpkg.json . 4 | COPY SymmetricPSI /SymmetricPSI 5 | 6 | # Update python and install dependencies 7 | RUN conda update conda && conda install python=3.10.0 8 | RUN pip install azureml-core==1.47.0 \ 9 | azure-keyvault==4.2.0 \ 10 | azureml-mlflow==1.48.0 \ 11 | pandas==1.5.2 \ 12 | redis==4.5.1 \ 13 | numpy==1.24.2 14 | 15 | # Install vcpkg and dependencies 16 | RUN apt-get update && apt-get install -y zip pkg-config build-essential cmake 17 | RUN git clone https://github.com/microsoft/vcpkg.git /vcpkg &&\ 18 | chmod a+x /vcpkg/bootstrap-vcpkg.sh && /bin/bash /vcpkg/bootstrap-vcpkg.sh &&\ 19 | /vcpkg/vcpkg install --triplet=x64-linux --x-buildtrees-root=/vcpkg/buildtrees --x-install-root=/vcpkg/installed --x-packages-root=/vcpkg/packages 20 | 21 | # Install APSI 22 | RUN git clone https://github.com/microsoft/APSI.git /APSI &&\ 23 | cd /APSI && mkdir build &&\ 24 | cd /APSI/build &&\ 25 | cmake .. -DAPSI_USE_ASM=OFF -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=/vcpkg/scripts/buildsystems/vcpkg.cmake &&\ 26 | make -j$(nproc) &&\ 27 | make install 28 | 29 | # Install SymmetricPSI and create Python bindings 30 | RUN cd SymmetricPSI && mkdir build && cd build &&\ 31 | cmake .. -DCMAKE_TOOLCHAIN_FILE=/vcpkg/scripts/buildsystems/vcpkg.cmake -DAPSI_ROOT=/APSI/build -DVCPKG_TARGET_TRIPLET=x64-linux -DCMAKE_BUILD_TYPE=Release &&\ 32 | make -j$(nproc) 33 | -------------------------------------------------------------------------------- /examples/components/CCFRAUD_VERTICAL/psi/context/SymmetricPSI/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.13) 2 | 3 | if(NOT CMAKE_BUILD_TYPE) 4 | set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Build type" FORCE) 5 | set_property(CACHE CMAKE_BUILD_TYPE PROPERTY 6 | STRINGS "Release" "Debug" "MinSizeRel" "RelWithDebInfo") 7 | endif() 8 | message(STATUS "Build type (CMAKE_BUILD_TYPE): ${CMAKE_BUILD_TYPE}") 9 | 10 | project(SymmetricPSI VERSION 1.0.0 LANGUAGES CXX C) 11 | find_package(Python3 COMPONENTS Interpreter Development REQUIRED) 12 | find_package(pybind11 CONFIG REQUIRED) 13 | find_package(APSI CONFIG REQUIRED) 14 | 15 | pybind11_add_module(SymmetricPSI psi.cpp) 16 | target_link_libraries(SymmetricPSI PRIVATE APSI::apsi Python3::Python pybind11::lto pybind11::embed pybind11::module pybind11::headers) 17 | set_target_properties(SymmetricPSI PROPERTIES PREFIX "" SUFFIX ".so" OUTPUT_NAME "SymmetricPSI" POSITION_INDEPENDENT_CODE TRUE) 18 | -------------------------------------------------------------------------------- /examples/components/CCFRAUD_VERTICAL/psi/context/vcpkg.json: -------------------------------------------------------------------------------- 1 | { 2 | "dependencies": [ 3 | { 4 | "name": "seal", 5 | "features": ["no-throw-tran"] 6 | }, 7 | "kuku", 8 | "log4cplus", 9 | "cppzmq", 10 | "flatbuffers", 11 | "jsoncpp", 12 | "tclap", 13 | "pybind11", 14 | "python3" 15 | ], 16 | "overrides": [ 17 | { 18 | "name": "seal", 19 | "version": "4.1.1" 20 | }, 21 | { 22 | "name": "kuku", 23 | "version": "2.1" 24 | }, 25 | { 26 | "name": "python3", 27 | "version": "3.10.0" 28 | } 29 | ], 30 | "builtin-baseline": "a325228200d7f229f3337e612e0077f2a5307090" 31 | } -------------------------------------------------------------------------------- /examples/components/CCFRAUD_VERTICAL/psi/spec.yaml: -------------------------------------------------------------------------------- 1 | 2 | $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json 3 | name: mnist_vertical_psi_in_silo 4 | version: 0.0.3 5 | display_name: CC Fraud PSI (in silo) 6 | type: command 7 | description: Component for private set intersection over data in silo's blob storage 8 | is_deterministic: false 9 | 10 | inputs: 11 | train_data: 12 | type: uri_file 13 | description: the training data in a given silo 14 | test_data: 15 | type: uri_file 16 | description: the testing data in a given silo 17 | global_size: 18 | type: number 19 | optional: false 20 | global_rank: 21 | type: number 22 | optional: false 23 | communication_backend: 24 | type: string 25 | enum: 26 | - socket 27 | - redis 28 | default: socket 29 | optional: false 30 | communication_encrypted: 31 | type: boolean 32 | description: Encrypt messages exchanged between the nodes 33 | optional: false 34 | metrics_prefix: 35 | type: string 36 | description: Metrics prefix 37 | default: Default-prefix 38 | optional: true 39 | 40 | 41 | outputs: 42 | matched_train_data: 43 | type: uri_folder 44 | description: the output training data after preprocessing 45 | matched_test_data: 46 | type: uri_folder 47 | description: the output testing data after preprocessing 48 | 49 | code: . 50 | 51 | command: >- 52 | cp /SymmetricPSI/build/SymmetricPSI.so . && python run.py 53 | --raw_training_data ${{inputs.train_data}} 54 | --raw_testing_data ${{inputs.test_data}} 55 | --train_output ${{outputs.matched_train_data}} 56 | --test_output ${{outputs.matched_test_data}} 57 | --global_size ${{inputs.global_size}} 58 | --global_rank ${{inputs.global_rank}} 59 | --communication_backend ${{inputs.communication_backend}} 60 | --communication_encrypted ${{inputs.communication_encrypted}} 61 | $[[--metrics_prefix=${{inputs.metrics_prefix}}]] 62 | 63 | environment: 64 | build: 65 | path: ./context 66 | -------------------------------------------------------------------------------- /examples/components/CCFRAUD_VERTICAL/traininsilo/conda.yaml: -------------------------------------------------------------------------------- 1 | name: ccfraud_vertical_train_conda_env 2 | channels: 3 | - defaults 4 | - pytorch 5 | - nvidia 6 | dependencies: 7 | - python=3.8 8 | - pip=22.3.1 9 | - pytorch=1.13.1 10 | - pytorch-cuda=11.6 11 | - pip: 12 | - azureml-core==1.47.0 13 | - azure-keyvault==4.2.0 14 | - azureml-mlflow==1.48.0 15 | - pandas==1.5.2 16 | - torchmetrics==0.10.3 17 | - redis==4.5.1 18 | -------------------------------------------------------------------------------- /examples/components/CCFRAUD_VERTICAL/traininsilo/contributor_spec.yaml: -------------------------------------------------------------------------------- 1 | 2 | $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json 3 | name: msft_fl_ccfraud_vertical_train_in_silo_contributor 4 | version: 0.0.1 5 | display_name: CC Fraud Train (in silo) 6 | type: command 7 | description: Component to train a model to classify CC Fraud. 8 | is_deterministic: false 9 | 10 | inputs: 11 | train_data: 12 | type: uri_folder 13 | description: the input training data (preprocessed) 14 | test_data: 15 | type: uri_folder 16 | description: the input testing data (preprocessed) 17 | metrics_prefix: 18 | type: string 19 | description: Metrics prefix 20 | default: Default-prefix 21 | optional: true 22 | checkpoint: 23 | type: uri_folder 24 | description: a given pre-existing checkpoint 25 | optional: true 26 | lr: 27 | type: number 28 | description: learning rate 29 | default: 1e-3 30 | optional: true 31 | epochs: 32 | type: integer 33 | description: total number of epochs for local training 34 | default: 10 35 | optional: true 36 | batch_size: 37 | type: integer 38 | description: batch size 39 | default: 100 40 | optional: true 41 | runtime_args: 42 | type: string 43 | description: stringified json config for a silo 44 | optional: true 45 | global_size: 46 | type: number 47 | optional: false 48 | global_rank: 49 | type: number 50 | optional: false 51 | communication_backend: 52 | type: string 53 | enum: 54 | - socket 55 | - redis 56 | default: socket 57 | optional: false 58 | communication_encrypted: 59 | type: boolean 60 | description: Encrypt messages exchanged between the nodes 61 | default: false 62 | optional: false 63 | 64 | outputs: 65 | model: 66 | type: uri_folder 67 | description: the output checkpoint 68 | 69 | code: . 70 | 71 | command: >- 72 | python contributor.py 73 | --train_data ${{inputs.train_data}} 74 | --test_data ${{inputs.test_data}} 75 | --model_path ${{outputs.model}} 76 | --global_size ${{inputs.global_size}} 77 | --global_rank ${{inputs.global_rank}} 78 | $[[--metrics_prefix ${{inputs.metrics_prefix}}]] 79 | $[[--checkpoint ${{inputs.checkpoint}}]] 80 | $[[--lr ${{inputs.lr}}]] 81 | $[[--epochs ${{inputs.epochs}}]] 82 | $[[--batch_size ${{inputs.batch_size}}]] 83 | --communication_backend ${{inputs.communication_backend}} 84 | --communication_encrypted ${{inputs.communication_encrypted}} 85 | 86 | environment: 87 | conda_file: ./conda.yaml 88 | image: mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04 89 | -------------------------------------------------------------------------------- /examples/components/CCFRAUD_VERTICAL/traininsilo/datasets.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.data import Dataset 3 | 4 | 5 | class FraudDataset(Dataset): 6 | """FraudDataset Dataset - combination of features and labels 7 | 8 | Args: 9 | df: Pandas dataframe containing features and/or labels 10 | 11 | Returns: 12 | None 13 | """ 14 | 15 | def __init__(self, df): 16 | if "is_fraud" in df.columns: 17 | if len(df.columns) > 1: 18 | self.X = df.loc[:, df.columns != "is_fraud"].values 19 | else: 20 | self.X = None 21 | self.Y = df.loc[:, "is_fraud"].values 22 | else: 23 | self.X = df.values 24 | self.Y = None 25 | 26 | if self.X is not None: 27 | self.X = torch.tensor(self.X, dtype=torch.float) 28 | if self.Y is not None: 29 | self.Y = torch.tensor(self.Y, dtype=torch.int) 30 | 31 | def __len__(self): 32 | if self.Y is None: 33 | return len(self.X) 34 | else: 35 | return len(self.Y) 36 | 37 | def features_count(self): 38 | if self.X is not None: 39 | return self.X.shape[1] 40 | return None 41 | 42 | def __getitem__(self, idx): 43 | if self.Y is None: 44 | return self.X[idx] 45 | elif self.X is None: 46 | return self.Y[idx] 47 | else: 48 | return self.X[idx], self.Y[idx] 49 | -------------------------------------------------------------------------------- /examples/components/CCFRAUD_VERTICAL/traininsilo/host_spec.yaml: -------------------------------------------------------------------------------- 1 | 2 | $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json 3 | name: msft_fl_ccfraud_vertical_train_in_silo_host 4 | version: 0.0.1 5 | display_name: CC Fraud Train (in silo) 6 | type: command 7 | description: Component to train a model to classify CC Fraud. 8 | is_deterministic: false 9 | 10 | inputs: 11 | train_data: 12 | type: uri_folder 13 | description: the input training data (preprocessed) 14 | test_data: 15 | type: uri_folder 16 | description: the input testing data (preprocessed) 17 | metrics_prefix: 18 | type: string 19 | description: Metrics prefix 20 | default: Default-prefix 21 | optional: true 22 | checkpoint: 23 | type: uri_folder 24 | description: a given pre-existing checkpoint 25 | optional: true 26 | lr: 27 | type: number 28 | description: learning rate 29 | default: 1e-3 30 | optional: true 31 | epochs: 32 | type: integer 33 | description: total number of epochs for local training 34 | default: 10 35 | optional: true 36 | batch_size: 37 | type: integer 38 | description: batch size 39 | default: 100 40 | optional: true 41 | global_size: 42 | type: number 43 | optional: false 44 | global_rank: 45 | type: number 46 | optional: false 47 | communication_backend: 48 | type: string 49 | enum: 50 | - socket 51 | - redis 52 | default: socket 53 | optional: false 54 | communication_encrypted: 55 | type: boolean 56 | description: Encrypt messages exchanged between the nodes 57 | optional: false 58 | 59 | 60 | outputs: 61 | model: 62 | type: uri_folder 63 | description: the output checkpoint 64 | 65 | code: . 66 | 67 | command: >- 68 | python host.py 69 | --train_data ${{inputs.train_data}} 70 | --test_data ${{inputs.test_data}} 71 | --model_path ${{outputs.model}} 72 | --global_size ${{inputs.global_size}} 73 | --global_rank ${{inputs.global_rank}} 74 | $[[--metrics_prefix ${{inputs.metrics_prefix}}]] 75 | $[[--checkpoint ${{inputs.checkpoint}}]] 76 | $[[--lr ${{inputs.lr}}]] 77 | $[[--epochs ${{inputs.epochs}}]] 78 | $[[--batch_size ${{inputs.batch_size}}]] 79 | --communication_backend ${{inputs.communication_backend}} 80 | --communication_encrypted ${{inputs.communication_encrypted}} 81 | 82 | environment: 83 | conda_file: ./conda.yaml 84 | image: mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04 85 | -------------------------------------------------------------------------------- /examples/components/CCFRAUD_VERTICAL/traininsilo/models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class SimpleLinearBottom(nn.Module): 7 | """Bottom (Contributor) part of the model composed of only Linear model interleaved with ReLU activations 8 | 9 | Args: 10 | input_dim (int): 11 | number of features to be consumed by the model 12 | """ 13 | 14 | def __init__(self, input_dim, latent_dim=4, hidden_dim=128, layers=4) -> None: 15 | super().__init__() 16 | 17 | self.input_dim = input_dim 18 | self.latent_dim = latent_dim 19 | self.layers = nn.ModuleList( 20 | [ 21 | nn.Linear(input_dim, hidden_dim) 22 | if i == 0 23 | else ( 24 | nn.Linear(hidden_dim, latent_dim) 25 | if i == layers - 1 26 | else nn.Linear(hidden_dim, hidden_dim) 27 | ) 28 | for i in range(layers) 29 | ] 30 | ) 31 | self._init_weights() 32 | 33 | def _init_weights(self): 34 | for m in self.modules(): 35 | if isinstance(m, nn.Embedding): 36 | torch.nn.init.uniform_(m.weight, -0.001, 0.001) 37 | elif isinstance(m, nn.Linear): 38 | torch.nn.init.xavier_uniform_(m.weight) 39 | m.bias.data.fill_(0.01) 40 | 41 | def forward(self, x): 42 | for i, layer in enumerate(self.layers): 43 | if i == len(self.layers) - 1: 44 | x = layer(x) 45 | else: 46 | x = F.relu(layer(x)) 47 | return x 48 | 49 | 50 | class SimpleLinearTop(nn.Module): 51 | """Top (Host) part of the model composed of only Linear model interleaved with ReLU activations""" 52 | 53 | def __init__(self, latent_dim) -> None: 54 | super().__init__() 55 | 56 | self.model = nn.Sequential( 57 | nn.Linear(latent_dim, 1), 58 | nn.Sigmoid(), 59 | ) 60 | self._init_weights() 61 | 62 | def _init_weights(self): 63 | for m in self.modules(): 64 | if isinstance(m, nn.Embedding): 65 | torch.nn.init.uniform_(m.weight, -0.001, 0.001) 66 | elif isinstance(m, nn.Linear): 67 | torch.nn.init.xavier_uniform_(m.weight) 68 | m.bias.data.fill_(0.01) 69 | 70 | def forward(self, x): 71 | return self.model(x).squeeze() 72 | -------------------------------------------------------------------------------- /examples/components/CCFRAUD_VERTICAL/traininsilo/samplers.py: -------------------------------------------------------------------------------- 1 | ########################################################################################## 2 | # WARNING # 3 | ########################################################################################## 4 | # Should this file change please update all copies of samplers.py file in the repository # 5 | ########################################################################################## 6 | 7 | import math 8 | import torch 9 | from torch.utils.data import Sampler 10 | 11 | 12 | class VerticallyDistributedBatchSampler(Sampler): 13 | """Batch sampler that uses a distributed communication backend to distribute samples indexes to each worker.""" 14 | 15 | def __init__(self, data_source, batch_size, comm, rank, world_size, shuffle=False): 16 | """Initializes the batch sampler. 17 | 18 | Args: 19 | data_source (torch.utils.data.Dataset): The dataset to sample from. 20 | batch_size (int): The size of the batch to sample. 21 | comm (AMLComm): The communicator to use for communication. 22 | rank (int): The rank of the current worker. 23 | world_size (int): The total number of workers. 24 | shuffle (bool, optional): Whether to shuffle the indices. Defaults to False. 25 | """ 26 | self.data_source = data_source 27 | self.batch_size = batch_size 28 | self.shuffle = shuffle 29 | self.rank = rank 30 | self.world_size = world_size 31 | self.comm = comm 32 | 33 | def __iter__(self): 34 | if self.rank == 0: 35 | if self.shuffle: 36 | indices = torch.randperm(len(self.data_source)) 37 | else: 38 | indices = torch.arange(len(self.data_source)) 39 | 40 | # Split the indices into batches 41 | batches = [ 42 | indices[i : i + self.batch_size] 43 | for i in range(0, len(indices), self.batch_size) 44 | ] 45 | 46 | for batch in batches: 47 | for i in range(1, self.world_size): 48 | # Send the batch to contributor i 49 | self.comm.send(batch, i) 50 | 51 | yield batch 52 | else: 53 | for i in range(0, len(self.data_source), self.batch_size): 54 | # Receive the batch from host 55 | batch = self.comm.recv(0) 56 | yield batch 57 | 58 | def __len__(self): 59 | return math.ceil(len(self.data_source) / self.batch_size) 60 | -------------------------------------------------------------------------------- /examples/components/CCFRAUD_VERTICAL/upload_data/conda.yaml: -------------------------------------------------------------------------------- 1 | name: ccfraud_vertical_upload_data_conda_env 2 | channels: 3 | - defaults 4 | dependencies: 5 | - python=3.8 6 | - pip=22.2.2 7 | - pip: 8 | - azure-identity==1.12.0 9 | - azure-keyvault==4.2.0 10 | - azureml-core==1.47.0 11 | - kaggle==1.5.12 12 | - scikit-learn==1.1.3 13 | - numpy==1.23.5 14 | - pandas==1.3.5 15 | -------------------------------------------------------------------------------- /examples/components/CCFRAUD_VERTICAL/upload_data/spec.yaml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json 2 | name: msft_fl_ccfraud_vertical_upload_data 3 | version: 0.0.1 4 | display_name: Download CC Fraud data from Kaggle and upload to silo storage 5 | type: command 6 | is_deterministic: true 7 | 8 | inputs: 9 | silo_count: 10 | type: number 11 | optional: false 12 | silo_index: 13 | type: number 14 | optional: false 15 | 16 | outputs: 17 | raw_train_data: 18 | type: uri_folder 19 | description: the output CC Fraud raw training data for a given silo 20 | raw_test_data: 21 | type: uri_folder 22 | description: the output CC Fraud raw testing data for a given silo 23 | 24 | code: . 25 | 26 | command: >- 27 | python run.py --silo_count ${{inputs.silo_count}} --silo_index ${{inputs.silo_index}} --raw_train_data ${{outputs.raw_train_data}} --raw_test_data ${{outputs.raw_test_data}} 28 | 29 | environment: 30 | conda_file: ./conda.yaml 31 | image: mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04 32 | -------------------------------------------------------------------------------- /examples/components/CCFRAUD_VERTICAL/upload_data/us_regions.csv: -------------------------------------------------------------------------------- 1 | State,StateCode,Region,Division 2 | Alaska,AK,West,Pacific 3 | Alabama,AL,South,East South Central 4 | Arkansas,AR,South,West South Central 5 | Arizona,AZ,West,Mountain 6 | California,CA,West,Pacific 7 | Colorado,CO,West,Mountain 8 | Connecticut,CT,Northeast,New England 9 | District of Columbia,DC,South,South Atlantic 10 | Delaware,DE,South,South Atlantic 11 | Florida,FL,South,South Atlantic 12 | Georgia,GA,South,South Atlantic 13 | Hawaii,HI,West,Pacific 14 | Iowa,IA,Midwest,West North Central 15 | Idaho,ID,West,Mountain 16 | Illinois,IL,Midwest,East North Central 17 | Indiana,IN,Midwest,East North Central 18 | Kansas,KS,Midwest,West North Central 19 | Kentucky,KY,South,East South Central 20 | Louisiana,LA,South,West South Central 21 | Massachusetts,MA,Northeast,New England 22 | Maryland,MD,South,South Atlantic 23 | Maine,ME,Northeast,New England 24 | Michigan,MI,Midwest,East North Central 25 | Minnesota,MN,Midwest,West North Central 26 | Missouri,MO,Midwest,West North Central 27 | Mississippi,MS,South,East South Central 28 | Montana,MT,West,Mountain 29 | North Carolina,NC,South,South Atlantic 30 | North Dakota,ND,Midwest,West North Central 31 | Nebraska,NE,Midwest,West North Central 32 | New Hampshire,NH,Northeast,New England 33 | New Jersey,NJ,Northeast,Middle Atlantic 34 | New Mexico,NM,West,Mountain 35 | Nevada,NV,West,Mountain 36 | New York,NY,Northeast,Middle Atlantic 37 | Ohio,OH,Midwest,East North Central 38 | Oklahoma,OK,South,West South Central 39 | Oregon,OR,West,Pacific 40 | Pennsylvania,PA,Northeast,Middle Atlantic 41 | Rhode Island,RI,Northeast,New England 42 | South Carolina,SC,South,South Atlantic 43 | South Dakota,SD,Midwest,West North Central 44 | Tennessee,TN,South,East South Central 45 | Texas,TX,South,West South Central 46 | Utah,UT,West,Mountain 47 | Virginia,VA,South,South Atlantic 48 | Vermont,VT,Northeast,New England 49 | Washington,WA,West,Pacific 50 | Wisconsin,WI,Midwest,East North Central 51 | West Virginia,WV,South,South Atlantic 52 | Wyoming,WY,West,Mountain 53 | -------------------------------------------------------------------------------- /examples/components/CCFRAUD_VERTICAL_FEDONCE/preprocessing/spec.yaml: -------------------------------------------------------------------------------- 1 | 2 | $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json 3 | name: msft_fl_ccfraud_vertical_fedonce_preprocessing_in_silo 4 | version: 0.0.1 5 | display_name: CC Fraud Pre-Processing (in silo) 6 | type: command 7 | description: Component for preprocessing raw data from silo's blob storage 8 | is_deterministic: true 9 | 10 | inputs: 11 | raw_training_data: 12 | type: uri_file 13 | description: the raw training data in a given silo 14 | raw_testing_data: 15 | type: uri_file 16 | description: the raw testing data in a given silo 17 | metrics_prefix: 18 | type: string 19 | description: Metrics prefix 20 | default: Default-prefix 21 | optional: true 22 | 23 | 24 | outputs: 25 | processed_train_data: 26 | type: uri_folder 27 | description: the output training data after preprocessing 28 | processed_test_data: 29 | type: uri_folder 30 | description: the output testing data after preprocessing 31 | 32 | code: . 33 | 34 | command: >- 35 | python run.py --raw_training_data ${{inputs.raw_training_data}} --raw_testing_data ${{inputs.raw_testing_data}} --train_output ${{outputs.processed_train_data}} --test_output ${{outputs.processed_test_data}} $[[--metrics_prefix ${{inputs.metrics_prefix}}]] 36 | 37 | # NOTE: using one of Azure ML's curated environments 38 | # which has all the dependencies needed for this job 39 | environment: azureml:AzureML-sklearn-1.0-ubuntu20.04-py38-cpu@latest 40 | -------------------------------------------------------------------------------- /examples/components/CCFRAUD_VERTICAL_FEDONCE/pretraining/conda.yaml: -------------------------------------------------------------------------------- 1 | name: ccfraud_vertical_pretrain_conda_env 2 | channels: 3 | - defaults 4 | - pytorch 5 | - nvidia 6 | dependencies: 7 | - python=3.8 8 | - pip=22.3.1 9 | - pytorch=1.13.1 10 | - pytorch-cuda=11.6 11 | - pip: 12 | - azureml-mlflow==1.48.0 13 | - pandas==1.5.2 14 | - tqdm==4.64.1 15 | -------------------------------------------------------------------------------- /examples/components/CCFRAUD_VERTICAL_FEDONCE/pretraining/datasets.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.data import Dataset 3 | 4 | 5 | class FraudDataset(Dataset): 6 | """FraudDataset Dataset - combination of features and labels 7 | 8 | Args: 9 | df: Pandas dataframe containing features and/or labels 10 | 11 | Returns: 12 | None 13 | """ 14 | 15 | def __init__(self, df): 16 | if "is_fraud" in df.columns: 17 | if len(df.columns) > 1: 18 | self.X = torch.tensor( 19 | df.loc[:, df.columns != "is_fraud"].values, dtype=torch.float 20 | ) 21 | else: 22 | self.X = None 23 | self.Y = torch.tensor(df.loc[:, "is_fraud"].values, dtype=torch.int) 24 | else: 25 | self.X = torch.tensor(df.values, dtype=torch.float) 26 | self.Y = None 27 | 28 | def __len__(self): 29 | if self.Y is None: 30 | return len(self.X) 31 | else: 32 | return len(self.Y) 33 | 34 | def features_count(self): 35 | if self.X is not None: 36 | return self.X.shape[1] 37 | return None 38 | 39 | def __getitem__(self, idx): 40 | if self.Y is None: 41 | return self.X[idx] 42 | elif self.X is None: 43 | return self.Y[idx] 44 | else: 45 | return self.X[idx], self.Y[idx] 46 | -------------------------------------------------------------------------------- /examples/components/CCFRAUD_VERTICAL_FEDONCE/pretraining/spec.yaml: -------------------------------------------------------------------------------- 1 | 2 | $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json 3 | name: msft_fl_ccfraud_vertical_fedonce_pretrain_contributor 4 | version: 0.0.1 5 | display_name: CC Fraud Pre-Train (in silo) 6 | type: command 7 | description: Component to train a model to generate embeddings representative of contributor samples. 8 | is_deterministic: false 9 | 10 | inputs: 11 | train_data: 12 | type: uri_folder 13 | description: the input training data (preprocessed) 14 | test_data: 15 | type: uri_folder 16 | description: the input testing data (preprocessed) 17 | metrics_prefix: 18 | type: string 19 | description: Metrics prefix 20 | default: Default-prefix 21 | optional: true 22 | checkpoint: 23 | type: uri_folder 24 | description: a given pre-existing checkpoint 25 | optional: true 26 | lr: 27 | type: number 28 | description: learning rate 29 | default: 1e-3 30 | optional: true 31 | epochs: 32 | type: integer 33 | description: total number of epochs for local training 34 | default: 10 35 | optional: true 36 | batch_size: 37 | type: integer 38 | description: batch size 39 | default: 100 40 | optional: true 41 | 42 | outputs: 43 | model: 44 | type: uri_folder 45 | description: the output checkpoint 46 | embeddings: 47 | type: uri_folder 48 | description: the output embeddings 49 | 50 | code: . 51 | 52 | command: >- 53 | python run.py 54 | --train_data ${{inputs.train_data}} 55 | --test_data ${{inputs.test_data}} 56 | --model_path ${{outputs.model}} 57 | --embeddings_path ${{outputs.embeddings}} 58 | $[[--metrics_prefix ${{inputs.metrics_prefix}}]] 59 | $[[--checkpoint ${{inputs.checkpoint}}]] 60 | $[[--lr ${{inputs.lr}}]] 61 | $[[--epochs ${{inputs.epochs}}]] 62 | $[[--batch_size ${{inputs.batch_size}}]] 63 | environment: 64 | conda_file: ./conda.yaml 65 | image: mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04 66 | -------------------------------------------------------------------------------- /examples/components/CCFRAUD_VERTICAL_FEDONCE/traininsilo/conda.yaml: -------------------------------------------------------------------------------- 1 | name: ccfraud_vertical_train_conda_env 2 | channels: 3 | - defaults 4 | - pytorch 5 | - nvidia 6 | dependencies: 7 | - python=3.8 8 | - pip=22.3.1 9 | - pytorch=1.13.1 10 | - pytorch-cuda=11.6 11 | - pip: 12 | - azureml-core==1.47.0 13 | - azure-keyvault==4.2.0 14 | - azureml-mlflow==1.48.0 15 | - pandas==1.5.2 16 | - torchmetrics==0.10.3 17 | - redis==4.5.1 18 | -------------------------------------------------------------------------------- /examples/components/CCFRAUD_VERTICAL_FEDONCE/traininsilo/datasets.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from torch.utils.data import Dataset 4 | 5 | 6 | class FraudDataset(Dataset): 7 | """FraudDataset Dataset - combination of features and labels 8 | 9 | Args: 10 | df: Pandas dataframe containing features and labels 11 | kwargs: 12 | embeddings: list of embeddings to be concatenated to features 13 | 14 | Returns: 15 | None 16 | """ 17 | 18 | def __init__(self, df, **kwargs): 19 | if "is_fraud" in df.columns: 20 | if len(df.columns) > 1: 21 | self.X = df.loc[:, df.columns != "is_fraud"].values 22 | else: 23 | self.X = None 24 | self.Y = df.loc[:, "is_fraud"].values 25 | else: 26 | self.X = df.values 27 | self.Y = None 28 | 29 | if "embeddings" in kwargs and len(kwargs["embeddings"]) > 0: 30 | self.X = np.load(kwargs["embeddings"][0]) 31 | for embedding in kwargs["embeddings"][1:]: 32 | np_embeddings = np.load(embedding) 33 | self.X = np.concatenate([self.X, np_embeddings], axis=1) 34 | 35 | if self.X is not None: 36 | self.X = torch.tensor(self.X, dtype=torch.float) 37 | if self.Y is not None: 38 | self.Y = torch.tensor(self.Y, dtype=torch.int) 39 | 40 | def __len__(self): 41 | if self.Y is None: 42 | return len(self.X) 43 | else: 44 | return len(self.Y) 45 | 46 | def features_count(self): 47 | if self.X is not None: 48 | return self.X.shape[1] 49 | return None 50 | 51 | def __getitem__(self, idx): 52 | if self.Y is None: 53 | return self.X[idx] 54 | elif self.X is None: 55 | return self.Y[idx] 56 | else: 57 | return self.X[idx], self.Y[idx] 58 | -------------------------------------------------------------------------------- /examples/components/CCFRAUD_VERTICAL_FEDONCE/traininsilo/models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class SimpleVAETop(nn.Module): 6 | """Top (Host) part of the LSTM based VAE with head composed of Linear layers interleaved by ReLU activations""" 7 | 8 | def __init__(self, latent_dim, hidden_dim=32) -> None: 9 | super().__init__() 10 | self._latent_dim = latent_dim 11 | self._hidden_dim = hidden_dim 12 | 13 | self.seq = torch.nn.Sequential( 14 | nn.Linear(in_features=self._latent_dim, out_features=self._hidden_dim), 15 | nn.ReLU(), 16 | nn.Linear(in_features=self._hidden_dim, out_features=1), 17 | nn.Sigmoid(), 18 | ) 19 | self._init_weights() 20 | 21 | def _init_weights(self): 22 | for m in self.modules(): 23 | if isinstance(m, nn.Linear): 24 | torch.nn.init.xavier_uniform_(m.weight) 25 | m.bias.data.fill_(0.01) 26 | 27 | def forward(self, x): 28 | x = self.seq(x).squeeze() 29 | return x 30 | -------------------------------------------------------------------------------- /examples/components/CCFRAUD_VERTICAL_FEDONCE/traininsilo/spec.yaml: -------------------------------------------------------------------------------- 1 | 2 | $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json 3 | name: msft_fl_ccfraud_vertical_fedonce_train_in_silo_host 4 | version: 0.0.1 5 | display_name: CC Fraud Train (in silo) 6 | type: command 7 | description: Component to train a model to classify CC Fraud. 8 | is_deterministic: false 9 | 10 | inputs: 11 | train_data: 12 | type: uri_folder 13 | description: the input training data (preprocessed) 14 | test_data: 15 | type: uri_folder 16 | description: the input testing data (preprocessed) 17 | metrics_prefix: 18 | type: string 19 | description: Metrics prefix 20 | default: Default-prefix 21 | optional: true 22 | checkpoint: 23 | type: uri_folder 24 | description: a given pre-existing checkpoint 25 | optional: true 26 | contributor_1_embeddings: 27 | type: uri_folder 28 | description: path to embeddings extracted by contributor 29 | optional: true 30 | contributor_2_embeddings: 31 | type: uri_folder 32 | description: path to embeddings extracted by contributor 33 | optional: true 34 | contributor_3_embeddings: 35 | type: uri_folder 36 | description: path to embeddings extracted by contributor 37 | optional: true 38 | lr: 39 | type: number 40 | description: learning rate 41 | default: 1e-3 42 | optional: true 43 | epochs: 44 | type: integer 45 | description: total number of epochs for local training 46 | default: 10 47 | optional: true 48 | batch_size: 49 | type: integer 50 | description: batch size 51 | default: 100 52 | optional: true 53 | 54 | outputs: 55 | model: 56 | type: uri_folder 57 | description: the output checkpoint 58 | 59 | code: . 60 | 61 | command: >- 62 | python run.py 63 | --train_data ${{inputs.train_data}} 64 | --test_data ${{inputs.test_data}} 65 | --model_path ${{outputs.model}} 66 | $[[--contributor_1_embeddings ${{inputs.contributor_1_embeddings}}]] 67 | $[[--contributor_2_embeddings ${{inputs.contributor_2_embeddings}}]] 68 | $[[--contributor_3_embeddings ${{inputs.contributor_3_embeddings}}]] 69 | $[[--metrics_prefix ${{inputs.metrics_prefix}}]] 70 | $[[--checkpoint ${{inputs.checkpoint}}]] 71 | $[[--lr ${{inputs.lr}}]] 72 | $[[--epochs ${{inputs.epochs}}]] 73 | $[[--batch_size ${{inputs.batch_size}}]] 74 | 75 | environment: 76 | conda_file: ./conda.yaml 77 | image: mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04 78 | -------------------------------------------------------------------------------- /examples/components/CCFRAUD_VERTICAL_FEDONCE/upload_data/conda.yaml: -------------------------------------------------------------------------------- 1 | name: ccfraud_vertical_upload_data_conda_env 2 | channels: 3 | - defaults 4 | dependencies: 5 | - python=3.8 6 | - pip=22.2.2 7 | - pip: 8 | - azure-identity==1.12.0 9 | - azure-keyvault==4.2.0 10 | - azureml-core==1.47.0 11 | - kaggle==1.5.12 12 | - scikit-learn==1.1.3 13 | - numpy==1.23.5 14 | - pandas==1.3.5 15 | -------------------------------------------------------------------------------- /examples/components/CCFRAUD_VERTICAL_FEDONCE/upload_data/spec.yaml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json 2 | name: msft_fl_ccfraud_vertical_fedonce_upload_data 3 | version: 0.0.1 4 | display_name: Download CC Fraud data from Kaggle and upload to silo storage 5 | type: command 6 | is_deterministic: true 7 | 8 | inputs: 9 | silo_count: 10 | type: number 11 | optional: false 12 | silo_index: 13 | type: number 14 | optional: false 15 | 16 | outputs: 17 | raw_train_data: 18 | type: uri_folder 19 | description: the output CC Fraud raw training data for a given silo 20 | raw_test_data: 21 | type: uri_folder 22 | description: the output CC Fraud raw testing data for a given silo 23 | 24 | code: . 25 | 26 | command: >- 27 | python run.py --silo_count ${{inputs.silo_count}} --silo_index ${{inputs.silo_index}} --raw_train_data ${{outputs.raw_train_data}} --raw_test_data ${{outputs.raw_test_data}} 28 | 29 | environment: 30 | conda_file: ./conda.yaml 31 | image: mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04 32 | -------------------------------------------------------------------------------- /examples/components/CCFRAUD_VERTICAL_FEDONCE/upload_data/us_regions.csv: -------------------------------------------------------------------------------- 1 | State,StateCode,Region,Division 2 | Alaska,AK,West,Pacific 3 | Alabama,AL,South,East South Central 4 | Arkansas,AR,South,West South Central 5 | Arizona,AZ,West,Mountain 6 | California,CA,West,Pacific 7 | Colorado,CO,West,Mountain 8 | Connecticut,CT,Northeast,New England 9 | District of Columbia,DC,South,South Atlantic 10 | Delaware,DE,South,South Atlantic 11 | Florida,FL,South,South Atlantic 12 | Georgia,GA,South,South Atlantic 13 | Hawaii,HI,West,Pacific 14 | Iowa,IA,Midwest,West North Central 15 | Idaho,ID,West,Mountain 16 | Illinois,IL,Midwest,East North Central 17 | Indiana,IN,Midwest,East North Central 18 | Kansas,KS,Midwest,West North Central 19 | Kentucky,KY,South,East South Central 20 | Louisiana,LA,South,West South Central 21 | Massachusetts,MA,Northeast,New England 22 | Maryland,MD,South,South Atlantic 23 | Maine,ME,Northeast,New England 24 | Michigan,MI,Midwest,East North Central 25 | Minnesota,MN,Midwest,West North Central 26 | Missouri,MO,Midwest,West North Central 27 | Mississippi,MS,South,East South Central 28 | Montana,MT,West,Mountain 29 | North Carolina,NC,South,South Atlantic 30 | North Dakota,ND,Midwest,West North Central 31 | Nebraska,NE,Midwest,West North Central 32 | New Hampshire,NH,Northeast,New England 33 | New Jersey,NJ,Northeast,Middle Atlantic 34 | New Mexico,NM,West,Mountain 35 | Nevada,NV,West,Mountain 36 | New York,NY,Northeast,Middle Atlantic 37 | Ohio,OH,Midwest,East North Central 38 | Oklahoma,OK,South,West South Central 39 | Oregon,OR,West,Pacific 40 | Pennsylvania,PA,Northeast,Middle Atlantic 41 | Rhode Island,RI,Northeast,New England 42 | South Carolina,SC,South,South Atlantic 43 | South Dakota,SD,Midwest,West North Central 44 | Tennessee,TN,South,East South Central 45 | Texas,TX,South,West South Central 46 | Utah,UT,West,Mountain 47 | Virginia,VA,South,South Atlantic 48 | Vermont,VT,Northeast,New England 49 | Washington,WA,West,Pacific 50 | Wisconsin,WI,Midwest,East North Central 51 | West Virginia,WV,South,South Atlantic 52 | Wyoming,WY,West,Mountain 53 | -------------------------------------------------------------------------------- /examples/components/FLWR/client/pneumonia_network.py: -------------------------------------------------------------------------------- 1 | # This file defining the model was taken as-is from https://github.com/Azure/medical-imaging/blob/main/federated-learning/pneumonia-federated/custom/pneumonia_network.py. 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | 7 | class PneumoniaNetwork(nn.Module): 8 | def __init__(self): 9 | super(PneumoniaNetwork, self).__init__() 10 | dropout = 0.2 11 | 12 | self.conv1 = nn.Conv2d( 13 | in_channels=1, out_channels=32, kernel_size=3, stride=1, padding=1 14 | ) 15 | self.conv2 = nn.Conv2d( 16 | in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1 17 | ) 18 | self.conv3 = nn.Conv2d( 19 | in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1 20 | ) 21 | 22 | self.dropout1 = nn.Dropout(dropout) 23 | self.dropout2 = nn.Dropout(dropout) 24 | 25 | self.fc1 = nn.Linear(28 * 28 * 128, 256) 26 | self.fc2 = nn.Linear(256, 2) 27 | 28 | def forward(self, x): 29 | x = F.relu(self.conv1(x)) # 224 x 224 x 32 30 | x = F.max_pool2d(x, 2, 2) # 112 x 112 x 32 31 | x = F.relu(self.conv2(x)) # 112 x 112 x 64 32 | x = F.max_pool2d(x, 2, 2) # 56 x 56 x 64 33 | x = self.dropout1(x) 34 | x = F.relu(self.conv3(x)) # 56 x 56 x 128 35 | x = F.max_pool2d(x, 2, 2) # 28 x 28 x 128 36 | x = self.dropout2(x) 37 | x = x.view(-1, 28 * 28 * 128) # 100.352 38 | x = F.relu(self.fc1(x)) 39 | x = self.fc2(x) 40 | return x 41 | -------------------------------------------------------------------------------- /examples/components/FLWR/client/spec.yaml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json 2 | name: msft_fl_flower_client 3 | version: 0.0.5 4 | display_name: Flower client 5 | type: command 6 | description: This component runs a Flower client inside an AzureML job. 7 | is_deterministic: true 8 | tags: 9 | flower: 1.2.0 10 | url: https://github.com/Azure-Samples/azure-ml-federated-learning 11 | 12 | inputs: 13 | federation_identifier: 14 | type: string 15 | client_data: 16 | type: uri_folder 17 | optional: true 18 | description: "an optional folder containing data for the client to use" 19 | lr: 20 | type: number 21 | description: learning rate 22 | default: 0.01 23 | optional: true 24 | epochs: 25 | type: integer 26 | description: total number of epochs for local training 27 | default: 3 28 | optional: true 29 | checkpoint: 30 | type: uri_folder 31 | description: a given pre-existing model checkpoint 32 | optional: true 33 | metrics_prefix: 34 | type: string 35 | description: Metrics prefix 36 | default: Default-prefix 37 | optional: true 38 | 39 | code: "." 40 | 41 | command: >- 42 | python run.py 43 | --federation_identifier ${{inputs.federation_identifier}} 44 | $[[--client_data ${{inputs.client_data}}]] 45 | $[[--checkpoint ${{inputs.checkpoint}}]] 46 | $[[--lr ${{inputs.lr}}]] 47 | $[[--epochs ${{inputs.epochs}}]] 48 | $[[--metrics_prefix ${{inputs.metrics_prefix}}]] 49 | 50 | environment: 51 | build: 52 | path: ../flower_pytorch_env/context/ 53 | -------------------------------------------------------------------------------- /examples/components/FLWR/flower_pytorch_env/context/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:22.09-py3 2 | FROM ${PYTORCH_IMAGE} 3 | 4 | RUN python3 -m pip install -U pip 5 | RUN python3 -m pip install -U setuptools 6 | 7 | # Install dependencies missing in this container 8 | ADD requirements.txt /tmp/requirements.txt 9 | RUN python3 -m pip install -r /tmp/requirements.txt 10 | -------------------------------------------------------------------------------- /examples/components/FLWR/flower_pytorch_env/context/requirements.txt: -------------------------------------------------------------------------------- 1 | azureml-mlflow==1.48.0 2 | flwr==1.2.0 3 | -------------------------------------------------------------------------------- /examples/components/FLWR/flower_pytorch_env/env.yml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/environment.schema.json 2 | name: flower-pt 3 | version: 1.2.0-pytorch 4 | build: 5 | path: ./context/ 6 | -------------------------------------------------------------------------------- /examples/components/FLWR/server/spec.yaml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json 2 | name: msft_fl_flower_server 3 | version: 0.0.5 4 | display_name: Flower server 5 | type: command 6 | description: This component runs a Flower server inside an AzureML job. 7 | is_deterministic: true 8 | tags: 9 | flower: 1.2.0 10 | url: https://github.com/Azure-Samples/azure-ml-federated-learning 11 | 12 | inputs: 13 | federation_identifier: 14 | type: string 15 | expected_clients: 16 | type: integer 17 | wait_for_clients_timeout: 18 | type: integer 19 | default: 600 20 | 21 | outputs: 22 | job_artefacts: 23 | type: uri_folder 24 | 25 | code: "." 26 | 27 | command: >- 28 | python run.py 29 | --federation_identifier ${{inputs.federation_identifier}} 30 | --expected_clients ${{inputs.expected_clients}} 31 | --output_dir ${{outputs.job_artefacts}} 32 | --wait_for_clients_timeout ${{inputs.wait_for_clients_timeout}} 33 | 34 | environment: 35 | build: 36 | path: ../flower_pytorch_env/context/ 37 | -------------------------------------------------------------------------------- /examples/components/HELLOWORLD/aggregatemodelweights/run.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import logging 4 | import sys 5 | import glob 6 | 7 | 8 | def get_arg_parser(parser=None): 9 | """Parse the command line arguments for merge using argparse. 10 | 11 | Args: 12 | parser (argparse.ArgumentParser or CompliantArgumentParser): 13 | an argument parser instance 14 | 15 | Returns: 16 | ArgumentParser: the argument parser instance 17 | 18 | Notes: 19 | if parser is None, creates a new parser instance 20 | """ 21 | # add arguments that are specific to the component 22 | if parser is None: 23 | parser = argparse.ArgumentParser(description=__doc__) 24 | 25 | parser.add_argument("--input_silo_1", type=str, required=True, help="") 26 | parser.add_argument("--input_silo_2", type=str, required=False, help="") 27 | parser.add_argument("--input_silo_3", type=str, required=False, help="") 28 | parser.add_argument("--aggregated_output", type=str, required=True, help="") 29 | return parser 30 | 31 | 32 | def test_input(path): 33 | file_list = glob.glob(os.path.join(path, "*.*"), recursive=True) 34 | print(f"Found {len(file_list)} files in {path}") 35 | 36 | print(f"Reading files from {path}") 37 | for file in file_list: 38 | print(f" -- Reading {file}") 39 | with open(file, "r") as f: 40 | f.read() 41 | 42 | 43 | def test_output(path): 44 | print(f"Writing output to {path}/aggregate.txt") 45 | with open(os.path.join(path, "aggregate.txt"), "w") as f: 46 | f.write("Hello World!") 47 | 48 | 49 | def main(cli_args=None): 50 | """Component main function. 51 | 52 | It parses arguments and executes run() with the right arguments. 53 | 54 | Args: 55 | cli_args (List[str], optional): list of args to feed script, useful for debugging. Defaults to None. 56 | """ 57 | # build an arg parser 58 | parser = get_arg_parser() 59 | 60 | # run the parser on cli args 61 | args = parser.parse_args(cli_args) 62 | 63 | print(f"Running script with arguments: {args}") 64 | test_input(args.input_silo_1) 65 | if args.input_silo_2: 66 | test_input(args.input_silo_2) 67 | if args.input_silo_3: 68 | test_input(args.input_silo_3) 69 | test_output(args.aggregated_output) 70 | 71 | 72 | if __name__ == "__main__": 73 | # Set logging to sys.out 74 | logger = logging.getLogger(__name__) 75 | logger.setLevel(logging.DEBUG) 76 | log_format = logging.Formatter("[%(asctime)s] [%(levelname)s] - %(message)s") 77 | handler = logging.StreamHandler(sys.stdout) 78 | handler.setLevel(logging.DEBUG) 79 | handler.setFormatter(log_format) 80 | logger.addHandler(handler) 81 | 82 | main() 83 | -------------------------------------------------------------------------------- /examples/components/HELLOWORLD/aggregatemodelweights/spec.yaml: -------------------------------------------------------------------------------- 1 | 2 | $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json 3 | name: msft_fl_helloworld_aggregate_model_weights 4 | version: 0.3.0 5 | display_name: Aggregate Model Weights (from all silos) 6 | type: command 7 | description: Component for aggreating model weights. 8 | is_deterministic: true 9 | 10 | inputs: 11 | input_silo_1: 12 | type: uri_folder 13 | description: input from silo 1 (e.g., model weights, or gradient updates) 14 | optional: false 15 | input_silo_2: 16 | type: uri_folder 17 | description: input from silo 2 (e.g., model weights, or gradient updates) 18 | optional: true 19 | input_silo_3: 20 | type: uri_folder 21 | description: input from silo 3 (e.g., model weights, or gradient updates) 22 | optional: true 23 | 24 | outputs: 25 | aggregated_output: 26 | type: uri_folder 27 | description: the aggregated model or gradiants, residing in the orchestrator compute. 28 | 29 | code: . 30 | 31 | command: >- 32 | python run.py --aggregated_output ${{outputs.aggregated_output}} 33 | --input_silo_1 ${{inputs.input_silo_1}} 34 | $[[--input_silo_2 ${{inputs.input_silo_2}}]] 35 | $[[--input_silo_3 ${{inputs.input_silo_3}}]] 36 | 37 | environment: azureml:AzureML-sklearn-1.0-ubuntu20.04-py38-cpu:30 38 | -------------------------------------------------------------------------------- /examples/components/HELLOWORLD/preprocessing/run.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import logging 4 | import sys 5 | import glob 6 | 7 | 8 | def get_arg_parser(parser=None): 9 | """Parse the command line arguments for merge using argparse. 10 | 11 | Args: 12 | parser (argparse.ArgumentParser or CompliantArgumentParser): 13 | an argument parser instance 14 | 15 | Returns: 16 | ArgumentParser: the argument parser instance 17 | 18 | Notes: 19 | if parser is None, creates a new parser instance 20 | """ 21 | # add arguments that are specific to the component 22 | if parser is None: 23 | parser = argparse.ArgumentParser(description=__doc__) 24 | 25 | parser.add_argument("--raw_training_data", type=str, required=True, help="") 26 | parser.add_argument("--raw_testing_data", type=str, required=True, help="") 27 | parser.add_argument("--train_output", type=str, required=True, help="") 28 | parser.add_argument("--test_output", type=str, required=True, help="") 29 | parser.add_argument( 30 | "--metrics_prefix", type=str, required=False, help="Metrics prefix" 31 | ) 32 | return parser 33 | 34 | 35 | def test_input(path): 36 | file_list = glob.glob(os.path.join(path, "*.*"), recursive=True) 37 | print(f"Found {len(file_list)} files in {path}") 38 | 39 | print(f"Reading files from {path}") 40 | for file in file_list: 41 | print(f" -- Reading {file}") 42 | with open(file, "r") as f: 43 | f.read() 44 | 45 | 46 | def test_output(path): 47 | with open(os.path.join(path, "output.txt"), "w") as f: 48 | f.write("Hello World!") 49 | 50 | 51 | def main(cli_args=None): 52 | """Component main function. 53 | 54 | It parses arguments and executes run() with the right arguments. 55 | 56 | Args: 57 | cli_args (List[str], optional): list of args to feed script, useful for debugging. Defaults to None. 58 | """ 59 | # build an arg parser 60 | parser = get_arg_parser() 61 | 62 | # run the parser on cli args 63 | args = parser.parse_args(cli_args) 64 | 65 | print(f"Running script with arguments: {args}") 66 | test_input(args.raw_training_data) 67 | test_input(args.raw_testing_data) 68 | test_output(args.train_output) 69 | test_output(args.test_output) 70 | 71 | 72 | if __name__ == "__main__": 73 | # Set logging to sys.out 74 | logger = logging.getLogger(__name__) 75 | logger.setLevel(logging.DEBUG) 76 | log_format = logging.Formatter("[%(asctime)s] [%(levelname)s] - %(message)s") 77 | handler = logging.StreamHandler(sys.stdout) 78 | handler.setLevel(logging.DEBUG) 79 | handler.setFormatter(log_format) 80 | logger.addHandler(handler) 81 | 82 | main() 83 | -------------------------------------------------------------------------------- /examples/components/HELLOWORLD/preprocessing/spec.yaml: -------------------------------------------------------------------------------- 1 | 2 | $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json 3 | name: msft_fl_helloworld_preprocessing_in_silo 4 | version: 0.3.0 5 | display_name: Pre-Processing (in silo) 6 | type: command 7 | description: Component for preprocessing raw data in a given silo. The images are transformed using random affine keeping the center invariant, then normalized. 8 | is_deterministic: true 9 | 10 | inputs: 11 | raw_training_data: 12 | type: uri_file 13 | description: the raw training data in a given silo 14 | raw_testing_data: 15 | type: uri_file 16 | description: the raw testing data in a given silo 17 | metrics_prefix: 18 | type: string 19 | description: Metrics prefix 20 | default: Default-prefix 21 | optional: true 22 | 23 | 24 | outputs: 25 | processed_train_data: 26 | type: uri_folder 27 | description: the output training data after preprocessing 28 | processed_test_data: 29 | type: uri_folder 30 | description: the output testing data after preprocessing 31 | 32 | code: . 33 | 34 | command: >- 35 | python run.py --raw_training_data ${{inputs.raw_training_data}} --raw_testing_data ${{inputs.raw_testing_data}} --train_output ${{outputs.processed_train_data}} --test_output ${{outputs.processed_test_data}} $[[--metrics_prefix ${{inputs.metrics_prefix}}]] 36 | 37 | environment: azureml:AzureML-sklearn-1.0-ubuntu20.04-py38-cpu:30 38 | -------------------------------------------------------------------------------- /examples/components/HELLOWORLD/traininsilo/spec.yaml: -------------------------------------------------------------------------------- 1 | 2 | $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json 3 | name: msft_fl_helloworld_train_in_silo 4 | version: 0.3.0 5 | display_name: Train (in silo) 6 | type: command 7 | description: Component to train a model within a FL silo. 8 | is_deterministic: true 9 | 10 | inputs: 11 | train_data: 12 | type: uri_folder 13 | description: the input training data 14 | test_data: 15 | type: uri_folder 16 | description: the input testing data 17 | metrics_prefix: 18 | type: string 19 | description: Metrics prefix 20 | default: Default-prefix 21 | optional: true 22 | iteration_num: 23 | type: integer 24 | description: Iteration number 25 | default: 1 26 | optional: true 27 | checkpoint: 28 | type: uri_folder 29 | description: a given pre-existing checkpoint 30 | optional: true 31 | lr: 32 | type: number 33 | description: learning rate 34 | default: 0.01 35 | optional: true 36 | epochs: 37 | type: integer 38 | description: total number of epochs for local training 39 | default: 3 40 | optional: true 41 | batch_size: 42 | type: integer 43 | description: batch size 44 | default: 64 45 | optional: true 46 | dp: 47 | type: boolean 48 | description: differential privacy 49 | default: false 50 | optional: true 51 | dp_target_epsilon: 52 | type: number 53 | description: DP target epsilon 54 | default: 50.0 55 | optional: true 56 | dp_target_delta: 57 | type: number 58 | description: DP target delta 59 | default: 1e-5 60 | optional: true 61 | dp_max_grad_norm: 62 | type: number 63 | description: DP max gradient norm 64 | default: 1.0 65 | optional: true 66 | total_num_of_iterations: 67 | type: integer 68 | description: Total num of iterations 69 | default: 1 70 | optional: true 71 | 72 | outputs: 73 | model: 74 | type: uri_folder 75 | description: the output checkpoint 76 | 77 | code: . 78 | 79 | command: >- 80 | python run.py --train_data ${{inputs.train_data}} --test_data ${{inputs.test_data}} $[[--metrics_prefix ${{inputs.metrics_prefix}}]] $[[--iteration_num ${{inputs.iteration_num}}]] $[[--checkpoint ${{inputs.checkpoint}}]] --model ${{outputs.model}} $[[--lr ${{inputs.lr}}]] $[[--epochs ${{inputs.epochs}}]] $[[--batch_size ${{inputs.batch_size}}]] $[[--dp ${{inputs.dp}}]] $[[--total_num_of_iterations ${{inputs.total_num_of_iterations}}]] $[[--dp_target_epsilon ${{inputs.dp_target_epsilon}}]] $[[--dp_target_delta ${{inputs.dp_target_delta}}]] $[[--dp_max_grad_norm ${{inputs.dp_max_grad_norm}}]] 81 | 82 | environment: azureml:AzureML-sklearn-1.0-ubuntu20.04-py38-cpu:30 83 | -------------------------------------------------------------------------------- /examples/components/MNIST/preprocessing/conda.yaml: -------------------------------------------------------------------------------- 1 | name: mnist_conda_env 2 | channels: 3 | - defaults 4 | - pytorch 5 | dependencies: 6 | - python=3.8 7 | - pip=22.3.1 8 | - pytorch=1.12.1 9 | - torchvision=0.13.1 10 | - cudatoolkit=11.3 11 | - pip: 12 | - azureml-mlflow==1.48.0 13 | - pandas==1.5.2 14 | -------------------------------------------------------------------------------- /examples/components/MNIST/preprocessing/spec.yaml: -------------------------------------------------------------------------------- 1 | 2 | $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json 3 | name: msft_fl_mnist_preprocessing_in_silo 4 | version: 0.3.0 5 | display_name: MNIST Pre-Processing (in silo) 6 | type: command 7 | description: Component for preprocessing MNIST data in a given silo. The images are transformed using random affine keeping the center invariant, then normalized. 8 | is_deterministic: true 9 | 10 | inputs: 11 | raw_training_data: 12 | type: uri_file 13 | description: the raw training data in a given silo 14 | raw_testing_data: 15 | type: uri_file 16 | description: the raw testing data in a given silo 17 | metrics_prefix: 18 | type: string 19 | description: Metrics prefix 20 | default: Default-prefix 21 | optional: true 22 | 23 | 24 | outputs: 25 | processed_train_data: 26 | type: uri_folder 27 | description: the output training data after preprocessing 28 | processed_test_data: 29 | type: uri_folder 30 | description: the output testing data after preprocessing 31 | 32 | code: . 33 | 34 | command: >- 35 | python run.py --raw_training_data ${{inputs.raw_training_data}} --raw_testing_data ${{inputs.raw_testing_data}} --train_output ${{outputs.processed_train_data}} --test_output ${{outputs.processed_test_data}} $[[--metrics_prefix ${{inputs.metrics_prefix}}]] 36 | 37 | environment: 38 | conda_file: ./conda.yaml 39 | image: mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04 40 | -------------------------------------------------------------------------------- /examples/components/MNIST/traininsilo/conda.yaml: -------------------------------------------------------------------------------- 1 | name: mnist_conda_env 2 | channels: 3 | - defaults 4 | - pytorch 5 | - nvidia 6 | dependencies: 7 | - python=3.8 8 | - pip=22.3.1 9 | - pytorch=2.0.0 10 | - pytorch-cuda=11.6 11 | - pip: 12 | - azureml-mlflow==1.48.0 13 | - pandas==1.5.2 14 | - opacus==1.3.0 15 | - tqdm==4.64.1 16 | - torchvision==0.15.1 17 | -------------------------------------------------------------------------------- /examples/components/MNIST_VERTICAL/traininsilo/conda.yaml: -------------------------------------------------------------------------------- 1 | name: mnist_vertical_train_conda_env 2 | channels: 3 | - defaults 4 | - pytorch 5 | - nvidia 6 | dependencies: 7 | - python=3.8 8 | - pip=22.3.1 9 | - pytorch=1.13.1 10 | - torchvision=0.13.1 11 | - pytorch-cuda=11.6 12 | - pip: 13 | - azureml-core==1.47.0 14 | - azure-keyvault==4.2.0 15 | - azureml-mlflow==1.48.0 16 | - pandas==1.5.2 17 | - redis==4.5.1 18 | -------------------------------------------------------------------------------- /examples/components/MNIST_VERTICAL/traininsilo/contributor_spec.yaml: -------------------------------------------------------------------------------- 1 | 2 | $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json 3 | name: msft_fl_mnist_vertical_train_in_silo_contributor 4 | version: 0.0.1 5 | display_name: MNIST Train (in silo) 6 | type: command 7 | description: Component to train a model on MNIST dataset. 8 | is_deterministic: false 9 | 10 | inputs: 11 | train_data: 12 | type: uri_folder 13 | description: the input training data 14 | test_data: 15 | type: uri_folder 16 | description: the input testing data 17 | metrics_prefix: 18 | type: string 19 | description: Metrics prefix 20 | default: Default-prefix 21 | optional: true 22 | checkpoint: 23 | type: uri_folder 24 | description: a given pre-existing checkpoint 25 | optional: true 26 | lr: 27 | type: number 28 | description: learning rate 29 | default: 0.01 30 | optional: true 31 | epochs: 32 | type: integer 33 | description: total number of epochs for local training 34 | default: 3 35 | optional: true 36 | batch_size: 37 | type: integer 38 | description: batch size 39 | default: 64 40 | optional: true 41 | global_size: 42 | type: number 43 | optional: false 44 | global_rank: 45 | type: number 46 | optional: false 47 | communication_backend: 48 | type: string 49 | enum: 50 | - socket 51 | - redis 52 | default: socket 53 | optional: true 54 | communication_encrypted: 55 | type: boolean 56 | description: Encrypt messages exchanged between the nodes 57 | optional: true 58 | 59 | outputs: 60 | model: 61 | type: uri_folder 62 | description: the output checkpoint 63 | 64 | code: . 65 | 66 | command: >- 67 | python contributor.py 68 | --train_data ${{inputs.train_data}} 69 | --test_data ${{inputs.test_data}} 70 | $[[--metrics_prefix ${{inputs.metrics_prefix}}]] 71 | $[[--checkpoint ${{inputs.checkpoint}}]] 72 | --model ${{outputs.model}} 73 | $[[--lr ${{inputs.lr}}]] 74 | $[[--epochs ${{inputs.epochs}}]] 75 | $[[--batch_size ${{inputs.batch_size}}]] 76 | --global_size ${{inputs.global_size}} 77 | --global_rank ${{inputs.global_rank}} 78 | $[[--communication_backend ${{inputs.communication_backend}}]] 79 | $[[--communication_encrypted ${{inputs.communication_encrypted}}]] 80 | 81 | environment: 82 | conda_file: ./conda.yaml 83 | image: mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04 84 | -------------------------------------------------------------------------------- /examples/components/MNIST_VERTICAL/traininsilo/host_spec.yaml: -------------------------------------------------------------------------------- 1 | 2 | $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json 3 | name: msft_fl_mnist_vertical_train_in_silo_host 4 | version: 0.0.1 5 | display_name: MNIST Train (in silo) 6 | type: command 7 | description: Component to train a model on MNIST dataset. 8 | is_deterministic: false 9 | 10 | inputs: 11 | train_data: 12 | type: uri_folder 13 | description: the input training data 14 | test_data: 15 | type: uri_folder 16 | description: the input testing data 17 | metrics_prefix: 18 | type: string 19 | description: Metrics prefix 20 | default: Default-prefix 21 | optional: true 22 | checkpoint: 23 | type: uri_folder 24 | description: a given pre-existing checkpoint 25 | optional: true 26 | lr: 27 | type: number 28 | description: learning rate 29 | default: 0.01 30 | optional: true 31 | epochs: 32 | type: integer 33 | description: total number of epochs for local training 34 | default: 3 35 | optional: true 36 | batch_size: 37 | type: integer 38 | description: batch size 39 | default: 64 40 | optional: true 41 | global_size: 42 | type: number 43 | optional: false 44 | global_rank: 45 | type: number 46 | optional: false 47 | communication_backend: 48 | type: string 49 | enum: 50 | - socket 51 | - redis 52 | default: socket 53 | optional: true 54 | communication_encrypted: 55 | type: boolean 56 | description: Encrypt messages exchanged between the nodes 57 | optional: true 58 | 59 | outputs: 60 | model: 61 | type: uri_folder 62 | description: the output checkpoint 63 | 64 | code: . 65 | 66 | command: >- 67 | python host.py 68 | --train_data ${{inputs.train_data}} 69 | --test_data ${{inputs.test_data}} 70 | $[[--metrics_prefix ${{inputs.metrics_prefix}}]] 71 | $[[--checkpoint ${{inputs.checkpoint}}]] 72 | --model ${{outputs.model}} 73 | $[[--lr ${{inputs.lr}}]] 74 | $[[--epochs ${{inputs.epochs}}]] 75 | $[[--batch_size ${{inputs.batch_size}}]] 76 | --global_size ${{inputs.global_size}} 77 | --global_rank ${{inputs.global_rank}} 78 | $[[--communication_backend ${{inputs.communication_backend}}]] 79 | $[[--communication_encrypted ${{inputs.communication_encrypted}}]] 80 | 81 | environment: 82 | conda_file: ./conda.yaml 83 | image: mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04 84 | -------------------------------------------------------------------------------- /examples/components/MNIST_VERTICAL/traininsilo/samplers.py: -------------------------------------------------------------------------------- 1 | ########################################################################################## 2 | # WARNING # 3 | ########################################################################################## 4 | # Should this file change please update all copies of samplers.py file in the repository # 5 | ########################################################################################## 6 | 7 | import math 8 | import torch 9 | from torch.utils.data import Sampler 10 | 11 | 12 | class VerticallyDistributedBatchSampler(Sampler): 13 | """Batch sampler that uses a distributed communication backend to distribute samples indexes to each worker.""" 14 | 15 | def __init__(self, data_source, batch_size, comm, rank, world_size, shuffle=False): 16 | """Initializes the batch sampler. 17 | 18 | Args: 19 | data_source (torch.utils.data.Dataset): The dataset to sample from. 20 | batch_size (int): The size of the batch to sample. 21 | comm (AMLComm): The communicator to use for communication. 22 | rank (int): The rank of the current worker. 23 | world_size (int): The total number of workers. 24 | shuffle (bool, optional): Whether to shuffle the indices. Defaults to False. 25 | """ 26 | self.data_source = data_source 27 | self.batch_size = batch_size 28 | self.shuffle = shuffle 29 | self.rank = rank 30 | self.world_size = world_size 31 | self.comm = comm 32 | 33 | def __iter__(self): 34 | if self.rank == 0: 35 | if self.shuffle: 36 | indices = torch.randperm(len(self.data_source)) 37 | else: 38 | indices = torch.arange(len(self.data_source)) 39 | 40 | # Split the indices into batches 41 | batches = [ 42 | indices[i : i + self.batch_size] 43 | for i in range(0, len(indices), self.batch_size) 44 | ] 45 | 46 | for batch in batches: 47 | for i in range(1, self.world_size): 48 | # Send the batch to contributor i 49 | self.comm.send(batch, i) 50 | 51 | yield batch 52 | else: 53 | for i in range(0, len(self.data_source), self.batch_size): 54 | # Receive the batch from host 55 | batch = self.comm.recv(0) 56 | yield batch 57 | 58 | def __len__(self): 59 | return math.ceil(len(self.data_source) / self.batch_size) 60 | -------------------------------------------------------------------------------- /examples/components/MNIST_VERTICAL/upload_data/conda.yaml: -------------------------------------------------------------------------------- 1 | name: mnist_vertical_upload_data_conda_env 2 | channels: 3 | - defaults 4 | - pytorch 5 | dependencies: 6 | - python=3.8 7 | - pip=22.1.2 8 | - pytorch=1.12.1 9 | - torchvision=0.13.1 10 | - cudatoolkit=11.3 11 | - pip: 12 | - azureml-mlflow==1.48.0 13 | - pandas==1.5.2 14 | - tqdm==4.64.1 15 | -------------------------------------------------------------------------------- /examples/components/MNIST_VERTICAL/upload_data/spec.yaml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json 2 | name: msft_fl_mnist_vertical_upload_data 3 | version: 0.0.1 4 | display_name: Download MNIST data and upload to silo storage partitioning vertically 5 | type: command 6 | is_deterministic: true 7 | 8 | inputs: 9 | silo_count: 10 | type: number 11 | optional: false 12 | silo_index: 13 | type: number 14 | optional: false 15 | 16 | outputs: 17 | raw_train_data: 18 | type: uri_folder 19 | description: the output CC Fraud raw training data for a given silo 20 | raw_test_data: 21 | type: uri_folder 22 | description: the output CC Fraud raw testing data for a given silo 23 | 24 | code: . 25 | 26 | command: >- 27 | python run.py --silo_count ${{inputs.silo_count}} --silo_index ${{inputs.silo_index}} --raw_train_data ${{outputs.raw_train_data}} --raw_test_data ${{outputs.raw_test_data}} 28 | 29 | environment: 30 | conda_file: ./conda.yaml 31 | image: mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04 32 | -------------------------------------------------------------------------------- /examples/components/NER/preprocessing/conda.yaml: -------------------------------------------------------------------------------- 1 | name: ner_preprocess_conda_env 2 | channels: 3 | - defaults 4 | - pytorch 5 | dependencies: 6 | - python=3.8 7 | - pip=22.3.1 8 | - pytorch=1.12.1 9 | - cudatoolkit=11.3 10 | - pip: 11 | - azureml-mlflow==1.48.0 12 | - pandas==1.5.2 13 | - transformers==4.25.1 14 | - datasets==2.7.1 15 | -------------------------------------------------------------------------------- /examples/components/NER/preprocessing/spec.yaml: -------------------------------------------------------------------------------- 1 | 2 | $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json 3 | name: msft_fl_ner_preprocessing_in_silo 4 | version: 0.3.0 5 | display_name: MultiNERD Pre-Processing (in silo) 6 | type: command 7 | description: Component to preprocess the MultiNERD raw data 8 | is_deterministic: true 9 | 10 | inputs: 11 | raw_training_data: 12 | type: uri_folder 13 | description: the raw MultiNERD training data 14 | raw_testing_data: 15 | type: uri_folder 16 | description: the raw MultiNERD testing data 17 | tokenizer_name: 18 | type: string 19 | description: Tokenizer model name 20 | default: "bert-base-cased" 21 | optional: true 22 | metrics_prefix: 23 | type: string 24 | description: Metrics prefix 25 | default: Default-prefix 26 | optional: true 27 | 28 | outputs: 29 | processed_train_data: 30 | type: uri_folder 31 | description: the output training data after preprocessing 32 | processed_test_data: 33 | type: uri_folder 34 | description: the output testing data after preprocessing 35 | 36 | code: . 37 | 38 | command: >- 39 | python run.py --raw_training_data ${{inputs.raw_training_data}} --raw_testing_data ${{inputs.raw_testing_data}} --train_output ${{outputs.processed_train_data}} --test_output ${{outputs.processed_test_data}} $[[--tokenizer_name ${{inputs.tokenizer_name}}]] $[[--metrics_prefix ${{inputs.metrics_prefix}}]] 40 | 41 | environment: 42 | conda_file: ./conda.yaml 43 | image: mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04 44 | -------------------------------------------------------------------------------- /examples/components/NER/traininsilo/conda.yaml: -------------------------------------------------------------------------------- 1 | name: ner_train_conda_env 2 | channels: 3 | - defaults 4 | - pytorch 5 | - nvidia 6 | dependencies: 7 | - python=3.8 8 | - pip=22.3.1 9 | - pytorch=1.13.1 10 | - pytorch-cuda=11.6 11 | - pip: 12 | - azureml-mlflow==1.48.0 13 | - pandas==1.5.2 14 | - transformers==4.25.1 15 | - datasets==2.7.1 16 | - evaluate==0.3.0 17 | - numpy==1.23.5 18 | - seqeval==1.2.2 19 | - opacus==1.3.0 20 | - tqdm==4.64.1 21 | -------------------------------------------------------------------------------- /examples/components/NER/traininsilo/labels.json: -------------------------------------------------------------------------------- 1 | { 2 | "O": 0, 3 | "B-PER": 1, 4 | "I-PER": 2, 5 | "B-LOC": 3, 6 | "I-LOC": 4, 7 | "B-ORG": 5, 8 | "I-ORG": 6, 9 | "B-ANIM": 7, 10 | "I-ANIM": 8, 11 | "B-BIO": 9, 12 | "I-BIO": 10, 13 | "B-CEL": 11, 14 | "I-CEL": 12, 15 | "B-DIS": 13, 16 | "I-DIS": 14, 17 | "B-EVE": 15, 18 | "I-EVE": 16, 19 | "B-FOOD": 17, 20 | "I-FOOD": 18, 21 | "B-INST": 19, 22 | "I-INST": 20, 23 | "B-MEDIA": 21, 24 | "I-MEDIA": 22, 25 | "B-PLANT": 23, 26 | "I-PLANT": 24, 27 | "B-MYTH": 25, 28 | "I-MYTH": 26, 29 | "B-TIME": 27, 30 | "I-TIME": 28, 31 | "B-VEHI": 29, 32 | "I-VEHI": 30, 33 | "B-SUPER": 31, 34 | "I-SUPER": 32, 35 | "B-PHY": 33, 36 | "I-PHY": 34 37 | } 38 | -------------------------------------------------------------------------------- /examples/components/NER/upload_data/conda.yaml: -------------------------------------------------------------------------------- 1 | name: ner_upload_data_conda_env 2 | channels: 3 | - defaults 4 | - pytorch 5 | dependencies: 6 | - python=3.8 7 | - pip=22.3.1 8 | - pip: 9 | - datasets==2.7.1 10 | -------------------------------------------------------------------------------- /examples/components/NER/upload_data/spec.yaml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json 2 | name: msft_fl_ner_upload_data 3 | version: 0.3.0 4 | display_name: Download MultiNERD data and upload to silo storage 5 | type: command 6 | is_deterministic: true 7 | 8 | inputs: 9 | silo_count: 10 | type: number 11 | optional: false 12 | silo_index: 13 | type: number 14 | optional: false 15 | 16 | outputs: 17 | raw_train_data: 18 | type: uri_folder 19 | description: the output raw training data for a given silo 20 | raw_test_data: 21 | type: uri_folder 22 | description: the output raw testing data for a given silo 23 | 24 | code: . 25 | 26 | command: >- 27 | python run.py --silo_count ${{inputs.silo_count}} --silo_index ${{inputs.silo_index}} --raw_train_data ${{outputs.raw_train_data}} --raw_test_data ${{outputs.raw_test_data}} 28 | environment: 29 | conda_file: ./conda.yaml 30 | image: mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04 31 | -------------------------------------------------------------------------------- /examples/components/NVFLARE/client/environment/context/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:22.12-py3 2 | FROM ${PYTORCH_IMAGE} 3 | 4 | RUN python3 -m pip install -U pip 5 | RUN python3 -m pip install -U setuptools 6 | RUN python3 -m pip install nvflare==2.2.3 7 | RUN python3 -m pip install azureml-mlflow==1.48.0 8 | -------------------------------------------------------------------------------- /examples/components/NVFLARE/client/environment/env.yml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/environment.schema.json 2 | name: nvflare-pt 3 | version: 2.2.3 4 | build: 5 | path: ./context/ 6 | -------------------------------------------------------------------------------- /examples/components/NVFLARE/client/spec.yaml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json 2 | name: msft_fl_nvflare_client 3 | version: 0.0.5 4 | display_name: NVFlare client 5 | type: command 6 | description: This component runs an NVFlare client inside an AzureML job. 7 | is_deterministic: true 8 | tags: 9 | nvflare: 2.2.3 10 | url: https://github.com/Azure-Samples/azure-ml-federated-learning 11 | 12 | inputs: 13 | federation_identifier: 14 | type: string 15 | description: "a unique identifier for the group of clients and server to find each other" 16 | default: "fed-0000" 17 | client_config: 18 | type: uri_folder 19 | description: "the NVFlare workspace folder for this client" 20 | client_data: 21 | type: uri_folder 22 | optional: true 23 | description: "an optional folder containing data for the client to use" 24 | client_data_env_var: 25 | type: string 26 | default: CLIENT_DATA_PATH 27 | description: "the name of the env variable to set with the mount path of the client_data folder" 28 | start: 29 | type: uri_file 30 | description: "input to trigger the job to start, not actually used" 31 | 32 | code: "." 33 | 34 | command: >- 35 | python run.py 36 | --federation_identifier ${{inputs.federation_identifier}} 37 | --client_config ${{inputs.client_config}} 38 | $[[--client_data ${{inputs.client_data}}]] 39 | --client_data_env_var ${{inputs.client_data_env_var}} 40 | 41 | environment: 42 | build: 43 | path: ./environment/context/ 44 | -------------------------------------------------------------------------------- /examples/components/NVFLARE/provision/environment/context/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:22.12-py3 2 | FROM ${PYTORCH_IMAGE} 3 | 4 | RUN python3 -m pip install -U pip 5 | RUN python3 -m pip install -U setuptools 6 | RUN python3 -m pip install nvflare==2.2.3 7 | RUN python3 -m pip install azureml-mlflow==1.48.0 8 | -------------------------------------------------------------------------------- /examples/components/NVFLARE/provision/environment/env.yml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/environment.schema.json 2 | name: nvflare-sdk 3 | version: 2.2.3 4 | build: 5 | path: ./context/ 6 | -------------------------------------------------------------------------------- /examples/components/NVFLARE/provision/spec.yaml: -------------------------------------------------------------------------------- 1 | 2 | $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json 3 | name: msft_fl_nvflare_provision 4 | version: 0.0.5 5 | display_name: NVFlare provision 6 | type: command 7 | description: Provision an NVFlare project yaml config 8 | is_deterministic: true 9 | tags: 10 | nvflare: 2.2.3 11 | url: https://github.com/Azure-Samples/azure-ml-federated-learning 12 | 13 | inputs: 14 | project_config: 15 | type: uri_file 16 | 17 | outputs: 18 | workspace: 19 | type: uri_folder 20 | start: 21 | type: uri_file 22 | 23 | command: >- 24 | nvflare provision -p ${{inputs.project_config}} -w ${{outputs.workspace}} && echo "start" >> ${{outputs.start}}/start.txt 25 | 26 | environment: 27 | build: 28 | path: ./environment/context/ 29 | -------------------------------------------------------------------------------- /examples/components/NVFLARE/server/environment/context/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:22.12-py3 2 | FROM ${PYTORCH_IMAGE} 3 | 4 | RUN python3 -m pip install -U pip 5 | RUN python3 -m pip install -U setuptools 6 | RUN python3 -m pip install nvflare==2.2.3 7 | RUN python3 -m pip install azureml-mlflow==1.48.0 8 | -------------------------------------------------------------------------------- /examples/components/NVFLARE/server/environment/env.yml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/environment.schema.json 2 | name: nvflare-pt 3 | version: 2.2.3 4 | build: 5 | path: ./context/ 6 | -------------------------------------------------------------------------------- /examples/components/NVFLARE/server/spec.yaml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json 2 | name: msft_fl_nvflare_server 3 | version: 0.0.5 4 | display_name: NVFlare server 5 | type: command 6 | description: This component runs an NVFlare server inside an AzureML job. 7 | tags: 8 | nvflare: 2.2.3 9 | url: https://github.com/Azure-Samples/azure-ml-federated-learning 10 | 11 | is_deterministic: true 12 | 13 | inputs: 14 | federation_identifier: 15 | type: string 16 | description: "a unique identifier for the group of clients and server to find each other" 17 | default: "fed-0000" 18 | server_config: 19 | type: uri_folder 20 | description: "the NVFlare workspace folder for this server" 21 | admin_config: 22 | type: uri_folder 23 | description: "the NVFlare workspace admin folder to connect to the server" 24 | app_dir: 25 | type: uri_folder 26 | description: "the NVFlare app code directory" 27 | server_name: 28 | type: string 29 | description: "the name of the server/overseer expected by clients for hostname resolution" 30 | expected_clients: 31 | type: integer 32 | description: "the number of clients expected to connect to the server before training" 33 | start: 34 | type: uri_file 35 | description: "input to trigger the job to start, not actually used" 36 | wait_for_clients_timeout: 37 | type: integer 38 | default: 600 39 | description: "the number of seconds to wait for clients to connect before timing out" 40 | 41 | outputs: 42 | job_artefacts: 43 | type: uri_folder 44 | description: "where the NVFlare job artefacts will be saved upon completion of the job" 45 | 46 | code: "." 47 | 48 | command: >- 49 | python run.py 50 | --federation_identifier ${{inputs.federation_identifier}} 51 | --server_config ${{inputs.server_config}} 52 | --admin_config ${{inputs.admin_config}} 53 | --app_dir ${{inputs.app_dir}} 54 | --server_name ${{inputs.server_name}} 55 | --expected_clients ${{inputs.expected_clients}} 56 | --output_dir ${{outputs.job_artefacts}} 57 | --wait_for_clients_timeout ${{inputs.wait_for_clients_timeout}} 58 | 59 | environment: 60 | build: 61 | path: ./environment/context/ 62 | -------------------------------------------------------------------------------- /examples/components/PNEUMONIA/traininsilo/conda.yaml: -------------------------------------------------------------------------------- 1 | name: pneumonia_train_conda_env 2 | channels: 3 | - defaults 4 | - pytorch 5 | - nvidia 6 | dependencies: 7 | - python=3.8 8 | - pip=22.3.1 9 | - pytorch=2.0.0 10 | - pytorch-cuda=11.6 11 | - pip: 12 | - azureml-mlflow==1.48.0 13 | - opacus==1.3.0 14 | - tqdm==4.64.1 15 | - torchvision==0.15.1 16 | -------------------------------------------------------------------------------- /examples/components/PNEUMONIA/traininsilo/pneumonia_network.py: -------------------------------------------------------------------------------- 1 | # This file defining the model was taken as-is from https://github.com/Azure/medical-imaging/blob/main/federated-learning/pneumonia-federated/custom/pneumonia_network.py. 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | 7 | class PneumoniaNetwork(nn.Module): 8 | def __init__(self): 9 | super(PneumoniaNetwork, self).__init__() 10 | dropout = 0.2 11 | 12 | self.conv1 = nn.Conv2d( 13 | in_channels=1, out_channels=32, kernel_size=3, stride=1, padding=1 14 | ) 15 | self.conv2 = nn.Conv2d( 16 | in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1 17 | ) 18 | self.conv3 = nn.Conv2d( 19 | in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1 20 | ) 21 | 22 | self.dropout1 = nn.Dropout(dropout) 23 | self.dropout2 = nn.Dropout(dropout) 24 | 25 | self.fc1 = nn.Linear(28 * 28 * 128, 256) 26 | self.fc2 = nn.Linear(256, 2) 27 | 28 | def forward(self, x): 29 | x = F.relu(self.conv1(x)) # 224 x 224 x 32 30 | x = F.max_pool2d(x, 2, 2) # 112 x 112 x 32 31 | x = F.relu(self.conv2(x)) # 112 x 112 x 64 32 | x = F.max_pool2d(x, 2, 2) # 56 x 56 x 64 33 | x = self.dropout1(x) 34 | x = F.relu(self.conv3(x)) # 56 x 56 x 128 35 | x = F.max_pool2d(x, 2, 2) # 28 x 28 x 128 36 | x = self.dropout2(x) 37 | x = x.view(-1, 28 * 28 * 128) # 100.352 38 | x = F.relu(self.fc1(x)) 39 | x = self.fc2(x) 40 | return x 41 | -------------------------------------------------------------------------------- /examples/components/PNEUMONIA/traininsilo/spec.yaml: -------------------------------------------------------------------------------- 1 | 2 | $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json 3 | name: msft_fl_pneumonia_train_in_silo 4 | version: 0.3.0 5 | display_name: Pneumonia Train (in silo) 6 | type: command 7 | description: Component to train a pneumonia detection model on chest radiographs 8 | is_deterministic: true 9 | 10 | distribution: 11 | type: pytorch 12 | 13 | inputs: 14 | lr: 15 | type: number 16 | description: learning rate 17 | default: 0.01 18 | optional: true 19 | epochs: 20 | type: integer 21 | description: total number of epochs for local training 22 | default: 3 23 | optional: true 24 | batch_size: 25 | type: integer 26 | description: Training batch size 27 | default: 32 28 | optional: true 29 | dp: 30 | type: boolean 31 | description: differential privacy 32 | default: false 33 | optional: true 34 | dp_target_epsilon: 35 | type: number 36 | description: DP target epsilon 37 | default: 50.0 38 | optional: true 39 | dp_target_delta: 40 | type: number 41 | description: DP target delta 42 | default: 1e-5 43 | optional: true 44 | dp_max_grad_norm: 45 | type: number 46 | description: DP max gradient norm 47 | default: 1.0 48 | optional: true 49 | total_num_of_iterations: 50 | type: integer 51 | description: Total num of iterations 52 | default: 1 53 | optional: true 54 | dataset_name: 55 | type: uri_folder 56 | description: the data asset in Azure ML 57 | iteration_num: 58 | type: integer 59 | description: Iteration number 60 | default: 1 61 | optional: true 62 | checkpoint: 63 | type: uri_folder 64 | description: a given pre-existing model checkpoint 65 | optional: true 66 | metrics_prefix: 67 | type: string 68 | description: Metrics prefix 69 | default: Default-prefix 70 | optional: true 71 | 72 | outputs: 73 | model: 74 | type: uri_folder 75 | description: the output checkpoint 76 | 77 | code: . 78 | 79 | command: >- 80 | python run.py --dataset_name ${{inputs.dataset_name}} $[[--iteration_num ${{inputs.iteration_num}}]] $[[--checkpoint ${{inputs.checkpoint}}]] --model ${{outputs.model}} $[[--lr ${{inputs.lr}}]] $[[--epochs ${{inputs.epochs}}]] $[[--batch_size ${{inputs.batch_size}}]] $[[--metrics_prefix ${{inputs.metrics_prefix}}]] $[[--dp ${{inputs.dp}}]] $[[--total_num_of_iterations ${{inputs.total_num_of_iterations}}]] $[[--dp_target_epsilon ${{inputs.dp_target_epsilon}}]] $[[--dp_target_delta ${{inputs.dp_target_delta}}]] $[[--dp_max_grad_norm ${{inputs.dp_max_grad_norm}}]] 81 | 82 | environment: 83 | conda_file: ./conda.yaml 84 | image: mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04 85 | -------------------------------------------------------------------------------- /examples/components/PNEUMONIA/upload_data/conda.yaml: -------------------------------------------------------------------------------- 1 | name: pneumonia_upload_data_conda_env 2 | channels: 3 | - defaults 4 | dependencies: 5 | - python=3.8 6 | - pip=22.3.1 7 | - pip: 8 | - azure-identity==1.12.0 9 | - azure-keyvault==4.2.0 10 | - azureml-core==1.47.0 11 | - kaggle==1.5.12 12 | - split-folders==0.5.1 13 | -------------------------------------------------------------------------------- /examples/components/PNEUMONIA/upload_data/spec.yaml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json 2 | name: msft_fl_pneumonia_upload_data 3 | version: 0.3.0 4 | display_name: Download Chest dataset and upload to silo storage 5 | type: command 6 | description: Component that downloads the pneumonia dataset from Kaggle, partitions it, and then uploads each partition to one of the silos' storages. 7 | is_deterministic: true 8 | 9 | inputs: 10 | silo_count: 11 | type: number 12 | optional: false 13 | silo_index: 14 | type: number 15 | optional: false 16 | 17 | outputs: 18 | raw_data_folder: 19 | type: uri_folder 20 | description: the output folder where the raw data will be written 21 | 22 | code: . 23 | 24 | command: >- 25 | python run.py --silo_count ${{inputs.silo_count}} --silo_index ${{inputs.silo_index}} --raw_data_folder ${{outputs.raw_data_folder}} 26 | 27 | environment: 28 | conda_file: ./conda.yaml 29 | image: mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04 30 | -------------------------------------------------------------------------------- /examples/components/shared/samplers.py: -------------------------------------------------------------------------------- 1 | ########################################################################################## 2 | # WARNING # 3 | ########################################################################################## 4 | # Should this file change please update all copies of samplers.py file in the repository # 5 | ########################################################################################## 6 | 7 | import math 8 | import torch 9 | from torch.utils.data import Sampler 10 | 11 | 12 | class VerticallyDistributedBatchSampler(Sampler): 13 | """Batch sampler that uses a distributed communication backend to distribute samples indexes to each worker.""" 14 | 15 | def __init__(self, data_source, batch_size, comm, rank, world_size, shuffle=False): 16 | """Initializes the batch sampler. 17 | 18 | Args: 19 | data_source (torch.utils.data.Dataset): The dataset to sample from. 20 | batch_size (int): The size of the batch to sample. 21 | comm (AMLComm): The communicator to use for communication. 22 | rank (int): The rank of the current worker. 23 | world_size (int): The total number of workers. 24 | shuffle (bool, optional): Whether to shuffle the indices. Defaults to False. 25 | """ 26 | self.data_source = data_source 27 | self.batch_size = batch_size 28 | self.shuffle = shuffle 29 | self.rank = rank 30 | self.world_size = world_size 31 | self.comm = comm 32 | 33 | def __iter__(self): 34 | if self.rank == 0: 35 | if self.shuffle: 36 | indices = torch.randperm(len(self.data_source)) 37 | else: 38 | indices = torch.arange(len(self.data_source)) 39 | 40 | # Split the indices into batches 41 | batches = [ 42 | indices[i : i + self.batch_size] 43 | for i in range(0, len(indices), self.batch_size) 44 | ] 45 | 46 | for batch in batches: 47 | for i in range(1, self.world_size): 48 | # Send the batch to contributor i 49 | self.comm.send(batch, i) 50 | 51 | yield batch 52 | else: 53 | for i in range(0, len(self.data_source), self.batch_size): 54 | # Receive the batch from host 55 | batch = self.comm.recv(0) 56 | yield batch 57 | 58 | def __len__(self): 59 | return math.ceil(len(self.data_source) / self.batch_size) 60 | -------------------------------------------------------------------------------- /examples/components/utils/aggregatemodelweights/conda.yaml: -------------------------------------------------------------------------------- 1 | name: agg_conda_env 2 | channels: 3 | - defaults 4 | - pytorch 5 | dependencies: 6 | - python=3.7.11 7 | - pytorch=1.12.1 8 | - torchvision=0.13.1 9 | - cudatoolkit=11.3 10 | -------------------------------------------------------------------------------- /examples/components/utils/aggregatemodelweights/spec.yaml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json 2 | name: msft_fl_aggregate_model_weights 3 | version: 0.3.0 4 | display_name: Aggregate PyTorch Model Weights (from all silos) 5 | type: command 6 | description: Component for aggregating pytorch model weights. 7 | is_deterministic: true 8 | 9 | inputs: 10 | input_silo_1: 11 | type: uri_folder 12 | description: input from silo 1 (e.g., model weights, or gradient updates) 13 | optional: false 14 | input_silo_2: 15 | type: uri_folder 16 | description: input from silo 2 (e.g., model weights, or gradient updates) 17 | optional: true 18 | input_silo_3: 19 | type: uri_folder 20 | description: input from silo 3 (e.g., model weights, or gradient updates) 21 | optional: true 22 | input_silo_4: 23 | type: uri_folder 24 | description: input from silo 4 (e.g., model weights, or gradient updates) 25 | optional: true 26 | input_silo_5: 27 | type: uri_folder 28 | description: input from silo 5 (e.g., model weights, or gradient updates) 29 | optional: true 30 | ancillary_files: 31 | type: boolean 32 | description: Whether ancillary files need to be copied 33 | optional: true 34 | out_checkpoint_name: 35 | type: string 36 | description: the name of the output checkpoint, e.g. model, finetuned_state_dict 37 | optional: true 38 | 39 | 40 | outputs: 41 | aggregated_output: 42 | type: uri_folder 43 | description: the aggregated model or gradiants, residing in the orchestrator compute. 44 | 45 | code: . 46 | 47 | command: >- 48 | python run.py --output ${{outputs.aggregated_output}} 49 | --extension pt 50 | --checkpoints ${{inputs.input_silo_1}} 51 | $[[${{inputs.input_silo_2}}]] 52 | $[[${{inputs.input_silo_3}}]] 53 | $[[${{inputs.input_silo_4}}]] 54 | $[[${{inputs.input_silo_5}}]] 55 | $[[--ancillary_files ${{inputs.ancillary_files}}]] 56 | $[[--out_checkpoint_name ${{inputs.out_checkpoint_name}}]] 57 | environment: 58 | conda_file: ./conda.yaml 59 | image: mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04 60 | -------------------------------------------------------------------------------- /examples/components/utils/data_analysis/spec.yaml: -------------------------------------------------------------------------------- 1 | 2 | $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json 3 | name: msft_fl_ccfraud_data_analysis_in_silo 4 | version: 0.0.1 5 | display_name: CC Fraud Data-analysis (in silo) 6 | type: command 7 | description: Component for data-analysis of tabular data from silo's blob storage 8 | is_deterministic: true 9 | 10 | inputs: 11 | training_data: 12 | type: uri_file 13 | description: training data in a given silo 14 | testing_data: 15 | type: uri_file 16 | description: testing data in a given silo 17 | categorical_columns: 18 | type: string 19 | description: Names of categorical columns 20 | optional: true 21 | onehot_columns_prefix: 22 | type: string 23 | description: PRefixes of one-hot encoded columns 24 | optional: true 25 | metrics_prefix: 26 | type: string 27 | description: Metrics prefix 28 | default: Default-prefix 29 | optional: true 30 | silo_index: 31 | type: integer 32 | description: Silo index 33 | optional: false 34 | 35 | code: . 36 | 37 | command: >- 38 | python run.py 39 | --training_data ${{inputs.training_data}} 40 | --testing_data ${{inputs.testing_data}} 41 | --silo_index ${{inputs.silo_index}} 42 | $[[--categorical_columns ${{inputs.categorical_columns}}]] 43 | $[[--onehot_columns_prefix ${{inputs.onehot_columns_prefix}}]] 44 | $[[--metrics_prefix ${{inputs.metrics_prefix}}]] 45 | 46 | # NOTE: using one of Azure ML's curated environments 47 | # which has all the dependencies needed for this job 48 | environment: azureml:AzureML-sklearn-1.0-ubuntu20.04-py38-cpu@latest 49 | -------------------------------------------------------------------------------- /examples/components/utils/multiply_data_files/conda.yaml: -------------------------------------------------------------------------------- 1 | name: multiply_data_conda_env 2 | channels: 3 | - defaults 4 | dependencies: 5 | - python=3.8 6 | - pip=22.3.1 7 | - pip: 8 | - azureml-mlflow==1.48.0 9 | - tqdm==4.64.1 10 | -------------------------------------------------------------------------------- /examples/components/utils/multiply_data_files/spec.yaml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json 2 | name: msft_fl_multiply_data_files 3 | version: 0.3.0 4 | display_name: Multiply Data Files 5 | type: command 6 | description: Component that multiplies the dataset on each silo. 7 | is_deterministic: true 8 | 9 | inputs: 10 | input_folder: 11 | type: uri_folder 12 | description: Input directory path 13 | optional: false 14 | multiply: 15 | type: integer 16 | description: Multiplication factor 17 | default: 10 18 | optional: true 19 | 20 | outputs: 21 | output_folder: # Multiplied data directory path 22 | type: uri_folder 23 | description: Output directory path 24 | 25 | code: . 26 | 27 | command: >- 28 | python run.py --input ${{inputs.input_folder}} --output ${{outputs.output_folder}} $[[--multiply ${{inputs.multiply}}]] 29 | 30 | environment: 31 | conda_file: ./conda.yaml 32 | image: mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04 -------------------------------------------------------------------------------- /examples/pipelines/bank_marketing_vertical/config.yaml: -------------------------------------------------------------------------------- 1 | # example yaml config 2 | 3 | # using this to store references to Azure ML 4 | aml: 5 | # subscription_id: "" 6 | # resource_group_name: "" 7 | # workspace_name: "" 8 | 9 | # federated learning parameters 10 | federated_learning: 11 | communication: 12 | backend: socket 13 | encrypted: false 14 | 15 | host: 16 | compute: orchestrator-01 17 | datastore: datastore_orchestrator 18 | training_data: 19 | type: uri_file 20 | mode: 'download' 21 | path: azureml://datastores/datastore_orchestrator/paths/federated_learning/bank_marketing_vertical/raw_train_data 22 | testing_data: 23 | type: uri_file 24 | mode: 'download' 25 | path: azureml://datastores/datastore_orchestrator/paths/federated_learning/bank_marketing_vertical/raw_test_data 26 | 27 | silos: 28 | - compute: silo0-01 29 | datastore: datastore_silo0 30 | training_data: 31 | type: uri_file 32 | mode: 'download' 33 | path: azureml://datastores/datastore_silo0/paths/federated_learning/bank_marketing_vertical/raw_train_data 34 | testing_data: 35 | type: uri_file 36 | mode: 'download' 37 | path: azureml://datastores/datastore_silo0/paths/federated_learning/bank_marketing_vertical/raw_test_data 38 | - compute: silo1-01 39 | datastore: datastore_silo1 40 | training_data: 41 | type: uri_file 42 | mode: 'download' 43 | path: azureml://datastores/datastore_silo1/paths/federated_learning/bank_marketing_vertical/raw_train_data 44 | testing_data: 45 | type: uri_file 46 | mode: 'download' 47 | path: azureml://datastores/datastore_silo1/paths/federated_learning/bank_marketing_vertical/raw_test_data 48 | - compute: silo2-01 49 | datastore: datastore_silo2 50 | training_data: 51 | type: uri_file 52 | mode: 'download' 53 | path: azureml://datastores/datastore_silo2/paths/federated_learning/bank_marketing_vertical/raw_train_data 54 | testing_data: 55 | type: uri_file 56 | mode: 'download' 57 | path: azureml://datastores/datastore_silo2/paths/federated_learning/bank_marketing_vertical/raw_test_data 58 | 59 | # training parameters 60 | training_parameters: 61 | epochs: 50 # number of epochs per iteration (in-silo training) 62 | lr: 1e-2 # learning rate 63 | batch_size: 1000 # batch size 64 | model_name: SimpleLinear 65 | -------------------------------------------------------------------------------- /examples/pipelines/ccfraud_vertical/config.yaml: -------------------------------------------------------------------------------- 1 | # example yaml config 2 | 3 | # using this to store references to Azure ML 4 | aml: 5 | # subscription_id: "" 6 | # resource_group_name: "" 7 | # workspace_name: "" 8 | 9 | # federated learning parameters 10 | federated_learning: 11 | communication: 12 | backend: socket 13 | encrypted: false 14 | 15 | host: 16 | compute: orchestrator-01 17 | datastore: datastore_orchestrator 18 | training_data: 19 | type: uri_file 20 | mode: 'download' 21 | path: azureml://datastores/datastore_orchestrator/paths/federated_learning/ccfraud_vertical/raw_train_data 22 | testing_data: 23 | type: uri_file 24 | mode: 'download' 25 | path: azureml://datastores/datastore_orchestrator/paths/federated_learning/ccfraud_vertical/raw_test_data 26 | 27 | silos: 28 | - compute: silo0-01 29 | datastore: datastore_silo0 30 | training_data: 31 | type: uri_file 32 | mode: 'download' 33 | path: azureml://datastores/datastore_silo0/paths/federated_learning/ccfraud_vertical/raw_train_data 34 | testing_data: 35 | type: uri_file 36 | mode: 'download' 37 | path: azureml://datastores/datastore_silo0/paths/federated_learning/ccfraud_vertical/raw_test_data 38 | - compute: silo1-01 39 | datastore: datastore_silo1 40 | training_data: 41 | type: uri_file 42 | mode: 'download' 43 | path: azureml://datastores/datastore_silo1/paths/federated_learning/ccfraud_vertical/raw_train_data 44 | testing_data: 45 | type: uri_file 46 | mode: 'download' 47 | path: azureml://datastores/datastore_silo1/paths/federated_learning/ccfraud_vertical/raw_test_data 48 | - compute: silo2-01 49 | datastore: datastore_silo2 50 | training_data: 51 | type: uri_file 52 | mode: 'download' 53 | path: azureml://datastores/datastore_silo2/paths/federated_learning/ccfraud_vertical/raw_train_data 54 | testing_data: 55 | type: uri_file 56 | mode: 'download' 57 | path: azureml://datastores/datastore_silo2/paths/federated_learning/ccfraud_vertical/raw_test_data 58 | 59 | # training parameters 60 | training_parameters: 61 | epochs: 10 # number of epochs per iteration (in-silo training) 62 | lr: 1e-2 # learning rate 63 | batch_size: 5000 # batch size 64 | -------------------------------------------------------------------------------- /examples/pipelines/ccfraud_vertical_fedonce/config.yaml: -------------------------------------------------------------------------------- 1 | # example yaml config 2 | 3 | # using this to store references to Azure ML 4 | aml: 5 | # subscription_id: "" 6 | # resource_group_name: "" 7 | # workspace_name: "" 8 | 9 | # federated learning parameters 10 | federated_learning: 11 | host: 12 | compute: orchestrator-01 13 | datastore: datastore_orchestrator 14 | training_data: 15 | type: uri_file 16 | mode: 'download' 17 | path: azureml://datastores/datastore_orchestrator/paths/federated_learning/ccfraud_vertical_fedonce/raw_train_data 18 | testing_data: 19 | type: uri_file 20 | mode: 'download' 21 | path: azureml://datastores/datastore_orchestrator/paths/federated_learning/ccfraud_vertical_fedonce/raw_test_data 22 | 23 | silos: 24 | - compute: silo0-01 25 | datastore: datastore_silo0 26 | training_data: 27 | type: uri_file 28 | mode: 'download' 29 | path: azureml://datastores/datastore_silo0/paths/federated_learning/ccfraud_vertical_fedonce/raw_train_data 30 | testing_data: 31 | type: uri_file 32 | mode: 'download' 33 | path: azureml://datastores/datastore_silo0/paths/federated_learning/ccfraud_vertical_fedonce/raw_test_data 34 | - compute: silo1-01 35 | datastore: datastore_silo1 36 | training_data: 37 | type: uri_file 38 | mode: 'download' 39 | path: azureml://datastores/datastore_silo1/paths/federated_learning/ccfraud_vertical_fedonce/raw_train_data 40 | testing_data: 41 | type: uri_file 42 | mode: 'download' 43 | path: azureml://datastores/datastore_silo1/paths/federated_learning/ccfraud_vertical_fedonce/raw_test_data 44 | - compute: silo2-01 45 | datastore: datastore_silo2 46 | training_data: 47 | type: uri_file 48 | mode: 'download' 49 | path: azureml://datastores/datastore_silo2/paths/federated_learning/ccfraud_vertical_fedonce/raw_train_data 50 | testing_data: 51 | type: uri_file 52 | mode: 'download' 53 | path: azureml://datastores/datastore_silo2/paths/federated_learning/ccfraud_vertical_fedonce/raw_test_data 54 | 55 | # training parameters 56 | training_parameters: 57 | epochs: 10 # number of epochs per iteration (in-silo training) 58 | lr: 1e-2 # learning rate 59 | batch_size: 5000 # batch size 60 | -------------------------------------------------------------------------------- /examples/pipelines/environment.yml: -------------------------------------------------------------------------------- 1 | name: fl_experiment_conda_env 2 | channels: 3 | - defaults 4 | dependencies: 5 | - python=3.10.4 6 | - pip=22.3.1 7 | - pip: 8 | - -r requirements.txt -------------------------------------------------------------------------------- /examples/pipelines/mnist_vertical/config.yaml: -------------------------------------------------------------------------------- 1 | # example yaml config 2 | 3 | # using this to store references to Azure ML 4 | aml: 5 | # subscription_id: "" 6 | # resource_group_name: "" 7 | # workspace_name: "" 8 | 9 | # federated learning parameters 10 | federated_learning: 11 | communication: 12 | backend: socket 13 | encrypted: false 14 | 15 | host: 16 | compute: orchestrator-01 17 | datastore: datastore_orchestrator 18 | training_data: 19 | type: uri_file 20 | mode: 'download' 21 | path: azureml://datastores/datastore_orchestrator/paths/federated_learning/mnist_vertical/raw_train_data 22 | testing_data: 23 | type: uri_file 24 | mode: 'download' 25 | path: azureml://datastores/datastore_orchestrator/paths/federated_learning/mnist_vertical/raw_test_data 26 | 27 | silos: 28 | - compute: silo0-01 29 | datastore: datastore_silo0 30 | training_data: 31 | type: uri_file 32 | mode: 'download' 33 | path: azureml://datastores/datastore_silo0/paths/federated_learning/mnist_vertical/raw_train_data 34 | testing_data: 35 | type: uri_file 36 | mode: 'download' 37 | path: azureml://datastores/datastore_silo0/paths/federated_learning/mnist_vertical/raw_test_data 38 | - compute: silo1-01 39 | datastore: datastore_silo1 40 | training_data: 41 | type: uri_file 42 | mode: 'download' 43 | path: azureml://datastores/datastore_silo1/paths/federated_learning/mnist_vertical/raw_train_data 44 | testing_data: 45 | type: uri_file 46 | mode: 'download' 47 | path: azureml://datastores/datastore_silo1/paths/federated_learning/mnist_vertical/raw_test_data 48 | - compute: silo2-01 49 | datastore: datastore_silo2 50 | training_data: 51 | type: uri_file 52 | mode: 'download' 53 | path: azureml://datastores/datastore_silo2/paths/federated_learning/mnist_vertical/raw_train_data 54 | testing_data: 55 | type: uri_file 56 | mode: 'download' 57 | path: azureml://datastores/datastore_silo2/paths/federated_learning/mnist_vertical/raw_test_data 58 | 59 | # training parameters 60 | training_parameters: 61 | epochs: 10 # number of epochs per iteration (in-silo training) 62 | lr: 1e-3 # learning rate 63 | batch_size: 128 # batch size -------------------------------------------------------------------------------- /examples/pipelines/pneumonia/config.yaml: -------------------------------------------------------------------------------- 1 | # example yaml config 2 | 3 | # using this to store references to Azure ML 4 | aml: 5 | # subscription_id: "" 6 | # resource_group_name: "" 7 | # workspace_name: "" 8 | 9 | # federated learning parameters 10 | federated_learning: 11 | orchestrator: 12 | compute: "orchestrator-01" 13 | datastore: "datastore_orchestrator" 14 | 15 | silos: 16 | - name: silo0 17 | computes: 18 | - silo0-01 # name of the compute for silo X 19 | datastore: datastore_silo0 20 | silo_data: 21 | type: uri_folder 22 | mode: 'download' 23 | path: azureml://datastores/datastore_silo0/paths/federated_learning/pneumonia 24 | 25 | - name: silo1 26 | computes: 27 | - silo1-01 # we are repeating over the same config for silo 2 28 | datastore: datastore_silo1 29 | silo_data: 30 | type: uri_folder 31 | mode: 'download' 32 | path: azureml://datastores/datastore_silo1/paths/federated_learning/pneumonia 33 | 34 | - name: silo2 35 | computes: 36 | - silo2-01 # we are repeating over the same config for silo 3 37 | datastore: datastore_silo2 38 | silo_data: 39 | type: uri_folder 40 | mode: 'download' 41 | path: azureml://datastores/datastore_silo2/paths/federated_learning/pneumonia 42 | 43 | # training parameters 44 | training_parameters: 45 | num_of_iterations: 2 46 | epochs: 5 47 | lr: 0.01 48 | batch_size: 32 49 | 50 | # Differential privacy 51 | dp: false # Flag to enable/disable differential privacy 52 | dp_target_epsilon: 50.0 # Smaller epsilon means more privacy, more noise (it depends on the size of the training dataset. For more info, please visit https://opacus.ai/docs/faq#what-does-epsilon11-really-mean-how-about-delta ) 53 | dp_target_delta: 1e-5 # The target δ of the (ϵ,δ)-differential privacy guarantee. Generally, it should be set to be less than the inverse of the size of the training dataset. 54 | dp_max_grad_norm: 1.0 # Clip per-sample gradients to this norm (DP) 55 | 56 | # if you want to use the privacy_engine.make_private method, please set the value of dp_noise_multiplier parameter 57 | # dp_noise_multiplier: 1.0 # Noise multiplier - to add noise to gradients (DP) -------------------------------------------------------------------------------- /examples/pipelines/pneumonia_flwr/config.yaml: -------------------------------------------------------------------------------- 1 | # EXAMPLE CONFIG FILE 2 | 3 | # This file is intendedt to help contain all the parameters required 4 | # to orchestrate our sample federated learning experiments. 5 | # It is by no means necessary to run an FL experiment, just helpful. 6 | # See submit.py for details on how to consume this file in python. 7 | 8 | # This should work out of the box when running an experiment 9 | # on one of our sandbox environments. 10 | 11 | # Follow the instructions in the comments to adapt to your settings. 12 | 13 | # References to Azure ML workspace (use cli args to override) 14 | aml: 15 | # subscription_id: "" 16 | # resource_group_name: "" 17 | # workspace_name: "" 18 | 19 | # Parameters to generate the FL graph 20 | federated_learning: 21 | orchestrator: 22 | # name of compute for orchestrator 23 | compute: "orchestrator-01" 24 | # name of datastore for orchestrator (saving model weights + aggregate) 25 | datastore: "datastore_orchestrator" 26 | 27 | silos: # silos are provided as a list 28 | - name: silo0 29 | computes: 30 | - silo0-01 # name of the compute for silo X 31 | datastore: datastore_silo0 # name of the datastore for silo X 32 | # training inputs are specified below 33 | # NOTE: in this demo, we're using public data from a url instead 34 | silo_data: 35 | type: uri_folder 36 | mode: 'download' 37 | path: azureml://datastores/datastore_silo0/paths/federated_learning/pneumonia 38 | 39 | - name: silo1 40 | computes: 41 | - silo1-01 # name of the compute for silo X 42 | datastore: datastore_silo1 # name of the datastore for silo X 43 | # training inputs are specified below 44 | # NOTE: in this demo, we're using public data from a url instead 45 | silo_data: 46 | type: uri_folder 47 | mode: 'download' 48 | path: azureml://datastores/datastore_silo1/paths/federated_learning/pneumonia 49 | 50 | - name: silo2 51 | computes: 52 | - silo2-01 # name of the compute for silo X 53 | datastore: datastore_silo2 # name of the datastore for silo X 54 | # training inputs are specified below 55 | # NOTE: in this demo, we're using public data from a url instead 56 | silo_data: 57 | type: uri_folder 58 | mode: 'download' 59 | path: azureml://datastores/datastore_silo2/paths/federated_learning/pneumonia 60 | 61 | # Training parameters 62 | training_parameters: 63 | # how many loops of scatter-gather to run 64 | num_of_iterations: 2 65 | 66 | # then typical training parameters 67 | epochs: 3 # number of epochs per iteration (in-silo training) 68 | lr: 0.01 # learning rate 69 | batch_size: 64 # batch size 70 | -------------------------------------------------------------------------------- /examples/pipelines/pneumonia_nvflare/pneumonia_federated/config/config_fed_client.json: -------------------------------------------------------------------------------- 1 | { 2 | "format_version": 2, 3 | "executors": [ 4 | { 5 | "tasks": [ 6 | "train", 7 | "submit_model", 8 | "validate" 9 | ], 10 | "executor": { 11 | "id": "Executor", 12 | "path": "nvflare.app_common.executors.learner_executor.LearnerExecutor", 13 | "args": { 14 | "learner_id": "pt_learner" 15 | } 16 | } 17 | } 18 | ], 19 | "task_result_filters": [], 20 | "task_data_filters": [], 21 | "components": [ 22 | { 23 | "id": "pt_learner", 24 | "path": "pt_learner.PTLearner", 25 | "args": { 26 | "lr": 0.0005, 27 | "epochs": 5, 28 | "dataset_path_env_var": "CLIENT_DATA_PATH", 29 | "analytic_sender_id": "analytic_sender" 30 | } 31 | }, 32 | { 33 | "id": "analytic_sender", 34 | "name": "AnalyticsSender", 35 | "args": {} 36 | }, 37 | { 38 | "id": "event_to_fed", 39 | "name": "ConvertToFedEvent", 40 | "args": { 41 | "events_to_convert": [ 42 | "analytix_log_stats" 43 | ], 44 | "fed_event_prefix": "fed." 45 | } 46 | } 47 | ] 48 | } 49 | -------------------------------------------------------------------------------- /examples/pipelines/pneumonia_nvflare/pneumonia_federated/config/config_fed_server.json: -------------------------------------------------------------------------------- 1 | { 2 | "format_version": 2, 3 | "server": { 4 | "heart_beat_timeout": 600 5 | }, 6 | "task_data_filters": [], 7 | "task_result_filters": [], 8 | "components": [ 9 | { 10 | "id": "persistor", 11 | "name": "PTFileModelPersistor", 12 | "args": { 13 | "model": { 14 | "path": "pneumonia_network.PneumoniaNetwork" 15 | } 16 | } 17 | }, 18 | { 19 | "id": "shareable_generator", 20 | "path": "nvflare.app_common.shareablegenerators.full_model_shareable_generator.FullModelShareableGenerator", 21 | "args": {} 22 | }, 23 | { 24 | "id": "aggregator", 25 | "path": "nvflare.app_common.aggregators.intime_accumulate_model_aggregator.InTimeAccumulateWeightedAggregator", 26 | "args": { 27 | "expected_data_kind": "WEIGHTS" 28 | } 29 | }, 30 | { 31 | "id": "model_locator", 32 | "path": "nvflare.app_common.pt.pt_file_model_locator.PTFileModelLocator", 33 | "args": { 34 | "pt_persistor_id": "persistor" 35 | } 36 | }, 37 | { 38 | "id": "json_generator", 39 | "path": "nvflare.app_common.widgets.validation_json_generator.ValidationJsonGenerator", 40 | "args": {} 41 | }, 42 | { 43 | "id": "tb_analytics_receiver", 44 | "name": "MLFlowAnalyticsReceiver", 45 | "path": "mlflow_receiver.MLFlowAnalyticsReceiver", 46 | "args": { 47 | "events": [ 48 | "fed.analytix_log_stats" 49 | ] 50 | } 51 | } 52 | ], 53 | "workflows": [ 54 | { 55 | "id": "scatter_and_gather", 56 | "name": "ScatterAndGather", 57 | "args": { 58 | "min_clients": 1, 59 | "num_rounds": 1, 60 | "start_round": 0, 61 | "wait_time_after_min_received": 10, 62 | "aggregator_id": "aggregator", 63 | "persistor_id": "persistor", 64 | "shareable_generator_id": "shareable_generator", 65 | "train_task_name": "train", 66 | "train_timeout": 0 67 | } 68 | }, 69 | { 70 | "id": "cross_site_validate", 71 | "name": "CrossSiteModelEval", 72 | "args": { 73 | "model_locator_id": "model_locator" 74 | } 75 | } 76 | ] 77 | } -------------------------------------------------------------------------------- /examples/pipelines/pneumonia_nvflare/pneumonia_federated/custom/pneumonia_network.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | import torch.nn as nn 17 | import torch.nn.functional as F 18 | 19 | 20 | class PneumoniaNetwork(nn.Module): 21 | def __init__(self): 22 | super(PneumoniaNetwork, self).__init__() 23 | dropout = 0.2 24 | 25 | self.conv1 = nn.Conv2d( 26 | in_channels=1, out_channels=32, kernel_size=3, stride=1, padding=1 27 | ) 28 | self.conv2 = nn.Conv2d( 29 | in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1 30 | ) 31 | self.conv3 = nn.Conv2d( 32 | in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1 33 | ) 34 | 35 | self.dropout1 = nn.Dropout(dropout) 36 | self.dropout2 = nn.Dropout(dropout) 37 | 38 | self.fc1 = nn.Linear(28 * 28 * 128, 256) 39 | self.fc2 = nn.Linear(256, 2) 40 | 41 | def forward(self, x): 42 | x = F.relu(self.conv1(x)) # 224 x 224 x 32 43 | x = F.max_pool2d(x, 2, 2) # 112 x 112 x 32 44 | x = F.relu(self.conv2(x)) # 112 x 112 x 64 45 | x = F.max_pool2d(x, 2, 2) # 56 x 56 x 64 46 | x = self.dropout1(x) 47 | x = F.relu(self.conv3(x)) # 56 x 56 x 128 48 | x = F.max_pool2d(x, 2, 2) # 28 x 28 x 128 49 | x = self.dropout2(x) 50 | x = x.view(-1, 28 * 28 * 128) # 100.352 51 | x = F.relu(self.fc1(x)) 52 | x = self.fc2(x) 53 | return x 54 | -------------------------------------------------------------------------------- /examples/pipelines/pneumonia_nvflare/pneumonia_federated/custom/pt_constants.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | class PTConstants: 17 | PTServerName = "server" 18 | PTFileModelName = "FL_global_model.pt" 19 | PTLocalModelName = "local_model.pt" 20 | 21 | PTModelsDir = "models" 22 | CrossValResultsJsonFilename = "cross_val_results.json" 23 | -------------------------------------------------------------------------------- /examples/pipelines/requirements.txt: -------------------------------------------------------------------------------- 1 | azure-identity 2 | azure-ai-ml==1.4.0 3 | omegaconf 4 | -------------------------------------------------------------------------------- /examples/pipelines/utils/multiply_data_files/config.yaml: -------------------------------------------------------------------------------- 1 | # CONFIG FILE FOR MULTIPLYING DATASETS 2 | 3 | # References to Azure ML workspace (use cli args to override) 4 | aml: 5 | # subscription_id: "" 6 | # resource_group_name: "" 7 | # workspace_name: "" 8 | 9 | # Parameters to generate the FL graph 10 | federated_learning: 11 | silos: # silos are provided as a list 12 | - compute: silo0-01 # name of the compute for silo X 13 | datastore: datastore_silo0 # name of the datastore for silo X 14 | input_data: 15 | type: uri_folder 16 | mode: 'download' 17 | path: azureml://datastores/datastore_silo0/paths/federated_learning/pneumonia 18 | output_data: 19 | type: uri_folder 20 | mode: 'upload' 21 | path: azureml://datastores/datastore_silo0/paths/federated_learning/multiplied_data/pneumonia 22 | - compute: silo1-01 # we are repeating over the same config for silo 2 23 | datastore: datastore_silo1 24 | input_data: 25 | type: uri_folder 26 | mode: 'download' 27 | path: azureml://datastores/datastore_silo1/paths/federated_learning/pneumonia 28 | output_data: 29 | type: uri_folder 30 | mode: 'upload' 31 | path: azureml://datastores/datastore_silo1/paths/federated_learning/multiplied_data/pneumonia 32 | - compute: silo2-01 # we are repeating over the same config for silo 3 33 | datastore: datastore_silo2 34 | input_data: 35 | type: uri_folder 36 | mode: 'download' 37 | path: azureml://datastores/datastore_silo2/paths/federated_learning/pneumonia 38 | output_data: 39 | type: uri_folder 40 | mode: 'upload' 41 | path: azureml://datastores/datastore_silo2/paths/federated_learning/multiplied_data/pneumonia 42 | -------------------------------------------------------------------------------- /examples/pipelines/utils/upload_data/config.yaml: -------------------------------------------------------------------------------- 1 | # CONFIG FILE FOR UPLOADING DATASETS TO CORRESPONDING REGIONS 2 | 3 | # References to Azure ML workspace (use cli args to override) 4 | aml: 5 | # subscription_id: "" 6 | # resource_group_name: "" 7 | # workspace_name: "" 8 | 9 | # Parameters to generate the FL graph 10 | federated_learning: 11 | host: 12 | compute: orchestrator-01 13 | datastore: datastore_orchestrator 14 | silos: # silos are provided as a list 15 | - compute: silo0-01 # name of the compute for silo X 16 | datastore: datastore_silo0 # name of the datastore for silo X 17 | 18 | - compute: silo1-01 # we are repeating over the same config for silo 2 19 | datastore: datastore_silo1 20 | 21 | - compute: silo2-01 # we are repeating over the same config for silo 3 22 | datastore: datastore_silo2 23 | 24 | confidentiality: 25 | enable: false 26 | keyvault: https://kv-fldemo.vault.azure.net # url of the keyvault 27 | key_name: dev-rsa-key # name of the secret containing your encryption public key 28 | -------------------------------------------------------------------------------- /mlops/arm/README.md: -------------------------------------------------------------------------------- 1 | These ARM templates are generated automatically from the ![bicep scripts](../bicep/). Please refer to bicep for the source of truth. -------------------------------------------------------------------------------- /mlops/bicep/modules/azureml/attach_aks_training_to_azureml.bicep: -------------------------------------------------------------------------------- 1 | // This BICEP script will attach an AKS cluster 2 | // to a given AzureML workspace for training (NOT inferencing). 3 | 4 | // resource group must be specified as scope in az cli or module call 5 | targetScope = 'resourceGroup' 6 | 7 | // required parameters 8 | @description('Name of AzureML workspace to attach compute+storage to.') 9 | param machineLearningName string 10 | 11 | @description('The region of the machine learning workspace') 12 | param machineLearningRegion string = resourceGroup().location 13 | 14 | @description('Resource ID of the AKS cluster.') 15 | param aksResourceId string 16 | 17 | @description('Region of the AKS cluster.') 18 | param aksRegion string 19 | 20 | @description('How to name this compute in Azure ML') 21 | param amlComputeName string 22 | 23 | @description('Name of the existing UAI for the compute cluster.') 24 | param computeUaiName string 25 | 26 | // provision a user assigned identify for this silo 27 | resource uai 'Microsoft.ManagedIdentity/userAssignedIdentities@2022-01-31-preview' existing = { 28 | name: computeUaiName 29 | scope: resourceGroup() 30 | } 31 | 32 | var identityPrincipalId = uai.properties.principalId 33 | var userAssignedIdentities = {'/subscriptions/${subscription().subscriptionId}/resourceGroups/${resourceGroup().name}/providers/Microsoft.ManagedIdentity/userAssignedIdentities/${uai.name}': {}} 34 | 35 | resource workspace 'Microsoft.MachineLearningServices/workspaces@2022-05-01' existing = { 36 | name: machineLearningName 37 | scope: resourceGroup() 38 | } 39 | 40 | // attach the AKS cluster to the workspace 41 | resource aksAzuremlCompute 'Microsoft.MachineLearningServices/workspaces/computes@2021-01-01' = { 42 | name: amlComputeName 43 | parent: workspace 44 | location: machineLearningRegion 45 | identity: { 46 | type: 'UserAssigned' 47 | userAssignedIdentities: userAssignedIdentities 48 | } 49 | properties: { 50 | computeType: 'Kubernetes' 51 | computeLocation: aksRegion 52 | resourceId: aksResourceId 53 | description: 'AKS cluster attached to AzureML workspace' 54 | properties: { 55 | } 56 | } 57 | } 58 | 59 | // output the compute config for next actions (permission model) 60 | output identityPrincipalId string = identityPrincipalId 61 | output compute string = aksAzuremlCompute.name 62 | -------------------------------------------------------------------------------- /mlops/bicep/modules/networking/private_dns_zone.bicep: -------------------------------------------------------------------------------- 1 | // Provision a private DNS Zone 2 | 3 | @description('Name of the private DNS zone') 4 | param name string 5 | 6 | @description('Location of the private DNS zone (default: global)') 7 | param location string = 'global' 8 | 9 | @description('Tags for curation of resources') 10 | param tags object = {} 11 | 12 | @description('Optional: link the private DNS zone to a given virtual network') 13 | param linkToVirtualNetworkId string = '' 14 | 15 | // create the dns zone 16 | resource privateDnsZone 'Microsoft.Network/privateDnsZones@2020-06-01' = { 17 | name: name 18 | location: location 19 | tags: tags 20 | } 21 | 22 | // create the link 23 | resource privateDnsZoneVnetLink 'Microsoft.Network/privateDnsZones/virtualNetworkLinks@2020-06-01' = if (!empty(linkToVirtualNetworkId)) { 24 | name: uniqueString(linkToVirtualNetworkId, name, location) 25 | parent: privateDnsZone 26 | location: location 27 | properties: { 28 | registrationEnabled: false 29 | virtualNetwork: { 30 | id: linkToVirtualNetworkId 31 | } 32 | } 33 | } 34 | 35 | output name string = privateDnsZone.name 36 | output id string = privateDnsZone.id 37 | -------------------------------------------------------------------------------- /mlops/bicep/modules/networking/vnet.bicep: -------------------------------------------------------------------------------- 1 | // Creates a virtual network 2 | 3 | targetScope = 'resourceGroup' 4 | 5 | @description('Name of the virtual network resource') 6 | param virtualNetworkName string 7 | 8 | @description('Group ID of the network security group') 9 | param networkSecurityGroupId string 10 | 11 | @description('Azure region of the deployment') 12 | param location string = resourceGroup().location 13 | 14 | @description('Virtual network address prefix') 15 | param vnetAddressPrefix string = '10.0.0.0/16' 16 | 17 | @description('Training subnets names and address prefix') 18 | param subnets array = [ 19 | { 20 | name: 'snet-training' 21 | addressPrefix: '10.0.0.0/24' 22 | } 23 | ] 24 | 25 | @description('List of service endpoints expected on this vnet') 26 | param serviceEndpoints array = [ 27 | 'Microsoft.KeyVault' 28 | 'Microsoft.ContainerRegistry' 29 | 'Microsoft.Storage' 30 | ] 31 | 32 | @description('Tags to add to the resources') 33 | param tags object = {} 34 | 35 | var serviceEndpointsDefinition = [for service in serviceEndpoints: { service: service }] 36 | var subnetsDefinition = [for subnet in subnets: { 37 | name: subnet.name 38 | properties: { 39 | addressPrefix: subnet.addressPrefix 40 | privateEndpointNetworkPolicies: 'Disabled' 41 | privateLinkServiceNetworkPolicies: 'Disabled' 42 | serviceEndpoints: serviceEndpointsDefinition 43 | networkSecurityGroup: { 44 | id: networkSecurityGroupId 45 | } 46 | } 47 | }] 48 | 49 | resource virtualNetwork 'Microsoft.Network/virtualNetworks@2022-01-01' = { 50 | name: virtualNetworkName 51 | location: location 52 | tags: tags 53 | properties: { 54 | addressSpace: { 55 | addressPrefixes: [ 56 | vnetAddressPrefix 57 | ] 58 | } 59 | subnets: subnetsDefinition 60 | } 61 | } 62 | 63 | output id string = virtualNetwork.id 64 | output name string = virtualNetwork.name 65 | -------------------------------------------------------------------------------- /mlops/bicep/modules/networking/vnet_peering.bicep: -------------------------------------------------------------------------------- 1 | // Peers two vnet (from different regions) 2 | // see https://learn.microsoft.com/en-us/azure/virtual-network/virtual-network-peering-overview 3 | 4 | targetScope = 'resourceGroup' 5 | 6 | @description('Set the local VNet name') 7 | param existingVirtualNetworkNameSource string 8 | 9 | @description('Set the remote VNet name') 10 | param existingVirtualNetworkNameTarget string 11 | 12 | @description('Sets the remote VNet Resource group') 13 | param existingVirtualNetworkNameTargetResourceGroupName string = resourceGroup().name 14 | 15 | param useGatewayFromSourceToTarget bool = false 16 | param allowVirtualNetworkAccess bool = true 17 | 18 | resource _vnet_peering 'Microsoft.Network/virtualNetworks/virtualNetworkPeerings@2022-01-01' = { 19 | name: '${existingVirtualNetworkNameSource}/peering-to-${existingVirtualNetworkNameTarget}' 20 | properties: { 21 | allowVirtualNetworkAccess: allowVirtualNetworkAccess 22 | allowForwardedTraffic: false 23 | allowGatewayTransit: false 24 | useRemoteGateways: useGatewayFromSourceToTarget 25 | remoteVirtualNetwork: { 26 | id: resourceId(existingVirtualNetworkNameTargetResourceGroupName, 'Microsoft.Network/virtualNetworks', existingVirtualNetworkNameTarget) 27 | } 28 | } 29 | } 30 | 31 | resource _vnet_peering_back 'Microsoft.Network/virtualNetworks/virtualNetworkPeerings@2022-01-01' = { 32 | name: '${existingVirtualNetworkNameTarget}/peering-to-${existingVirtualNetworkNameSource}' 33 | properties: { 34 | allowVirtualNetworkAccess: allowVirtualNetworkAccess 35 | allowForwardedTraffic: false 36 | allowGatewayTransit: useGatewayFromSourceToTarget 37 | useRemoteGateways: false 38 | remoteVirtualNetwork: { 39 | id: resourceId(resourceGroup().name, 'Microsoft.Network/virtualNetworks', existingVirtualNetworkNameSource) 40 | } 41 | } 42 | } 43 | 44 | output id string = _vnet_peering.id 45 | -------------------------------------------------------------------------------- /mlops/bicep/modules/permissions/msi_storage_rw.bicep: -------------------------------------------------------------------------------- 1 | // Assigns roles to a given User Assigned Identity 2 | // towards a given storage account 3 | 4 | @description('Full path to storage') 5 | param storageAccountName string 6 | 7 | @description('PrincipalId of the managed identity') 8 | param identityPrincipalId string 9 | 10 | @description('Role definition IDs for the compute towards the internal storage') 11 | param computeToStorageRoles array = [ 12 | // see https://learn.microsoft.com/en-us/azure/role-based-access-control/built-in-roles 13 | 'ba92f5b4-2d11-453d-a403-e96b0029c9fe' // Storage Blob Data Contributor 14 | '81a9662b-bebf-436f-a333-f67b29880f12' // Storage Account Key Operator Service Role 15 | 'c12c1c16-33a1-487b-954d-41c89c60f349' // Reader and Data Access 16 | ] 17 | 18 | resource storage 'Microsoft.Storage/storageAccounts@2022-05-01' existing = { 19 | name: storageAccountName 20 | } 21 | 22 | resource roleAssignments 'Microsoft.Authorization/roleAssignments@2022-04-01' = [ for roleId in computeToStorageRoles: { 23 | scope: storage 24 | name: guid(resourceGroup().id, storage.id, identityPrincipalId, roleId) 25 | properties: { 26 | roleDefinitionId: '/subscriptions/${subscription().subscriptionId}/providers/Microsoft.Authorization/roleDefinitions/${roleId}' 27 | principalId: identityPrincipalId 28 | principalType: 'ServicePrincipal' 29 | } 30 | dependsOn: [ 31 | storage 32 | ] 33 | }] 34 | -------------------------------------------------------------------------------- /mlops/bicep/modules/resources/private_acr.bicep: -------------------------------------------------------------------------------- 1 | // Creates an Azure Container Registry with Azure Private Link endpoint 2 | 3 | // resource group must be specified as scope in az cli or module call 4 | targetScope = 'resourceGroup' 5 | 6 | @description('Azure region of the deployment') 7 | param location string 8 | 9 | @description('Tags to add to the resources') 10 | param tags object = {} 11 | 12 | @description('Container registry name') 13 | param containerRegistryName string 14 | 15 | @description('Resource ID of the subnet') 16 | param subnetId string 17 | 18 | @description('Name of the private DNS zone') 19 | param privateDNSZoneName string = 'privatelink${environment().suffixes.acrLoginServer}' 20 | 21 | @description('Optional: static IPs for the 2 PLEs (comma separated)') 22 | param acrPLEStaticIPs string = '' 23 | 24 | var containerRegistryNameCleaned = replace(containerRegistryName, '-', '') 25 | 26 | resource containerRegistry 'Microsoft.ContainerRegistry/registries@2021-09-01' = { 27 | name: containerRegistryNameCleaned 28 | location: location 29 | tags: tags 30 | sku: { 31 | name: 'Premium' 32 | } 33 | properties: { 34 | adminUserEnabled: true 35 | dataEndpointEnabled: false 36 | networkRuleBypassOptions: 'AzureServices' 37 | networkRuleSet: { 38 | defaultAction: 'Deny' 39 | } 40 | policies: { 41 | quarantinePolicy: { 42 | status: 'disabled' 43 | } 44 | retentionPolicy: { 45 | status: 'enabled' 46 | days: 7 47 | } 48 | trustPolicy: { 49 | status: 'disabled' 50 | type: 'Notary' 51 | } 52 | } 53 | publicNetworkAccess: 'Disabled' 54 | zoneRedundancy: 'Disabled' 55 | } 56 | } 57 | 58 | module privateEndpoint '../networking/private_endpoint.bicep' = { 59 | name: '${containerRegistry.name}-endpoint-to-vnet' 60 | scope: resourceGroup() 61 | params: { 62 | tags: tags 63 | location: location 64 | resourceServiceId: containerRegistry.id 65 | pleRootName: 'ple-${containerRegistry.name}' 66 | subnetId: subnetId 67 | privateDNSZoneName: privateDNSZoneName 68 | groupId: 'registry' 69 | memberNames: [ 'registry', 'registry_data_${location}' ] 70 | useStaticIPAddress: !empty(acrPLEStaticIPs) 71 | privateIPAddress: acrPLEStaticIPs 72 | } 73 | } 74 | 75 | output containerRegistryId string = containerRegistry.id 76 | -------------------------------------------------------------------------------- /mlops/bicep/modules/resources/private_appinsights.bicep: -------------------------------------------------------------------------------- 1 | // Creates an Application Insights instance as dependency for Azure ML 2 | 3 | // resource group must be specified as scope in az cli or module call 4 | targetScope = 'resourceGroup' 5 | 6 | @description('Azure region of the deployment') 7 | param location string = resourceGroup().location 8 | 9 | @description('Tags to add to the resources') 10 | param tags object = {} 11 | 12 | @description('Application Insights resource name') 13 | param applicationInsightsName string 14 | 15 | resource applicationInsights 'Microsoft.Insights/components@2020-02-02' = { 16 | name: applicationInsightsName 17 | location: location 18 | tags: tags 19 | kind: 'web' 20 | properties: { 21 | Application_Type: 'web' 22 | DisableIpMasking: false 23 | DisableLocalAuth: false 24 | Flow_Type: 'Bluefield' 25 | ForceCustomerStorageForProfiler: false 26 | ImmediatePurgeDataOn30Days: true 27 | IngestionMode: 'ApplicationInsights' 28 | publicNetworkAccessForIngestion: 'Enabled' 29 | publicNetworkAccessForQuery: 'Disabled' 30 | Request_Source: 'rest' 31 | } 32 | } 33 | 34 | output applicationInsightsId string = applicationInsights.id 35 | -------------------------------------------------------------------------------- /mlops/bicep/modules/resources/private_keyvault.bicep: -------------------------------------------------------------------------------- 1 | // Creates a KeyVault with Private Link Endpoint 2 | 3 | // resource group must be specified as scope in az cli or module call 4 | targetScope = 'resourceGroup' 5 | 6 | @description('The Azure Region to deploy the resources into') 7 | param location string = resourceGroup().location 8 | 9 | @description('Tags to apply to the Key Vault Instance') 10 | param tags object = {} 11 | 12 | @description('The name of the Key Vault') 13 | param keyvaultName string 14 | 15 | @description('The Subnet ID where the Key Vault Private Link is to be created') 16 | param subnetId string 17 | 18 | @description('Name of the private DNS zone') 19 | param privateDNSZoneName string = 'privatelink${environment().suffixes.keyvaultDns}' 20 | 21 | resource keyVault 'Microsoft.KeyVault/vaults@2022-07-01' = { 22 | name: keyvaultName 23 | location: location 24 | tags: tags 25 | properties: { 26 | tenantId: subscription().tenantId 27 | createMode: 'default' 28 | sku: { 29 | name: 'standard' 30 | family: 'A' 31 | } 32 | 33 | // usage 34 | enabledForDeployment: false 35 | enabledForDiskEncryption: true 36 | enabledForTemplateDeployment: false 37 | enableRbacAuthorization: true 38 | 39 | // loss protection 40 | enablePurgeProtection: true 41 | enableSoftDelete: true 42 | softDeleteRetentionInDays: 7 43 | 44 | // networking 45 | publicNetworkAccess: 'Disabled' 46 | networkAcls: { 47 | bypass: 'AzureServices' 48 | defaultAction: 'Deny' 49 | } 50 | } 51 | } 52 | 53 | module privateEndpoint '../networking/private_endpoint.bicep' = { 54 | name: '${keyVault.name}-endpoint-to-vnet' 55 | scope: resourceGroup() 56 | params: { 57 | tags: tags 58 | location: keyVault.location 59 | resourceServiceId: keyVault.id 60 | pleRootName: 'ple-${keyVault.name}' 61 | subnetId: subnetId 62 | privateDNSZoneName: privateDNSZoneName 63 | groupId: 'vault' 64 | } 65 | } 66 | 67 | output keyvaultId string = keyVault.id 68 | -------------------------------------------------------------------------------- /mlops/bicep/modules/storages/existing_blob_storage_datastore.bicep: -------------------------------------------------------------------------------- 1 | // Creates a datastore for an existing storage account in the same tenant 2 | @description('Name of AzureML workspace to attach compute+storage to.') 3 | param machineLearningName string 4 | 5 | @description('Existing storage account name to attach to the pair.') 6 | param storageAccountName string 7 | 8 | @description('Azure region of the storage to create') 9 | param storageRegion string 10 | 11 | @description('Resource group of the existing storage account to attach to the pair.') 12 | param storageAccountResourceGroup string = resourceGroup().name 13 | 14 | @description('SubscriptionId of the existing storage account to attach to the pair.') 15 | param storageAccountSubscriptionId string = subscription().subscriptionId 16 | 17 | @description('Name of the storage container resource to create for the pair') 18 | param containerName string = 'private' 19 | 20 | @description('Name of the datastore for attaching the storage to the AzureML workspace.') 21 | param datastoreName string = replace('datastore_${storageAccountName}','-','_') 22 | 23 | @description('Tags to add to the resources') 24 | param tags object = {} 25 | 26 | var storageId = '/subscriptions/${storageAccountSubscriptionId}/resourceGroups/${storageAccountResourceGroup}/providers/Microsoft.Storage/storageAccounts/${storageAccountName}' 27 | 28 | // attach as a datastore in AzureML 29 | resource datastore 'Microsoft.MachineLearningServices/workspaces/datastores@2022-06-01-preview' = { 30 | name: '${machineLearningName}/${datastoreName}' 31 | properties: { 32 | credentials: { 33 | credentialsType: 'None' 34 | } 35 | description: 'Private storage in region ${storageRegion}' 36 | properties: {} 37 | datastoreType: 'AzureBlob' 38 | 39 | accountName: storageAccountName 40 | containerName: containerName 41 | resourceGroup: storageAccountResourceGroup 42 | subscriptionId: storageAccountSubscriptionId 43 | tags: tags 44 | } 45 | } 46 | 47 | // output storage references 48 | output storageId string = storageId 49 | output storageName string = storageAccountName 50 | output containerName string = containerName 51 | output datastoreName string = datastore.name 52 | -------------------------------------------------------------------------------- /mlops/bicep/sandbox_fl_confidential.bicep: -------------------------------------------------------------------------------- 1 | // This BICEP script will fully provision a federated learning sandbox 2 | // with eyes-off orchestrator and silos storages 3 | // and only confidential compute clusters in orchestrator and silos 4 | 5 | targetScope = 'resourceGroup' 6 | 7 | // please specify the base name for all resources 8 | @description('Base name of the demo, used for creating all resources as prefix') 9 | param demoBaseName string = 'fldemo' 10 | 11 | @description('Region of the orchestrator (workspace, central storage and compute).') 12 | param orchestratorRegion string = 'eastus' 13 | 14 | @description('List of each region in which to create an internal silo.') 15 | param siloRegions array = [ 16 | 'eastus' 17 | 'eastus' 18 | 'eastus' 19 | ] 20 | 21 | @description('The VM used for creating compute clusters in orchestrator and silos.') 22 | param computeSKU string = 'Standard_DC4as_v5' 23 | 24 | @description('Uses public network access for the orchestrator storage, allowing it to be eyes-on.') 25 | param orchestratorEyesOn bool = false 26 | 27 | @description('Apply vnet peering to allow for vertical FL') 28 | param applyVNetPeering bool = true 29 | 30 | @description('Provide your Kaggle API user name to run our samples relying on Kaggle datasets.') 31 | param kaggleUsername string = '' 32 | 33 | @description('Provide your Kaggle API key to run our samples relying on Kaggle datasets.') 34 | @secure() 35 | param kaggleKey string = '' 36 | 37 | 38 | // run the generic sandbox bicep script with proper arguments 39 | module sandbox 'vnet_publicip_sandbox_aks_confcomp_setup.bicep' = { 40 | name: 'sandbox-${demoBaseName}' 41 | params: { 42 | demoBaseName: demoBaseName 43 | orchestratorRegion: orchestratorRegion 44 | siloRegions: siloRegions 45 | 46 | // computes 47 | computeSKU: computeSKU 48 | 49 | // eyes-on/eyes-off settings 50 | orchestratorStorageNetworkAccess: orchestratorEyesOn ? 'public' : 'private' 51 | siloStorageNetworkAccess: 'private' 52 | 53 | // ready for vertical FL 54 | applyVNetPeering: applyVNetPeering 55 | } 56 | } 57 | 58 | // Add kaggle secrets if given 59 | resource kaggleSecretUsername 'Microsoft.KeyVault/vaults/secrets@2022-07-01' = if (!empty(kaggleUsername)) { 60 | name: 'ws-shkv-${demoBaseName}/kaggleusername' 61 | properties: { 62 | value: kaggleUsername 63 | } 64 | dependsOn: [ 65 | sandbox 66 | ] 67 | } 68 | 69 | resource kaggleSecretKey 'Microsoft.KeyVault/vaults/secrets@2022-07-01' = if (!empty(kaggleUsername)) { 70 | name: 'ws-shkv-${demoBaseName}/kagglekey' 71 | properties: { 72 | value: kaggleKey 73 | } 74 | dependsOn: [ 75 | sandbox 76 | ] 77 | } 78 | -------------------------------------------------------------------------------- /mlops/bicep/sandbox_fl_eyesoff_cpu.bicep: -------------------------------------------------------------------------------- 1 | // This BICEP script will fully provision a federated learning sandbox 2 | // with eyes-off orchestrator and silos storages 3 | // and only one compute (cpu by default) 4 | 5 | targetScope = 'resourceGroup' 6 | 7 | // please specify the base name for all resources 8 | @description('Base name of the demo, used for creating all resources as prefix') 9 | param demoBaseName string = 'fldemo' 10 | 11 | @description('Region of the orchestrator (workspace, central storage and compute).') 12 | param orchestratorRegion string = resourceGroup().location 13 | 14 | @description('List of each region in which to create an internal silo.') 15 | param siloRegions array = [ 16 | 'australiaeast' 17 | 'eastus' 18 | 'westeurope' 19 | ] 20 | 21 | @description('The VM used for creating compute clusters in orchestrator and silos.') 22 | param computeSKU string = 'Standard_DS4_v2' 23 | 24 | @description('Uses public network access for the orchestrator storage, allowing it to be eyes-on.') 25 | param orchestratorEyesOn bool = false 26 | 27 | @description('Apply vnet peering to allow for vertical FL') 28 | param applyVNetPeering bool = true 29 | 30 | @description('Provide your Kaggle API user name to run our samples relying on Kaggle datasets.') 31 | param kaggleUsername string = '' 32 | 33 | @description('Provide your Kaggle API key to run our samples relying on Kaggle datasets.') 34 | @secure() 35 | param kaggleKey string = '' 36 | 37 | 38 | // run the generic sandbox bicep script with proper arguments 39 | module sandbox 'vnet_publicip_sandbox_setup.bicep' = { 40 | name: 'sandbox-${demoBaseName}' 41 | params: { 42 | demoBaseName: demoBaseName 43 | orchestratorRegion: orchestratorRegion 44 | siloRegions: siloRegions 45 | 46 | // computes 47 | compute1SKU: computeSKU 48 | compute2: false 49 | 50 | // eyes-on/eyes-off settings 51 | orchestratorStorageNetworkAccess: orchestratorEyesOn ? 'public' : 'private' 52 | siloStorageNetworkAccess: 'private' 53 | 54 | // ready for vertical FL 55 | applyVNetPeering: applyVNetPeering 56 | } 57 | } 58 | 59 | // Add kaggle secrets if given 60 | resource kaggleSecretUsername 'Microsoft.KeyVault/vaults/secrets@2022-07-01' = if (!empty(kaggleUsername)) { 61 | name: 'ws-shkv-${demoBaseName}/kaggleusername' 62 | properties: { 63 | value: kaggleUsername 64 | } 65 | dependsOn: [ 66 | sandbox 67 | ] 68 | } 69 | 70 | resource kaggleSecretKey 'Microsoft.KeyVault/vaults/secrets@2022-07-01' = if (!empty(kaggleUsername)) { 71 | name: 'ws-shkv-${demoBaseName}/kagglekey' 72 | properties: { 73 | value: kaggleKey 74 | } 75 | dependsOn: [ 76 | sandbox 77 | ] 78 | } 79 | -------------------------------------------------------------------------------- /mlops/bicep/sandbox_fl_eyesoff_gpu.bicep: -------------------------------------------------------------------------------- 1 | // This BICEP script will fully provision a federated learning sandbox 2 | // with eyes-off orchestrator and silos storages 3 | // and only one compute (gpu by default) 4 | 5 | targetScope = 'resourceGroup' 6 | 7 | // please specify the base name for all resources 8 | @description('Base name of the demo, used for creating all resources as prefix') 9 | param demoBaseName string = 'fldemo' 10 | 11 | @description('Region of the orchestrator (workspace, central storage and compute).') 12 | param orchestratorRegion string = resourceGroup().location 13 | 14 | @description('List of each region in which to create an internal silo.') 15 | param siloRegions array = [ 16 | 'australiaeast' 17 | 'eastus' 18 | 'westeurope' 19 | ] 20 | 21 | @description('The VM used for creating compute clusters in orchestrator and silos.') 22 | param computeSKU string = 'Standard_NC6' 23 | 24 | @description('Uses public network access for the orchestrator storage, allowing it to be eyes-on.') 25 | param orchestratorEyesOn bool = false 26 | 27 | @description('Apply vnet peering to allow for vertical FL') 28 | param applyVNetPeering bool = true 29 | 30 | @description('Provide your Kaggle API user name to run our samples relying on Kaggle datasets.') 31 | param kaggleUsername string = '' 32 | 33 | @description('Provide your Kaggle API key to run our samples relying on Kaggle datasets.') 34 | @secure() 35 | param kaggleKey string = '' 36 | 37 | 38 | // run the generic sandbox bicep script with proper arguments 39 | module sandbox 'vnet_publicip_sandbox_setup.bicep' = { 40 | name: 'sandbox-${demoBaseName}' 41 | params: { 42 | demoBaseName: demoBaseName 43 | orchestratorRegion: orchestratorRegion 44 | siloRegions: siloRegions 45 | 46 | // computes 47 | compute1SKU: computeSKU 48 | compute2: false 49 | 50 | // eyes-on/eyes-off settings 51 | orchestratorStorageNetworkAccess: orchestratorEyesOn ? 'public' : 'private' 52 | siloStorageNetworkAccess: 'private' 53 | 54 | // ready for vertical FL 55 | applyVNetPeering: applyVNetPeering 56 | } 57 | } 58 | 59 | // Add kaggle secrets if given 60 | resource kaggleSecretUsername 'Microsoft.KeyVault/vaults/secrets@2022-07-01' = if (!empty(kaggleUsername)) { 61 | name: 'ws-shkv-${demoBaseName}/kaggleusername' 62 | properties: { 63 | value: kaggleUsername 64 | } 65 | dependsOn: [ 66 | sandbox 67 | ] 68 | } 69 | 70 | resource kaggleSecretKey 'Microsoft.KeyVault/vaults/secrets@2022-07-01' = if (!empty(kaggleUsername)) { 71 | name: 'ws-shkv-${demoBaseName}/kagglekey' 72 | properties: { 73 | value: kaggleKey 74 | } 75 | dependsOn: [ 76 | sandbox 77 | ] 78 | } 79 | -------------------------------------------------------------------------------- /mlops/bicep/sandbox_fl_eyeson_cpu.bicep: -------------------------------------------------------------------------------- 1 | // This BICEP script will fully provision a federated learning sandbox 2 | // with eyes-on access to the orchestrator and silos. 3 | // and only one compute (cpu by default) 4 | 5 | targetScope = 'resourceGroup' 6 | 7 | // please specify the base name for all resources 8 | @description('Base name of the demo, used for creating all resources as prefix') 9 | param demoBaseName string = 'fldemo' 10 | 11 | @description('Region of the orchestrator (workspace, central storage and compute).') 12 | param orchestratorRegion string = resourceGroup().location 13 | 14 | @description('List of each region in which to create an internal silo.') 15 | param siloRegions array = [ 16 | 'australiaeast' 17 | 'eastus' 18 | 'westeurope' 19 | ] 20 | 21 | @description('The VM used for creating compute clusters in orchestrator and silos.') 22 | param computeSKU string = 'Standard_DS4_v2' 23 | 24 | @description('Apply vnet peering to allow for vertical FL') 25 | param applyVNetPeering bool = true 26 | 27 | @description('Provide your Kaggle API user name to run our samples relying on Kaggle datasets.') 28 | param kaggleUsername string = '' 29 | 30 | @description('Provide your Kaggle API key to run our samples relying on Kaggle datasets.') 31 | @secure() 32 | param kaggleKey string = '' 33 | 34 | 35 | // run the generic sandbox bicep script with proper arguments 36 | module sandbox 'vnet_publicip_sandbox_setup.bicep' = { 37 | name: 'sandbox-${demoBaseName}' 38 | params: { 39 | demoBaseName: demoBaseName 40 | orchestratorRegion: orchestratorRegion 41 | siloRegions: siloRegions 42 | 43 | // computes 44 | compute1SKU: computeSKU 45 | compute2: false 46 | 47 | // eyes-on/eyes-off settings 48 | orchestratorStorageNetworkAccess: 'public' 49 | siloStorageNetworkAccess: 'public' 50 | 51 | // ready for vertical FL 52 | applyVNetPeering: applyVNetPeering 53 | } 54 | } 55 | 56 | // Add kaggle secrets if given 57 | resource kaggleSecretUsername 'Microsoft.KeyVault/vaults/secrets@2022-07-01' = if (!empty(kaggleUsername)) { 58 | name: 'ws-shkv-${demoBaseName}/kaggleusername' 59 | properties: { 60 | value: kaggleUsername 61 | } 62 | dependsOn: [ 63 | sandbox 64 | ] 65 | } 66 | 67 | resource kaggleSecretKey 'Microsoft.KeyVault/vaults/secrets@2022-07-01' = if (!empty(kaggleUsername)) { 68 | name: 'ws-shkv-${demoBaseName}/kagglekey' 69 | properties: { 70 | value: kaggleKey 71 | } 72 | dependsOn: [ 73 | sandbox 74 | ] 75 | } 76 | -------------------------------------------------------------------------------- /mlops/bicep/sandbox_fl_eyeson_cpu_gpu.bicep: -------------------------------------------------------------------------------- 1 | // This BICEP script will fully provision a federated learning sandbox 2 | // with eyes-on access to the orchestrator and silos. 3 | // and two computes (cpu and gpu) 4 | 5 | targetScope = 'resourceGroup' 6 | 7 | // please specify the base name for all resources 8 | @description('Base name of the demo, used for creating all resources as prefix') 9 | param demoBaseName string = 'fldemo' 10 | 11 | @description('Region of the orchestrator (workspace, central storage and compute).') 12 | param orchestratorRegion string = resourceGroup().location 13 | 14 | @description('List of each region in which to create an internal silo.') 15 | param siloRegions array = [ 16 | 'australiaeast' 17 | 'eastus' 18 | 'westeurope' 19 | ] 20 | 21 | @description('The VM used for creating compute clusters in orchestrator and silos.') 22 | param primarySKU string = 'Standard_DS4_v2' 23 | 24 | @description('The VM used for creating a second compute cluster in orchestrator and silos.') 25 | param secondarySKU string = 'Standard_NC6' 26 | 27 | @description('Apply vnet peering to allow for vertical FL') 28 | param applyVNetPeering bool = true 29 | 30 | @description('Provide your Kaggle API user name to run our samples relying on Kaggle datasets.') 31 | param kaggleUsername string = '' 32 | 33 | @description('Provide your Kaggle API key to run our samples relying on Kaggle datasets.') 34 | @secure() 35 | param kaggleKey string = '' 36 | 37 | 38 | // run the generic sandbox bicep script with proper arguments 39 | module sandbox 'vnet_publicip_sandbox_setup.bicep' = { 40 | name: 'sandbox-${demoBaseName}' 41 | params: { 42 | demoBaseName: demoBaseName 43 | orchestratorRegion: orchestratorRegion 44 | siloRegions: siloRegions 45 | 46 | // computes 47 | compute1SKU: primarySKU 48 | compute2: true 49 | compute2SKU: secondarySKU 50 | 51 | // eyes-on/eyes-off settings 52 | orchestratorStorageNetworkAccess: 'public' 53 | siloStorageNetworkAccess: 'public' 54 | 55 | // ready for vertical FL 56 | applyVNetPeering: applyVNetPeering 57 | } 58 | } 59 | 60 | // Add kaggle secrets if given 61 | resource kaggleSecretUsername 'Microsoft.KeyVault/vaults/secrets@2022-07-01' = if (!empty(kaggleUsername)) { 62 | name: 'ws-shkv-${demoBaseName}/kaggleusername' 63 | properties: { 64 | value: kaggleUsername 65 | } 66 | dependsOn: [ 67 | sandbox 68 | ] 69 | } 70 | 71 | resource kaggleSecretKey 'Microsoft.KeyVault/vaults/secrets@2022-07-01' = if (!empty(kaggleUsername)) { 72 | name: 'ws-shkv-${demoBaseName}/kagglekey' 73 | properties: { 74 | value: kaggleKey 75 | } 76 | dependsOn: [ 77 | sandbox 78 | ] 79 | } 80 | -------------------------------------------------------------------------------- /mlops/bicep/sandbox_fl_eyeson_gpu.bicep: -------------------------------------------------------------------------------- 1 | // This BICEP script will fully provision a federated learning sandbox 2 | // with eyes-on access to the orchestrator and silos. 3 | // and only one compute (gpu by default) 4 | 5 | targetScope = 'resourceGroup' 6 | 7 | // please specify the base name for all resources 8 | @description('Base name of the demo, used for creating all resources as prefix') 9 | param demoBaseName string = 'fldemo' 10 | 11 | @description('Region of the orchestrator (workspace, central storage and compute).') 12 | param orchestratorRegion string = resourceGroup().location 13 | 14 | @description('List of each region in which to create an internal silo.') 15 | param siloRegions array = [ 16 | 'australiaeast' 17 | 'eastus' 18 | 'westeurope' 19 | ] 20 | 21 | @description('The VM used for creating compute clusters in orchestrator and silos.') 22 | param computeSKU string = 'Standard_NC6' 23 | 24 | @description('Apply vnet peering to allow for vertical FL') 25 | param applyVNetPeering bool = true 26 | 27 | @description('Provide your Kaggle API user name to run our samples relying on Kaggle datasets.') 28 | param kaggleUsername string = '' 29 | 30 | @description('Provide your Kaggle API key to run our samples relying on Kaggle datasets.') 31 | @secure() 32 | param kaggleKey string = '' 33 | 34 | 35 | // run the generic sandbox bicep script with proper arguments 36 | module sandbox 'vnet_publicip_sandbox_setup.bicep' = { 37 | name: 'sandbox-${demoBaseName}' 38 | params: { 39 | demoBaseName: demoBaseName 40 | orchestratorRegion: orchestratorRegion 41 | siloRegions: siloRegions 42 | 43 | // computes 44 | compute1SKU: computeSKU 45 | compute2: false 46 | 47 | // eyes-on/eyes-off settings 48 | orchestratorStorageNetworkAccess: 'public' 49 | siloStorageNetworkAccess: 'public' 50 | 51 | // ready for vertical FL 52 | applyVNetPeering: applyVNetPeering 53 | } 54 | } 55 | 56 | // Add kaggle secrets if given 57 | resource kaggleSecretUsername 'Microsoft.KeyVault/vaults/secrets@2022-07-01' = if (!empty(kaggleUsername)) { 58 | name: 'ws-shkv-${demoBaseName}/kaggleusername' 59 | properties: { 60 | value: kaggleUsername 61 | } 62 | dependsOn: [ 63 | sandbox 64 | ] 65 | } 66 | 67 | resource kaggleSecretKey 'Microsoft.KeyVault/vaults/secrets@2022-07-01' = if (!empty(kaggleUsername)) { 68 | name: 'ws-shkv-${demoBaseName}/kagglekey' 69 | properties: { 70 | value: kaggleKey 71 | } 72 | dependsOn: [ 73 | sandbox 74 | ] 75 | } 76 | -------------------------------------------------------------------------------- /mlops/k8s_templates/README.md: -------------------------------------------------------------------------------- 1 | # Kubernetes Templates 2 | 3 | ## Contents 4 | This folder contains example yaml templates you can use for creating kubernetes (k8s) clusters meeting different types of requirements. 5 | 6 | > Note: Please keep in mind that for both use cases you need to create an AzureML instance type, process is documented at the end of the [following document](../../docs/provisioning/silo_open_aks_with_cc.md) 7 | 8 | ## Templates for creating on-premises k8s clusters with access to local data 9 | The use of templates is documented in [this tutorial](../../docs/tutorials/read-local-data-in-k8s-silo.md). 10 | - [k8s_config.yaml](./k8s_config.yaml): for creating a k8s cluster using [kind](https://kind.sigs.k8s.io/). There is an extra mount added to the cluster, which is used to access the local data. 11 | - [pv.yaml](./pv.yaml), [pvc.yaml](./pvc.yaml), [deploy_pvc](./deploy_pvc.yaml): for creating a [Persistent Volume](https://kubernetes.io/docs/concepts/storage/persistent-volumes/), claiming it, and deploying. 12 | 13 | ## Templates for creating k8s clusters using Confidential Compute 14 | The use of templates is documented in [this tutorial](../../docs/provisioning/silo_open_aks_with_cc.md). 15 | 16 | - ... -------------------------------------------------------------------------------- /mlops/k8s_templates/deploy_pvc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: pvc 5 | namespace: default 6 | labels: 7 | app: demolocaldata 8 | spec: 9 | replicas: 1 10 | selector: 11 | matchLabels: 12 | app: pvc 13 | template: 14 | metadata: 15 | labels: 16 | app: pvc 17 | spec: 18 | containers: 19 | - name: name 20 | image: gcr.io/google-containers/echoserver:1.10 21 | volumeMounts: 22 | - mountPath: /localdata # Path on the docker file system where the local data folder was mounted. 23 | name: localdata 24 | volumes: 25 | - name: localdata 26 | persistentVolumeClaim: 27 | claimName: demolocaldata -------------------------------------------------------------------------------- /mlops/k8s_templates/instance-type.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: amlarc.azureml.com/v1alpha1 2 | kind: InstanceType 3 | metadata: 4 | name: computeinstancetype 5 | spec: 6 | resources: 7 | limits: 8 | cpu: "2" 9 | memory: "8Gi" 10 | requests: 11 | cpu: "1" 12 | memory: "4Gi" -------------------------------------------------------------------------------- /mlops/k8s_templates/k8s_config.yaml: -------------------------------------------------------------------------------- 1 | kind: Cluster 2 | apiVersion: kind.x-k8s.io/v1alpha4 3 | name: my-kind-cluster 4 | nodes: 5 | - role: control-plane 6 | image: kindest/node:v1.24.6@sha256:97e8d00bc37a7598a0b32d1fabd155a96355c49fa0d4d4790aab0f161bf31be1 # We recommend keeping that 1.24.6 k8s version that we used in our tests. 7 | extraMounts: 8 | - hostPath: `/path/to/data` # The path to the folder you want to expose. If you're using linux, this is the verbatim path; if you're using Windows and your data are located at C:\path\to\data, the path should be: /run/desktop/mnt/host/c/path/to/data 9 | containerPath: /localdata # Path on the docker file system where to mount the local folder. 10 | -------------------------------------------------------------------------------- /mlops/k8s_templates/pv.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolume 3 | metadata: 4 | name: demolocaldata 5 | labels: 6 | app: demolocaldata 7 | spec: 8 | storageClassName: manual 9 | 10 | capacity: 11 | storage: 1Gi 12 | accessModes: 13 | - ReadWriteMany 14 | claimRef: # the claimRef is here to provide a "connection" between this pv and the pvc. 15 | apiVersion: v1 16 | kind: PersistentVolumeClaim 17 | name: demolocaldata 18 | namespace: default 19 | hostPath: 20 | path: /localdata # the path in the docker file system that we want to expose in k8s -------------------------------------------------------------------------------- /mlops/k8s_templates/pvc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolumeClaim 3 | metadata: 4 | name: demolocaldata 5 | namespace: default 6 | # The label and annotation below are required for the Azure ML job to have access to the mounted folders. 7 | labels: 8 | app: demolocaldata 9 | ml.azure.com/pvc: "true" 10 | annotations: 11 | ml.azure.com/mountpath: "/mnt/localdata" # The path from which the local data will be accessed during the Azure ML job. You can change that to a different path if you want. 12 | spec: 13 | storageClassName: manual 14 | accessModes: 15 | - ReadWriteMany 16 | resources: 17 | requests: 18 | # The amount of the volume's storage to request. 19 | storage: 1Gi -------------------------------------------------------------------------------- /tests/examples/components/shared/utils.py: -------------------------------------------------------------------------------- 1 | import socket 2 | 3 | 4 | def get_free_port(): 5 | s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 6 | s.bind(("", 0)) 7 | s.listen(1) 8 | port = s.getsockname()[1] 9 | s.close() 10 | return str(port) 11 | -------------------------------------------------------------------------------- /tests/requirements.txt: -------------------------------------------------------------------------------- 1 | --extra-index-url https://download.pytorch.org/whl/cpu 2 | coverage==7.2.2 3 | torch 4 | redis==4.5.4 5 | azure-keyvault==4.2.0 6 | azureml-core==1.47.0 7 | azureml-mlflow==1.48.0 8 | --------------------------------------------------------------------------------