├── .gitignore ├── AMLNotebooks ├── 01_Create_CreditRisk_AML_Pipeline.ipynb ├── 02_Create_CreditRisk_AML_Pipeline_Lineage.ipynb ├── Authenticate_to_Purview_AML.py ├── Create_ML_Lineage_Functions.py ├── Create_ML_Lineage_Types.py └── Data │ ├── borrower.csv │ └── loan.csv ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── Data ├── borrower.csv └── loan.csv ├── Deployment ├── deploy.json ├── img │ ├── ADLSGen2Scanning.PNG │ ├── AMLPipeline.PNG │ ├── AMLPipelineLineage.PNG │ ├── Architecture.PNG │ ├── MLLineageScreenshot.PNG │ ├── ManageSparkPool.png │ ├── PurviewMLLineageIntroduction.PNG │ ├── PurviewMLLineageSolutionAccelerator.PNG │ ├── PurviewScreenshot.png │ ├── Requirements.png │ ├── add-role-assignment-page.png │ └── deploy-firewall.png └── requirements.txt ├── LICENSE ├── NOTICE.txt ├── PRIVACY.md ├── README.md ├── SECURITY.md ├── SUPPORT.md └── SynapseNotebooks ├── 01_Authenticate_to_Purview_AML.ipynb ├── 02_Create_ML_Lineage_Types.ipynb ├── 03_Create_ML_Lineage_Functions.ipynb └── 04_Create_CreditRisk_Experiment.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | ## 4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore 5 | 6 | # User-specific files 7 | *.rsuser 8 | *.suo 9 | *.user 10 | *.userosscache 11 | *.sln.docstates 12 | 13 | # User-specific files (MonoDevelop/Xamarin Studio) 14 | *.userprefs 15 | 16 | # Mono auto generated files 17 | mono_crash.* 18 | 19 | # Build results 20 | [Dd]ebug/ 21 | [Dd]ebugPublic/ 22 | [Rr]elease/ 23 | [Rr]eleases/ 24 | x64/ 25 | x86/ 26 | [Aa][Rr][Mm]/ 27 | [Aa][Rr][Mm]64/ 28 | bld/ 29 | [Bb]in/ 30 | [Oo]bj/ 31 | [Ll]og/ 32 | [Ll]ogs/ 33 | 34 | # Visual Studio 2015/2017 cache/options directory 35 | .vs/ 36 | # Uncomment if you have tasks that create the project's static files in wwwroot 37 | #wwwroot/ 38 | 39 | # Visual Studio 2017 auto generated files 40 | Generated\ Files/ 41 | 42 | # MSTest test Results 43 | [Tt]est[Rr]esult*/ 44 | [Bb]uild[Ll]og.* 45 | 46 | # NUnit 47 | *.VisualState.xml 48 | TestResult.xml 49 | nunit-*.xml 50 | 51 | # Build Results of an ATL Project 52 | [Dd]ebugPS/ 53 | [Rr]eleasePS/ 54 | dlldata.c 55 | 56 | # Benchmark Results 57 | BenchmarkDotNet.Artifacts/ 58 | 59 | # .NET Core 60 | project.lock.json 61 | project.fragment.lock.json 62 | artifacts/ 63 | 64 | # StyleCop 65 | StyleCopReport.xml 66 | 67 | # Files built by Visual Studio 68 | *_i.c 69 | *_p.c 70 | *_h.h 71 | *.ilk 72 | *.meta 73 | *.obj 74 | *.iobj 75 | *.pch 76 | *.pdb 77 | *.ipdb 78 | *.pgc 79 | *.pgd 80 | *.rsp 81 | *.sbr 82 | *.tlb 83 | *.tli 84 | *.tlh 85 | *.tmp 86 | *.tmp_proj 87 | *_wpftmp.csproj 88 | *.log 89 | *.vspscc 90 | *.vssscc 91 | .builds 92 | *.pidb 93 | *.svclog 94 | *.scc 95 | 96 | # Chutzpah Test files 97 | _Chutzpah* 98 | 99 | # Visual C++ cache files 100 | ipch/ 101 | *.aps 102 | *.ncb 103 | *.opendb 104 | *.opensdf 105 | *.sdf 106 | *.cachefile 107 | *.VC.db 108 | *.VC.VC.opendb 109 | 110 | # Visual Studio profiler 111 | *.psess 112 | *.vsp 113 | *.vspx 114 | *.sap 115 | 116 | # Visual Studio Trace Files 117 | *.e2e 118 | 119 | # TFS 2012 Local Workspace 120 | $tf/ 121 | 122 | # Guidance Automation Toolkit 123 | *.gpState 124 | 125 | # ReSharper is a .NET coding add-in 126 | _ReSharper*/ 127 | *.[Rr]e[Ss]harper 128 | *.DotSettings.user 129 | 130 | # TeamCity is a build add-in 131 | _TeamCity* 132 | 133 | # DotCover is a Code Coverage Tool 134 | *.dotCover 135 | 136 | # AxoCover is a Code Coverage Tool 137 | .axoCover/* 138 | !.axoCover/settings.json 139 | 140 | # Visual Studio code coverage results 141 | *.coverage 142 | *.coveragexml 143 | 144 | # NCrunch 145 | _NCrunch_* 146 | .*crunch*.local.xml 147 | nCrunchTemp_* 148 | 149 | # MightyMoose 150 | *.mm.* 151 | AutoTest.Net/ 152 | 153 | # Web workbench (sass) 154 | .sass-cache/ 155 | 156 | # Installshield output folder 157 | [Ee]xpress/ 158 | 159 | # DocProject is a documentation generator add-in 160 | DocProject/buildhelp/ 161 | DocProject/Help/*.HxT 162 | DocProject/Help/*.HxC 163 | DocProject/Help/*.hhc 164 | DocProject/Help/*.hhk 165 | DocProject/Help/*.hhp 166 | DocProject/Help/Html2 167 | DocProject/Help/html 168 | 169 | # Click-Once directory 170 | publish/ 171 | 172 | # Publish Web Output 173 | *.[Pp]ublish.xml 174 | *.azurePubxml 175 | # Note: Comment the next line if you want to checkin your web deploy settings, 176 | # but database connection strings (with potential passwords) will be unencrypted 177 | *.pubxml 178 | *.publishproj 179 | 180 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 181 | # checkin your Azure Web App publish settings, but sensitive information contained 182 | # in these scripts will be unencrypted 183 | PublishScripts/ 184 | 185 | # NuGet Packages 186 | *.nupkg 187 | # NuGet Symbol Packages 188 | *.snupkg 189 | # The packages folder can be ignored because of Package Restore 190 | **/[Pp]ackages/* 191 | # except build/, which is used as an MSBuild target. 192 | !**/[Pp]ackages/build/ 193 | # Uncomment if necessary however generally it will be regenerated when needed 194 | #!**/[Pp]ackages/repositories.config 195 | # NuGet v3's project.json files produces more ignorable files 196 | *.nuget.props 197 | *.nuget.targets 198 | 199 | # Microsoft Azure Build Output 200 | csx/ 201 | *.build.csdef 202 | 203 | # Microsoft Azure Emulator 204 | ecf/ 205 | rcf/ 206 | 207 | # Windows Store app package directories and files 208 | AppPackages/ 209 | BundleArtifacts/ 210 | Package.StoreAssociation.xml 211 | _pkginfo.txt 212 | *.appx 213 | *.appxbundle 214 | *.appxupload 215 | 216 | # Visual Studio cache files 217 | # files ending in .cache can be ignored 218 | *.[Cc]ache 219 | # but keep track of directories ending in .cache 220 | !?*.[Cc]ache/ 221 | 222 | # Others 223 | ClientBin/ 224 | ~$* 225 | *~ 226 | *.dbmdl 227 | *.dbproj.schemaview 228 | *.jfm 229 | *.pfx 230 | *.publishsettings 231 | orleans.codegen.cs 232 | 233 | # Including strong name files can present a security risk 234 | # (https://github.com/github/gitignore/pull/2483#issue-259490424) 235 | #*.snk 236 | 237 | # Since there are multiple workflows, uncomment next line to ignore bower_components 238 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 239 | #bower_components/ 240 | 241 | # RIA/Silverlight projects 242 | Generated_Code/ 243 | 244 | # Backup & report files from converting an old project file 245 | # to a newer Visual Studio version. Backup files are not needed, 246 | # because we have git ;-) 247 | _UpgradeReport_Files/ 248 | Backup*/ 249 | UpgradeLog*.XML 250 | UpgradeLog*.htm 251 | ServiceFabricBackup/ 252 | *.rptproj.bak 253 | 254 | # SQL Server files 255 | *.mdf 256 | *.ldf 257 | *.ndf 258 | 259 | # Business Intelligence projects 260 | *.rdl.data 261 | *.bim.layout 262 | *.bim_*.settings 263 | *.rptproj.rsuser 264 | *- [Bb]ackup.rdl 265 | *- [Bb]ackup ([0-9]).rdl 266 | *- [Bb]ackup ([0-9][0-9]).rdl 267 | 268 | # Microsoft Fakes 269 | FakesAssemblies/ 270 | 271 | # GhostDoc plugin setting file 272 | *.GhostDoc.xml 273 | 274 | # Node.js Tools for Visual Studio 275 | .ntvs_analysis.dat 276 | node_modules/ 277 | 278 | # Visual Studio 6 build log 279 | *.plg 280 | 281 | # Visual Studio 6 workspace options file 282 | *.opt 283 | 284 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.) 285 | *.vbw 286 | 287 | # Visual Studio LightSwitch build output 288 | **/*.HTMLClient/GeneratedArtifacts 289 | **/*.DesktopClient/GeneratedArtifacts 290 | **/*.DesktopClient/ModelManifest.xml 291 | **/*.Server/GeneratedArtifacts 292 | **/*.Server/ModelManifest.xml 293 | _Pvt_Extensions 294 | 295 | # Paket dependency manager 296 | .paket/paket.exe 297 | paket-files/ 298 | 299 | # FAKE - F# Make 300 | .fake/ 301 | 302 | # CodeRush personal settings 303 | .cr/personal 304 | 305 | # Python Tools for Visual Studio (PTVS) 306 | __pycache__/ 307 | *.pyc 308 | 309 | # Cake - Uncomment if you are using it 310 | # tools/** 311 | # !tools/packages.config 312 | 313 | # Tabs Studio 314 | *.tss 315 | 316 | # Telerik's JustMock configuration file 317 | *.jmconfig 318 | 319 | # BizTalk build output 320 | *.btp.cs 321 | *.btm.cs 322 | *.odx.cs 323 | *.xsd.cs 324 | 325 | # OpenCover UI analysis results 326 | OpenCover/ 327 | 328 | # Azure Stream Analytics local run output 329 | ASALocalRun/ 330 | 331 | # MSBuild Binary and Structured Log 332 | *.binlog 333 | 334 | # NVidia Nsight GPU debugger configuration file 335 | *.nvuser 336 | 337 | # MFractors (Xamarin productivity tool) working folder 338 | .mfractor/ 339 | 340 | # Local History for Visual Studio 341 | .localhistory/ 342 | 343 | # BeatPulse healthcheck temp database 344 | healthchecksdb 345 | 346 | # Backup folder for Package Reference Convert tool in Visual Studio 2017 347 | MigrationBackup/ 348 | 349 | # Ionide (cross platform F# VS Code tools) working folder 350 | .ionide/ 351 | -------------------------------------------------------------------------------- /AMLNotebooks/01_Create_CreditRisk_AML_Pipeline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import azureml.core\n", 10 | "from azureml.core import Workspace\n", 11 | "\n", 12 | "ws = Workspace.from_config()\n", 13 | "\n", 14 | "# Get the default datastore\n", 15 | "default_ds = ws.get_default_datastore()\n", 16 | "\n", 17 | "default_ds.upload_files(files=['./Data/borrower.csv', './Data/loan.csv'], # Upload the diabetes csv files in /data\n", 18 | " target_path='creditrisk-data/', # Put it in a folder path in the datastore\n", 19 | " overwrite=True, # Replace existing files of the same name\n", 20 | " show_progress=True)\n", 21 | "\n", 22 | "#Create a Tabular dataset from the path on the datastore\n", 23 | "from azureml.core import Dataset\n", 24 | "\n", 25 | "tab_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, 'creditrisk-data/borrower.csv'))\n", 26 | "\n", 27 | "tab_data_set = tab_data_set.register(workspace=ws,\n", 28 | " name='BorrowerData',\n", 29 | " description='Borrower Data',\n", 30 | " tags = {'format':'CSV'},\n", 31 | " create_new_version=True)\n", 32 | "\n", 33 | "#Create a Tabular dataset from the path on the datastore\n", 34 | "tab_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, 'creditrisk-data/loan.csv'))\n", 35 | "\n", 36 | "tab_data_set = tab_data_set.register(workspace=ws,\n", 37 | " name='LoanData',\n", 38 | " description='Loans Data',\n", 39 | " tags = {'format':'CSV'},\n", 40 | " create_new_version=True)" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "from azureml.core import Workspace, Dataset, Datastore, ScriptRunConfig, Experiment\n", 50 | "from azureml.data.data_reference import DataReference\n", 51 | "import os\n", 52 | "import azureml.dataprep as dprep\n", 53 | "import pandas as pd\n", 54 | "import numpy as np\n", 55 | "\n", 56 | "import azureml.core\n", 57 | "from azureml.core import Workspace\n", 58 | "\n", 59 | "ws = Workspace.from_config()\n", 60 | "\n", 61 | "borrowerData = Dataset.get_by_name(ws, name='BorrowerData')\n", 62 | "loanData = Dataset.get_by_name(ws, name='LoanData')" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "from azureml.core import Datastore\n", 72 | "from azureml.core.compute import AmlCompute, ComputeTarget\n", 73 | "\n", 74 | "datastore = ws.get_default_datastore()\n", 75 | "\n", 76 | "# Create a compute cluster\n", 77 | "compute_name = 'cpu-cluster'\n", 78 | "if not compute_name in ws.compute_targets :\n", 79 | " print('creating a new compute target...')\n", 80 | " provisioning_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS2_V2',\n", 81 | " min_nodes=0,\n", 82 | " max_nodes=1)\n", 83 | " compute_target = ComputeTarget.create(ws, compute_name, provisioning_config)\n", 84 | "\n", 85 | " compute_target.wait_for_completion(\n", 86 | " show_output=True, min_node_count=None, timeout_in_minutes=20)\n", 87 | "\n", 88 | " # Show the result\n", 89 | " print(compute_target.get_status().serialize())\n", 90 | "\n", 91 | "compute_target = ws.compute_targets[compute_name]" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "from azureml.core.runconfig import RunConfiguration\n", 101 | "\n", 102 | "from azureml.core import Environment\n", 103 | "from azureml.core.conda_dependencies import CondaDependencies\n", 104 | "\n", 105 | "# Create a Python environment for the experiment\n", 106 | "creditrisk_env = Environment(\"creditrisk-pipeline-env\")\n", 107 | "creditrisk_env.python.user_managed_dependencies = False # Let Azure ML manage dependencies\n", 108 | "creditrisk_env.docker.enabled = True # Use a docker container\n", 109 | "\n", 110 | "# Create a set of package dependencies\n", 111 | "creditrisk_packages = CondaDependencies.create(conda_packages=['scikit-learn','joblib','pandas','numpy','pip'],\n", 112 | " pip_packages=['azureml-defaults','azureml-dataprep[pandas]'])\n", 113 | "\n", 114 | "\n", 115 | "# Add the dependencies to the environment\n", 116 | "creditrisk_env.python.conda_dependencies = creditrisk_packages\n", 117 | "\n", 118 | "# Register the environment \n", 119 | "creditrisk_env.register(workspace=ws)\n", 120 | "registered_env = Environment.get(ws, 'creditrisk-pipeline-env')\n", 121 | "\n", 122 | "# Create a new runconfig object for the pipeline\n", 123 | "aml_run_config = RunConfiguration()\n", 124 | "\n", 125 | "# Use the compute you created above. \n", 126 | "aml_run_config.target = compute_target\n", 127 | "\n", 128 | "# Assign the environment to the run configuration\n", 129 | "aml_run_config.environment = registered_env\n", 130 | "\n", 131 | "print (\"Run configuration created.\")" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "%%writefile PrepareData.py\n", 141 | "from azureml.core import Run\n", 142 | "\n", 143 | "import pandas as pd \n", 144 | "import numpy as np \n", 145 | "import argparse\n", 146 | "\n", 147 | "parser = argparse.ArgumentParser()\n", 148 | "parser.add_argument('--prepared_data', dest='prepared_data', required=True)\n", 149 | "args = parser.parse_args()\n", 150 | " \n", 151 | "borrowerData = Run.get_context().input_datasets['BorrowerData']\n", 152 | "loanData = Run.get_context().input_datasets['LoanData']\n", 153 | "\n", 154 | "df_borrower = borrowerData.to_pandas_dataframe()\n", 155 | "df_loan = loanData.to_pandas_dataframe()\n", 156 | "\n", 157 | "# Join data and do some transformations\n", 158 | "df_data = df_borrower.merge(df_loan,on='memberId',how='inner')\n", 159 | "df_data.shape\n", 160 | "\n", 161 | "df_data['homeOwnership'] = df_data['homeOwnership'].replace('nan', np.nan).fillna(0)\n", 162 | "df_data['isJointApplication'] = df_data['isJointApplication'].replace('nan', np.nan).fillna(0)\n", 163 | "\n", 164 | "drop_cols = ['memberId', 'loanId', 'date','grade','residentialState']\n", 165 | "df_data = df_data.drop(drop_cols, axis=1)\n", 166 | "\n", 167 | "df_data['loanStatus'] = np.where(df_data['loanStatus'] == 'Default', 1, 0) # change label column to 0/1\n", 168 | "\n", 169 | "df_data.to_csv(os.path.join(args.prepared_data,\"prepared_data.csv\"),index=False)\n", 170 | "\n", 171 | "print(f\"Wrote prepped data to {args.prepared_data}/prepared_data.csv\")" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [ 180 | "from azureml.data import OutputFileDatasetConfig\n", 181 | "from azureml.pipeline.steps import PythonScriptStep\n", 182 | "\n", 183 | "prepared_data = OutputFileDatasetConfig(name=\"prepared_data\")\n", 184 | "\n", 185 | "dataprep_step = PythonScriptStep(\n", 186 | " name=\"PrepareData\", \n", 187 | " script_name=\"PrepareData.py\", \n", 188 | " compute_target=compute_target, \n", 189 | " runconfig=aml_run_config,\n", 190 | " arguments=[\"--prepared_data\", prepared_data],\n", 191 | " inputs=[borrowerData.as_named_input('BorrowerData'),loanData.as_named_input('LoanData')],\n", 192 | " allow_reuse=True\n", 193 | ")" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "metadata": {}, 200 | "outputs": [], 201 | "source": [ 202 | "# prepared_data = prepared_data_path.read_delimited_files()" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": null, 208 | "metadata": {}, 209 | "outputs": [], 210 | "source": [ 211 | "%%writefile TrainTestDataSplit.py\n", 212 | "from azureml.core import Run\n", 213 | "\n", 214 | "import pandas as pd \n", 215 | "import numpy as np \n", 216 | "import argparse\n", 217 | "\n", 218 | "parser = argparse.ArgumentParser()\n", 219 | "parser.add_argument('--prepared_data', dest='prepared_data', required=True)\n", 220 | "parser.add_argument('--train_data', dest='train_data', required=True)\n", 221 | "parser.add_argument('--test_data', dest='test_data', required=True)\n", 222 | "args = parser.parse_args()\n", 223 | "\n", 224 | "df_data = pd.read_csv(args.prepared_data + '/prepared_data.csv')\n", 225 | "\n", 226 | "df_train=df_data.sample(frac=0.8,random_state=200) #random state is a seed value\n", 227 | "df_train=df_data.drop(df_train.index)\n", 228 | "\n", 229 | "df_train.to_csv(os.path.join(args.train_data,\"train_data.csv\"),index=False)\n", 230 | "df_train.to_csv(os.path.join(args.test_data,\"test_data.csv\"),index=False)\n", 231 | "\n", 232 | "print(f\"Wrote prepped data to {args.train_data}/train_data.csv\")\n", 233 | "print(f\"Wrote prepped data to {args.test_data}/test_data.csv\")" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "# test train split the data \n", 243 | "train_data = OutputFileDatasetConfig(name=\"train_data\")\n", 244 | "test_data = OutputFileDatasetConfig(name=\"test_data\")\n", 245 | "\n", 246 | "test_train_step = PythonScriptStep(name = \"TestTrainDataSplit\",\n", 247 | " script_name =\"TrainTestDataSplit.py\",\n", 248 | " arguments = [\"--prepared_data\", prepared_data.as_input(),\n", 249 | " \"--train_data\", train_data,\n", 250 | " \"--test_data\", test_data],\n", 251 | " outputs = [train_data,test_data],\n", 252 | " compute_target = compute_target, \n", 253 | " runconfig = aml_run_config, \n", 254 | " allow_reuse = True\n", 255 | " )" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": null, 261 | "metadata": {}, 262 | "outputs": [], 263 | "source": [ 264 | "training_data = train_data.read_delimited_files()\n", 265 | "training_data\n", 266 | "\n", 267 | "testing_data = test_data.read_delimited_files()\n", 268 | "testing_data" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": null, 274 | "metadata": {}, 275 | "outputs": [], 276 | "source": [ 277 | "%%writefile TrainModel.py\n", 278 | "\n", 279 | "from azureml.core import Run\n", 280 | "from azureml.core.model import Model\n", 281 | "import joblib\n", 282 | "\n", 283 | "import pandas as pd \n", 284 | "import numpy as np \n", 285 | "import argparse\n", 286 | "\n", 287 | "from sklearn.linear_model import LogisticRegression\n", 288 | "\n", 289 | "import pandas as pd\n", 290 | "import numpy as np\n", 291 | "from sklearn.preprocessing import OneHotEncoder\n", 292 | "from sklearn.impute import SimpleImputer\n", 293 | "\n", 294 | "def creditrisk_onehot_encoder(df_data):\n", 295 | " catColumns = df_data.select_dtypes(['object']).columns\n", 296 | " df_data[catColumns] = df_data[catColumns].fillna(value='Unknown')\n", 297 | " \n", 298 | " df_data = df_data.fillna(df_data.mean())\n", 299 | " OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)\n", 300 | " OH_cols= pd.DataFrame(OH_encoder.fit_transform(df_data[catColumns]),columns = list(OH_encoder.get_feature_names(catColumns)))\n", 301 | " \n", 302 | " # Remove categorical columns (will replace with one-hot encoding)\n", 303 | " numeric_cols = df_data.drop(catColumns, axis=1)\n", 304 | " \n", 305 | " # Add one-hot encoded columns to numerical features\n", 306 | " df_result = pd.concat([numeric_cols, OH_cols], axis=1)\n", 307 | " \n", 308 | " # impute missing numeric values with mean\n", 309 | " fill_NaN = SimpleImputer(missing_values=np.nan, strategy='mean')\n", 310 | " imputed_df = pd.DataFrame(fill_NaN.fit_transform(df_result))\n", 311 | " imputed_df.columns = df_result.columns\n", 312 | " imputed_df.index = df_result.index\n", 313 | " df_result = imputed_df\n", 314 | "\n", 315 | " return(df_result)\n", 316 | "\n", 317 | "# Get the experiment run context\n", 318 | "run = Run.get_context()\n", 319 | "\n", 320 | "parser = argparse.ArgumentParser()\n", 321 | "parser.add_argument('--train_data', dest='train_data', required=True)\n", 322 | "parser.add_argument('--test_data', dest='test_data', required=True)\n", 323 | "parser.add_argument('--metrics_data', dest='metrics_data', required=True)\n", 324 | "parser.add_argument('--model_data', dest='model_data', required=True)\n", 325 | "args = parser.parse_args()\n", 326 | "\n", 327 | "df_train = pd.read_csv(args.train_data + '/train_data.csv')\n", 328 | "df_test = pd.read_csv(args.test_data + '/test_data.csv')\n", 329 | "\n", 330 | "df_train = creditrisk_onehot_encoder(df_train)\n", 331 | "df_test = creditrisk_onehot_encoder(df_test)\n", 332 | "\n", 333 | "cols = [col for col in df_train.columns if col not in [\"loanStatus\"]]\n", 334 | "\n", 335 | "clf = LogisticRegression()\n", 336 | "clf.fit(df_train[cols].values, df_train[\"loanStatus\"].values)\n", 337 | "\n", 338 | "print('predicting ...')\n", 339 | "y_hat = clf.predict(df_test[cols].astype(int).values)\n", 340 | "\n", 341 | "acc = np.average(y_hat == df_test[\"loanStatus\"].values)\n", 342 | "print('Accuracy is', acc)\n", 343 | "\n", 344 | "print(\"save model\")\n", 345 | "os.makedirs('models', exist_ok=True) \n", 346 | "joblib.dump(value=clf, filename= 'models/creditrisk_model.pkl')\n", 347 | "\n", 348 | "model = Model.register(model_path = 'models/creditrisk_model.pkl',\n", 349 | " model_name = 'creditrisk_model',\n", 350 | " description = 'creditrisk model',\n", 351 | " workspace = run.experiment.workspace,\n", 352 | " properties={'Accuracy': np.float(acc)})\n", 353 | "\n", 354 | "modeldata = []\n", 355 | "modeldata.append(('models/creditrisk_model.pkl','creditrisk_model'))\n", 356 | "df_model = pd.DataFrame(modeldata, columns=('modelfile', 'model_name'))\n", 357 | "\n", 358 | "metricsdata = []\n", 359 | "metricsdata.append(('Accuracy',acc))\n", 360 | "df_metrics = pd.DataFrame(metricsdata, columns=('Metric', 'Value'))\n", 361 | "\n", 362 | "df_model.to_csv(os.path.join(args.model_data,\"model_data.csv\"),index=False)\n", 363 | "df_metrics.to_csv(os.path.join(args.metrics_data,\"metrics_data.csv\"),index=False)\n", 364 | "\n", 365 | "print(f\"Wrote model data to {args.model_data}/model_data.csv\")\n", 366 | "print(f\"Wrote metrics data to {args.metrics_data}/metrics_data.csv\")\n" 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": null, 372 | "metadata": {}, 373 | "outputs": [], 374 | "source": [ 375 | "# train the model\n", 376 | "model_data = OutputFileDatasetConfig(name=\"model_data\")\n", 377 | "metrics_data = OutputFileDatasetConfig(name=\"metrics_data\")\n", 378 | "\n", 379 | "train_step = PythonScriptStep(name = \"TrainModel\",\n", 380 | " script_name =\"TrainModel.py\",\n", 381 | " arguments = [\"--train_data\", train_data.as_input(),\n", 382 | " \"--test_data\", test_data.as_input(),\n", 383 | " \"--model_data\", model_data,\n", 384 | " \"--metrics_data\", metrics_data],\n", 385 | " outputs = [model_data,metrics_data],\n", 386 | " compute_target = compute_target, \n", 387 | " runconfig = aml_run_config, \n", 388 | " allow_reuse = True\n", 389 | " )" 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": null, 395 | "metadata": {}, 396 | "outputs": [], 397 | "source": [ 398 | "%%writefile BatchInference.py\n", 399 | "from azureml.core import Run\n", 400 | "from azureml.core.model import Model\n", 401 | "import joblib\n", 402 | "\n", 403 | "import pandas as pd \n", 404 | "import numpy as np \n", 405 | "import argparse\n", 406 | "\n", 407 | "from sklearn.preprocessing import OneHotEncoder\n", 408 | "from sklearn.impute import SimpleImputer\n", 409 | "\n", 410 | "def creditrisk_onehot_encoder(df_data):\n", 411 | " catColumns = df_data.select_dtypes(['object']).columns\n", 412 | " df_data[catColumns] = df_data[catColumns].fillna(value='Unknown')\n", 413 | " \n", 414 | " df_data = df_data.fillna(df_data.mean())\n", 415 | " OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)\n", 416 | " OH_cols= pd.DataFrame(OH_encoder.fit_transform(df_data[catColumns]),columns = list(OH_encoder.get_feature_names(catColumns)))\n", 417 | " \n", 418 | " # Remove categorical columns (will replace with one-hot encoding)\n", 419 | " numeric_cols = df_data.drop(catColumns, axis=1)\n", 420 | " \n", 421 | " # Add one-hot encoded columns to numerical features\n", 422 | " df_result = pd.concat([numeric_cols, OH_cols], axis=1)\n", 423 | " \n", 424 | " # impute missing numeric values with mean\n", 425 | " fill_NaN = SimpleImputer(missing_values=np.nan, strategy='mean')\n", 426 | " imputed_df = pd.DataFrame(fill_NaN.fit_transform(df_result))\n", 427 | " imputed_df.columns = df_result.columns\n", 428 | " imputed_df.index = df_result.index\n", 429 | " df_result = imputed_df\n", 430 | "\n", 431 | " return(df_result)\n", 432 | "\n", 433 | "parser = argparse.ArgumentParser()\n", 434 | "parser.add_argument('--test_data', dest=\"test_data\", type=str, required=True)\n", 435 | "parser.add_argument('--model_data', dest=\"model_data\", type=str, required=True)\n", 436 | "parser.add_argument('--batchinfer_data', dest='batchinfer_data', required=True)\n", 437 | "\n", 438 | "args = parser.parse_args()\n", 439 | "\n", 440 | "# Get the experiment run context\n", 441 | "run = Run.get_context()\n", 442 | "\n", 443 | "df_model = pd.read_csv(args.model_data + '/model_data.csv')\n", 444 | "# model_path = Model.get_model_path(model_name = 'best_model_data')\n", 445 | "model_name = df_model['model_name'][0]\n", 446 | "\n", 447 | "model_path = Model.get_model_path(model_name=model_name, _workspace=run.experiment.workspace)\n", 448 | "model = joblib.load(model_path)\n", 449 | "\n", 450 | "df_test = pd.read_csv(args.test_data + '/test_data.csv')\n", 451 | "df_test = creditrisk_onehot_encoder(df_test)\n", 452 | "\n", 453 | "x_test = df_test.drop(['loanStatus'], axis=1)\n", 454 | "\n", 455 | "y_predict = model.predict(x_test)\n", 456 | "\n", 457 | "df_test['Prediction'] = y_predict\n", 458 | "\n", 459 | "df_test.to_csv(os.path.join(args.batchinfer_data,\"batchinfer_data.csv\"),index=False)\n", 460 | "\n", 461 | "print(f\"Wrote prediction data with to {args.batchinfer_data}/batchinfer_data.csv\")" 462 | ] 463 | }, 464 | { 465 | "cell_type": "code", 466 | "execution_count": null, 467 | "metadata": {}, 468 | "outputs": [], 469 | "source": [ 470 | "from azureml.data import OutputFileDatasetConfig\n", 471 | "from azureml.pipeline.steps import PythonScriptStep\n", 472 | "\n", 473 | "batchinfer_data = OutputFileDatasetConfig(name=\"batchinfer_data\").register_on_complete(name=\"CreditRiskBatchInferenceData\",description = 'Batch Inference Data Output')\n", 474 | "\n", 475 | "batchinfer_step = PythonScriptStep(\n", 476 | " name=\"RunBatchInference\", \n", 477 | " script_name=\"BatchInference.py\", \n", 478 | " compute_target=compute_target, \n", 479 | " runconfig=aml_run_config,\n", 480 | " arguments=[\"--test_data\", test_data.as_input(),\"--model_data\", model_data.as_input(),\"--batchinfer_data\", batchinfer_data],\n", 481 | " outputs = [batchinfer_data],\n", 482 | " allow_reuse=True\n", 483 | ")" 484 | ] 485 | }, 486 | { 487 | "cell_type": "code", 488 | "execution_count": null, 489 | "metadata": {}, 490 | "outputs": [], 491 | "source": [ 492 | "from azureml.pipeline.core import Pipeline\n", 493 | "from azureml.core import Experiment\n", 494 | "\n", 495 | "pipeline = Pipeline(ws, [dataprep_step, test_train_step, train_step,batchinfer_step])\n", 496 | "\n", 497 | "experiment = Experiment(workspace=ws, name='CreditRiskPipeline')\n", 498 | "\n", 499 | "run = experiment.submit(pipeline, show_output=True)\n", 500 | "run.wait_for_completion()" 501 | ] 502 | } 503 | ], 504 | "metadata": { 505 | "kernelspec": { 506 | "display_name": "Python 3", 507 | "language": "python", 508 | "name": "python3" 509 | }, 510 | "language_info": { 511 | "codemirror_mode": { 512 | "name": "ipython", 513 | "version": 3 514 | }, 515 | "file_extension": ".py", 516 | "mimetype": "text/x-python", 517 | "name": "python", 518 | "nbconvert_exporter": "python", 519 | "pygments_lexer": "ipython3", 520 | "version": "3.7.4" 521 | } 522 | }, 523 | "nbformat": 4, 524 | "nbformat_minor": 5 525 | } 526 | -------------------------------------------------------------------------------- /AMLNotebooks/02_Create_CreditRisk_AML_Pipeline_Lineage.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "pip install pyapacheatlas" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "#!pip install pyapacheatlas #run this cell if the above cell runs into any issues" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 1, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "from Authenticate_to_Purview_AML import *\n", 28 | "ws,guid,client = authentitae_to_purview_AML()" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "from Create_ML_Lineage_Types import *\n", 38 | "create_ml_lineage_types(client)" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 3, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "%run Create_ML_Lineage_Functions" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 4, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "create_workspace_entities(ws)" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 5, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "create_datastore_entities(ws)" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 6, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "create_dataset_entities(ws)" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "create_experiment_entities(ws)" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "## uncomment below code to link PowerBI Dataset and Report in lineage if you have access to a PBI workspace \n", 93 | "# #The PowerBI entities will populate with more details if you set up a scan for PBI workspaces in Purview\n", 94 | "# #We are just creating a placeholders and links for lineage below\n", 95 | "\n", 96 | "# #get batch inference data entity name and exprimentname\n", 97 | "# batchpred_data_ent_name = 'batchinfer_data.csv_CreditRiskPipeline'\n", 98 | "# experimentname = \"CreditRiskPipeline\"\n", 99 | "\n", 100 | "# #create PowerBI dataset entity and lineage \n", 101 | "# pbi_workspace = '' #'https://xxx.powerbi.com/groups/7c555287-f9b8-45ff-be6c-9909afe9df40'\n", 102 | "# pbi_datasetid = '' #'c4a30c22-466d-4a30-a1ac-8736ed6567cc' \n", 103 | "\n", 104 | "# pbidata_ent_name = 'creditriskpbidataset' \n", 105 | "# create_powerbi_dataset_and_lineage(experimentname,pbi_workspace,pbi_datasetid,pbidata_ent_name,batchpred_data_ent_name,'custom_ml_dataset')\n", 106 | "\n", 107 | "\n", 108 | "# #create PowerBI report entity and lineage\n", 109 | "# pbi_reportid = '' #'e495453d-6c0c-4fb9-bdc4-556319f6a57b'\n", 110 | "# pbi_ent_name = 'creditriskpbireport'\n", 111 | " \n", 112 | "# create_powerbi_report_and_lineage(experimentname,pbi_workspace,pbi_reportid,pbi_ent_name,pbi_datasetid)" 113 | ] 114 | } 115 | ], 116 | "metadata": { 117 | "kernelspec": { 118 | "display_name": "Python 3", 119 | "language": "python", 120 | "name": "python3" 121 | }, 122 | "language_info": { 123 | "codemirror_mode": { 124 | "name": "ipython", 125 | "version": 3 126 | }, 127 | "file_extension": ".py", 128 | "mimetype": "text/x-python", 129 | "name": "python", 130 | "nbconvert_exporter": "python", 131 | "pygments_lexer": "ipython3", 132 | "version": "3.7.4" 133 | }, 134 | "save_output": true, 135 | "synapse_widget": { 136 | "state": {}, 137 | "version": "0.1" 138 | } 139 | }, 140 | "nbformat": 4, 141 | "nbformat_minor": 2 142 | } 143 | -------------------------------------------------------------------------------- /AMLNotebooks/Authenticate_to_Purview_AML.py: -------------------------------------------------------------------------------- 1 | def authentitae_to_purview_AML(): 2 | from pyapacheatlas.auth import ServicePrincipalAuthentication 3 | from pyapacheatlas.core import PurviewClient, AtlasClassification, AtlasEntity, AtlasProcess 4 | from pyapacheatlas.readers import ExcelConfiguration, ExcelReader 5 | from pyapacheatlas.core.util import GuidTracker 6 | from pyapacheatlas.core import AtlasAttributeDef, AtlasEntity, PurviewClient 7 | from pyapacheatlas.core.typedef import EntityTypeDef 8 | 9 | # get SPN details you created in step 2.1 of solution accelerator setup 10 | tenant_id = "" 11 | client_id = "" 12 | client_secret = "" 13 | 14 | # get Purview account name from azure portal 15 | purview_name = "" 16 | 17 | # get AML workspace details from azure portal 18 | subscription_id = "" 19 | resource_group = "" 20 | workspace_name = "" 21 | workspace_region = "" 22 | 23 | from pyapacheatlas.auth import ServicePrincipalAuthentication 24 | from pyapacheatlas.core import PurviewClient 25 | from pyapacheatlas.core.util import GuidTracker 26 | 27 | # Authenticate to your Atlas server using a Service Principal 28 | oauth = ServicePrincipalAuthentication( 29 | tenant_id= tenant_id, 30 | client_id= client_id, 31 | client_secret= client_secret 32 | ) 33 | client = PurviewClient( 34 | account_name = purview_name, 35 | authentication=oauth 36 | ) 37 | guid = GuidTracker() 38 | 39 | 40 | # get SPN details you created in step 3.1 of solution accelerator setup 41 | aml_client_id = "" 42 | aml_client_secret = "" 43 | 44 | 45 | from azureml.core.authentication import ServicePrincipalAuthentication 46 | 47 | sp = ServicePrincipalAuthentication(tenant_id=tenant_id, 48 | service_principal_id=aml_client_id, 49 | service_principal_password=aml_client_secret) 50 | 51 | from azureml.core import Workspace 52 | 53 | ws = Workspace.get(name=workspace_name, 54 | resource_group = resource_group, 55 | auth=sp, 56 | subscription_id=subscription_id) 57 | return ws,guid,client 58 | -------------------------------------------------------------------------------- /AMLNotebooks/Create_ML_Lineage_Functions.py: -------------------------------------------------------------------------------- 1 | # + 2 | from pyapacheatlas.auth import ServicePrincipalAuthentication 3 | from pyapacheatlas.core import PurviewClient, AtlasClassification, AtlasEntity, AtlasProcess 4 | from pyapacheatlas.readers import ExcelConfiguration, ExcelReader 5 | from pyapacheatlas.core.util import GuidTracker 6 | from pyapacheatlas.core import AtlasAttributeDef, AtlasEntity, PurviewClient 7 | from pyapacheatlas.core.typedef import EntityTypeDef 8 | 9 | from Authenticate_to_Purview_AML import * 10 | ws,guid,client = authentitae_to_purview_AML() 11 | 12 | def get_entity_details(qualifiedName,typeName): 13 | entities = client.get_entity( 14 | qualifiedName=[qualifiedName], 15 | typeName=typeName 16 | ) 17 | for entity in entities.get("entities"): 18 | entity = entity 19 | break 20 | return entity 21 | #get_entity_details('https://sampledataadls.dfs.core.windows.net/masterdata/employees.csv','azure_datalake_gen2_path') 22 | 23 | def get_entity_guid(qualifiedName,typeName): 24 | entities = client.get_entity( 25 | qualifiedName=[qualifiedName], 26 | typeName=typeName 27 | ) 28 | for entity in entities.get("entities"): 29 | entity_guid = entity.get("guid") 30 | break 31 | return entity_guid 32 | #get_entity_guid('https://sampledataadls.dfs.core.windows.net/creditriskdata/borrower.csv','azure_datalake_gen2_path') 33 | 34 | def get_entity_schema(guid): 35 | columns = [] 36 | results = client.get_entity(guid) 37 | for entity in results["entities"]: 38 | if "tabular_schema" in entity["relationshipAttributes"]: 39 | ts = entity["relationshipAttributes"]["tabular_schema"] 40 | ts_entity = client.get_entity(ts["guid"]) 41 | for schema in ts_entity["entities"]: 42 | for col in schema["relationshipAttributes"]["columns"]: 43 | if col['displayText'] != ':csv': 44 | columns.append(col['displayText']) 45 | return(columns) 46 | 47 | # ent_guid = 'a8698a33-9174-43cb-8835-26968862e2bf' 48 | # get_entity_schema(ent_guid) 49 | 50 | def create_data_entity_with_schema_and_parent(df_data,entityname,entitytype='custom_ml_dataset',parent_entityname=None,parent_entitytype='custom_ml_datastore'): 51 | # Create an asset for the output data schema. 52 | output_schema_entity = AtlasEntity( 53 | name="schema-" + entityname, 54 | qualified_name = "pyapacheatlas://"+"schema-" + entityname, 55 | typeName="tabular_schema", 56 | guid=guid.get_guid() 57 | ) 58 | 59 | df_data_schema = pd.DataFrame(list(zip(list(df_data.columns), list(df_data.dtypes))),columns=['column','dtype']) 60 | 61 | #Iterate over the out data frame's columns and create entities 62 | output_entity_schema_columns = [] 63 | #for column in df.schema: 64 | for index, row in df_data_schema.iterrows(): 65 | temp_column = AtlasEntity( 66 | name = row.column, 67 | typeName = "column", 68 | qualified_name = "pyapacheatlas://schema-" + entityname + "#" + row.column, 69 | guid=guid.get_guid(), 70 | attributes = {"type":str(row.dtype),"description": row.column}, 71 | relationshipAttributes = {"composeSchema":output_schema_entity.to_json(minimum=True)} 72 | ) 73 | output_entity_schema_columns.append(temp_column) 74 | 75 | 76 | if parent_entityname: 77 | dstore_entity = get_entity_details("pyapacheatlas://"+parent_entityname, parent_entitytype) 78 | # Create a entity for dataset 79 | dataset_output_entity = AtlasEntity( 80 | name=entityname, 81 | typeName=entitytype, 82 | qualified_name="pyapacheatlas://" + entityname, 83 | guid = guid.get_guid(), 84 | relationshipAttributes = { 85 | "tabular_schema": output_schema_entity.to_json(minimum=True), 86 | "datastore":dstore_entity 87 | } 88 | ) 89 | else: 90 | # Create a entity for dataset 91 | dataset_output_entity = AtlasEntity( 92 | name=entityname, 93 | typeName=entitytype, 94 | qualified_name="pyapacheatlas://" + entityname, 95 | guid = guid.get_guid(), 96 | relationshipAttributes = { 97 | "tabular_schema": output_schema_entity.to_json(minimum=True) 98 | } 99 | ) 100 | 101 | # Prepare all the entities as a batch to be uploaded. 102 | batch = [dataset_output_entity, output_schema_entity] + output_entity_schema_columns 103 | batch 104 | 105 | # Upload all entities! 106 | client.upload_entities(batch=batch) 107 | 108 | def create_data_entity_with_schema(df_data,entityname,entitytype='custom_ml_dataset'): 109 | # Create an asset for the output data schema. 110 | output_schema_entity = AtlasEntity( 111 | name="schema-" + entityname, 112 | qualified_name = "pyapacheatlas://"+"schema-" + entityname, 113 | typeName="tabular_schema", 114 | guid=guid.get_guid() 115 | ) 116 | 117 | df_data_schema = pd.DataFrame(list(zip(list(df_data.columns), list(df_data.dtypes))),columns=['column','dtype']) 118 | 119 | #Iterate over the out data frame's columns and create entities 120 | output_entity_schema_columns = [] 121 | #for column in df.schema: 122 | for index, row in df_data_schema.iterrows(): 123 | temp_column = AtlasEntity( 124 | name = row.column, 125 | typeName = "column", 126 | qualified_name = "pyapacheatlas://schema-" + entityname + "#" + row.column, 127 | guid=guid.get_guid(), 128 | attributes = {"type":str(row.dtype),"description": row.column}, 129 | relationshipAttributes = {"composeSchema":output_schema_entity.to_json(minimum=True)} 130 | ) 131 | output_entity_schema_columns.append(temp_column) 132 | 133 | # Create a entity for dataset 134 | dataset_output_entity = AtlasEntity( 135 | name=entityname, 136 | typeName=entitytype, 137 | qualified_name="pyapacheatlas://" + entityname, 138 | guid = guid.get_guid(), 139 | relationshipAttributes = { 140 | "tabular_schema": output_schema_entity.to_json(minimum=True) 141 | } 142 | ) 143 | 144 | # Prepare all the entities as a batch to be uploaded. 145 | batch = [dataset_output_entity, output_schema_entity] + output_entity_schema_columns 146 | batch 147 | 148 | # Upload all entities! 149 | client.upload_entities(batch=batch) 150 | 151 | def create_lineage_for_entities(experimentname,processname,in_ent_qns,out_ent_qns,process_type_name='Process',ColumnMapping=False): 152 | # create a process 153 | # inputs: list of (entity,type) tuples 154 | # outputs: list of (entity,type) tuples 155 | 156 | from pyapacheatlas.core import AtlasProcess 157 | 158 | in_ent_guids = [] 159 | for in_ent_qn in in_ent_qns: 160 | #print(in_ent_qn,in_ent_qns[in_ent_qn]) 161 | in_ent_guid = get_entity_guid(in_ent_qn,in_ent_qns[in_ent_qn]) 162 | in_ent_guids.append({'guid':in_ent_guid}) 163 | 164 | out_ent_guids = [] 165 | for out_ent_qn in out_ent_qns: 166 | #print(in_ent_qn,in_ent_qns[in_ent_qn]) 167 | out_ent_guid = get_entity_guid(out_ent_qn,out_ent_qns[out_ent_qn]) 168 | out_ent_guids.append({'guid':out_ent_guid}) 169 | 170 | process_name = experimentname + processname 171 | process_qn = "pyapacheatlas://" + process_name 172 | 173 | if ColumnMapping == False: 174 | process_type_name = process_type_name 175 | 176 | process = AtlasProcess( 177 | name=process_name, 178 | typeName=process_type_name, 179 | qualified_name=process_qn, 180 | inputs = in_ent_guids, 181 | outputs = out_ent_guids, 182 | guid=guid.get_guid() 183 | ) 184 | else: 185 | process_type_name = "ProcessWithColumnMapping" 186 | 187 | column_mapping_attributes = [] 188 | for in_ent_qn in in_ent_qns: 189 | cl_mapping = [] 190 | in_ent_columns = get_entity_schema(get_entity_guid(in_ent_qn,in_ent_qns[in_ent_qn])) 191 | for in_col in in_ent_columns: 192 | cl_mapping.append({"Source":in_col,"Sink":in_col}) 193 | #break 194 | mapping = { 195 | 'DatasetMapping': {'Source':in_ent_qn,'Sink':list(out_ent_qns.keys())[0]}, 196 | 'ColumnMapping': cl_mapping 197 | } 198 | column_mapping_attributes.append(mapping) 199 | 200 | process = AtlasProcess( 201 | name=process_name, 202 | typeName=process_type_name, 203 | qualified_name=process_qn, 204 | inputs = in_ent_guids, 205 | outputs = out_ent_guids, 206 | guid=guid.get_guid(), 207 | attributes={"columnMapping":json.dumps(column_mapping_attributes)} 208 | ) 209 | 210 | # Prepare all the entities as a batch to be uploaded. 211 | batch = [process] 212 | batch 213 | 214 | # Upload all entities! 215 | client.upload_entities(batch=batch) 216 | 217 | def create_entity(name,typeName,config_attibutes): 218 | # Create an entity 219 | name = name 220 | qn = "pyapacheatlas://" + name 221 | 222 | exp_config_entity = AtlasEntity( 223 | name=name, 224 | typeName=typeName, 225 | qualified_name=qn, 226 | guid = guid.get_guid(), 227 | attributes = config_attibutes 228 | ) 229 | 230 | # Upload all entities! 231 | client.upload_entities(batch=[exp_config_entity.to_json()]) 232 | 233 | 234 | def get_dataset_details(indataset,experiment_name=''): 235 | result = [] 236 | #print(indataset) 237 | if 'FileDataset' in str(type((indataset))): 238 | dssource = eval(json.loads(str(indataset).replace('FileDataset',''))['source'][0]) 239 | sourcestore = dssource[0] 240 | sourcepath = dssource[1] 241 | sourcepathfiles = indataset.to_path() 242 | for sourcepathfile in sourcepathfiles: 243 | entityname = sourcepath.split('/')[-1] + sourcepathfile.replace('/','_') #.replace('.parquet','').replace('.csv','') 244 | #print('\nFileDataset:',entityname) 245 | 246 | dsdatastore = Datastore.get(ws, sourcestore) 247 | datastore_path = [DataPath(dsdatastore, sourcepath+sourcepathfile.replace('/',''))] 248 | 249 | if '.parquet' in sourcepathfile: 250 | tabular_dataset = Dataset.Tabular.from_parquet_files(path=datastore_path) 251 | df_data = tabular_dataset.take(10).to_pandas_dataframe() 252 | 253 | elif '.csv' in sourcepathfile: 254 | tabular_dataset = Dataset.Tabular.from_delimited_files(path=datastore_path,encoding ='iso88591') 255 | #'utf8', 'iso88591', 'latin1', 'ascii', 'utf16', 'utf32', 'utf8bom' and 'windows1252' 256 | df_data = tabular_dataset.take(10).to_pandas_dataframe() 257 | 258 | if experiment_name != '': 259 | result.append((entityname + '_' + experiment_name,df_data)) 260 | else: 261 | result.append((entityname,df_data)) 262 | 263 | elif 'TabularDataset' in str(type((indataset))): 264 | tabular_dataset = indataset 265 | entityname = json.loads(str(indataset).replace('TabularDataset',''))['registration']['name'] 266 | 267 | # dataset = Dataset.get_by_name(ws, name=entityname) 268 | # try: 269 | # sourcestore = json.loads(dataset._definition)['blocks'][0]['arguments']['datastore']['datastoreName'] 270 | # except: 271 | # sourcestore = json.loads(dataset._definition)['blocks'][0]['arguments']['datastores'][0]['datastoreName'] 272 | df_data = tabular_dataset.take(10).to_pandas_dataframe() 273 | #print('TabularDataset:', entityname) 274 | result.append((entityname,df_data)) 275 | return result 276 | 277 | 278 | from azureml.core import Experiment 279 | from azureml.pipeline.core import PipelineRun 280 | 281 | from azureml.core import Workspace, Datastore, Dataset 282 | from azureml.data.datapath import DataPath 283 | import json 284 | import pandas as pd 285 | 286 | def create_aml_experiment_steps(ws,experiment_name): 287 | experiments_lst = Experiment.list(ws) 288 | for experiment in experiments_lst: 289 | if experiment.name == experiment_name: 290 | print(experiment) 291 | exp = Experiment(ws,experiment.name) 292 | for run in exp.get_runs(): 293 | rundetails = run.get_details() 294 | 295 | if rundetails['status'] != 'Completed': #continue until we find a completed run 296 | continue 297 | pipeline_run = PipelineRun(exp, rundetails['runId']) 298 | 299 | steps = pipeline_run.get_steps() 300 | for step_run in steps: 301 | step_run_details = step_run.get_details_with_logs() 302 | 303 | #print(step_run_details['runDefinition']['script']) 304 | 305 | purview_basepath = 'pyapacheatlas://' 306 | in_ent_qns = {} 307 | out_ent_qns = {} 308 | 309 | step_name = step_run.name #step_run_details['runDefinition']['script'] 310 | 311 | #print('\n Input Datasets:\n') 312 | for indataset in step_run_details['inputDatasets']: 313 | in_result = get_dataset_details(indataset['dataset'],experiment_name) 314 | #print(in_result) 315 | #create entities 316 | for in_res in in_result: 317 | data_ent_name = in_res[0].strip('_') 318 | create_data_entity_with_schema(in_res[1],data_ent_name,'custom_ml_dataset') 319 | in_ent_qns[purview_basepath + data_ent_name] = 'custom_ml_dataset' 320 | #break 321 | #print('\n Output Datasets:\n') 322 | for outdataset in step_run_details['outputDatasets']: 323 | out_result = get_dataset_details(outdataset['dataset'],experiment_name) 324 | #print(out_result) 325 | #create entities 326 | for out_res in out_result: 327 | data_ent_name = out_res[0].strip('_') 328 | create_data_entity_with_schema(out_res[1],data_ent_name,'custom_ml_dataset') 329 | out_ent_qns[purview_basepath + data_ent_name] = 'custom_ml_dataset' 330 | #break 331 | #print(in_ent_qns,out_ent_qns) 332 | create_lineage_for_entities(experiment_name + '_',step_name, in_ent_qns,out_ent_qns,process_type_name='custom_ml_experiment_step',ColumnMapping=False) 333 | #break 334 | 335 | break # break after processing one completed run 336 | break #after finding the experiment 337 | 338 | 339 | #create workspace entity 340 | def create_workspace_entities(ws): 341 | 342 | config_attibutes={} 343 | temp_column={} 344 | 345 | temp_column['name'] = ws.name 346 | config_attibutes.update(temp_column) 347 | temp_column['subscription_id'] = ws.subscription_id 348 | config_attibutes.update(temp_column) 349 | temp_column['resource_group'] = ws.resource_group 350 | config_attibutes.update(temp_column) 351 | 352 | create_entity(ws.name,'custom_ml_workspace',config_attibutes) 353 | #break 354 | 355 | #create all datastore entities 356 | def create_datastore_entities(ws): 357 | for datastore in ws.datastores.values(): 358 | config_attibutes={} 359 | temp_column={} 360 | 361 | temp_column['name'] = datastore.name 362 | config_attibutes.update(temp_column) 363 | 364 | if ('AzureDataLakeGen2Datastore' in str(type(datastore))) or ('AzureBlobDatastore' in str(type(datastore))): 365 | temp_column['container_name'] = datastore.container_name 366 | config_attibutes.update(temp_column) 367 | temp_column['account_name'] = datastore.account_name 368 | config_attibutes.update(temp_column) 369 | temp_column['protocol'] = datastore.protocol 370 | config_attibutes.update(temp_column) 371 | temp_column['endpoint'] = datastore.endpoint 372 | config_attibutes.update(temp_column) 373 | elif 'AzureSqlDatabaseDatastore' in str(type(datastore)): 374 | #print('sql',datastore.server_name) 375 | temp_column['server_name'] = datastore.server_name 376 | config_attibutes.update(temp_column) 377 | temp_column['database_name'] = datastore.database_name 378 | config_attibutes.update(temp_column) 379 | elif 'AzureBlobDatastore' in str(type(datastore)): 380 | pass 381 | 382 | create_entity(datastore.name,'custom_ml_datastore',config_attibutes) 383 | #break 384 | 385 | #create workspace and datastore relationship 386 | purview_basepath = 'pyapacheatlas://' 387 | for datastore in ws.datastores.values(): 388 | relationshiptype = 'custom_ml_workspace_datastore' 389 | end1type = 'custom_ml_workspace' 390 | end2type = 'custom_ml_datastore' 391 | end1_qn = purview_basepath + ws.name 392 | end2_qn = purview_basepath + datastore.name 393 | try: 394 | create_entities_relationship(relationshiptype,end1type,end2type,end1_qn,end2_qn) 395 | except: 396 | pass # ignore if relationship exists 397 | 398 | #create all dataset entities (with datastore as parent) 399 | from azureml.core import Workspace, Datastore, Dataset 400 | import pandas as pd 401 | def create_dataset_entities(ws,parent_flag=True): 402 | purview_basepath = 'pyapacheatlas://' 403 | for dsname in ws.datasets: 404 | dataset = ws.datasets[dsname] 405 | try: 406 | if 'FileDataset' in str(type((dataset))): 407 | datasetsource = eval(json.loads(str(dataset).replace('FileDataset',''))['source'][0])[0] 408 | elif 'TabularDataset' in str(type((dataset))): 409 | datasetsource = eval(json.loads(str(dataset).replace('TabularDataset',''))['source'][0])[0] 410 | 411 | dsdetails = get_dataset_details(dataset) 412 | #print(dsdetails) 413 | for ds in dsdetails: 414 | if parent_flag == False: 415 | 416 | create_data_entity_with_schema(ds[1],dsname,'custom_ml_dataset') 417 | create_lineage_for_entities('',('register_' + dsname), {(purview_basepath+datasetsource):'custom_ml_datastore'}, 418 | {(purview_basepath+ds[0]):'custom_ml_dataset'},ColumnMapping=False) 419 | else: 420 | create_data_entity_with_schema_and_parent(ds[1],dsname,entitytype='custom_ml_dataset', 421 | parent_entityname=datasetsource,parent_entitytype='custom_ml_datastore') 422 | except: 423 | print('Error:',dsname) 424 | #break 425 | 426 | 427 | #create experiment entity 428 | from azureml.core import Experiment 429 | 430 | def create_experiment_entities(ws): 431 | for experiment in Experiment.list(ws): 432 | #create experiment entity 433 | config_attibutes={} 434 | temp_column={} 435 | 436 | temp_column['name'] = experiment.name 437 | config_attibutes.update(temp_column) 438 | 439 | create_entity(experiment.name,'custom_ml_experiment',config_attibutes) 440 | #break 441 | 442 | purview_basepath = 'pyapacheatlas://' 443 | 444 | #create experiment relationship to workspace 445 | relationshiptype = 'custom_ml_workspace_experiment' 446 | end1type = 'custom_ml_workspace' 447 | end2type = 'custom_ml_experiment' 448 | end1_qn = purview_basepath + ws.name 449 | end2_qn = purview_basepath + experiment.name 450 | try: 451 | create_entities_relationship(relationshiptype,end1type,end2type,end1_qn,end2_qn) 452 | except: 453 | pass # ignore if relationship exists 454 | 455 | for run in experiment.get_runs(): 456 | rundetails = run.get_details() 457 | #print(rundetails) 458 | if rundetails['status'] != 'Completed': #continue until we find a completed run 459 | continue 460 | #create experiment steps 461 | if rundetails['properties']['azureml.runsource'] == 'azureml.PipelineRun': 462 | print(experiment.name) 463 | create_aml_experiment_steps(ws,experiment.name) 464 | 465 | pipeline_run = PipelineRun(experiment, rundetails['runId']) 466 | 467 | steps = pipeline_run.get_steps() 468 | for step_run in steps: 469 | #create experiment relationship to workspace 470 | relationshiptype = 'custom_ml_experiment_to_experimentstep' 471 | end1type = 'custom_ml_experiment' 472 | end2type = 'custom_ml_experiment_step' 473 | end1_qn = purview_basepath + experiment.name 474 | end2_qn = purview_basepath + experiment.name + '_' + step_run.name 475 | try: 476 | create_entities_relationship(relationshiptype,end1type,end2type,end1_qn,end2_qn) 477 | except: 478 | pass # ignore if relationship exists 479 | 480 | break # break after processing one completed run 481 | #break 482 | 483 | def create_entities_relationship(relationshiptype,end1type,end2type,end1_qn,end2_qn): 484 | relationship = {} 485 | end1 = {} 486 | end2 = {} 487 | 488 | end1["guid"] = get_entity_guid(end1_qn,end1type) 489 | end1["typeName"] = end1type 490 | end1["uniqueAttributes"] = {"qualifiedName": end1_qn} 491 | 492 | end2["guid"] = get_entity_guid(end2_qn,end2type) 493 | end2["typeName"] = end2type 494 | end2["uniqueAttributes"] = {"qualifiedName": end2_qn} 495 | 496 | relationship["typeName"] = relationshiptype 497 | relationship["attributes"] = {} 498 | relationship["guid"] = guid.get_guid() 499 | relationship["provenanceType"] = 0 500 | relationship["end1"] = end1 501 | relationship["end2"] = end2 502 | relationship 503 | 504 | client.upload_relationship(relationship) 505 | 506 | def create_package_entities(experimentname,packageslist): 507 | packages_name = experimentname + '-packages' 508 | packages_qn = "pyapacheatlas://" + packages_name 509 | 510 | # Create an asset for the packages. 511 | packages_entity = AtlasEntity( 512 | name = packages_name, 513 | qualified_name = packages_qn, 514 | typeName="custom_ml_packages", 515 | attributes = {"notes":"test note"}, 516 | guid=guid.get_guid() 517 | ) 518 | 519 | packages_entity.to_json(minimum=True) 520 | 521 | atlas_packages = [] 522 | relationships = [] 523 | for package in packageslist: 524 | package_attibutes={} 525 | temp_column={} 526 | temp_column['programming_language'] = str(package[0]) 527 | package_attibutes.update(temp_column) 528 | temp_column['package_name'] = str(package[1]) 529 | package_attibutes.update(temp_column) 530 | temp_column['version'] = str(package[2]) 531 | package_attibutes.update(temp_column) 532 | temp_column['notes'] = str(package[3]) 533 | package_attibutes.update(temp_column) 534 | 535 | # Create an entity for each package 536 | name = str(package[1]) #experimentname + '-package-' + package[1] 537 | qn = packages_qn + '#' + str(package[1]) #"pyapacheatlas://" + name 538 | 539 | package_entity = AtlasEntity( 540 | name= name, 541 | typeName="custom_ml_package", 542 | qualified_name=qn, 543 | guid = guid.get_guid(), 544 | attributes = package_attibutes, 545 | relationshipAttributes = {"packages":packages_entity.to_json(minimum=True)} 546 | ) 547 | atlas_packages.append(package_entity) 548 | 549 | atlas_packages 550 | 551 | # Prepare all the entities as a batch to be uploaded. 552 | batch = [packages_entity] + atlas_packages 553 | client.upload_entities(batch=batch) 554 | 555 | def create_experiment_config_entity(ws,experiment_name,automl_run): 556 | # Get experiment config from AML run 557 | import json 558 | import pandas as pd 559 | run_properties = automl_run.get_properties() 560 | run_properties 561 | 562 | AMLSettingsJsonString = run_properties['AMLSettingsJsonString'] 563 | AMLSettings = json.loads(AMLSettingsJsonString) 564 | 565 | df_config = pd.DataFrame(list(AMLSettings.items()),columns = ['key','value']) 566 | 567 | keys = ['task_type','enable_early_stopping','experiment_timeout_minutes','primary_metric','compute_target','label_column_name','n_cross_validations','model_explainability'] 568 | 569 | df_config = df_config[df_config['key'].isin(keys)] 570 | 571 | dict_config = df_config.to_dict(orient = 'records') 572 | dict_config 573 | 574 | config_attibutes={} 575 | for attibutes in dict_config: 576 | temp_column={} 577 | temp_column[attibutes['key']] = attibutes['value'] 578 | config_attibutes.update(temp_column) 579 | config_attibutes 580 | 581 | # Create a entity for exp config 582 | name = experiment_name + "-config" 583 | qn = "pyapacheatlas://" + name 584 | 585 | exp_config_entity = AtlasEntity( 586 | name=name, 587 | typeName="custom_ml_exp_config", 588 | qualified_name=qn, 589 | guid = guid.get_guid(), 590 | attributes = config_attibutes 591 | ) 592 | 593 | # Upload all entities! 594 | client.upload_entities(batch=[exp_config_entity.to_json()]) 595 | 596 | def create_model_entity(ws,experiment_name,modelname): 597 | # get deployed model 598 | from azureml.core.model import Model 599 | model = Model(ws, modelname) 600 | 601 | config_attibutes={} 602 | temp_column={} 603 | temp_column['workspace_name'] = model.workspace.name 604 | config_attibutes.update(temp_column) 605 | temp_column['workspace_subscription_id'] = model.workspace.subscription_id 606 | config_attibutes.update(temp_column) 607 | temp_column['workspace_subscription_id'] = model.workspace.subscription_id 608 | config_attibutes.update(temp_column) 609 | temp_column['workspace_resource_group'] = model.workspace.resource_group 610 | config_attibutes.update(temp_column) 611 | temp_column['name'] = model.name 612 | config_attibutes.update(temp_column) 613 | temp_column['id'] = model.id 614 | config_attibutes.update(temp_column) 615 | temp_column['version'] = model.version 616 | config_attibutes.update(temp_column) 617 | temp_column['tags'] = model.tags 618 | config_attibutes.update(temp_column) 619 | temp_column['properties'] = model.properties 620 | config_attibutes.update(temp_column) 621 | 622 | # Create a entity for Model 623 | name = modelname 624 | qn = "pyapacheatlas://" + name 625 | 626 | exp_config_entity = AtlasEntity( 627 | name=name, 628 | typeName="custom_ml_model", 629 | qualified_name=qn, 630 | guid = guid.get_guid(), 631 | attributes = config_attibutes 632 | ) 633 | 634 | # Upload all entities! 635 | client.upload_entities(batch=[exp_config_entity.to_json()]) 636 | 637 | def create_model_metrics_entity(experiment_name,best_run): 638 | metrics = best_run.get_metrics() 639 | 640 | # select relevant metrics 641 | auc = metrics.get('AUC_weighted') 642 | accuracy = metrics.get('accuracy') 643 | precision = metrics.get('precision_score_weighted') 644 | recall = metrics.get('recall_score_weighted') 645 | f1 = metrics.get('f1_score_weighted') 646 | 647 | # # combine into single dataframe 648 | # metrics_df = sc.parallelize([['AUC', auc], ['Accuracy', accuracy], ['Precision', precision], ['Recall', recall], ['F1', f1]]).toDF(('Metric', 'Value')) 649 | metrics = ['AUC','Accuracy','Precision','Recall','F1'] 650 | metricslist= [auc,accuracy,precision,recall,f1] 651 | columns = ['Metric','Value'] 652 | metrics_df = pd.DataFrame(zip(metrics, metricslist),columns=columns) 653 | 654 | 655 | dict_metrics = metrics_df.to_dict(orient = 'records') 656 | dict_metrics 657 | 658 | config_attibutes={} 659 | for attibutes in dict_metrics: 660 | temp_column={} 661 | temp_column[attibutes['Metric']] = attibutes['Value'] 662 | config_attibutes.update(temp_column) 663 | config_attibutes 664 | 665 | name = experiment_name + "-modelmetrics" 666 | qn = "pyapacheatlas://" + name 667 | 668 | # Create a entity for model metrics 669 | exp_config_entity = AtlasEntity( 670 | name=name, 671 | typeName="custom_ml_model_metrics", 672 | qualified_name=qn, 673 | guid = guid.get_guid(), 674 | attributes = config_attibutes 675 | ) 676 | 677 | # Upload all entities! 678 | client.upload_entities(batch=[exp_config_entity.to_json()]) 679 | 680 | def create_experiment_lineage(experimentname,exp_data_qn,exp_config_qn,model_metrics_qn,model_qn): 681 | # create experiment process 682 | # inputs: prepareddata, modelconfig 683 | # outputs: model metrics and registered model 684 | 685 | from pyapacheatlas.core import AtlasProcess 686 | 687 | in_data_ent_guid = get_entity_guid(exp_data_qn,'custom_dataset') 688 | in_exp_config_guid = get_entity_guid(exp_config_qn,'custom_ml_exp_config') 689 | out_model_metrics_guid = get_entity_guid(model_metrics_qn,'custom_ml_model_metrics') 690 | out_model_guid = get_entity_guid(model_qn,'custom_ml_model') 691 | 692 | process_name = experimentname + '-train' 693 | process_qn = "pyapacheatlas://" + process_name 694 | process_type_name = "Process" 695 | 696 | process = AtlasProcess( 697 | name=process_name, 698 | typeName=process_type_name, 699 | qualified_name=process_qn, 700 | inputs = [{"guid":in_data_ent_guid},{"guid":in_exp_config_guid}], 701 | outputs = [{"guid":out_model_metrics_guid},{"guid":out_model_guid}], 702 | guid=guid.get_guid() 703 | ) 704 | 705 | # Prepare all the entities as a batch to be uploaded. 706 | batch = [process] 707 | batch 708 | 709 | # Upload all entities! 710 | client.upload_entities(batch=batch) 711 | 712 | def create_model_service_entity(ws,experimentname,aci_service_name,samplejson): 713 | # get deployed ACI Web Service 714 | from azureml.core.webservice import AciWebservice 715 | aciws = AciWebservice(ws, aci_service_name) 716 | 717 | config_attibutes={} 718 | temp_column={} 719 | temp_column['workspace_name'] = aciws.workspace.name 720 | config_attibutes.update(temp_column) 721 | temp_column['workspace_subscription_id'] = aciws.workspace.subscription_id 722 | config_attibutes.update(temp_column) 723 | temp_column['workspace_resource_group'] = aciws.workspace.resource_group 724 | config_attibutes.update(temp_column) 725 | temp_column['name'] = aciws.name 726 | config_attibutes.update(temp_column) 727 | temp_column['image_id'] = aciws.image_id 728 | config_attibutes.update(temp_column) 729 | temp_column['compute_type'] = aciws.compute_type 730 | config_attibutes.update(temp_column) 731 | temp_column['state'] = aciws.state 732 | config_attibutes.update(temp_column) 733 | temp_column['scoring_uri'] = aciws.scoring_uri 734 | config_attibutes.update(temp_column) 735 | temp_column['tags'] = aciws.tags 736 | config_attibutes.update(temp_column) 737 | temp_column['state'] = aciws.state 738 | config_attibutes.update(temp_column) 739 | temp_column['properties'] = aciws.properties 740 | config_attibutes.update(temp_column) 741 | temp_column['created_by'] = aciws.created_by 742 | config_attibutes.update(temp_column) 743 | temp_column['sample_json'] = samplejson 744 | config_attibutes.update(temp_column) 745 | 746 | name = experimentname + "-model_endpoint" 747 | qn = "pyapacheatlas://" + name 748 | 749 | # Create a entity for ACI Web Service 750 | endpoint_entity = AtlasEntity( 751 | name=name, 752 | typeName="custom_ml_model_endpoint", 753 | qualified_name=qn, 754 | guid = guid.get_guid(), 755 | attributes = config_attibutes 756 | ) 757 | 758 | # Upload all entities! 759 | client.upload_entities(batch=[endpoint_entity.to_json()]) 760 | 761 | def create_powerbi_dataset_and_lineage(experiment_name,pbi_workspace,pbi_datasetid,pbidata_ent_name,ml_dataset_ent_name,ml_dataset_ent_type): 762 | 763 | pbidata_entity_type = 'powerbi_dataset' 764 | pbidata_ent_qn = pbi_workspace + '/datasets/' + pbi_datasetid 765 | purview_basepath = 'pyapacheatlas://' 766 | #"https://msit.powerbi.com/groups/7d666287-f9b8-45ff-be6c-9909afe9df40/datasets/e5a30c22-466d-4a30-a1ac-8736ed6567cc" 767 | 768 | pbidata_ent = AtlasEntity( 769 | name=pbidata_ent_name, 770 | typeName=pbidata_entity_type, 771 | qualified_name= pbidata_ent_qn, 772 | workspace = pbi_workspace, 773 | guid = guid.get_guid() 774 | ) 775 | 776 | # Prepare all the entities as a batch to be uploaded. 777 | batch = [pbidata_ent] 778 | batch 779 | 780 | # Upload all entities! 781 | client.upload_entities(batch=batch) 782 | 783 | #cretae powerbi_dataset_process lineage 784 | in_ent_guids = [] 785 | in_ent_guid = get_entity_guid(purview_basepath + ml_dataset_ent_name,ml_dataset_ent_type) 786 | in_ent_guids.append({'guid':in_ent_guid}) 787 | 788 | out_ent_guids = [] 789 | out_ent_guid = get_entity_guid(pbidata_ent_qn,pbidata_entity_type) 790 | out_ent_guids.append({'guid':out_ent_guid}) 791 | 792 | process_name = 'createpowerbidataset' + pbidata_ent_name + experiment_name 793 | process_qn = "pyapacheatlas://" + process_name 794 | process_type_name = "powerbi_dataset_process" 795 | 796 | process = AtlasProcess( 797 | name=process_name, 798 | typeName=process_type_name, 799 | qualified_name=process_qn, 800 | inputs = in_ent_guids, 801 | outputs = out_ent_guids, 802 | guid=guid.get_guid() 803 | ) 804 | 805 | # Prepare all the entities as a batch to be uploaded. 806 | batch = [process] 807 | batch 808 | 809 | # Upload all entities! 810 | client.upload_entities(batch=batch) 811 | 812 | def create_powerbi_report_and_lineage(experiment_name,pbi_workspace,pbi_reportid,pbi_ent_name,pbi_datasetid): 813 | 814 | #create powerbi report 815 | pbi_entity_type = 'powerbi_report' 816 | pbi_ent_qn = pbi_workspace + '/reports/' + pbi_reportid 817 | purview_basepath = 'pyapacheatlas://' 818 | 819 | pbi_ent = AtlasEntity( 820 | name=pbi_ent_name, 821 | typeName=pbi_entity_type, 822 | qualified_name= pbi_ent_qn, 823 | workspace = pbi_workspace, 824 | guid = guid.get_guid() 825 | ) 826 | 827 | # Prepare all the entities as a batch to be uploaded. 828 | batch = [pbi_ent] 829 | batch 830 | 831 | # Upload all entities! 832 | client.upload_entities(batch=batch) 833 | 834 | #create powerbi dashboard process lineage 835 | pbidata_ent_qn = pbi_workspace + '/datasets/' + pbi_datasetid 836 | in_ent_guids = [] 837 | in_ent_guid = get_entity_guid(pbidata_ent_qn,'powerbi_dataset') 838 | in_ent_guids.append({'guid':in_ent_guid}) 839 | 840 | out_ent_guids = [] 841 | out_ent_guid = get_entity_guid(pbi_ent_qn,'powerbi_report') 842 | out_ent_guids.append({'guid':out_ent_guid}) 843 | 844 | process_name = 'createpowerbireport' + pbi_ent_name + experiment_name 845 | process_qn = "pyapacheatlas://" + process_name 846 | process_type_name = "powerbi_report_process" 847 | 848 | process = AtlasProcess( 849 | name=process_name, 850 | typeName=process_type_name, 851 | qualified_name=process_qn, 852 | inputs = in_ent_guids, 853 | outputs = out_ent_guids, 854 | guid=guid.get_guid() 855 | ) 856 | 857 | # Prepare all the entities as a batch to be uploaded. 858 | batch = [process] 859 | batch 860 | 861 | # Upload all entities! 862 | client.upload_entities(batch=batch) 863 | 864 | # clean up datasets 865 | def cleanup_entities(typename, entitytype): 866 | filter_setup = {"typeName": typename, "includeSubTypes": True} 867 | search = client.search_entities("*", search_filter=filter_setup) 868 | for entity in search: 869 | #print(entity) 870 | if entity.get("entityType") == entitytype: 871 | print(entity.get("id"),entity.get("qualifiedName"),entity.get("entityType")) 872 | guid = entity.get("id") 873 | client.delete_entity(guid=guid) 874 | 875 | -------------------------------------------------------------------------------- /AMLNotebooks/Create_ML_Lineage_Types.py: -------------------------------------------------------------------------------- 1 | def create_ml_lineage_types(client): 2 | from pyapacheatlas.core.typedef import AtlasAttributeDef, EntityTypeDef, RelationshipTypeDef 3 | try: 4 | #-----------------------------------------------------------------------------------# 5 | #create custom dataset type 6 | type_df = EntityTypeDef( 7 | name="custom_dataset", 8 | attributeDefs=[ 9 | AtlasAttributeDef(name="format") 10 | ], 11 | superTypes = ["DataSet"] 12 | ) 13 | typedef_results = client.upload_typedefs(entityDefs = [type_df], force_update=True) 14 | 15 | #-----------------------------------------------------------------------------------# 16 | #create process with column mapping type 17 | type_df = EntityTypeDef( 18 | name="ProcessWithColumnMapping", 19 | attributeDefs=[ 20 | AtlasAttributeDef(name="columnMapping") 21 | ], 22 | superTypes = ["Process"] 23 | ) 24 | typedef_results = client.upload_typedefs(entityDefs = [type_df], force_update=True) 25 | 26 | #-----------------------------------------------------------------------------------# 27 | #create AML workspace type 28 | type_df = EntityTypeDef( 29 | name="custom_ml_workspace", 30 | attributeDefs=[ 31 | AtlasAttributeDef(name='name',typename='string'), 32 | AtlasAttributeDef(name='description',typename='string'), 33 | AtlasAttributeDef(name='subscription_id',typename='string'), 34 | AtlasAttributeDef(name='resource_group',typename='string') 35 | ], 36 | superTypes = ["DataSet"] 37 | ) 38 | typedef_results = client.upload_typedefs(entityDefs = [type_df], force_update=True) 39 | #-----------------------------------------------------------------------------------# 40 | #create types for datastore and dataset 41 | 42 | #create AML datastore type 43 | datastore_type_df = EntityTypeDef( 44 | name="custom_ml_datastore", 45 | attributeDefs=[ 46 | AtlasAttributeDef(name="name",typename='string'), 47 | AtlasAttributeDef(name='container_name',typename='string'), 48 | AtlasAttributeDef(name='account_name',typename='string'), 49 | AtlasAttributeDef(name='protocol',typename='string'), 50 | AtlasAttributeDef(name='endpoint',typename='string'), 51 | AtlasAttributeDef(name='server_name',typename='string'), 52 | AtlasAttributeDef(name='database_name',typename='string'), 53 | AtlasAttributeDef(name="createdby",typename='string') 54 | ], 55 | superTypes = ["DataSet"], 56 | options = {"schemaElementAttribute":"dataset"} 57 | ) 58 | 59 | #create AML dataset type 60 | dataset_type_df = EntityTypeDef( 61 | name="custom_ml_dataset", 62 | attributeDefs=[ 63 | AtlasAttributeDef(name="name",typename='string'), 64 | AtlasAttributeDef(name="description",typename='string'), 65 | AtlasAttributeDef(name="createdby",typename='string'), 66 | AtlasAttributeDef(name="createdtime",typename='string') 67 | ], 68 | superTypes = ["DataSet"] 69 | ) 70 | 71 | # create relationsip between datastore and dataset 72 | dataset_to_datastore_relationship = RelationshipTypeDef( 73 | name="custom_ml_datastore_to_dataset", 74 | relationshipCategory="COMPOSITION", 75 | endDef1={ 76 | "type": "custom_ml_datastore", 77 | "name": "dataset", 78 | "isContainer": True, 79 | "cardinality": "SET", 80 | "isLegacyAttribute": False 81 | }, 82 | endDef2={ 83 | "type": "custom_ml_dataset", 84 | "name": "datastore", 85 | "isContainer": False, 86 | "cardinality": "SINGLE", 87 | "isLegacyAttribute": False 88 | } 89 | ) 90 | 91 | typedef_results = client.upload_typedefs( 92 | entityDefs = [datastore_type_df, dataset_type_df], 93 | relationshipDefs = [dataset_to_datastore_relationship], 94 | force_update=True 95 | ) 96 | #-----------------------------------------------------------------------------------# 97 | #create types for experiment and experimentstep 98 | 99 | #create process for Ml Experiment Step 100 | exp_type_df = EntityTypeDef( 101 | name="custom_ml_experiment", 102 | attributeDefs=[ 103 | AtlasAttributeDef(name='name',typename='string'), 104 | AtlasAttributeDef(name='notes',typename='string'), 105 | AtlasAttributeDef(name="createdby",typename='string'), 106 | AtlasAttributeDef(name="createdtime",typename='string') 107 | ], 108 | superTypes = ["Process"] 109 | ) 110 | 111 | #create process for Ml Experiment Step 112 | exp_step_type_df = EntityTypeDef( 113 | name="custom_ml_experiment_step", 114 | attributeDefs=[ 115 | AtlasAttributeDef(name='notes',typename='string') 116 | ], 117 | superTypes = ["Process"] 118 | ) 119 | 120 | # create relationsip between experiment and experimentstep 121 | step_to_exp_relationship = RelationshipTypeDef( 122 | name="custom_ml_experiment_to_experimentstep", 123 | relationshipCategory="COMPOSITION", 124 | endDef1={ 125 | "type": "custom_ml_experiment", 126 | "name": "experimentstep", 127 | "isContainer": True, 128 | "cardinality": "SET", 129 | "isLegacyAttribute": False 130 | }, 131 | endDef2={ 132 | "type": "custom_ml_experiment_step", 133 | "name": "experiment", 134 | "isContainer": False, 135 | "cardinality": "SINGLE", 136 | "isLegacyAttribute": False 137 | } 138 | ) 139 | 140 | typedef_results = client.upload_typedefs( 141 | entityDefs = [exp_type_df, exp_step_type_df], 142 | relationshipDefs = [step_to_exp_relationship], 143 | force_update=True 144 | ) 145 | #-----------------------------------------------------------------------------------# 146 | 147 | rd = RelationshipTypeDef( 148 | name="custom_ml_workspace_datastore", 149 | attributeDefs=[], 150 | relationshipCategory="COMPOSITION", # Means the child can't exist without the parent 151 | endDef1={ # endDef1 decribes what the parent will have as an attribute 152 | "type":"custom_ml_workspace", # Type of the parent 153 | "name":"datastores", # What the parent will have 154 | "isContainer": True, 155 | "cardinality":"SET", # This is related to the cardinality, in this case the parent Server will have a SET of Models. 156 | "isLegacyAttribute":False 157 | }, 158 | endDef2={ # endDef2 decribes what the child will have as an attribute 159 | "type":"custom_ml_datastore", # Type of the child 160 | "name":"workspace", # What the child will have 161 | "isContainer":False, 162 | "cardinality":"SINGLE", 163 | "isLegacyAttribute":False 164 | } 165 | ) 166 | client.upload_typedefs(relationshipDefs=[rd]) 167 | 168 | #-----------------------------------------------------------------------------------# 169 | rd = RelationshipTypeDef( 170 | name="custom_ml_workspace_experiment", 171 | attributeDefs=[], 172 | relationshipCategory="COMPOSITION", # Means the child can't exist without the parent 173 | endDef1={ # endDef1 decribes what the parent will have as an attribute 174 | "type":"custom_ml_workspace", # Type of the parent 175 | "name":"experiments", # What the parent will have 176 | "isContainer": True, 177 | "cardinality":"SET", # This is related to the cardinality, in this case the parent Server will have a SET of Models. 178 | "isLegacyAttribute":False 179 | }, 180 | endDef2={ # endDef2 decribes what the child will have as an attribute 181 | "type":"custom_ml_experiment", # Type of the child 182 | "name":"workspace", # What the child will have 183 | "isContainer":False, 184 | "cardinality":"SINGLE", 185 | "isLegacyAttribute":False 186 | } 187 | ) 188 | client.upload_typedefs(relationshipDefs=[rd]) 189 | 190 | #-----------------------------------------------------------------------------------# 191 | #create types for packages and package 192 | 193 | #create packages type 194 | packages_type_df = EntityTypeDef( 195 | name="custom_ml_packages", 196 | attributeDefs=[ 197 | AtlasAttributeDef(name='notes',typename='string') 198 | ], 199 | superTypes = ["DataSet"], 200 | options = {"schemaElementAttribute":"package"} 201 | ) 202 | 203 | package_type_df = EntityTypeDef( 204 | name="custom_ml_package", 205 | attributeDefs=[ 206 | AtlasAttributeDef(name='programming_language',typename='string'), 207 | AtlasAttributeDef(name='package_name',typename='string'), 208 | AtlasAttributeDef(name='version',typename='string'), 209 | AtlasAttributeDef(name='notes',typename='string') 210 | ], 211 | superTypes = ["DataSet"] 212 | ) 213 | 214 | # create relationsip between packages and package 215 | package_to_packages_relationship = RelationshipTypeDef( 216 | name="custom_ml_packages_to_package", 217 | relationshipCategory="COMPOSITION", 218 | endDef1={ 219 | "type": "custom_ml_packages", 220 | "name": "package", 221 | "isContainer": True, 222 | "cardinality": "SET", 223 | "isLegacyAttribute": False 224 | }, 225 | endDef2={ 226 | "type": "custom_ml_package", 227 | "name": "packages", 228 | "isContainer": False, 229 | "cardinality": "SINGLE", 230 | "isLegacyAttribute": False 231 | } 232 | ) 233 | 234 | typedef_results = client.upload_typedefs( 235 | entityDefs = [packages_type_df, package_type_df], 236 | relationshipDefs = [package_to_packages_relationship], 237 | force_update=True 238 | ) 239 | #-----------------------------------------------------------------------------------# 240 | 241 | #create experiemnt config type 242 | type_df = EntityTypeDef( 243 | name="custom_ml_exp_config", 244 | attributeDefs=[ 245 | AtlasAttributeDef(name='task_type',typename='string'), 246 | AtlasAttributeDef(name='enable_early_stopping',typename='bool'), 247 | AtlasAttributeDef(name='experiment_timeout_minutes',typename='int'), 248 | AtlasAttributeDef(name='primary_metric',typename='string'), 249 | AtlasAttributeDef(name='compute_target',typename='string'), 250 | AtlasAttributeDef(name='label_column_name',typename='string'), 251 | AtlasAttributeDef(name='n_cross_validations',typename='int'), 252 | AtlasAttributeDef(name='model_explainability',typename='bool') 253 | ], 254 | superTypes = ["DataSet"] 255 | ) 256 | typedef_results = client.upload_typedefs(entityDefs = [type_df], force_update=True) 257 | 258 | #-----------------------------------------------------------------------------------# 259 | 260 | #create model metrics type 261 | type_df = EntityTypeDef( 262 | name="custom_ml_model_metrics", 263 | attributeDefs=[ 264 | AtlasAttributeDef(name='AUC',typename='float'), 265 | AtlasAttributeDef(name='Accuracy',typename='float'), 266 | AtlasAttributeDef(name='Precision',typename='float'), 267 | AtlasAttributeDef(name='Recall',typename='float'), 268 | AtlasAttributeDef(name='F1',typename='float') 269 | ], 270 | superTypes = ["DataSet"] 271 | ) 272 | typedef_results = client.upload_typedefs(entityDefs = [type_df], force_update=True) 273 | 274 | #-----------------------------------------------------------------------------------# 275 | 276 | #create model type 277 | type_df = EntityTypeDef( 278 | name="custom_ml_model", 279 | attributeDefs=[ 280 | AtlasAttributeDef(name='workspace_name',typename='string'), 281 | AtlasAttributeDef(name='workspace_subscription_id',typename='string'), 282 | AtlasAttributeDef(name='workspace_resource_group',typename='string'), 283 | AtlasAttributeDef(name='name',typename='string'), 284 | AtlasAttributeDef(name='id',typename='string'), 285 | AtlasAttributeDef(name='version',typename='string'), 286 | AtlasAttributeDef(name='tags',typename='string'), 287 | AtlasAttributeDef(name='properties',typename='string') 288 | ], 289 | superTypes = ["DataSet"] 290 | ) 291 | typedef_results = client.upload_typedefs(entityDefs = [type_df], force_update=True) 292 | 293 | #-----------------------------------------------------------------------------------# 294 | 295 | #create endpoint type 296 | type_df = EntityTypeDef( 297 | name="custom_ml_model_endpoint", 298 | attributeDefs=[ 299 | AtlasAttributeDef(name='workspace_name',typename='string'), 300 | AtlasAttributeDef(name='workspace_subscription_id',typename='string'), 301 | AtlasAttributeDef(name='workspace_resource_group',typename='string'), 302 | AtlasAttributeDef(name='name',typename='string'), 303 | AtlasAttributeDef(name='image_id',typename='string'), 304 | AtlasAttributeDef(name='compute_type',typename='string'), 305 | AtlasAttributeDef(name='state',typename='string'), 306 | AtlasAttributeDef(name='scoring_uri',typename='string'), 307 | AtlasAttributeDef(name='tags',typename='string'), 308 | AtlasAttributeDef(name='state',typename='string'), 309 | AtlasAttributeDef(name='properties',typename='string'), 310 | AtlasAttributeDef(name='created_by',typename='string'), 311 | AtlasAttributeDef(name='sample_json',typename='string') 312 | ], 313 | superTypes = ["DataSet"] 314 | ) 315 | typedef_results = client.upload_typedefs(entityDefs = [type_df], force_update=True) 316 | 317 | #-----------------------------------------------------------------------------------# 318 | except: 319 | print('types already created') 320 | 321 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | Welcome, and thank you for your interest in contributing. There are many ways to contribute: 4 | * [Submit issues](https://github.com/microsoft/Purview-Machine-Learning-Lineage-Solution-Accelerator/issues) to report bugs and make suggestions. 5 | * Review the [source code changes](https://github.com/microsoft/Purview-Machine-Learning-Lineage-Solution-Accelerator/pulls). 6 | * Contribute features and fixes by forking the repository and creating a [pull request](https://github.com/microsoft/Purview-Machine-Learning-Lineage-Solution-Accelerator/compare). 7 | 8 | ## Contributor License Agreement 9 | This project welcomes contributors and suggestions. Most contributions require you to agree to a Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us the rights to use your contribution. For details, visit [https://cla.opensource.microsoft.com](https://cla.opensource.microsoft.com). 10 | 11 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide a CLA and decorate the PR appropriately (e.g., status checks, comments). Simply follow the instructions provided by the bot. You will only need to do this once across all repos using our CLA. 12 | 13 | ## Microsoft Open Source Code of Conduct 14 | This project has adopted the [Microsoft Open Source Code](https://opensource.microsoft.com/codeofconduct/) of Conduct. For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 15 | -------------------------------------------------------------------------------- /Deployment/deploy.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", 3 | "contentVersion": "1.0.0.0", 4 | 5 | "parameters": { 6 | "prefixName": { 7 | "type": "string", 8 | "defaultValue": "zzmlpr", 9 | "minLength": 3, 10 | "maxLength": 10, 11 | "metadata": { 12 | "description": "Name prefix between 3-6 characters with only characters and numbers" 13 | } 14 | }, 15 | "AllowAll": { 16 | "type": "string", 17 | "allowedValues": [ 18 | "true", 19 | "false" 20 | ], 21 | "defaultValue": "true" 22 | } 23 | }, 24 | 25 | "variables": { 26 | "subscriptionId": "[subscription().subscriptionId]", 27 | "location": "[resourceGroup().location]", 28 | "rgId": "[resourceGroup().id]", 29 | "rgName": "[resourceGroup().name]", 30 | 31 | "tenantId": "[subscription().tenantId]", 32 | "paramName": "[parameters('prefixName')]", 33 | "storageContainer": "data", 34 | 35 | "uniqueName": "[substring(uniqueString(variables('rgId')),0,4)]", 36 | 37 | "synapseWorkspaceName": "[concat('synapse-ws-',variables('paramName'))]", 38 | "storageName": "[replace(replace(toLower(concat(concat('synapsestrg',variables('paramName')),variables('uniqueName'))),'-',''),'_','')]", 39 | 40 | "machinelearningName": "[concat('ml-', variables('paramName'))]", 41 | "storageMLname": "[replace(replace(toLower(concat(concat('mlstrg',variables('paramName')),variables('uniqueName'))),'-',''),'_','')]", 42 | 43 | "appinsightsname": "[concat(variables('machinelearningName'), 'ai')]", 44 | "keyvaultname": "[replace(replace(toLower(concat('keyvault',variables('paramName'))),'-',''),'_','')]", 45 | "keyvaultname": "[replace(replace(toLower(concat(concat('keyvault',variables('paramName')),variables('uniqueName'))),'-',''),'_','')]", 46 | "purviewName": "[concat('purview-', variables('paramName'))]", 47 | 48 | "StorageBlobDataContributor": "ba92f5b4-2d11-453d-a403-e96b0029c9fe" 49 | }, 50 | 51 | "resources": [ 52 | { 53 | "name": "[concat(variables('purviewname'), 'dev')]", 54 | "type": "Microsoft.Purview/accounts", 55 | "apiVersion": "2020-12-01-preview", 56 | "location": "[variables('location')]", 57 | "identity": { 58 | "type": "SystemAssigned" 59 | }, 60 | "properties": { 61 | "networkAcls": { 62 | "defaultAction": "Allow" 63 | } 64 | }, 65 | "dependsOn": [], 66 | "sku": { 67 | "name": "Standard", 68 | "capacity": "4" 69 | }, 70 | "tags": {} 71 | }, 72 | { 73 | "type": "Microsoft.Storage/storageAccounts", 74 | "apiVersion": "2019-06-01", 75 | "name": "[variables('storageName')]", 76 | "location": "[variables('location')]", 77 | "sku": { 78 | "name": "Standard_LRS", 79 | "tier": "Standard" 80 | }, 81 | "kind": "StorageV2", 82 | "properties": { 83 | "isHnsEnabled": true, 84 | "networkAcls": { 85 | "bypass": "AzureServices", 86 | "virtualNetworkRules": [], 87 | "ipRules": [], 88 | "defaultAction": "Allow" 89 | }, 90 | "supportsHttpsTrafficOnly": true, 91 | "encryption": { 92 | "services": { 93 | "file": { 94 | "enabled": true 95 | }, 96 | "blob": { 97 | "enabled": true 98 | } 99 | }, 100 | "keySource": "Microsoft.Storage" 101 | }, 102 | "accessTier": "Hot" 103 | } 104 | }, 105 | { 106 | "type": "Microsoft.Storage/storageAccounts/blobServices", 107 | "apiVersion": "2019-06-01", 108 | "name": "[concat(variables('storageName'), '/default')]", 109 | "dependsOn": [ 110 | "[resourceId('Microsoft.Storage/storageAccounts', variables('storageName'))]" 111 | ], 112 | "properties": { 113 | "cors": { 114 | "corsRules": [] 115 | }, 116 | "deleteRetentionPolicy": { 117 | "enabled": false 118 | } 119 | } 120 | }, 121 | { 122 | "type": "Microsoft.Storage/storageAccounts/blobServices/containers", 123 | "apiVersion": "2019-06-01", 124 | "name": "[concat(variables('storageName'), '/default/', variables('storageContainer'))]", 125 | "dependsOn": [ 126 | "[resourceId('Microsoft.Storage/storageAccounts/blobServices', variables('storageName'), 'default')]", 127 | "[resourceId('Microsoft.Storage/storageAccounts', variables('storageName'))]" 128 | ], 129 | "properties": { 130 | "publicAccess": "None" 131 | } 132 | }, 133 | { 134 | "type": "Microsoft.Synapse/workspaces", 135 | "apiVersion": "2020-12-01", 136 | "name": "[variables('synapseWorkspaceName')]", 137 | "location": "[variables('location')]", 138 | "identity": { 139 | "type": "SystemAssigned" 140 | }, 141 | "properties": { 142 | "defaultDataLakeStorage": { 143 | "accountUrl": "[concat('https://', variables('storageName') , '.dfs.core.windows.net')]", 144 | "filesystem": "[variables('storageContainer')]" 145 | }, 146 | "virtualNetworkProfile": { 147 | "computeSubnetId": "" 148 | }, 149 | "sqlAdministratorLogin": "sqladminuser" 150 | }, 151 | "resources": [ 152 | { 153 | "condition": "[equals(parameters('AllowAll'),'true')]", 154 | "type": "firewallrules", 155 | "apiVersion": "2019-06-01-preview", 156 | "name": "allowAll", 157 | "location": "[variables('location')]", 158 | "dependsOn": [ "[variables('synapseWorkspaceName')]" ], 159 | "properties": { 160 | "startIpAddress": "0.0.0.0", 161 | "endIpAddress": "255.255.255.255" 162 | } 163 | } 164 | ] 165 | }, 166 | { 167 | "type": "Microsoft.Synapse/workspaces/bigDataPools", 168 | "apiVersion": "2020-12-01", 169 | "name": "[concat(variables('synapseWorkspaceName'), '/spark1')]", 170 | "location": "[variables('location')]", 171 | "dependsOn": [ 172 | "[resourceId('Microsoft.Synapse/workspaces', variables('synapseWorkspaceName'))]" 173 | ], 174 | "properties": { 175 | "sparkVersion": "2.4", 176 | "nodeCount": 3, 177 | "nodeSize": "Medium", 178 | "nodeSizeFamily": "MemoryOptimized", 179 | "autoScale": { 180 | "enabled": true, 181 | "minNodeCount": 3, 182 | "maxNodeCount": 6 183 | }, 184 | "autoPause": { 185 | "enabled": true, 186 | "delayInMinutes": 15 187 | }, 188 | "isComputeIsolationEnabled": false, 189 | "sessionLevelPackagesEnabled": false, 190 | "cacheSize": 0, 191 | "dynamicExecutorAllocation": { 192 | "enabled": true 193 | }, 194 | "provisioningState": "Succeeded" 195 | } 196 | }, 197 | 198 | { 199 | "type": "microsoft.insights/components", 200 | "apiVersion": "2020-02-02-preview", 201 | "name": "[variables('appinsightsName')]", 202 | "location": "[variables('location')]", 203 | "kind": "web", 204 | "properties": { 205 | "Application_Type": "web", 206 | "IngestionMode": "ApplicationInsights", 207 | "publicNetworkAccessForIngestion": "Enabled", 208 | "publicNetworkAccessForQuery": "Enabled" 209 | } 210 | }, 211 | { 212 | "type": "Microsoft.KeyVault/vaults", 213 | "apiVersion": "2020-04-01-preview", 214 | "name": "[variables('keyvaultName')]", 215 | "location": "[variables('location')]", 216 | "properties": { 217 | "sku": { 218 | "family": "A", 219 | "name": "standard" 220 | }, 221 | "tenantId": "[variables('tenantId')]", 222 | "accessPolicies": [ 223 | ], 224 | "enabledForDeployment": false, 225 | "enableSoftDelete": true, 226 | "enablePurgeProtection": true, 227 | "vaultUri": "[concat('https://', variables('keyvaultName'), '.vault.azure.net/')]", 228 | "provisioningState": "Succeeded" 229 | } 230 | }, 231 | { 232 | "type": "Microsoft.Storage/storageAccounts", 233 | "apiVersion": "2021-01-01", 234 | "name": "[variables('storageMLName')]", 235 | "location": "[variables('location')]", 236 | "sku": { 237 | "name": "Standard_LRS", 238 | "tier": "Standard" 239 | }, 240 | "kind": "StorageV2", 241 | "properties": { 242 | "networkAcls": { 243 | "bypass": "AzureServices", 244 | "virtualNetworkRules": [], 245 | "ipRules": [], 246 | "defaultAction": "Allow" 247 | }, 248 | "supportsHttpsTrafficOnly": true, 249 | "encryption": { 250 | "services": { 251 | "file": { 252 | "keyType": "Account", 253 | "enabled": true 254 | }, 255 | "blob": { 256 | "keyType": "Account", 257 | "enabled": true 258 | } 259 | }, 260 | "keySource": "Microsoft.Storage" 261 | }, 262 | "accessTier": "Hot" 263 | } 264 | }, 265 | { 266 | "type": "Microsoft.MachineLearningServices/workspaces", 267 | "apiVersion": "2021-01-01", 268 | "name": "[variables('machinelearningName')]", 269 | "location": "[variables('location')]", 270 | "dependsOn": [ 271 | "[resourceId('Microsoft.Storage/storageAccounts', variables('storageMLname'))]", 272 | "[resourceId('Microsoft.KeyVault/vaults', variables('keyvaultname'))]", 273 | "[resourceId('microsoft.insights/components', variables('appinsightsname'))]" 274 | ], 275 | "sku": { 276 | "name": "Basic", 277 | "tier": "Basic" 278 | }, 279 | "identity": { 280 | "type": "SystemAssigned" 281 | }, 282 | "properties": { 283 | "friendlyName": "[variables('machinelearningName')]", 284 | "storageAccount": "[resourceId('Microsoft.Storage/storageAccounts', variables('storageMLname'))]", 285 | "keyVault": "[resourceId('Microsoft.KeyVault/vaults', variables('keyvaultname'))]", 286 | "applicationInsights": "[resourceId('microsoft.insights/components', variables('appinsightsname'))]", 287 | "hbiWorkspace": false, 288 | "allowPublicAccessWhenBehindVnet": false 289 | } 290 | }, 291 | { 292 | "scope": "[concat('Microsoft.Storage/storageAccounts/', variables('storageName'))]", 293 | "type": "Microsoft.Authorization/roleAssignments", 294 | "apiVersion": "2020-04-01-preview", 295 | "name": "[guid(uniqueString(variables('storageName')))]", 296 | "location": "[variables('location')]", 297 | "dependsOn": [ 298 | "[variables('synapseWorkspaceName')]" 299 | ], 300 | "properties": { 301 | "roleDefinitionId": "[resourceId('Microsoft.Authorization/roleDefinitions', variables('StorageBlobDataContributor'))]", 302 | "principalId": "[reference(resourceId('Microsoft.Synapse/workspaces', variables('synapseWorkspaceName')), '2019-06-01-preview', 'Full').identity.principalId]", 303 | "principalType": "ServicePrincipal" 304 | } 305 | }, 306 | { 307 | "apiVersion": "2020-10-01", 308 | "name": "pid-2da55a03-dd52-561e-8690-cae328ce0200", 309 | "type": "Microsoft.Resources/deployments", 310 | "properties": { 311 | "mode": "Incremental", 312 | "template": { 313 | "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", 314 | "contentVersion": "1.0.0.0", 315 | "resources": [] 316 | } 317 | } 318 | } 319 | ] 320 | } -------------------------------------------------------------------------------- /Deployment/img/ADLSGen2Scanning.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/Purview-Machine-Learning-Lineage-Solution-Accelerator/171e292aa682764aef4c9e2bfa2b90297c7724e4/Deployment/img/ADLSGen2Scanning.PNG -------------------------------------------------------------------------------- /Deployment/img/AMLPipeline.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/Purview-Machine-Learning-Lineage-Solution-Accelerator/171e292aa682764aef4c9e2bfa2b90297c7724e4/Deployment/img/AMLPipeline.PNG -------------------------------------------------------------------------------- /Deployment/img/AMLPipelineLineage.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/Purview-Machine-Learning-Lineage-Solution-Accelerator/171e292aa682764aef4c9e2bfa2b90297c7724e4/Deployment/img/AMLPipelineLineage.PNG -------------------------------------------------------------------------------- /Deployment/img/Architecture.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/Purview-Machine-Learning-Lineage-Solution-Accelerator/171e292aa682764aef4c9e2bfa2b90297c7724e4/Deployment/img/Architecture.PNG -------------------------------------------------------------------------------- /Deployment/img/MLLineageScreenshot.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/Purview-Machine-Learning-Lineage-Solution-Accelerator/171e292aa682764aef4c9e2bfa2b90297c7724e4/Deployment/img/MLLineageScreenshot.PNG -------------------------------------------------------------------------------- /Deployment/img/ManageSparkPool.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/Purview-Machine-Learning-Lineage-Solution-Accelerator/171e292aa682764aef4c9e2bfa2b90297c7724e4/Deployment/img/ManageSparkPool.png -------------------------------------------------------------------------------- /Deployment/img/PurviewMLLineageIntroduction.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/Purview-Machine-Learning-Lineage-Solution-Accelerator/171e292aa682764aef4c9e2bfa2b90297c7724e4/Deployment/img/PurviewMLLineageIntroduction.PNG -------------------------------------------------------------------------------- /Deployment/img/PurviewMLLineageSolutionAccelerator.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/Purview-Machine-Learning-Lineage-Solution-Accelerator/171e292aa682764aef4c9e2bfa2b90297c7724e4/Deployment/img/PurviewMLLineageSolutionAccelerator.PNG -------------------------------------------------------------------------------- /Deployment/img/PurviewScreenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/Purview-Machine-Learning-Lineage-Solution-Accelerator/171e292aa682764aef4c9e2bfa2b90297c7724e4/Deployment/img/PurviewScreenshot.png -------------------------------------------------------------------------------- /Deployment/img/Requirements.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/Purview-Machine-Learning-Lineage-Solution-Accelerator/171e292aa682764aef4c9e2bfa2b90297c7724e4/Deployment/img/Requirements.png -------------------------------------------------------------------------------- /Deployment/img/add-role-assignment-page.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/Purview-Machine-Learning-Lineage-Solution-Accelerator/171e292aa682764aef4c9e2bfa2b90297c7724e4/Deployment/img/add-role-assignment-page.png -------------------------------------------------------------------------------- /Deployment/img/deploy-firewall.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/Purview-Machine-Learning-Lineage-Solution-Accelerator/171e292aa682764aef4c9e2bfa2b90297c7724e4/Deployment/img/deploy-firewall.png -------------------------------------------------------------------------------- /Deployment/requirements.txt: -------------------------------------------------------------------------------- 1 | pyapacheatlas 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | 23 | ## Note about Libraries with MPL-2.0 and LGPL-2.1 Licenses 24 | The following libraries are not **explicitly included** in this repository, but users who use this Solution Accelerator may need to install them locally and in Azure Synapse and Azure Machine Learning to fully utilize this Solution Accelerator. However, the actual binaries and files associated with the libraries **are not included** as part of this repository, but they are available for installation via the PyPI library using the pip installation tool. 25 | 26 | Libraries: chardet, certifi 27 | -------------------------------------------------------------------------------- /NOTICE.txt: -------------------------------------------------------------------------------- 1 | NOTICES AND INFORMATION 2 | Do Not Translate or Localize 3 | 4 | This software incorporates material from third parties. 5 | Microsoft makes certain open source code available at https://3rdpartysource.microsoft.com, 6 | or you may send a check or money order for US $5.00, including the product name, 7 | the open source component name, platform, and version number, to: 8 | 9 | Source Code Compliance Team 10 | Microsoft Corporation 11 | One Microsoft Way 12 | Redmond, WA 98052 13 | USA 14 | 15 | Notwithstanding any other terms, you may reverse engineer this software to the extent 16 | required to debug changes to any libraries licensed under the GNU Lesser General Public License. 17 | 18 | --------------------------------------------------------- 19 | -------------------------------------------------------------------------------- /PRIVACY.md: -------------------------------------------------------------------------------- 1 | # Privacy 2 | 3 | When you deploy this template, Microsoft is able to identify the installation of the software with the Azure resources that are deployed. Microsoft is able to correlate the Azure resources that are used to support the software. Microsoft collects this information to provide the best experiences with their products and to operate their business. The data is collected and governed by Microsoft's privacy policies, which can be found at [Microsoft Privacy Statement](https://go.microsoft.com/fwlink/?LinkID=824704). 4 | 5 | To disable this, simply remove the following section from [deploy.json](./Deployment/deploy.json) before deploying the resources to Azure: 6 | 7 | ```json 8 | { 9 | "apiVersion": "2018-02-01", 10 | "name": "pid-2da55a03-dd52-561e-8690-cae328ce0200", 11 | "type": "Microsoft.Resources/deployments", 12 | "properties": { 13 | "mode": "Incremental", 14 | "template": { 15 | "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", 16 | "contentVersion": "1.0.0.0", 17 | "resources": [] 18 | } 19 | } 20 | } 21 | ``` 22 | 23 | You can see more information on this at https://docs.microsoft.com/en-us/azure/marketplace/azure-partner-customer-usage-attribution. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | --- 2 | page_type: sample 3 | languages: 4 | - python 5 | - bash 6 | products: 7 | - microsoft-purview 8 | - azure-synapse-analytics 9 | - azure-machine-learning 10 | --- 11 | ![Purview Machine Learning Lineage Solution Accelerator](./Deployment/img/PurviewMLLineageSolutionAccelerator.PNG) 12 | 13 | # Purview Machine Learning Lineage Solution Accelerator 14 | 15 | Microsoft Purview is a unified data governance service that helps you manage and govern data across different sources. 16 | 17 | Machine Learning project life cycle involves many steps to transform raw data into insights. This process usually requires individuals with different roles/skillsets across multiple teams to collaborate effectively. Microsoft Purview helps simplify this complex process by providing an end-to-end lineage of ML entities and processes to enable better collaboration, auditing and debugging capabilities. 18 | 19 | This solution accelerator helps developers with the resources needed to build an end-to-end lineage in Purview for Machine Learning scenarios. 20 | 21 | ## Sample Credit Risk Prediction ML Process Flow 22 | ![Purview Machine Learning Lineage Introduction](./Deployment/img/PurviewMLLineageIntroduction.PNG) 23 | 24 | ## Purview ML Process Lineage 25 | ![ML Lineage](./Deployment/img/MLLineageScreenshot.PNG) 26 | 27 | ## Prerequisites 28 | To use this solution accelerator, you will need access to an [Azure subscription](https://azure.microsoft.com/free/). While not required, a prior understanding of Microsoft Purview, Azure Synapse Analytics and Machine Learning will be helpful. 29 | 30 | For additional training and support, please see: 31 | 1. [Microsoft Purview](https://azure.microsoft.com/en-us/services/purview/) 32 | 2. [Azure Synapse Analytics](https://azure.microsoft.com/en-us/services/synapse-analytics/) 33 | 3. [Azure Machine Learning](https://azure.microsoft.com/en-us/services/machine-learning/) 34 | 35 | ## Getting Started 36 | Start by deploying the required resources to Azure. The button below will deploy Microsoft Purview, Azure Synapse Analytics, Azure Machine Learning and its related resources: 37 | 38 | [![Deploy to Azure](https://aka.ms/deploytoazurebutton)](https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2Fmicrosoft%2FPurview-Machine-Learning-Lineage-Solution-Accelerator%2Fmain%2FDeployment%2Fdeploy.json) 39 | 40 | If you prefer to setup manually, you need to deploy Microsoft Purview, Azure Synapse Analytics, Azure Machine Learning. 41 | 42 | Note: To minimize Azure costs, consider deleting the Purview instance at the end of this exercise if you do not plan to use this instance actively. 43 | 44 | ### Step 1. Download Files 45 | Clone or download this repository and navigate to the project's root directory. 46 | 47 | ### Step 2. Purview Security Access 48 | 49 | #### Step 2.1 Create a Service Principal for Purview Rest API access 50 | [Create a service principal](https://docs.microsoft.com/en-us/azure/purview/tutorial-using-rest-apis#create-a-service-principal-application) 51 | 52 | #### Step 2.2 Configure your Purview catalog to trust the service principal 53 | [Configure your Purview catalog to trust the service principal](https://docs.microsoft.com/en-us/azure/purview/tutorial-using-rest-apis#configure-your-catalog-to-trust-the-service-principal-application) 54 | 55 | ### Step 3. Azure Machine Learning Security Access 56 | 57 | #### Step 3.1 Create a Service Principal for AML access 58 | [Create a service principal](https://docs.microsoft.com/en-us/azure/purview/tutorial-using-rest-apis#create-a-service-principal-application) 59 | 60 | #### Step 3.2 Configure your Azure Machine Learning to trust the service principal 61 | 1. From the [Azure portal](https://portal.azure.com/), select your AML workspace 62 | 2. select Access Control (IAM) 63 | 3. Select Add, Add Role Assignment to open the Add role assignment page 64 | 65 | 3.1 For the `Role` type in `Contributor` 66 | 67 | 3.2 For `Assign access to` leave the default, `User, group, or service principal` 68 | 69 | 3.2 For `Select` enter the name of the previosly created service principal in step 3.1 and then click on their name in the results pane 70 | 71 | 3.3 Click on Save 72 | You've now configured the service principal as a contributor on Azure Machine Learning resource. 73 | 74 | ### Step 4. Synapse Security Access 75 | 76 | #### Step 4.1 Add your IP address to Synapse firewall 77 | Before you can upload assests to the Synapse Workspace you will need to add your IP address: 78 | 1. Go to the Synapse resouce you created in the previous step 79 | 2. Navigate to `Firewalls` under `Security` on the left hand side of the page 80 | 3. At the top of the screen click `+ Add client IP` 81 | ![Update Firewalls](./Deployment/img/deploy-firewall.png) 82 | 4. Your IP address should now be visable in the IP list 83 | 84 | #### Step 4.2: Update storage account permisions 85 | In order to perform the necessary actions in Synapse workspace, you will need to grant more access. 86 | 1. Go to the Azure Data Lake Storage Account created above 87 | 2. Go to the `Access Control (IAM) > + Add > Add role assignment` 88 | 3. Now click the Role dropdown and select `Storage Blob Data Contributor` 89 | - Search for your username and add 90 | 4. Click `Save` at the bottom 91 | 92 | [Learn more](https://docs.microsoft.com/azure/synapse-analytics/security/how-to-set-up-access-control) 93 | 94 | ### Step 5. Upload CreditRisk Sample Dataset 95 | 1. Launch the Synapse workspace [Synapse Workspace](https://ms.web.azuresynapse.net/) 96 | 2. Select the `subscription` and `workspace` name you are using for this solution accelerator 97 | 3. In Synapse Studio, navigate to the `Data` Hub 98 | 4. Select `Linked` 99 | 5. Under the category `Azure Data Lake Storage Gen2` you'll see an item with a name like `xxxxx(xxxxx- Primary)` 100 | 6. Select the container named `data (Primary)` 101 | 7. Create a new folder `creditriskdata` 102 | 8. Select `Upload` and select `loan.csv` and `borrower.csv` files downloaded from [Data](./Data/) folder 103 | 104 | ### Step 6. Register and scan uploaded data in Purview 105 | 106 | 1. [Setting up authentication for a scan](https://docs.microsoft.com/en-us/azure/purview/register-scan-adls-gen2#managed-identity-recommended) 107 | 108 | 2. [Register and scan adls gen2](https://docs.microsoft.com/en-us/azure/purview/register-scan-adls-gen2#register-azure-data-lake-storage-gen2-data-source) 109 | 110 | select only the `creditriskdata` folder while creating the scan. 111 | 112 | ![ADLSGen2 Scanning folder selection](./Deployment/img/ADLSGen2Scanning.PNG) 113 | 114 | Wait for scan run status to change to `Completed` before running next step. 115 | 116 | ### Step 7. Upload Assets and Run Noteboks 117 | 1. Launch the Synapse workspace [Synapse Workspace](https://ms.web.azuresynapse.net/) 118 | 2. Select the `subscription` and `workspace` name you are using for this solution accelerator 119 | 3. Go to the `Manage` tab in the Synapse workspace and click on the `Apache Spark pools` 120 | 121 | - ![Spark Pool](./Deployment/img/ManageSparkPool.png) 122 | 4. Click `...` on the deployed Spark Pool and select `Packages` 123 | 5. Click `Upload` and select [requirements.txt](/Deployment/requirements.txt) from the cloned repo and click `Apply` 124 | 125 | - ![Requirements File](./Deployment/img/Requirements.png) 126 | 127 | 6. Go to `Develop`, click the `+`, and click `Import` to select all notebooks from the repository's `/SynapseNotebooks/` folder 128 | 7. For each of the notebooks, select `Attach to > spark1` in the top dropdown 129 | 8. Update Purview Tenant, Client Id and Secret from step `2.1` in `01_Authenticate_to_Purview_AML.ipynb` 130 | 9. Update Azure Machine Learning Tenant, Client Id and Secret from step `3.1` in `01_Authenticate_to_Purview_AML.ipynb` 131 | 10. Update `account_name` variable to your ADLS in `04_Create_CreditRisk_Experiment.ipynb` 132 | 11. Click `Publish all` to publish the notebook changes 133 | 12. Run the following notebook: 134 | - `04_Create_CreditRisk_Experiment.ipynb` (This notebook runs other notebooks you imported) 135 | 136 | ### Step 8. Check Machine Learning Lineage in Purview Studio 137 | 1. Launch [Purview Studio](https://ms.web.purview.azure.com/) 138 | 2. Click on `Browse Assets` 139 | 3. Click on `Custom Model` and select the model we created from running notebooks in `Step 7` 140 | 4. Click on `Lineage` to see Machine Learning process Lineage 141 | ![ML Lineage](./Deployment/img/PurviewScreenshot.png) 142 | 143 | ### Step 9. Upload Assets and Run Azure Machine Learning Noteboks (Optional) 144 | 1. Launch the Azure Machine Learning studio [AML Studio](https://ml.azure.com/) 145 | 2. Select the `subscription` and `workspace` name you are using for this solution accelerator 146 | 3. Go to the `Notebooks` tab in the AML Studio and upload the notebooks and scripts in `AML Notebooks` folder including `Data` folder 147 | 4. Go to the `Compute` tab in the AML Studio and click on the `Compute Instances` 148 | 5. Click `New` and create a new compute instance 149 | 6. Click `Jupyter` and launch the compute instance 150 | 7. In the browser window that opens, click the folders to see the notebooks you uploaded in step `9.3` 151 | 7. Update Purview Tenant, Client Id and Secret from step `2.1` in `Authenticate_to_Purview_AML.py` 152 | 8. Update Azure Machine Learning Tenant, Client Id and Secret from step `3.1` in `Authenticate_to_Purview_AML.py` 153 | 9. Run the following notebooks in order: 154 | - `01_Create_CreditRisk_AML_Pipeline.ipynb` ( Pipeline run might take few minutes so please wait for completion before running the next notebook) 155 | - `02_Create_CreditRisk_AML_Pipeline_Lineage.ipynb` 156 | 157 | ![ML Pipeline](./Deployment/img/AMLPipeline.PNG) 158 | 159 | ### Step 10. Check Machine Learning pipeline Lineage in Purview Studio (Optional) 160 | 1. Launch [Purview Studio](https://ms.web.purview.azure.com/) 161 | 2. Click on `Browse Assets` 162 | 3. Click on `Custom ML Experiment Step` and select any step we created from running notebooks in `Step 9` 163 | 4. Click on `Lineage` to see Machine Learning pipeline Lineage 164 | 165 | ![ML Pipeline Lineage](./Deployment/img/AMLPipelineLineage.PNG) 166 | 167 | ## Architecture 168 | The architecture diagram below details what you will be building for this Solution Accelerator. 169 | ![Architecture](./Deployment/img/Architecture.PNG) 170 | 171 | 172 | ## License 173 | MIT License 174 | 175 | Copyright (c) Microsoft Corporation. 176 | 177 | Permission is hereby granted, free of charge, to any person obtaining a copy 178 | of this software and associated documentation files (the "Software"), to deal 179 | in the Software without restriction, including without limitation the rights 180 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 181 | copies of the Software, and to permit persons to whom the Software is 182 | furnished to do so, subject to the following conditions: 183 | 184 | The above copyright notice and this permission notice shall be included in all 185 | copies or substantial portions of the Software. 186 | 187 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 188 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 189 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 190 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 191 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 192 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 193 | SOFTWARE 194 | 195 | ## Note about Libraries with MPL-2.0 and LGPL-2.1 Licenses 196 | The following libraries are not **explicitly included** in this repository, but users who use this Solution Accelerator may need to install them locally and in Azure Synapse and Azure Machine Learning to fully utilize this Solution Accelerator. However, the actual binaries and files associated with the libraries **are not included** as part of this repository, but they are available for installation via the PyPI library using the pip installation tool. 197 | 198 | Libraries: chardet, certifi 199 | 200 | ## Contributing 201 | This project welcomes contributions and suggestions. Most contributions require you to agree to a Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. 202 | 203 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions provided by the bot. You will only need to do this once across all repos using our CLA. 204 | 205 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 206 | 207 | ## Trademarks 208 | This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft trademarks or logos is subject to and must follow [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. Any use of third-party trademarks or logos are subject to those third-party's policies. 209 | 210 | ## Data Collection 211 | The software may collect information about you and your use of the software and send it to Microsoft. Microsoft may use this information to provide services and improve our products and services. You may turn off the telemetry as described in the repository. There are also some features in the software that may enable you and Microsoft to collect data from users of your applications. If you use these features, you must comply with applicable law, including providing appropriate notices to users of your applications together with a copy of Microsoft's privacy statement. Our privacy statement is located at https://go.microsoft.com/fwlink/?LinkID=824704. You can learn more about data collection and use in the help documentation and our privacy statement. Your use of the software operates as your consent to these practices. 212 | 213 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd). 40 | 41 | -------------------------------------------------------------------------------- /SUPPORT.md: -------------------------------------------------------------------------------- 1 | # Support 2 | 3 | ## How to file issues and get help 4 | 5 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 6 | issues before filing new issues to avoid duplicates. For new issues, file your bug or 7 | feature request as a new Issue. 8 | 9 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 10 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER 11 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**. 12 | 13 | ## Microsoft Support Policy 14 | 15 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above. 16 | -------------------------------------------------------------------------------- /SynapseNotebooks/01_Authenticate_to_Purview_AML.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from pyapacheatlas.auth import ServicePrincipalAuthentication\n", 10 | "from pyapacheatlas.core import PurviewClient, AtlasClassification, AtlasEntity, AtlasProcess \n", 11 | "from pyapacheatlas.readers import ExcelConfiguration, ExcelReader\n", 12 | "from pyapacheatlas.core.util import GuidTracker\n", 13 | "from pyapacheatlas.core import AtlasAttributeDef, AtlasEntity, PurviewClient\n", 14 | "from pyapacheatlas.core.typedef import EntityTypeDef" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "# get SPN details you created in step 2.1 of solution accelerator setup\n", 24 | "tenant_id = \"\"\n", 25 | "client_id = \"\"\n", 26 | "client_secret = \"\"\n", 27 | "\n", 28 | "# get Purview account name from azure portal\n", 29 | "purview_name = \"\"\n" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "# get AML workspace details from azure portal\n", 39 | "subscription_id = \"\" \n", 40 | "resource_group = \"\"\n", 41 | "workspace_name = \"\"\n", 42 | "workspace_region = \"\"\n" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "from pyapacheatlas.auth import ServicePrincipalAuthentication\n", 52 | "from pyapacheatlas.core import PurviewClient\n", 53 | "from pyapacheatlas.core.util import GuidTracker\n", 54 | "\n", 55 | "# Authenticate to your Atlas server using a Service Principal\n", 56 | "oauth = ServicePrincipalAuthentication(\n", 57 | " tenant_id= tenant_id,\n", 58 | " client_id= client_id,\n", 59 | " client_secret= client_secret\n", 60 | ")\n", 61 | "client = PurviewClient(\n", 62 | " account_name = purview_name,\n", 63 | " authentication=oauth\n", 64 | ")\n", 65 | "guid = GuidTracker()" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "# get SPN details you created in step 3.1 of solution accelerator setup\n", 75 | "aml_client_id = \"\"\n", 76 | "aml_client_secret = \"\"" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "from azureml.core.authentication import ServicePrincipalAuthentication\n", 86 | "\n", 87 | "sp = ServicePrincipalAuthentication(tenant_id=tenant_id, \n", 88 | " service_principal_id=aml_client_id, \n", 89 | " service_principal_password=aml_client_secret)\n", 90 | "\n", 91 | "from azureml.core import Workspace\n", 92 | "\n", 93 | "ws = Workspace.get(name=workspace_name,\n", 94 | " resource_group = resource_group,\n", 95 | " auth=sp,\n", 96 | " subscription_id=subscription_id)" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "# We recommend you add your service principal secrets in KeyVault instead of hardcoded values above\n", 113 | "# Create a linked service for key vault in Synapse Studio \n", 114 | "# See below code snippet how to access secrets from KeyVault " 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "# linked_service = \"AzureKeyVault1\" # Azure Key Vault Linked Service name \n", 124 | "# akv_name = \"\" # Azure Key Vault name\n", 125 | "# secret_name = \"\" # Azure Key Vault Secret name\n", 126 | "\n", 127 | "# # Fetch the key from Azure Key Vault\n", 128 | "# aml_spn = mssparkutils.credentials.getSecret(\n", 129 | "# linkedService=linked_service,\n", 130 | "# akvName=akv_name, \n", 131 | "# secret=secret_name)\n", 132 | "\n", 133 | "# linked_service = \"AzureKeyVault1\" # Azure Key Vault Linked Service name \n", 134 | "# akv_name = \"\" # Azure Key Vault name\n", 135 | "# secret_name = \"\" # Azure Key Vault Secret name\n", 136 | "\n", 137 | "# # Fetch the key from Azure Key Vault\n", 138 | "# purview_spn = mssparkutils.credentials.getSecret(\n", 139 | "# linkedService=linked_service,\n", 140 | "# akvName=akv_name, \n", 141 | "# # secret=secret_name)" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "# # get SPN details you created in step 2.1 of solution accelerator installation\n", 151 | "# tenant_id = \"\"\n", 152 | "# purview_client_id = \"\"\n", 153 | "# purview_name = \"\"\n", 154 | "# purview_client_secret = purview_spn" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [ 163 | "# import json\n", 164 | "# import os\n", 165 | "# import sys\n", 166 | "\n", 167 | "# from pyapacheatlas.auth import ServicePrincipalAuthentication\n", 168 | "# from pyapacheatlas.core import PurviewClient, AtlasClassification, AtlasEntity, AtlasProcess \n", 169 | "# from pyapacheatlas.readers import ExcelConfiguration, ExcelReader\n", 170 | "# from pyapacheatlas.core.util import GuidTracker\n", 171 | "# from pyapacheatlas.core import AtlasAttributeDef, AtlasEntity, PurviewClient\n", 172 | "# from pyapacheatlas.core.typedef import EntityTypeDef\n", 173 | "\n", 174 | "# # Authenticate to your Atlas server using a Service Principal\n", 175 | "# oauth = ServicePrincipalAuthentication(\n", 176 | "# tenant_id= tenant_id,\n", 177 | "# client_id= purview_client_id,\n", 178 | "# client_secret= purview_client_secret\n", 179 | "# )\n", 180 | "# client = PurviewClient(\n", 181 | "# account_name = purview_name,\n", 182 | "# authentication=oauth\n", 183 | "# )\n", 184 | "# guid = GuidTracker()" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "# # get AML workspace details from azure portal\n", 194 | "# subscription_id = \"\" \n", 195 | "# resource_group = \"\"\n", 196 | "# workspace_name = \"\"" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [ 205 | "# # get SPN details you created in step 2.1 of solution accelerator installation\n", 206 | "# tenant_id = \"\"\n", 207 | "# aml_client_id = \"\"\n", 208 | "# aml_client_secret = aml_spn" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": null, 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": [ 217 | "# # Authentiate to AML\n", 218 | "\n", 219 | "# from azureml.core.authentication import ServicePrincipalAuthentication\n", 220 | "\n", 221 | "# sp = ServicePrincipalAuthentication(tenant_id=tenant_id, \n", 222 | "# service_principal_id=aml_client_id, \n", 223 | "# service_principal_password=aml_client_secret) \n", 224 | "\n", 225 | "# from azureml.core import Workspace\n", 226 | "\n", 227 | "# ws = Workspace.get(name=workspace_name,\n", 228 | "# resource_group = resource_group,\n", 229 | "# auth=sp,\n", 230 | "# subscription_id=subscription_id)" 231 | ] 232 | } 233 | ], 234 | "metadata": { 235 | "kernelspec": { 236 | "display_name": "Python 3", 237 | "language": "python", 238 | "name": "python3" 239 | }, 240 | "language_info": { 241 | "codemirror_mode": { 242 | "name": "ipython", 243 | "version": 3 244 | }, 245 | "file_extension": ".py", 246 | "mimetype": "text/x-python", 247 | "name": "python", 248 | "nbconvert_exporter": "python", 249 | "pygments_lexer": "ipython3", 250 | "version": "3.7.4" 251 | }, 252 | "save_output": true, 253 | "synapse_widget": { 254 | "state": {}, 255 | "version": "0.1" 256 | } 257 | }, 258 | "nbformat": 4, 259 | "nbformat_minor": 2 260 | } 261 | -------------------------------------------------------------------------------- /SynapseNotebooks/02_Create_ML_Lineage_Types.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true, 8 | "jupyter": { 9 | "outputs_hidden": false, 10 | "source_hidden": false 11 | }, 12 | "nteract": { 13 | "transient": { 14 | "deleting": false 15 | } 16 | } 17 | }, 18 | "outputs": [], 19 | "source": [ 20 | "from pyapacheatlas.core.typedef import AtlasAttributeDef, EntityTypeDef, RelationshipTypeDef\n", 21 | "\n", 22 | "try:\n", 23 | " #-----------------------------------------------------------------------------------# \n", 24 | " #create custom dataset type\n", 25 | " type_df = EntityTypeDef(\n", 26 | " name=\"custom_dataset\",\n", 27 | " attributeDefs=[\n", 28 | " AtlasAttributeDef(name=\"format\")\n", 29 | " ],\n", 30 | " superTypes = [\"DataSet\"]\n", 31 | " )\n", 32 | " typedef_results = client.upload_typedefs(entityDefs = [type_df], force_update=True)\n", 33 | "\n", 34 | " #-----------------------------------------------------------------------------------# \n", 35 | " #create process with column mapping type\n", 36 | " type_df = EntityTypeDef(\n", 37 | " name=\"ProcessWithColumnMapping\",\n", 38 | " attributeDefs=[\n", 39 | " AtlasAttributeDef(name=\"columnMapping\")\n", 40 | " ],\n", 41 | " superTypes = [\"Process\"]\n", 42 | " )\n", 43 | " typedef_results = client.upload_typedefs(entityDefs = [type_df], force_update=True)\n", 44 | "\n", 45 | " #-----------------------------------------------------------------------------------# \n", 46 | " #create AML workspace type\n", 47 | " type_df = EntityTypeDef(\n", 48 | " name=\"custom_ml_workspace\",\n", 49 | " attributeDefs=[\n", 50 | " AtlasAttributeDef(name='name',typename='string'),\n", 51 | " AtlasAttributeDef(name='description',typename='string'),\n", 52 | " AtlasAttributeDef(name='subscription_id',typename='string'),\n", 53 | " AtlasAttributeDef(name='resource_group',typename='string')\n", 54 | " ],\n", 55 | " superTypes = [\"DataSet\"]\n", 56 | " )\n", 57 | " typedef_results = client.upload_typedefs(entityDefs = [type_df], force_update=True)\n", 58 | " #-----------------------------------------------------------------------------------# \n", 59 | " #create types for datastore and dataset\n", 60 | "\n", 61 | " #create AML datastore type\n", 62 | " datastore_type_df = EntityTypeDef(\n", 63 | " name=\"custom_ml_datastore\",\n", 64 | " attributeDefs=[\n", 65 | " AtlasAttributeDef(name=\"name\",typename='string'),\n", 66 | " AtlasAttributeDef(name='container_name',typename='string'),\n", 67 | " AtlasAttributeDef(name='account_name',typename='string'),\n", 68 | " AtlasAttributeDef(name='protocol',typename='string'),\n", 69 | " AtlasAttributeDef(name='endpoint',typename='string'),\n", 70 | " AtlasAttributeDef(name='server_name',typename='string'),\n", 71 | " AtlasAttributeDef(name='database_name',typename='string'),\n", 72 | " AtlasAttributeDef(name=\"createdby\",typename='string')\n", 73 | " ],\n", 74 | " superTypes = [\"DataSet\"],\n", 75 | " options = {\"schemaElementAttribute\":\"dataset\"}\n", 76 | " )\n", 77 | "\n", 78 | " #create AML dataset type\n", 79 | " dataset_type_df = EntityTypeDef(\n", 80 | " name=\"custom_ml_dataset\",\n", 81 | " attributeDefs=[\n", 82 | " AtlasAttributeDef(name=\"name\",typename='string'),\n", 83 | " AtlasAttributeDef(name=\"description\",typename='string'),\n", 84 | " AtlasAttributeDef(name=\"createdby\",typename='string'),\n", 85 | " AtlasAttributeDef(name=\"createdtime\",typename='string')\n", 86 | " ],\n", 87 | " superTypes = [\"DataSet\"]\n", 88 | " )\n", 89 | "\n", 90 | " # create relationsip between datastore and dataset\n", 91 | " dataset_to_datastore_relationship = RelationshipTypeDef(\n", 92 | " name=\"custom_ml_datastore_to_dataset\",\n", 93 | " relationshipCategory=\"COMPOSITION\",\n", 94 | " endDef1={\n", 95 | " \"type\": \"custom_ml_datastore\",\n", 96 | " \"name\": \"dataset\",\n", 97 | " \"isContainer\": True,\n", 98 | " \"cardinality\": \"SET\",\n", 99 | " \"isLegacyAttribute\": False\n", 100 | " },\n", 101 | " endDef2={\n", 102 | " \"type\": \"custom_ml_dataset\",\n", 103 | " \"name\": \"datastore\",\n", 104 | " \"isContainer\": False,\n", 105 | " \"cardinality\": \"SINGLE\",\n", 106 | " \"isLegacyAttribute\": False\n", 107 | " }\n", 108 | " )\n", 109 | "\n", 110 | " typedef_results = client.upload_typedefs(\n", 111 | " entityDefs = [datastore_type_df, dataset_type_df],\n", 112 | " relationshipDefs = [dataset_to_datastore_relationship],\n", 113 | " force_update=True\n", 114 | " )\n", 115 | " #-----------------------------------------------------------------------------------# \n", 116 | " #create types for experiment and experimentstep\n", 117 | " \n", 118 | " #create process for Ml Experiment Step\n", 119 | " exp_type_df = EntityTypeDef(\n", 120 | " name=\"custom_ml_experiment\",\n", 121 | " attributeDefs=[\n", 122 | " AtlasAttributeDef(name='name',typename='string'),\n", 123 | " AtlasAttributeDef(name='notes',typename='string'),\n", 124 | " AtlasAttributeDef(name=\"createdby\",typename='string'),\n", 125 | " AtlasAttributeDef(name=\"createdtime\",typename='string')\n", 126 | " ],\n", 127 | " superTypes = [\"Process\"]\n", 128 | " )\n", 129 | "\n", 130 | " #create process for Ml Experiment Step\n", 131 | " exp_step_type_df = EntityTypeDef(\n", 132 | " name=\"custom_ml_experiment_step\",\n", 133 | " attributeDefs=[\n", 134 | " AtlasAttributeDef(name='notes',typename='string')\n", 135 | " ],\n", 136 | " superTypes = [\"Process\"]\n", 137 | " )\n", 138 | "\n", 139 | " # create relationsip between experiment and experimentstep\n", 140 | " step_to_exp_relationship = RelationshipTypeDef(\n", 141 | " name=\"custom_ml_experiment_to_experimentstep\",\n", 142 | " relationshipCategory=\"COMPOSITION\",\n", 143 | " endDef1={\n", 144 | " \"type\": \"custom_ml_experiment\",\n", 145 | " \"name\": \"experimentstep\",\n", 146 | " \"isContainer\": True,\n", 147 | " \"cardinality\": \"SET\",\n", 148 | " \"isLegacyAttribute\": False\n", 149 | " },\n", 150 | " endDef2={\n", 151 | " \"type\": \"custom_ml_experiment_step\",\n", 152 | " \"name\": \"experiment\",\n", 153 | " \"isContainer\": False,\n", 154 | " \"cardinality\": \"SINGLE\",\n", 155 | " \"isLegacyAttribute\": False\n", 156 | " }\n", 157 | " )\n", 158 | "\n", 159 | " typedef_results = client.upload_typedefs(\n", 160 | " entityDefs = [exp_type_df, exp_step_type_df],\n", 161 | " relationshipDefs = [step_to_exp_relationship],\n", 162 | " force_update=True\n", 163 | " )\n", 164 | " #-----------------------------------------------------------------------------------# \n", 165 | " \n", 166 | " rd = RelationshipTypeDef(\n", 167 | " name=\"custom_ml_workspace_datastore\",\n", 168 | " attributeDefs=[],\n", 169 | " relationshipCategory=\"COMPOSITION\", # Means the child can't exist without the parent\n", 170 | " endDef1={ # endDef1 decribes what the parent will have as an attribute\n", 171 | " \"type\":\"custom_ml_workspace\", # Type of the parent\n", 172 | " \"name\":\"datastores\", # What the parent will have\n", 173 | " \"isContainer\": True,\n", 174 | " \"cardinality\":\"SET\", # This is related to the cardinality, in this case the parent Server will have a SET of Models.\n", 175 | " \"isLegacyAttribute\":False\n", 176 | " },\n", 177 | " endDef2={ # endDef2 decribes what the child will have as an attribute\n", 178 | " \"type\":\"custom_ml_datastore\", # Type of the child\n", 179 | " \"name\":\"workspace\", # What the child will have\n", 180 | " \"isContainer\":False,\n", 181 | " \"cardinality\":\"SINGLE\",\n", 182 | " \"isLegacyAttribute\":False\n", 183 | " }\n", 184 | " )\n", 185 | " client.upload_typedefs(relationshipDefs=[rd])\n", 186 | " \n", 187 | " #-----------------------------------------------------------------------------------# \n", 188 | " rd = RelationshipTypeDef(\n", 189 | " name=\"custom_ml_workspace_experiment\",\n", 190 | " attributeDefs=[],\n", 191 | " relationshipCategory=\"COMPOSITION\", # Means the child can't exist without the parent\n", 192 | " endDef1={ # endDef1 decribes what the parent will have as an attribute\n", 193 | " \"type\":\"custom_ml_workspace\", # Type of the parent\n", 194 | " \"name\":\"experiments\", # What the parent will have\n", 195 | " \"isContainer\": True,\n", 196 | " \"cardinality\":\"SET\", # This is related to the cardinality, in this case the parent Server will have a SET of Models.\n", 197 | " \"isLegacyAttribute\":False\n", 198 | " },\n", 199 | " endDef2={ # endDef2 decribes what the child will have as an attribute\n", 200 | " \"type\":\"custom_ml_experiment\", # Type of the child\n", 201 | " \"name\":\"workspace\", # What the child will have\n", 202 | " \"isContainer\":False,\n", 203 | " \"cardinality\":\"SINGLE\",\n", 204 | " \"isLegacyAttribute\":False\n", 205 | " }\n", 206 | " )\n", 207 | " client.upload_typedefs(relationshipDefs=[rd])\n", 208 | "\n", 209 | " #-----------------------------------------------------------------------------------# \n", 210 | " #create types for packages and package\n", 211 | "\n", 212 | " #create packages type\n", 213 | " packages_type_df = EntityTypeDef(\n", 214 | " name=\"custom_ml_packages\",\n", 215 | " attributeDefs=[\n", 216 | " AtlasAttributeDef(name='notes',typename='string')\n", 217 | " ],\n", 218 | " superTypes = [\"DataSet\"],\n", 219 | " options = {\"schemaElementAttribute\":\"package\"}\n", 220 | " )\n", 221 | "\n", 222 | " package_type_df = EntityTypeDef(\n", 223 | " name=\"custom_ml_package\",\n", 224 | " attributeDefs=[\n", 225 | " AtlasAttributeDef(name='programming_language',typename='string'),\n", 226 | " AtlasAttributeDef(name='package_name',typename='string'),\n", 227 | " AtlasAttributeDef(name='version',typename='string'),\n", 228 | " AtlasAttributeDef(name='notes',typename='string')\n", 229 | " ],\n", 230 | " superTypes = [\"DataSet\"]\n", 231 | " )\n", 232 | "\n", 233 | " # create relationsip between packages and package\n", 234 | " package_to_packages_relationship = RelationshipTypeDef(\n", 235 | " name=\"custom_ml_packages_to_package\",\n", 236 | " relationshipCategory=\"COMPOSITION\",\n", 237 | " endDef1={\n", 238 | " \"type\": \"custom_ml_packages\",\n", 239 | " \"name\": \"package\",\n", 240 | " \"isContainer\": True,\n", 241 | " \"cardinality\": \"SET\",\n", 242 | " \"isLegacyAttribute\": False\n", 243 | " },\n", 244 | " endDef2={\n", 245 | " \"type\": \"custom_ml_package\",\n", 246 | " \"name\": \"packages\",\n", 247 | " \"isContainer\": False,\n", 248 | " \"cardinality\": \"SINGLE\",\n", 249 | " \"isLegacyAttribute\": False\n", 250 | " }\n", 251 | " )\n", 252 | "\n", 253 | " typedef_results = client.upload_typedefs(\n", 254 | " entityDefs = [packages_type_df, package_type_df],\n", 255 | " relationshipDefs = [package_to_packages_relationship],\n", 256 | " force_update=True\n", 257 | " )\n", 258 | " #-----------------------------------------------------------------------------------# \n", 259 | " \n", 260 | " #create experiemnt config type\n", 261 | " type_df = EntityTypeDef(\n", 262 | " name=\"custom_ml_exp_config\",\n", 263 | " attributeDefs=[\n", 264 | " AtlasAttributeDef(name='task_type',typename='string'),\n", 265 | " AtlasAttributeDef(name='enable_early_stopping',typename='bool'),\n", 266 | " AtlasAttributeDef(name='experiment_timeout_minutes',typename='int'),\n", 267 | " AtlasAttributeDef(name='primary_metric',typename='string'),\n", 268 | " AtlasAttributeDef(name='compute_target',typename='string'),\n", 269 | " AtlasAttributeDef(name='label_column_name',typename='string'),\n", 270 | " AtlasAttributeDef(name='n_cross_validations',typename='int'),\n", 271 | " AtlasAttributeDef(name='model_explainability',typename='bool')\n", 272 | " ],\n", 273 | " superTypes = [\"DataSet\"]\n", 274 | " )\n", 275 | " typedef_results = client.upload_typedefs(entityDefs = [type_df], force_update=True)\n", 276 | " \n", 277 | " #-----------------------------------------------------------------------------------# \n", 278 | "\n", 279 | " #create model metrics type\n", 280 | " type_df = EntityTypeDef(\n", 281 | " name=\"custom_ml_model_metrics\",\n", 282 | " attributeDefs=[\n", 283 | " AtlasAttributeDef(name='AUC',typename='float'),\n", 284 | " AtlasAttributeDef(name='Accuracy',typename='float'),\n", 285 | " AtlasAttributeDef(name='Precision',typename='float'),\n", 286 | " AtlasAttributeDef(name='Recall',typename='float'),\n", 287 | " AtlasAttributeDef(name='F1',typename='float')\n", 288 | " ],\n", 289 | " superTypes = [\"DataSet\"]\n", 290 | " )\n", 291 | " typedef_results = client.upload_typedefs(entityDefs = [type_df], force_update=True)\n", 292 | " \n", 293 | " #-----------------------------------------------------------------------------------# \n", 294 | "\n", 295 | " #create model type\n", 296 | " type_df = EntityTypeDef(\n", 297 | " name=\"custom_ml_model\",\n", 298 | " attributeDefs=[\n", 299 | " AtlasAttributeDef(name='workspace_name',typename='string'),\n", 300 | " AtlasAttributeDef(name='workspace_subscription_id',typename='string'),\n", 301 | " AtlasAttributeDef(name='workspace_resource_group',typename='string'),\n", 302 | " AtlasAttributeDef(name='name',typename='string'),\n", 303 | " AtlasAttributeDef(name='id',typename='string'),\n", 304 | " AtlasAttributeDef(name='version',typename='string'),\n", 305 | " AtlasAttributeDef(name='tags',typename='string'),\n", 306 | " AtlasAttributeDef(name='properties',typename='string')\n", 307 | " ],\n", 308 | " superTypes = [\"DataSet\"]\n", 309 | " )\n", 310 | " typedef_results = client.upload_typedefs(entityDefs = [type_df], force_update=True)\n", 311 | " \n", 312 | " #-----------------------------------------------------------------------------------# \n", 313 | "\n", 314 | " #create endpoint type\n", 315 | " type_df = EntityTypeDef(\n", 316 | " name=\"custom_ml_model_endpoint\",\n", 317 | " attributeDefs=[\n", 318 | " AtlasAttributeDef(name='workspace_name',typename='string'),\n", 319 | " AtlasAttributeDef(name='workspace_subscription_id',typename='string'),\n", 320 | " AtlasAttributeDef(name='workspace_resource_group',typename='string'),\n", 321 | " AtlasAttributeDef(name='name',typename='string'),\n", 322 | " AtlasAttributeDef(name='image_id',typename='string'),\n", 323 | " AtlasAttributeDef(name='compute_type',typename='string'),\n", 324 | " AtlasAttributeDef(name='state',typename='string'),\n", 325 | " AtlasAttributeDef(name='scoring_uri',typename='string'),\n", 326 | " AtlasAttributeDef(name='tags',typename='string'),\n", 327 | " AtlasAttributeDef(name='state',typename='string'),\n", 328 | " AtlasAttributeDef(name='properties',typename='string'),\n", 329 | " AtlasAttributeDef(name='created_by',typename='string'),\n", 330 | " AtlasAttributeDef(name='sample_json',typename='string')\n", 331 | " ],\n", 332 | " superTypes = [\"DataSet\"]\n", 333 | " )\n", 334 | " typedef_results = client.upload_typedefs(entityDefs = [type_df], force_update=True)\n", 335 | " \n", 336 | " #-----------------------------------------------------------------------------------# \n", 337 | "except:\n", 338 | " print('types already created') " 339 | ] 340 | } 341 | ], 342 | "metadata": { 343 | "kernelspec": { 344 | "display_name": "Python 3", 345 | "language": "python", 346 | "name": "python3" 347 | }, 348 | "language_info": { 349 | "codemirror_mode": { 350 | "name": "ipython", 351 | "version": 3 352 | }, 353 | "file_extension": ".py", 354 | "mimetype": "text/x-python", 355 | "name": "python", 356 | "nbconvert_exporter": "python", 357 | "pygments_lexer": "ipython3", 358 | "version": "3.7.4" 359 | }, 360 | "save_output": true, 361 | "synapse_widget": { 362 | "state": {}, 363 | "version": "0.1" 364 | } 365 | }, 366 | "nbformat": 4, 367 | "nbformat_minor": 2 368 | } 369 | -------------------------------------------------------------------------------- /SynapseNotebooks/03_Create_ML_Lineage_Functions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true, 8 | "jupyter": { 9 | "outputs_hidden": false, 10 | "source_hidden": false 11 | }, 12 | "nteract": { 13 | "transient": { 14 | "deleting": false 15 | } 16 | } 17 | }, 18 | "outputs": [], 19 | "source": [ 20 | "def get_entity_details(qualifiedName,typeName):\n", 21 | " entities = client.get_entity(\n", 22 | " qualifiedName=[qualifiedName],\n", 23 | " typeName=typeName\n", 24 | " )\n", 25 | " for entity in entities.get(\"entities\"):\n", 26 | " entity = entity\n", 27 | " break\n", 28 | " return entity\n", 29 | "#get_entity_details('https://sampledataadls.dfs.core.windows.net/masterdata/employees.csv','azure_datalake_gen2_path')\n", 30 | "\n", 31 | "def get_entity_guid(qualifiedName,typeName):\n", 32 | " entities = client.get_entity(\n", 33 | " qualifiedName=[qualifiedName],\n", 34 | " typeName=typeName\n", 35 | " )\n", 36 | " for entity in entities.get(\"entities\"):\n", 37 | " entity_guid = entity.get(\"guid\")\n", 38 | " break\n", 39 | " return entity_guid\n", 40 | "#get_entity_guid('https://sampledataadls.dfs.core.windows.net/creditriskdata/borrower.csv','azure_datalake_gen2_path')\n", 41 | "\n", 42 | "def get_entity_schema(guid):\n", 43 | " columns = []\n", 44 | " results = client.get_entity(guid)\n", 45 | " for entity in results[\"entities\"]:\n", 46 | " if \"tabular_schema\" in entity[\"relationshipAttributes\"]:\n", 47 | " ts = entity[\"relationshipAttributes\"][\"tabular_schema\"]\n", 48 | " ts_entity = client.get_entity(ts[\"guid\"])\n", 49 | " for schema in ts_entity[\"entities\"]:\n", 50 | " for col in schema[\"relationshipAttributes\"][\"columns\"]:\n", 51 | " if col['displayText'] != ':csv':\n", 52 | " columns.append(col['displayText'])\n", 53 | " return(columns)\n", 54 | " \n", 55 | "# ent_guid = 'a8698a33-9174-43cb-8835-26968862e2bf'\n", 56 | "# get_entity_schema(ent_guid)\n", 57 | "\n", 58 | "def create_data_entity_with_schema_and_parent(df_data,entityname,entitytype='custom_ml_dataset',parent_entityname=None,parent_entitytype='custom_ml_datastore'):\n", 59 | " # Create an asset for the output data schema.\n", 60 | " output_schema_entity = AtlasEntity(\n", 61 | " name=\"schema-\" + entityname,\n", 62 | " qualified_name = \"pyapacheatlas://\"+\"schema-\" + entityname,\n", 63 | " typeName=\"tabular_schema\",\n", 64 | " guid=guid.get_guid()\n", 65 | " )\n", 66 | "\n", 67 | " df_data_schema = pd.DataFrame(list(zip(list(df_data.columns), list(df_data.dtypes))),columns=['column','dtype'])\n", 68 | "\n", 69 | " #Iterate over the out data frame's columns and create entities\n", 70 | " output_entity_schema_columns = []\n", 71 | " #for column in df.schema:\n", 72 | " for index, row in df_data_schema.iterrows(): \n", 73 | " temp_column = AtlasEntity(\n", 74 | " name = row.column,\n", 75 | " typeName = \"column\",\n", 76 | " qualified_name = \"pyapacheatlas://schema-\" + entityname + \"#\" + row.column,\n", 77 | " guid=guid.get_guid(),\n", 78 | " attributes = {\"type\":str(row.dtype),\"description\": row.column},\n", 79 | " relationshipAttributes = {\"composeSchema\":output_schema_entity.to_json(minimum=True)}\n", 80 | " )\n", 81 | " output_entity_schema_columns.append(temp_column)\n", 82 | "\n", 83 | "\n", 84 | " if parent_entityname:\n", 85 | " dstore_entity = get_entity_details(\"pyapacheatlas://\"+parent_entityname, parent_entitytype)\n", 86 | " # Create a entity for dataset \n", 87 | " dataset_output_entity = AtlasEntity(\n", 88 | " name=entityname,\n", 89 | " typeName=entitytype,\n", 90 | " qualified_name=\"pyapacheatlas://\" + entityname,\n", 91 | " guid = guid.get_guid(),\n", 92 | " relationshipAttributes = {\n", 93 | " \"tabular_schema\": output_schema_entity.to_json(minimum=True),\n", 94 | " \"datastore\":dstore_entity\n", 95 | " }\n", 96 | " )\n", 97 | " else:\n", 98 | " # Create a entity for dataset \n", 99 | " dataset_output_entity = AtlasEntity(\n", 100 | " name=entityname,\n", 101 | " typeName=entitytype,\n", 102 | " qualified_name=\"pyapacheatlas://\" + entityname,\n", 103 | " guid = guid.get_guid(),\n", 104 | " relationshipAttributes = {\n", 105 | " \"tabular_schema\": output_schema_entity.to_json(minimum=True)\n", 106 | " }\n", 107 | " )\n", 108 | "\n", 109 | " # Prepare all the entities as a batch to be uploaded.\n", 110 | " batch = [dataset_output_entity, output_schema_entity] + output_entity_schema_columns\n", 111 | " batch\n", 112 | "\n", 113 | " # Upload all entities!\n", 114 | " client.upload_entities(batch=batch)\n", 115 | " \n", 116 | "def create_data_entity_with_schema(df_data,entityname,entitytype='custom_ml_dataset'):\n", 117 | " # Create an asset for the output data schema.\n", 118 | " output_schema_entity = AtlasEntity(\n", 119 | " name=\"schema-\" + entityname,\n", 120 | " qualified_name = \"pyapacheatlas://\"+\"schema-\" + entityname,\n", 121 | " typeName=\"tabular_schema\",\n", 122 | " guid=guid.get_guid()\n", 123 | " )\n", 124 | "\n", 125 | " df_data_schema = pd.DataFrame(list(zip(list(df_data.columns), list(df_data.dtypes))),columns=['column','dtype'])\n", 126 | "\n", 127 | " #Iterate over the out data frame's columns and create entities\n", 128 | " output_entity_schema_columns = []\n", 129 | " #for column in df.schema:\n", 130 | " for index, row in df_data_schema.iterrows(): \n", 131 | " temp_column = AtlasEntity(\n", 132 | " name = row.column,\n", 133 | " typeName = \"column\",\n", 134 | " qualified_name = \"pyapacheatlas://schema-\" + entityname + \"#\" + row.column,\n", 135 | " guid=guid.get_guid(),\n", 136 | " attributes = {\"type\":str(row.dtype),\"description\": row.column},\n", 137 | " relationshipAttributes = {\"composeSchema\":output_schema_entity.to_json(minimum=True)}\n", 138 | " )\n", 139 | " output_entity_schema_columns.append(temp_column)\n", 140 | "\n", 141 | " # Create a entity for dataset \n", 142 | " dataset_output_entity = AtlasEntity(\n", 143 | " name=entityname,\n", 144 | " typeName=entitytype,\n", 145 | " qualified_name=\"pyapacheatlas://\" + entityname,\n", 146 | " guid = guid.get_guid(),\n", 147 | " relationshipAttributes = {\n", 148 | " \"tabular_schema\": output_schema_entity.to_json(minimum=True)\n", 149 | " }\n", 150 | " )\n", 151 | "\n", 152 | " # Prepare all the entities as a batch to be uploaded.\n", 153 | " batch = [dataset_output_entity, output_schema_entity] + output_entity_schema_columns\n", 154 | " batch\n", 155 | "\n", 156 | " # Upload all entities!\n", 157 | " client.upload_entities(batch=batch)\n", 158 | " \n", 159 | "def create_lineage_for_entities(experimentname,processname,in_ent_qns,out_ent_qns,process_type_name='Process',ColumnMapping=False):\n", 160 | " # create a process \n", 161 | " # inputs: list of (entity,type) tuples\n", 162 | " # outputs: list of (entity,type) tuples\n", 163 | "\n", 164 | " from pyapacheatlas.core import AtlasProcess\n", 165 | "\n", 166 | " in_ent_guids = []\n", 167 | " for in_ent_qn in in_ent_qns:\n", 168 | " #print(in_ent_qn,in_ent_qns[in_ent_qn])\n", 169 | " in_ent_guid = get_entity_guid(in_ent_qn,in_ent_qns[in_ent_qn])\n", 170 | " in_ent_guids.append({'guid':in_ent_guid})\n", 171 | " \n", 172 | " out_ent_guids = []\n", 173 | " for out_ent_qn in out_ent_qns:\n", 174 | " #print(in_ent_qn,in_ent_qns[in_ent_qn])\n", 175 | " out_ent_guid = get_entity_guid(out_ent_qn,out_ent_qns[out_ent_qn])\n", 176 | " out_ent_guids.append({'guid':out_ent_guid})\n", 177 | "\n", 178 | " process_name = experimentname + processname\n", 179 | " process_qn = \"pyapacheatlas://\" + process_name\n", 180 | "\n", 181 | " if ColumnMapping == False:\n", 182 | " process_type_name = process_type_name\n", 183 | "\n", 184 | " process = AtlasProcess(\n", 185 | " name=process_name,\n", 186 | " typeName=process_type_name,\n", 187 | " qualified_name=process_qn,\n", 188 | " inputs = in_ent_guids,\n", 189 | " outputs = out_ent_guids,\n", 190 | " guid=guid.get_guid()\n", 191 | " )\n", 192 | " else:\n", 193 | " process_type_name = \"ProcessWithColumnMapping\"\n", 194 | "\n", 195 | " column_mapping_attributes = []\n", 196 | " for in_ent_qn in in_ent_qns:\n", 197 | " cl_mapping = []\n", 198 | " in_ent_columns = get_entity_schema(get_entity_guid(in_ent_qn,in_ent_qns[in_ent_qn]))\n", 199 | " for in_col in in_ent_columns:\n", 200 | " cl_mapping.append({\"Source\":in_col,\"Sink\":in_col})\n", 201 | " #break\n", 202 | " mapping = {\n", 203 | " 'DatasetMapping': {'Source':in_ent_qn,'Sink':list(out_ent_qns.keys())[0]},\n", 204 | " 'ColumnMapping': cl_mapping\n", 205 | " }\n", 206 | " column_mapping_attributes.append(mapping)\n", 207 | "\n", 208 | " process = AtlasProcess(\n", 209 | " name=process_name,\n", 210 | " typeName=process_type_name,\n", 211 | " qualified_name=process_qn,\n", 212 | " inputs = in_ent_guids,\n", 213 | " outputs = out_ent_guids,\n", 214 | " guid=guid.get_guid(),\n", 215 | " attributes={\"columnMapping\":json.dumps(column_mapping_attributes)}\n", 216 | " )\n", 217 | "\n", 218 | " # Prepare all the entities as a batch to be uploaded.\n", 219 | " batch = [process]\n", 220 | " batch\n", 221 | "\n", 222 | " # Upload all entities!\n", 223 | " client.upload_entities(batch=batch)\n", 224 | " \n", 225 | "def create_entity(name,typeName,config_attibutes):\n", 226 | " # Create an entity\n", 227 | " name = name \n", 228 | " qn = \"pyapacheatlas://\" + name\n", 229 | "\n", 230 | " exp_config_entity = AtlasEntity(\n", 231 | " name=name,\n", 232 | " typeName=typeName,\n", 233 | " qualified_name=qn,\n", 234 | " guid = guid.get_guid(),\n", 235 | " attributes = config_attibutes\n", 236 | " )\n", 237 | "\n", 238 | " # Upload all entities!\n", 239 | " client.upload_entities(batch=[exp_config_entity.to_json()])\n", 240 | "\n", 241 | " \n", 242 | "def get_dataset_details(indataset,experiment_name=''):\n", 243 | " result = []\n", 244 | " #print(indataset)\n", 245 | " if 'FileDataset' in str(type((indataset))):\n", 246 | " dssource = eval(json.loads(str(indataset).replace('FileDataset',''))['source'][0])\n", 247 | " sourcestore = dssource[0]\n", 248 | " sourcepath = dssource[1]\n", 249 | " sourcepathfiles = indataset.to_path()\n", 250 | " for sourcepathfile in sourcepathfiles:\n", 251 | " entityname = sourcepath.split('/')[-1] + sourcepathfile.replace('/','_') #.replace('.parquet','').replace('.csv','')\n", 252 | " #print('\\nFileDataset:',entityname)\n", 253 | "\n", 254 | " dsdatastore = Datastore.get(ws, sourcestore)\n", 255 | " datastore_path = [DataPath(dsdatastore, sourcepath+sourcepathfile.replace('/',''))]\n", 256 | " \n", 257 | " if '.parquet' in sourcepathfile:\n", 258 | " tabular_dataset = Dataset.Tabular.from_parquet_files(path=datastore_path)\n", 259 | " df_data = tabular_dataset.take(10).to_pandas_dataframe()\n", 260 | " \n", 261 | " elif '.csv' in sourcepathfile:\n", 262 | " tabular_dataset = Dataset.Tabular.from_delimited_files(path=datastore_path,encoding ='iso88591') \n", 263 | " #'utf8', 'iso88591', 'latin1', 'ascii', 'utf16', 'utf32', 'utf8bom' and 'windows1252'\n", 264 | " df_data = tabular_dataset.take(10).to_pandas_dataframe()\n", 265 | " \n", 266 | " if experiment_name != '':\n", 267 | " result.append((entityname + '_' + experiment_name,df_data))\n", 268 | " else:\n", 269 | " result.append((entityname,df_data))\n", 270 | "\n", 271 | " elif 'TabularDataset' in str(type((indataset))):\n", 272 | " tabular_dataset = indataset\n", 273 | " entityname = json.loads(str(indataset).replace('TabularDataset',''))['registration']['name']\n", 274 | " \n", 275 | " # dataset = Dataset.get_by_name(ws, name=entityname)\n", 276 | " # try:\n", 277 | " # sourcestore = json.loads(dataset._definition)['blocks'][0]['arguments']['datastore']['datastoreName']\n", 278 | " # except:\n", 279 | " # sourcestore = json.loads(dataset._definition)['blocks'][0]['arguments']['datastores'][0]['datastoreName']\n", 280 | " df_data = tabular_dataset.take(10).to_pandas_dataframe()\n", 281 | " #print('TabularDataset:', entityname)\n", 282 | " result.append((entityname,df_data))\n", 283 | " return result\n", 284 | "\n", 285 | "\n", 286 | "from azureml.core import Experiment\n", 287 | "from azureml.pipeline.core import PipelineRun\n", 288 | "\n", 289 | "from azureml.core import Workspace, Datastore, Dataset\n", 290 | "from azureml.data.datapath import DataPath\n", 291 | "import json \n", 292 | "import pandas as pd\n", 293 | "\n", 294 | "def create_aml_experiment_steps(ws,experiment_name):\n", 295 | " experiments_lst = Experiment.list(ws)\n", 296 | " for experiment in experiments_lst:\n", 297 | " if experiment.name == experiment_name:\n", 298 | " #print(experiment)\n", 299 | " exp = Experiment(ws,experiment.name)\n", 300 | " for run in exp.get_runs(): \n", 301 | " rundetails = run.get_details()\n", 302 | " #print(rundetails)\n", 303 | " if rundetails['status'] != 'Completed': #continue until we find a completed run \n", 304 | " continue\n", 305 | " pipeline_run = PipelineRun(exp, rundetails['runId'])\n", 306 | "\n", 307 | " steps = pipeline_run.get_steps()\n", 308 | " for step_run in steps:\n", 309 | " step_run_details = step_run.get_details_with_logs()\n", 310 | " #print(step_run_details)\n", 311 | " #print(step_run_details['runDefinition']['script'])\n", 312 | "\n", 313 | " purview_basepath = 'pyapacheatlas://'\n", 314 | " in_ent_qns = {}\n", 315 | " out_ent_qns = {}\n", 316 | " #print(step_run_details)\n", 317 | " step_name = step_run.name #step_run_details['runDefinition']['script']\n", 318 | " #print(step_name)\n", 319 | " \n", 320 | " #print('\\n Input Datasets:\\n')\n", 321 | " for indataset in step_run_details['inputDatasets']:\n", 322 | " in_result = get_dataset_details(indataset['dataset'],experiment_name)\n", 323 | " #print(in_result)\n", 324 | " #create entities \n", 325 | " for in_res in in_result:\n", 326 | " data_ent_name = in_res[0].strip('_')\n", 327 | " create_data_entity_with_schema(in_res[1],data_ent_name,'custom_ml_dataset')\n", 328 | " in_ent_qns[purview_basepath + data_ent_name] = 'custom_ml_dataset'\n", 329 | " #break\n", 330 | " #print('\\n Output Datasets:\\n')\n", 331 | " for outdataset in step_run_details['outputDatasets']:\n", 332 | " out_result = get_dataset_details(outdataset['dataset'],experiment_name)\n", 333 | " #print(out_result)\n", 334 | " #create entities\n", 335 | " for out_res in out_result:\n", 336 | " data_ent_name = out_res[0].strip('_')\n", 337 | " create_data_entity_with_schema(out_res[1],data_ent_name,'custom_ml_dataset')\n", 338 | " out_ent_qns[purview_basepath + data_ent_name] = 'custom_ml_dataset'\n", 339 | " #break\n", 340 | " #print(in_ent_qns,out_ent_qns)\n", 341 | " create_lineage_for_entities(experiment_name + '_',step_name, in_ent_qns,out_ent_qns,process_type_name='custom_ml_experiment_step',ColumnMapping=False)\n", 342 | " #break \n", 343 | " \n", 344 | " break # break after processing one completed run\n", 345 | " break #after finding the experiment\n", 346 | "\n", 347 | "\n", 348 | "#create workspace entity\n", 349 | "def create_workspace_entities(ws):\n", 350 | "\n", 351 | " config_attibutes={}\n", 352 | " temp_column={}\n", 353 | "\n", 354 | " temp_column['name'] = ws.name\n", 355 | " config_attibutes.update(temp_column)\n", 356 | " temp_column['subscription_id'] = ws.subscription_id\n", 357 | " config_attibutes.update(temp_column)\n", 358 | " temp_column['resource_group'] = ws.resource_group\n", 359 | " config_attibutes.update(temp_column)\n", 360 | "\n", 361 | " create_entity(ws.name,'custom_ml_workspace',config_attibutes)\n", 362 | " #break\n", 363 | "\n", 364 | "\n", 365 | "#create all datastore entities\n", 366 | "def create_datastore_entities(ws):\n", 367 | " for datastore in ws.datastores.values():\n", 368 | " config_attibutes={}\n", 369 | " temp_column={}\n", 370 | " \n", 371 | " temp_column['name'] = datastore.name\n", 372 | " config_attibutes.update(temp_column)\n", 373 | "\n", 374 | " if ('AzureDataLakeGen2Datastore' in str(type(datastore))) or ('AzureBlobDatastore' in str(type(datastore))):\n", 375 | " temp_column['container_name'] = datastore.container_name\n", 376 | " config_attibutes.update(temp_column)\n", 377 | " temp_column['account_name'] = datastore.account_name\n", 378 | " config_attibutes.update(temp_column)\n", 379 | " temp_column['protocol'] = datastore.protocol\n", 380 | " config_attibutes.update(temp_column)\n", 381 | " temp_column['endpoint'] = datastore.endpoint\n", 382 | " config_attibutes.update(temp_column)\n", 383 | " elif 'AzureSqlDatabaseDatastore' in str(type(datastore)):\n", 384 | " #print('sql',datastore.server_name)\n", 385 | " temp_column['server_name'] = datastore.server_name\n", 386 | " config_attibutes.update(temp_column)\n", 387 | " temp_column['database_name'] = datastore.database_name\n", 388 | " config_attibutes.update(temp_column)\n", 389 | " elif 'AzureBlobDatastore' in str(type(datastore)): \n", 390 | " pass\n", 391 | "\n", 392 | " create_entity(datastore.name,'custom_ml_datastore',config_attibutes)\n", 393 | " #break\n", 394 | "\n", 395 | " #create workspace and datastore relationship\n", 396 | " purview_basepath = 'pyapacheatlas://'\n", 397 | " for datastore in ws.datastores.values():\n", 398 | " relationshiptype = 'custom_ml_workspace_datastore'\n", 399 | " end1type = 'custom_ml_workspace'\n", 400 | " end2type = 'custom_ml_datastore'\n", 401 | " end1_qn = purview_basepath + ws.name\n", 402 | " end2_qn = purview_basepath + datastore.name\n", 403 | " try:\n", 404 | " create_entities_relationship(relationshiptype,end1type,end2type,end1_qn,end2_qn)\n", 405 | " except:\n", 406 | " pass # ignore if relationship exists\n", 407 | "\n", 408 | "#create all dataset entities (with datastore as parent)\n", 409 | "from azureml.core import Workspace, Datastore, Dataset\n", 410 | "import pandas as pd\n", 411 | "def create_dataset_entities(ws,parent_flag=True):\n", 412 | " purview_basepath = 'pyapacheatlas://'\n", 413 | " for dsname in ws.datasets:\n", 414 | " dataset = ws.datasets[dsname]\n", 415 | " try:\n", 416 | " if 'FileDataset' in str(type((dataset))):\n", 417 | " datasetsource = eval(json.loads(str(dataset).replace('FileDataset',''))['source'][0])[0]\n", 418 | " elif 'TabularDataset' in str(type((dataset))):\n", 419 | " datasetsource = eval(json.loads(str(dataset).replace('TabularDataset',''))['source'][0])[0]\n", 420 | " dsdetails = get_dataset_details(dataset)\n", 421 | " #print(dsdetails)\n", 422 | " for ds in dsdetails:\n", 423 | " if parent_flag == False:\n", 424 | " create_data_entity_with_schema(ds[1],dsname,'custom_ml_dataset')\n", 425 | " create_lineage_for_entities('',('register_' + dsname), {(purview_basepath+datasetsource):'custom_ml_datastore'},\n", 426 | " {(purview_basepath+ds[0]):'custom_ml_dataset'},ColumnMapping=False)\n", 427 | " else: \n", 428 | " create_data_entity_with_schema_and_parent(ds[1],dsname,entitytype='custom_ml_dataset',\n", 429 | " parent_entityname=datasetsource,parent_entitytype='custom_ml_datastore') \n", 430 | " except:\n", 431 | " print('Error:',dsname) \n", 432 | " #break\n", 433 | " \n", 434 | " \n", 435 | "#create experiment entity\n", 436 | "from azureml.core import Experiment\n", 437 | "\n", 438 | "def create_experiment_entities(ws):\n", 439 | " for experiment in Experiment.list(ws):\n", 440 | " #create experiment entity\n", 441 | " config_attibutes={}\n", 442 | " temp_column={}\n", 443 | "\n", 444 | " temp_column['name'] = experiment.name\n", 445 | " config_attibutes.update(temp_column)\n", 446 | "\n", 447 | " create_entity(experiment.name,'custom_ml_experiment',config_attibutes)\n", 448 | " #break\n", 449 | " \n", 450 | " purview_basepath = 'pyapacheatlas://'\n", 451 | "\n", 452 | " #create experiment relationship to workspace\n", 453 | " relationshiptype = 'custom_ml_workspace_experiment'\n", 454 | " end1type = 'custom_ml_workspace'\n", 455 | " end2type = 'custom_ml_experiment'\n", 456 | " end1_qn = purview_basepath + ws.name\n", 457 | " end2_qn = purview_basepath + experiment.name\n", 458 | " try:\n", 459 | " create_entities_relationship(relationshiptype,end1type,end2type,end1_qn,end2_qn)\n", 460 | " except:\n", 461 | " pass # ignore if relationship exists\n", 462 | " \n", 463 | " for run in experiment.get_runs(): \n", 464 | " rundetails = run.get_details()\n", 465 | " #print(rundetails)\n", 466 | " if rundetails['status'] != 'Completed': #continue until we find a completed run \n", 467 | " continue\n", 468 | " # print(rundetails['properties']['azureml.runsource'])\n", 469 | " #create experiment steps\n", 470 | " if rundetails['properties']['azureml.runsource'] == 'azureml.PipelineRun':\n", 471 | " print(experiment.name)\n", 472 | " create_aml_experiment_steps(ws,experiment.name)\n", 473 | "\n", 474 | " pipeline_run = PipelineRun(experiment, rundetails['runId'])\n", 475 | "\n", 476 | " steps = pipeline_run.get_steps()\n", 477 | " for step_run in steps:\n", 478 | " #print(experiment.name + '_' + step_run.name)\n", 479 | " \n", 480 | " #create experiment relationship to workspace\n", 481 | " relationshiptype = 'custom_ml_experiment_to_experimentstep'\n", 482 | " end1type = 'custom_ml_experiment'\n", 483 | " end2type = 'custom_ml_experiment_step'\n", 484 | " end1_qn = purview_basepath + experiment.name\n", 485 | " end2_qn = purview_basepath + experiment.name + '_' + step_run.name\n", 486 | " try:\n", 487 | " create_entities_relationship(relationshiptype,end1type,end2type,end1_qn,end2_qn)\n", 488 | " except:\n", 489 | " pass # ignore if relationship exists\n", 490 | "\n", 491 | " break # break after processing one completed run\n", 492 | " #break\n", 493 | "\n", 494 | "def create_entities_relationship(relationshiptype,end1type,end2type,end1_qn,end2_qn):\n", 495 | " relationship = {}\n", 496 | " end1 = {}\n", 497 | " end2 = {}\n", 498 | "\n", 499 | " end1[\"guid\"] = get_entity_guid(end1_qn,end1type)\n", 500 | " end1[\"typeName\"] = end1type\n", 501 | " end1[\"uniqueAttributes\"] = {\"qualifiedName\": end1_qn}\n", 502 | "\n", 503 | " end2[\"guid\"] = get_entity_guid(end2_qn,end2type)\n", 504 | " end2[\"typeName\"] = end2type\n", 505 | " end2[\"uniqueAttributes\"] = {\"qualifiedName\": end2_qn}\n", 506 | "\n", 507 | " relationship[\"typeName\"] = relationshiptype\n", 508 | " relationship[\"attributes\"] = {}\n", 509 | " relationship[\"guid\"] = guid.get_guid()\n", 510 | " relationship[\"provenanceType\"] = 0\n", 511 | " relationship[\"end1\"] = end1\n", 512 | " relationship[\"end2\"] = end2\n", 513 | " relationship\n", 514 | " \n", 515 | " client.upload_relationship(relationship) \n", 516 | " \n", 517 | "def create_package_entities(experimentname,packageslist):\n", 518 | " packages_name = experimentname + '-packages' \n", 519 | " packages_qn = \"pyapacheatlas://\" + packages_name\n", 520 | "\n", 521 | " # Create an asset for the packages.\n", 522 | " packages_entity = AtlasEntity(\n", 523 | " name = packages_name,\n", 524 | " qualified_name = packages_qn,\n", 525 | " typeName=\"custom_ml_packages\",\n", 526 | " attributes = {\"notes\":\"test note\"},\n", 527 | " guid=guid.get_guid()\n", 528 | " )\n", 529 | "\n", 530 | " packages_entity.to_json(minimum=True)\n", 531 | "\n", 532 | " atlas_packages = []\n", 533 | " relationships = []\n", 534 | " for package in packageslist:\n", 535 | " package_attibutes={}\n", 536 | " temp_column={}\n", 537 | " temp_column['programming_language'] = str(package[0])\n", 538 | " package_attibutes.update(temp_column)\n", 539 | " temp_column['package_name'] = str(package[1])\n", 540 | " package_attibutes.update(temp_column)\n", 541 | " temp_column['version'] = str(package[2])\n", 542 | " package_attibutes.update(temp_column)\n", 543 | " temp_column['notes'] = str(package[3])\n", 544 | " package_attibutes.update(temp_column)\n", 545 | "\n", 546 | " # Create an entity for each package\n", 547 | " name = str(package[1]) #experimentname + '-package-' + package[1] \n", 548 | " qn = packages_qn + '#' + str(package[1]) #\"pyapacheatlas://\" + name\n", 549 | "\n", 550 | " package_entity = AtlasEntity(\n", 551 | " name= name,\n", 552 | " typeName=\"custom_ml_package\",\n", 553 | " qualified_name=qn,\n", 554 | " guid = guid.get_guid(),\n", 555 | " attributes = package_attibutes,\n", 556 | " relationshipAttributes = {\"packages\":packages_entity.to_json(minimum=True)}\n", 557 | " )\n", 558 | " atlas_packages.append(package_entity)\n", 559 | "\n", 560 | " atlas_packages\n", 561 | "\n", 562 | " # Prepare all the entities as a batch to be uploaded.\n", 563 | " batch = [packages_entity] + atlas_packages\n", 564 | " client.upload_entities(batch=batch) \n", 565 | " \n", 566 | "def create_experiment_config_entity(ws,experiment_name,automl_run):\n", 567 | " # Get experiment config from AML run\n", 568 | " import json\n", 569 | " import pandas as pd\n", 570 | " run_properties = automl_run.get_properties()\n", 571 | " run_properties\n", 572 | "\n", 573 | " AMLSettingsJsonString = run_properties['AMLSettingsJsonString']\n", 574 | " AMLSettings = json.loads(AMLSettingsJsonString)\n", 575 | "\n", 576 | " df_config = pd.DataFrame(list(AMLSettings.items()),columns = ['key','value']) \n", 577 | "\n", 578 | " keys = ['task_type','enable_early_stopping','experiment_timeout_minutes','primary_metric','compute_target','label_column_name','n_cross_validations','model_explainability']\n", 579 | "\n", 580 | " df_config = df_config[df_config['key'].isin(keys)]\n", 581 | "\n", 582 | " dict_config = df_config.to_dict(orient = 'records')\n", 583 | " dict_config\n", 584 | "\n", 585 | " config_attibutes={}\n", 586 | " for attibutes in dict_config:\n", 587 | " temp_column={}\n", 588 | " temp_column[attibutes['key']] = attibutes['value']\n", 589 | " config_attibutes.update(temp_column)\n", 590 | " config_attibutes\n", 591 | "\n", 592 | " # Create a entity for exp config \n", 593 | " name = experiment_name + \"-config\"\n", 594 | " qn = \"pyapacheatlas://\" + name\n", 595 | "\n", 596 | " exp_config_entity = AtlasEntity(\n", 597 | " name=name,\n", 598 | " typeName=\"custom_ml_exp_config\",\n", 599 | " qualified_name=qn,\n", 600 | " guid = guid.get_guid(),\n", 601 | " attributes = config_attibutes\n", 602 | " )\n", 603 | "\n", 604 | " # Upload all entities!\n", 605 | " client.upload_entities(batch=[exp_config_entity.to_json()])\n", 606 | " \n", 607 | "def create_model_entity(ws,experiment_name,modelname):\n", 608 | " # get deployed model\n", 609 | " from azureml.core.model import Model\n", 610 | " model = Model(ws, modelname)\n", 611 | "\n", 612 | " config_attibutes={}\n", 613 | " temp_column={}\n", 614 | " temp_column['workspace_name'] = model.workspace.name\n", 615 | " config_attibutes.update(temp_column)\n", 616 | " temp_column['workspace_subscription_id'] = model.workspace.subscription_id\n", 617 | " config_attibutes.update(temp_column)\n", 618 | " temp_column['workspace_subscription_id'] = model.workspace.subscription_id\n", 619 | " config_attibutes.update(temp_column)\n", 620 | " temp_column['workspace_resource_group'] = model.workspace.resource_group\n", 621 | " config_attibutes.update(temp_column)\n", 622 | " temp_column['name'] = model.name\n", 623 | " config_attibutes.update(temp_column)\n", 624 | " temp_column['id'] = model.id\n", 625 | " config_attibutes.update(temp_column)\n", 626 | " temp_column['version'] = model.version\n", 627 | " config_attibutes.update(temp_column)\n", 628 | " temp_column['tags'] = model.tags\n", 629 | " config_attibutes.update(temp_column)\n", 630 | " temp_column['properties'] = model.properties\n", 631 | " config_attibutes.update(temp_column)\n", 632 | "\n", 633 | " # Create a entity for Model\n", 634 | " name = modelname \n", 635 | " qn = \"pyapacheatlas://\" + name\n", 636 | "\n", 637 | " exp_config_entity = AtlasEntity(\n", 638 | " name=name,\n", 639 | " typeName=\"custom_ml_model\",\n", 640 | " qualified_name=qn,\n", 641 | " guid = guid.get_guid(),\n", 642 | " attributes = config_attibutes\n", 643 | " )\n", 644 | "\n", 645 | " # Upload all entities!\n", 646 | " client.upload_entities(batch=[exp_config_entity.to_json()]) \n", 647 | " \n", 648 | "def create_model_metrics_entity(experiment_name,best_run):\n", 649 | " metrics = best_run.get_metrics()\n", 650 | "\n", 651 | " # select relevant metrics\n", 652 | " auc = metrics.get('AUC_weighted')\n", 653 | " accuracy = metrics.get('accuracy')\n", 654 | " precision = metrics.get('precision_score_weighted')\n", 655 | " recall = metrics.get('recall_score_weighted')\n", 656 | " f1 = metrics.get('f1_score_weighted')\n", 657 | "\n", 658 | " # # combine into single dataframe\n", 659 | " # metrics_df = sc.parallelize([['AUC', auc], ['Accuracy', accuracy], ['Precision', precision], ['Recall', recall], ['F1', f1]]).toDF(('Metric', 'Value'))\n", 660 | " metrics = ['AUC','Accuracy','Precision','Recall','F1']\n", 661 | " metricslist= [auc,accuracy,precision,recall,f1]\n", 662 | " columns = ['Metric','Value']\n", 663 | " metrics_df = pd.DataFrame(zip(metrics, metricslist),columns=columns)\n", 664 | "\n", 665 | "\n", 666 | " dict_metrics = metrics_df.to_dict(orient = 'records')\n", 667 | " dict_metrics\n", 668 | "\n", 669 | " config_attibutes={}\n", 670 | " for attibutes in dict_metrics:\n", 671 | " temp_column={}\n", 672 | " temp_column[attibutes['Metric']] = attibutes['Value']\n", 673 | " config_attibutes.update(temp_column)\n", 674 | " config_attibutes\n", 675 | "\n", 676 | " name = experiment_name + \"-modelmetrics\"\n", 677 | " qn = \"pyapacheatlas://\" + name\n", 678 | "\n", 679 | " # Create a entity for model metrics\n", 680 | " exp_config_entity = AtlasEntity(\n", 681 | " name=name,\n", 682 | " typeName=\"custom_ml_model_metrics\",\n", 683 | " qualified_name=qn,\n", 684 | " guid = guid.get_guid(),\n", 685 | " attributes = config_attibutes\n", 686 | " )\n", 687 | "\n", 688 | " # Upload all entities!\n", 689 | " client.upload_entities(batch=[exp_config_entity.to_json()])\n", 690 | " \n", 691 | "def create_experiment_lineage(experimentname,exp_data_qn,exp_config_qn,model_metrics_qn,model_qn): \n", 692 | " # create experiment process \n", 693 | " # inputs: prepareddata, modelconfig \n", 694 | " # outputs: model metrics and registered model\n", 695 | "\n", 696 | " from pyapacheatlas.core import AtlasProcess\n", 697 | "\n", 698 | " in_data_ent_guid = get_entity_guid(exp_data_qn,'custom_dataset')\n", 699 | " in_exp_config_guid = get_entity_guid(exp_config_qn,'custom_ml_exp_config')\n", 700 | " out_model_metrics_guid = get_entity_guid(model_metrics_qn,'custom_ml_model_metrics')\n", 701 | " out_model_guid = get_entity_guid(model_qn,'custom_ml_model')\n", 702 | "\n", 703 | " process_name = experimentname + '-train'\n", 704 | " process_qn = \"pyapacheatlas://\" + process_name\n", 705 | " process_type_name = \"Process\"\n", 706 | "\n", 707 | " process = AtlasProcess(\n", 708 | " name=process_name,\n", 709 | " typeName=process_type_name,\n", 710 | " qualified_name=process_qn,\n", 711 | " inputs = [{\"guid\":in_data_ent_guid},{\"guid\":in_exp_config_guid}],\n", 712 | " outputs = [{\"guid\":out_model_metrics_guid},{\"guid\":out_model_guid}],\n", 713 | " guid=guid.get_guid()\n", 714 | " )\n", 715 | "\n", 716 | " # Prepare all the entities as a batch to be uploaded.\n", 717 | " batch = [process]\n", 718 | " batch\n", 719 | "\n", 720 | " # Upload all entities!\n", 721 | " client.upload_entities(batch=batch) \n", 722 | " \n", 723 | "def create_model_service_entity(ws,experimentname,aci_service_name,samplejson):\n", 724 | " # get deployed ACI Web Service\n", 725 | " from azureml.core.webservice import AciWebservice\n", 726 | " aciws = AciWebservice(ws, aci_service_name)\n", 727 | "\n", 728 | " config_attibutes={}\n", 729 | " temp_column={}\n", 730 | " temp_column['workspace_name'] = aciws.workspace.name\n", 731 | " config_attibutes.update(temp_column)\n", 732 | " temp_column['workspace_subscription_id'] = aciws.workspace.subscription_id\n", 733 | " config_attibutes.update(temp_column)\n", 734 | " temp_column['workspace_resource_group'] = aciws.workspace.resource_group\n", 735 | " config_attibutes.update(temp_column)\n", 736 | " temp_column['name'] = aciws.name\n", 737 | " config_attibutes.update(temp_column)\n", 738 | " temp_column['image_id'] = aciws.image_id\n", 739 | " config_attibutes.update(temp_column)\n", 740 | " temp_column['compute_type'] = aciws.compute_type\n", 741 | " config_attibutes.update(temp_column)\n", 742 | " temp_column['state'] = aciws.state\n", 743 | " config_attibutes.update(temp_column)\n", 744 | " temp_column['scoring_uri'] = aciws.scoring_uri\n", 745 | " config_attibutes.update(temp_column)\n", 746 | " temp_column['tags'] = aciws.tags\n", 747 | " config_attibutes.update(temp_column)\n", 748 | " temp_column['state'] = aciws.state\n", 749 | " config_attibutes.update(temp_column)\n", 750 | " temp_column['properties'] = aciws.properties\n", 751 | " config_attibutes.update(temp_column)\n", 752 | " temp_column['created_by'] = aciws.created_by\n", 753 | " config_attibutes.update(temp_column)\n", 754 | " temp_column['sample_json'] = samplejson\n", 755 | " config_attibutes.update(temp_column)\n", 756 | "\n", 757 | " name = experimentname + \"-model_endpoint\"\n", 758 | " qn = \"pyapacheatlas://\" + name\n", 759 | "\n", 760 | " # Create a entity for ACI Web Service\n", 761 | " endpoint_entity = AtlasEntity(\n", 762 | " name=name,\n", 763 | " typeName=\"custom_ml_model_endpoint\",\n", 764 | " qualified_name=qn,\n", 765 | " guid = guid.get_guid(),\n", 766 | " attributes = config_attibutes\n", 767 | " )\n", 768 | "\n", 769 | " # Upload all entities!\n", 770 | " client.upload_entities(batch=[endpoint_entity.to_json()]) \n", 771 | " \n", 772 | "def create_powerbi_dataset_and_lineage(experiment_name,pbi_workspace,pbi_datasetid,pbidata_ent_name,ml_dataset_ent_name,ml_dataset_ent_type):\n", 773 | " \n", 774 | " pbidata_entity_type = 'powerbi_dataset'\n", 775 | " pbidata_ent_qn = pbi_workspace + '/datasets/' + pbi_datasetid \n", 776 | " purview_basepath = 'pyapacheatlas://'\n", 777 | " #\"https://msit.powerbi.com/groups/7d666287-f9b8-45ff-be6c-9909afe9df40/datasets/e5a30c22-466d-4a30-a1ac-8736ed6567cc\"\n", 778 | "\n", 779 | " pbidata_ent = AtlasEntity(\n", 780 | " name=pbidata_ent_name,\n", 781 | " typeName=pbidata_entity_type,\n", 782 | " qualified_name= pbidata_ent_qn,\n", 783 | " workspace = pbi_workspace,\n", 784 | " guid = guid.get_guid()\n", 785 | " )\n", 786 | "\n", 787 | " # Prepare all the entities as a batch to be uploaded.\n", 788 | " batch = [pbidata_ent]\n", 789 | " batch\n", 790 | "\n", 791 | " # Upload all entities!\n", 792 | " client.upload_entities(batch=batch)\n", 793 | "\n", 794 | " #cretae powerbi_dataset_process lineage\n", 795 | " in_ent_guids = []\n", 796 | " in_ent_guid = get_entity_guid(purview_basepath + ml_dataset_ent_name,ml_dataset_ent_type)\n", 797 | " in_ent_guids.append({'guid':in_ent_guid})\n", 798 | "\n", 799 | " out_ent_guids = []\n", 800 | " out_ent_guid = get_entity_guid(pbidata_ent_qn,pbidata_entity_type)\n", 801 | " out_ent_guids.append({'guid':out_ent_guid})\n", 802 | "\n", 803 | " process_name = 'createpowerbidataset' + pbidata_ent_name + experiment_name\n", 804 | " process_qn = \"pyapacheatlas://\" + process_name\n", 805 | " process_type_name = \"powerbi_dataset_process\"\n", 806 | "\n", 807 | " process = AtlasProcess(\n", 808 | " name=process_name,\n", 809 | " typeName=process_type_name,\n", 810 | " qualified_name=process_qn,\n", 811 | " inputs = in_ent_guids,\n", 812 | " outputs = out_ent_guids,\n", 813 | " guid=guid.get_guid()\n", 814 | " )\n", 815 | "\n", 816 | " # Prepare all the entities as a batch to be uploaded.\n", 817 | " batch = [process]\n", 818 | " batch\n", 819 | "\n", 820 | " # Upload all entities!\n", 821 | " client.upload_entities(batch=batch)\n", 822 | " \n", 823 | "def create_powerbi_report_and_lineage(experiment_name,pbi_workspace,pbi_reportid,pbi_ent_name,pbi_datasetid):\n", 824 | "\n", 825 | " #create powerbi report\n", 826 | " pbi_entity_type = 'powerbi_report'\n", 827 | " pbi_ent_qn = pbi_workspace + '/reports/' + pbi_reportid \n", 828 | " purview_basepath = 'pyapacheatlas://'\n", 829 | " \n", 830 | " pbi_ent = AtlasEntity(\n", 831 | " name=pbi_ent_name,\n", 832 | " typeName=pbi_entity_type,\n", 833 | " qualified_name= pbi_ent_qn, \n", 834 | " workspace = pbi_workspace,\n", 835 | " guid = guid.get_guid()\n", 836 | " )\n", 837 | "\n", 838 | " # Prepare all the entities as a batch to be uploaded.\n", 839 | " batch = [pbi_ent]\n", 840 | " batch\n", 841 | "\n", 842 | " # Upload all entities!\n", 843 | " client.upload_entities(batch=batch)\n", 844 | "\n", 845 | " #create powerbi dashboard process lineage\n", 846 | " pbidata_ent_qn = pbi_workspace + '/datasets/' + pbi_datasetid \n", 847 | " in_ent_guids = []\n", 848 | " in_ent_guid = get_entity_guid(pbidata_ent_qn,'powerbi_dataset')\n", 849 | " in_ent_guids.append({'guid':in_ent_guid})\n", 850 | "\n", 851 | " out_ent_guids = []\n", 852 | " out_ent_guid = get_entity_guid(pbi_ent_qn,'powerbi_report')\n", 853 | " out_ent_guids.append({'guid':out_ent_guid})\n", 854 | "\n", 855 | " process_name = 'createpowerbireport' + pbi_ent_name + experiment_name\n", 856 | " process_qn = \"pyapacheatlas://\" + process_name\n", 857 | " process_type_name = \"powerbi_report_process\"\n", 858 | "\n", 859 | " process = AtlasProcess(\n", 860 | " name=process_name,\n", 861 | " typeName=process_type_name,\n", 862 | " qualified_name=process_qn,\n", 863 | " inputs = in_ent_guids,\n", 864 | " outputs = out_ent_guids,\n", 865 | " guid=guid.get_guid()\n", 866 | " )\n", 867 | "\n", 868 | " # Prepare all the entities as a batch to be uploaded.\n", 869 | " batch = [process]\n", 870 | " batch\n", 871 | "\n", 872 | " # Upload all entities!\n", 873 | " client.upload_entities(batch=batch)\n", 874 | " \n", 875 | "# clean up datasets\n", 876 | "def cleanup_entities(typename, entitytype):\n", 877 | " filter_setup = {\"typeName\": typename, \"includeSubTypes\": True}\n", 878 | " search = client.search_entities(\"*\", search_filter=filter_setup)\n", 879 | " for entity in search:\n", 880 | " #print(entity)\n", 881 | " if entity.get(\"entityType\") == entitytype:\n", 882 | " print(entity.get(\"id\"),entity.get(\"qualifiedName\"),entity.get(\"entityType\"))\n", 883 | " guid = entity.get(\"id\")\n", 884 | " client.delete_entity(guid=guid)\n", 885 | "\n" 886 | ] 887 | } 888 | ], 889 | "metadata": { 890 | "kernelspec": { 891 | "display_name": "Python 3", 892 | "language": "python", 893 | "name": "python3" 894 | }, 895 | "language_info": { 896 | "codemirror_mode": { 897 | "name": "ipython", 898 | "version": 3 899 | }, 900 | "file_extension": ".py", 901 | "mimetype": "text/x-python", 902 | "name": "python", 903 | "nbconvert_exporter": "python", 904 | "pygments_lexer": "ipython3", 905 | "version": "3.7.4" 906 | }, 907 | "save_output": true, 908 | "synapse_widget": { 909 | "state": {}, 910 | "version": "0.1" 911 | } 912 | }, 913 | "nbformat": 4, 914 | "nbformat_minor": 2 915 | } 916 | -------------------------------------------------------------------------------- /SynapseNotebooks/04_Create_CreditRisk_Experiment.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true, 8 | "jupyter": { 9 | "outputs_hidden": false, 10 | "source_hidden": false 11 | }, 12 | "nteract": { 13 | "transient": { 14 | "deleting": false 15 | } 16 | } 17 | }, 18 | "outputs": [], 19 | "source": [ 20 | "%run /01_Authenticate_to_Purview_AML" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "metadata": { 27 | "collapsed": true, 28 | "jupyter": { 29 | "outputs_hidden": false, 30 | "source_hidden": false 31 | }, 32 | "nteract": { 33 | "transient": { 34 | "deleting": false 35 | } 36 | } 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "%run /02_Create_ML_Lineage_Types" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 3, 46 | "metadata": { 47 | "collapsed": true, 48 | "jupyter": { 49 | "outputs_hidden": false, 50 | "source_hidden": false 51 | }, 52 | "nteract": { 53 | "transient": { 54 | "deleting": false 55 | } 56 | } 57 | }, 58 | "outputs": [], 59 | "source": [ 60 | "%run /03_Create_ML_Lineage_Functions" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 2, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "import json" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 3, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "#update below variables with synapse adls name and container/filesystem name\n", 79 | "data_lake_account_name = \"\"\n", 80 | "file_system_name = \"data\"" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 4, 86 | "metadata": { 87 | "jupyter": { 88 | "outputs_hidden": false, 89 | "source_hidden": false 90 | }, 91 | "nteract": { 92 | "transient": { 93 | "deleting": false 94 | } 95 | } 96 | }, 97 | "outputs": [], 98 | "source": [ 99 | "import pandas as pd\n", 100 | "import numpy as np\n", 101 | "\n", 102 | "synapse_base_path = 'abfss://' + file_system_name + '@' + data_lake_account_name + '.dfs.core.windows.net'\n", 103 | "df_borrower = spark.read.load(synapse_base_path+ '/creditriskdata/borrower.csv', format='csv', header=True).toPandas()\n", 104 | "#display(df_borrower.head(10))\n", 105 | "\n", 106 | "df_loan = spark.read.load(synapse_base_path + '/creditriskdata/loan.csv', format='csv', header=True).toPandas()\n", 107 | "#display(df_loan.head(1))\n", 108 | "\n", 109 | "# Join data and do some transformations\n", 110 | "df_data = df_borrower.merge(df_loan,on='memberId',how='inner')\n", 111 | "df_data.shape\n", 112 | "\n", 113 | "df_sp = spark.createDataFrame(df_data)\n", 114 | "df_sp = df_sp.drop('loanStatus')\n", 115 | "\n", 116 | "df_sp.write.option('header', 'true').mode('overwrite').csv(synapse_base_path + '/creditriskdata/testdata/')\n", 117 | "\n", 118 | "df_data['homeOwnership'] = df_data['homeOwnership'].replace('nan', np.nan).fillna(0)\n", 119 | "df_data['isJointApplication'] = df_data['isJointApplication'].replace('nan', np.nan).fillna(0)\n", 120 | "\n", 121 | "drop_cols = ['memberId', 'loanId', 'date','grade']\n", 122 | "df_data = df_data.drop(drop_cols, axis=1)\n", 123 | "#df_data.dtypes" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 6, 129 | "metadata": { 130 | "collapsed": true, 131 | "jupyter": { 132 | "outputs_hidden": false, 133 | "source_hidden": false 134 | }, 135 | "nteract": { 136 | "transient": { 137 | "deleting": false 138 | } 139 | } 140 | }, 141 | "outputs": [], 142 | "source": [ 143 | "experimentname = \"CreditRiskExperiment\"" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 7, 149 | "metadata": { 150 | "collapsed": true, 151 | "jupyter": { 152 | "outputs_hidden": false, 153 | "source_hidden": false 154 | }, 155 | "nteract": { 156 | "transient": { 157 | "deleting": false 158 | } 159 | } 160 | }, 161 | "outputs": [], 162 | "source": [ 163 | "#create an entity for prepated data\n", 164 | "data_ent_name = 'creditriskdata'\n", 165 | "create_data_entity_with_schema(df_data,data_ent_name,'custom_dataset')\n", 166 | "\n", 167 | "#create preprocess lineage \n", 168 | "\n", 169 | "syn_basepath = 'https://' + data_lake_account_name + '.dfs.core.windows.net/' + file_system_name + '/creditriskdata'\n", 170 | "purview_basepath = 'pyapacheatlas://'\n", 171 | "\n", 172 | "in_ent_qns = {syn_basepath + '/borrower.csv':'azure_datalake_gen2_path',syn_basepath + '/loan.csv':'azure_datalake_gen2_path'}\n", 173 | "out_ent_qns = {purview_basepath + data_ent_name:'custom_dataset'}\n", 174 | "\n", 175 | "processname = '-preprocess'\n", 176 | "create_lineage_for_entities(experimentname,processname, in_ent_qns,out_ent_qns,ColumnMapping=True)" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 8, 182 | "metadata": { 183 | "collapsed": true, 184 | "jupyter": { 185 | "outputs_hidden": false, 186 | "source_hidden": false 187 | }, 188 | "nteract": { 189 | "transient": { 190 | "deleting": false 191 | } 192 | } 193 | }, 194 | "outputs": [], 195 | "source": [ 196 | "from azureml.core.experiment import Experiment\n", 197 | "from azureml.train.automl.run import AutoMLRun\n", 198 | "from azureml.train.automl import AutoMLConfig\n", 199 | "\n", 200 | "##run only once\n", 201 | "experiment = Experiment(ws, experimentname)\n", 202 | "\n", 203 | "automl_classifier_config = AutoMLConfig(\n", 204 | " task='classification', \n", 205 | " enable_early_stopping = True, \n", 206 | " iterations = 2, \n", 207 | " experiment_timeout_minutes=15,\n", 208 | " primary_metric='AUC_weighted',\n", 209 | " training_data= df_data,\n", 210 | " #compute = 'local',\n", 211 | " label_column_name='loanStatus',\n", 212 | " n_cross_validations=5,\n", 213 | " model_explainability=True,\n", 214 | " enable_onnx_compatible_models=True,\n", 215 | " enable_voting_ensemble=False,\n", 216 | " enable_stack_ensemble=False\n", 217 | " )\n", 218 | "local_run = experiment.submit(automl_classifier_config, show_output=True)" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": 10, 224 | "metadata": { 225 | "collapsed": true, 226 | "jupyter": { 227 | "outputs_hidden": false, 228 | "source_hidden": false 229 | }, 230 | "nteract": { 231 | "transient": { 232 | "deleting": false 233 | } 234 | } 235 | }, 236 | "outputs": [], 237 | "source": [ 238 | "# get experiment run, get the best model and register\n", 239 | "\n", 240 | "from azureml.core.experiment import Experiment\n", 241 | "from azureml.core.workspace import Workspace\n", 242 | "from azureml.train.automl.run import AutoMLRun\n", 243 | "from azureml.train.automl import AutoMLConfig\n", 244 | "from azureml.core.model import Model\n", 245 | "import joblib\n", 246 | "\n", 247 | "# get experiment run, get the best model and register\n", 248 | "experimentname = \"CreditRiskExperiment\"\n", 249 | "\n", 250 | "for automl_run in ws.experiments[experimentname].get_runs():\n", 251 | " best_run, fitted_model = automl_run.get_output() # We are taking the first run. You can update this if you like to take a different run\n", 252 | " break\n", 253 | "\n", 254 | "#save the model to a local file\n", 255 | "model_path = 'creditrisk_model'\n", 256 | "joblib.dump(fitted_model, model_path)\n", 257 | "\n", 258 | "model_name = \"creditrisk_model\"\n", 259 | "registered_model = Model.register(model_path = model_path, # this points to a local file\n", 260 | " model_name = model_name, # name the model is registered as\n", 261 | " tags = {'type': \"classification\"}, \n", 262 | " description = \"Credit Risk Classifier\", \n", 263 | " workspace = ws)\n" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": 11, 269 | "metadata": { 270 | "collapsed": true, 271 | "jupyter": { 272 | "outputs_hidden": false, 273 | "source_hidden": false 274 | }, 275 | "nteract": { 276 | "transient": { 277 | "deleting": false 278 | } 279 | } 280 | }, 281 | "outputs": [], 282 | "source": [ 283 | "#create packages entities\n", 284 | "#[programming_language,package_name,version,notes]\n", 285 | "packageslist = [['python','mmlspark','v0.0.11','older versions before 0.0.10 give error'],\n", 286 | " ['python','scikit-learn','0.22rc2.post1','latest version 0.24.x gives error if you call the model from Azure Function']]\n", 287 | "create_package_entities(experimentname,packageslist)" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": 12, 293 | "metadata": { 294 | "collapsed": true, 295 | "jupyter": { 296 | "outputs_hidden": false, 297 | "source_hidden": false 298 | }, 299 | "nteract": { 300 | "transient": { 301 | "deleting": false 302 | } 303 | } 304 | }, 305 | "outputs": [], 306 | "source": [ 307 | "#create experiment train lineage\n", 308 | "create_experiment_config_entity(ws,experimentname,automl_run)\n", 309 | "create_model_entity(ws,experimentname,model_name)\n", 310 | "create_model_metrics_entity(experimentname,best_run)\n", 311 | "\n", 312 | "pbasepath = 'pyapacheatlas://'\n", 313 | "\n", 314 | "in_ent_qns = {pbasepath + data_ent_name:'custom_dataset',pbasepath + experimentname + \"-config\":'custom_ml_exp_config',pbasepath + experimentname + '-packages':'custom_ml_packages'}\n", 315 | "out_ent_qns = {pbasepath + model_name:'custom_ml_model',pbasepath + experimentname + \"-modelmetrics\":'custom_ml_model_metrics'}\n", 316 | "\n", 317 | "processname = '-train'\n", 318 | "create_lineage_for_entities(experimentname,processname, in_ent_qns,out_ent_qns,ColumnMapping=False)" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": 13, 324 | "metadata": { 325 | "collapsed": true, 326 | "jupyter": { 327 | "outputs_hidden": false, 328 | "source_hidden": false 329 | }, 330 | "nteract": { 331 | "transient": { 332 | "deleting": false 333 | } 334 | } 335 | }, 336 | "outputs": [], 337 | "source": [ 338 | "scoring_script = \"\"\"\n", 339 | "import json\n", 340 | "import pickle\n", 341 | "import numpy as np\n", 342 | "import pandas as pd\n", 343 | "import azureml.train.automl\n", 344 | "from sklearn.externals import joblib\n", 345 | "from azureml.core.model import Model\n", 346 | "\n", 347 | "def init():\n", 348 | " global model\n", 349 | " # This name is model.id of model that we want to deploy deserialize the model file back\n", 350 | " model_path = Model.get_model_path(model_name = 'creditrisk_model')\n", 351 | " model = joblib.load(model_path)\n", 352 | "\n", 353 | "def run(input_json): \n", 354 | " try:\n", 355 | " data_df = pd.read_json(input_json) \n", 356 | " # Get the predictions...\n", 357 | " prediction = model.predict(data_df)\n", 358 | " prediction = json.dumps(prediction.tolist())\n", 359 | " except Exception as e:\n", 360 | " prediction = str(e)\n", 361 | " return prediction\n", 362 | "\"\"\"\n", 363 | "exec(scoring_script)\n", 364 | "with open(\"scoring_script.py\", \"w\") as file:\n", 365 | " file.write(scoring_script)\n", 366 | " \n", 367 | "scoring_script_file_name = 'scoring_script.py'\n", 368 | "\n", 369 | "#test locally\n", 370 | "import numpy as np\n", 371 | "# X_test = spark.sql('select * from default.creditrisk_data limit 20').toPandas()\n", 372 | "drop_cols = ['loanStatus']\n", 373 | "X_test = df_data.drop(drop_cols, axis=1)\n", 374 | "X_test = X_test.head(1)\n", 375 | "json_test_data = X_test.to_json(orient='records')\n", 376 | "print(json_test_data)\n", 377 | "init()\n", 378 | "run(json_test_data)" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": 14, 384 | "metadata": { 385 | "collapsed": true, 386 | "jupyter": { 387 | "outputs_hidden": false, 388 | "source_hidden": false 389 | }, 390 | "nteract": { 391 | "transient": { 392 | "deleting": false 393 | } 394 | } 395 | }, 396 | "outputs": [], 397 | "source": [ 398 | "# obtain conda dependencies from the automl run and save the file locally\n", 399 | "from azureml.core import Environment\n", 400 | "environment_config_file = 'creditrisk_conda_env.yml'\n", 401 | "best_run.download_file('outputs/conda_env_v_1_0_0.yml', environment_config_file)\n", 402 | "# with open('creditrisk_conda_env.yml', 'r') as f:\n", 403 | "# print(f.read())\n", 404 | "\n", 405 | "# create the environment based on the saved conda dependencies file\n", 406 | "myenv = Environment.from_conda_specification(name=\"creditriskenv\", file_path=environment_config_file)\n", 407 | "myenv.register(workspace=ws)\n", 408 | "\n", 409 | "from azureml.core.model import InferenceConfig\n", 410 | "from azureml.core.webservice import AciWebservice\n", 411 | "from azureml.core.webservice import Webservice\n", 412 | "\n", 413 | "# Configure and deploy the web service to Azure Container Instances\n", 414 | "inference_config = InferenceConfig(environment=myenv, entry_script=scoring_script_file_name)\n", 415 | "aci_config = AciWebservice.deploy_configuration(cpu_cores = 1, memory_gb= 2, tags = { 'type' : 'automl-classification'}, description='AutoML Credit Risk Classifier Service')\n", 416 | "aci_service_name = 'creditrisk-automl-service'\n", 417 | "aci_service = Model.deploy(ws, aci_service_name, [registered_model], inference_config, aci_config)\n", 418 | "aci_service.wait_for_deployment(show_output = True)\n", 419 | "print(aci_service.state)" 420 | ] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": 16, 425 | "metadata": { 426 | "collapsed": true, 427 | "jupyter": { 428 | "outputs_hidden": false, 429 | "source_hidden": false 430 | }, 431 | "nteract": { 432 | "transient": { 433 | "deleting": false 434 | } 435 | } 436 | }, 437 | "outputs": [], 438 | "source": [ 439 | "aci_service_name = 'creditrisk-automl-service'\n", 440 | "create_model_service_entity(ws,experimentname,aci_service_name,json_test_data)\n", 441 | "\n", 442 | "pbasepath = 'pyapacheatlas://'\n", 443 | "\n", 444 | "in_ent_qns = {pbasepath + model_name:'custom_ml_model'}\n", 445 | "out_ent_qns = {pbasepath + experimentname + \"-model_endpoint\":'custom_ml_model_endpoint'}\n", 446 | "\n", 447 | "processname = '-deploymodel'\n", 448 | "create_lineage_for_entities(experimentname,processname, in_ent_qns,out_ent_qns,ColumnMapping=False)" 449 | ] 450 | }, 451 | { 452 | "cell_type": "code", 453 | "execution_count": 17, 454 | "metadata": { 455 | "jupyter": { 456 | "outputs_hidden": false, 457 | "source_hidden": false 458 | }, 459 | "nteract": { 460 | "transient": { 461 | "deleting": false 462 | } 463 | } 464 | }, 465 | "outputs": [], 466 | "source": [ 467 | "#batch inferencing\n", 468 | "df_test = spark.read.load(synapse_base_path +'/creditriskdata/testdata', format='csv', header=True).toPandas()\n", 469 | "\n", 470 | "drop_cols = ['memberId', 'loanId', 'date','grade']\n", 471 | "df_test1 = df_test.drop(drop_cols, axis=1)\n", 472 | "\n", 473 | "model_path = Model.get_model_path(model_name = 'creditrisk_model')\n", 474 | "model = joblib.load(model_path)\n", 475 | "\n", 476 | "prediction = model.predict(df_test1)\n", 477 | "prediction\n", 478 | "\n", 479 | "df_result = df_test \n", 480 | "df_result['prediction'] = prediction\n", 481 | "df_result\n", 482 | "\n", 483 | "data_lake_account_name = 'purviewaccdl'\n", 484 | "file_system_name = 'purviewaccfs'\n", 485 | "df_sp = spark.createDataFrame(df_result)\n", 486 | "df_sp.write.option('header', 'true').mode('overwrite').csv(synapse_base_path + '/creditriskdata/batchpredictions/')\n", 487 | "\n", 488 | "df_sp.write.mode(\"overwrite\").saveAsTable(\"default.creditrisk_predictions\")" 489 | ] 490 | }, 491 | { 492 | "cell_type": "code", 493 | "execution_count": 22, 494 | "metadata": { 495 | "collapsed": true, 496 | "jupyter": { 497 | "outputs_hidden": false, 498 | "source_hidden": false 499 | }, 500 | "nteract": { 501 | "transient": { 502 | "deleting": false 503 | } 504 | } 505 | }, 506 | "outputs": [], 507 | "source": [ 508 | "#create an entity for test data\n", 509 | "test_data_ent_name = 'creditrisktestdata'\n", 510 | "create_data_entity_with_schema(df_test,test_data_ent_name,entitytype='custom_dataset')\n", 511 | "\n", 512 | "#create an entity for batch inference data\n", 513 | "batchpred_data_ent_name = 'creditriskbatchpredictions'\n", 514 | "create_data_entity_with_schema(df_result,batchpred_data_ent_name,entitytype='custom_dataset')\n", 515 | "\n", 516 | "#create batch inference lineage \n", 517 | "syn_basepath = 'https://' + data_lake_account_name + 'dfs.core.windows.net' + file_system_name + '/'\n", 518 | "pbasepath = 'pyapacheatlas://'\n", 519 | "\n", 520 | "in_ent_qns = {pbasepath + test_data_ent_name:'custom_dataset',pbasepath + model_name:'custom_ml_model'}\n", 521 | "out_ent_qns = {pbasepath + batchpred_data_ent_name:'custom_dataset'}\n", 522 | "\n", 523 | "processname = '-batchinference'\n", 524 | "create_lineage_for_entities(experimentname,processname, in_ent_qns,out_ent_qns,ColumnMapping=True)" 525 | ] 526 | }, 527 | { 528 | "cell_type": "code", 529 | "execution_count": null, 530 | "metadata": {}, 531 | "outputs": [], 532 | "source": [ 533 | "## uncomment below code to link PowerBI Dataset and Report in lineage if you have access to a PBI workspace " 534 | ] 535 | }, 536 | { 537 | "cell_type": "code", 538 | "execution_count": 23, 539 | "metadata": {}, 540 | "outputs": [], 541 | "source": [ 542 | "# #The PowerBI entities will populate with more details if you set up a scan for PBI workspaces in Purview\n", 543 | "# #We are only creating placeholders and links for lineage below\n", 544 | "\n", 545 | "# #create PowerBI dataset entity and lineage \n", 546 | "# pbi_workspace = '' #'https://xxx.powerbi.com/groups/7c555287-f9b8-45ff-be6c-9909afe9df40'\n", 547 | "# pbi_datasetid = '' #'c4a30c22-466d-4a30-a1ac-8736ed6567cc' \n", 548 | "\n", 549 | "# pbidata_ent_name = 'creditriskpbidataset' \n", 550 | "\n", 551 | "# create_powerbi_dataset_and_lineage(experimentname,pbi_workspace,pbi_datasetid,pbidata_ent_name,batchpred_data_ent_name,'custom_dataset')" 552 | ] 553 | }, 554 | { 555 | "cell_type": "code", 556 | "execution_count": 24, 557 | "metadata": {}, 558 | "outputs": [], 559 | "source": [ 560 | "# #create PowerBI report entity and lineage\n", 561 | "# pbi_reportid = '' #'e495453d-6c0c-4fb9-bdc4-556319f6a57b'\n", 562 | "# pbi_ent_name = 'creditriskpbireport'\n", 563 | " \n", 564 | "# create_powerbi_report_and_lineage(experimentname,pbi_workspace,pbi_reportid,pbi_ent_name,pbi_datasetid)" 565 | ] 566 | } 567 | ], 568 | "metadata": { 569 | "kernelspec": { 570 | "display_name": "Python 3", 571 | "language": "python", 572 | "name": "python3" 573 | }, 574 | "language_info": { 575 | "codemirror_mode": { 576 | "name": "ipython", 577 | "version": 3 578 | }, 579 | "file_extension": ".py", 580 | "mimetype": "text/x-python", 581 | "name": "python", 582 | "nbconvert_exporter": "python", 583 | "pygments_lexer": "ipython3", 584 | "version": "3.7.4" 585 | }, 586 | "save_output": true 587 | }, 588 | "nbformat": 4, 589 | "nbformat_minor": 2 590 | } 591 | --------------------------------------------------------------------------------