├── .gitignore
├── AMLNotebooks
    ├── 01_Create_CreditRisk_AML_Pipeline.ipynb
    ├── 02_Create_CreditRisk_AML_Pipeline_Lineage.ipynb
    ├── Authenticate_to_Purview_AML.py
    ├── Create_ML_Lineage_Functions.py
    ├── Create_ML_Lineage_Types.py
    └── Data
    │   ├── borrower.csv
    │   └── loan.csv
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── Data
    ├── borrower.csv
    └── loan.csv
├── Deployment
    ├── deploy.json
    ├── img
    │   ├── ADLSGen2Scanning.PNG
    │   ├── AMLPipeline.PNG
    │   ├── AMLPipelineLineage.PNG
    │   ├── Architecture.PNG
    │   ├── MLLineageScreenshot.PNG
    │   ├── ManageSparkPool.png
    │   ├── PurviewMLLineageIntroduction.PNG
    │   ├── PurviewMLLineageSolutionAccelerator.PNG
    │   ├── PurviewScreenshot.png
    │   ├── Requirements.png
    │   ├── add-role-assignment-page.png
    │   └── deploy-firewall.png
    └── requirements.txt
├── LICENSE
├── NOTICE.txt
├── PRIVACY.md
├── README.md
├── SECURITY.md
├── SUPPORT.md
└── SynapseNotebooks
    ├── 01_Authenticate_to_Purview_AML.ipynb
    ├── 02_Create_ML_Lineage_Types.ipynb
    ├── 03_Create_ML_Lineage_Functions.ipynb
    └── 04_Create_CreditRisk_Experiment.ipynb


/.gitignore:
--------------------------------------------------------------------------------
  1 | ## Ignore Visual Studio temporary files, build results, and
  2 | ## files generated by popular Visual Studio add-ons.
  3 | ##
  4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
  5 | 
  6 | # User-specific files
  7 | *.rsuser
  8 | *.suo
  9 | *.user
 10 | *.userosscache
 11 | *.sln.docstates
 12 | 
 13 | # User-specific files (MonoDevelop/Xamarin Studio)
 14 | *.userprefs
 15 | 
 16 | # Mono auto generated files
 17 | mono_crash.*
 18 | 
 19 | # Build results
 20 | [Dd]ebug/
 21 | [Dd]ebugPublic/
 22 | [Rr]elease/
 23 | [Rr]eleases/
 24 | x64/
 25 | x86/
 26 | [Aa][Rr][Mm]/
 27 | [Aa][Rr][Mm]64/
 28 | bld/
 29 | [Bb]in/
 30 | [Oo]bj/
 31 | [Ll]og/
 32 | [Ll]ogs/
 33 | 
 34 | # Visual Studio 2015/2017 cache/options directory
 35 | .vs/
 36 | # Uncomment if you have tasks that create the project's static files in wwwroot
 37 | #wwwroot/
 38 | 
 39 | # Visual Studio 2017 auto generated files
 40 | Generated\ Files/
 41 | 
 42 | # MSTest test Results
 43 | [Tt]est[Rr]esult*/
 44 | [Bb]uild[Ll]og.*
 45 | 
 46 | # NUnit
 47 | *.VisualState.xml
 48 | TestResult.xml
 49 | nunit-*.xml
 50 | 
 51 | # Build Results of an ATL Project
 52 | [Dd]ebugPS/
 53 | [Rr]eleasePS/
 54 | dlldata.c
 55 | 
 56 | # Benchmark Results
 57 | BenchmarkDotNet.Artifacts/
 58 | 
 59 | # .NET Core
 60 | project.lock.json
 61 | project.fragment.lock.json
 62 | artifacts/
 63 | 
 64 | # StyleCop
 65 | StyleCopReport.xml
 66 | 
 67 | # Files built by Visual Studio
 68 | *_i.c
 69 | *_p.c
 70 | *_h.h
 71 | *.ilk
 72 | *.meta
 73 | *.obj
 74 | *.iobj
 75 | *.pch
 76 | *.pdb
 77 | *.ipdb
 78 | *.pgc
 79 | *.pgd
 80 | *.rsp
 81 | *.sbr
 82 | *.tlb
 83 | *.tli
 84 | *.tlh
 85 | *.tmp
 86 | *.tmp_proj
 87 | *_wpftmp.csproj
 88 | *.log
 89 | *.vspscc
 90 | *.vssscc
 91 | .builds
 92 | *.pidb
 93 | *.svclog
 94 | *.scc
 95 | 
 96 | # Chutzpah Test files
 97 | _Chutzpah*
 98 | 
 99 | # Visual C++ cache files
100 | ipch/
101 | *.aps
102 | *.ncb
103 | *.opendb
104 | *.opensdf
105 | *.sdf
106 | *.cachefile
107 | *.VC.db
108 | *.VC.VC.opendb
109 | 
110 | # Visual Studio profiler
111 | *.psess
112 | *.vsp
113 | *.vspx
114 | *.sap
115 | 
116 | # Visual Studio Trace Files
117 | *.e2e
118 | 
119 | # TFS 2012 Local Workspace
120 | $tf/
121 | 
122 | # Guidance Automation Toolkit
123 | *.gpState
124 | 
125 | # ReSharper is a .NET coding add-in
126 | _ReSharper*/
127 | *.[Rr]e[Ss]harper
128 | *.DotSettings.user
129 | 
130 | # TeamCity is a build add-in
131 | _TeamCity*
132 | 
133 | # DotCover is a Code Coverage Tool
134 | *.dotCover
135 | 
136 | # AxoCover is a Code Coverage Tool
137 | .axoCover/*
138 | !.axoCover/settings.json
139 | 
140 | # Visual Studio code coverage results
141 | *.coverage
142 | *.coveragexml
143 | 
144 | # NCrunch
145 | _NCrunch_*
146 | .*crunch*.local.xml
147 | nCrunchTemp_*
148 | 
149 | # MightyMoose
150 | *.mm.*
151 | AutoTest.Net/
152 | 
153 | # Web workbench (sass)
154 | .sass-cache/
155 | 
156 | # Installshield output folder
157 | [Ee]xpress/
158 | 
159 | # DocProject is a documentation generator add-in
160 | DocProject/buildhelp/
161 | DocProject/Help/*.HxT
162 | DocProject/Help/*.HxC
163 | DocProject/Help/*.hhc
164 | DocProject/Help/*.hhk
165 | DocProject/Help/*.hhp
166 | DocProject/Help/Html2
167 | DocProject/Help/html
168 | 
169 | # Click-Once directory
170 | publish/
171 | 
172 | # Publish Web Output
173 | *.[Pp]ublish.xml
174 | *.azurePubxml
175 | # Note: Comment the next line if you want to checkin your web deploy settings,
176 | # but database connection strings (with potential passwords) will be unencrypted
177 | *.pubxml
178 | *.publishproj
179 | 
180 | # Microsoft Azure Web App publish settings. Comment the next line if you want to
181 | # checkin your Azure Web App publish settings, but sensitive information contained
182 | # in these scripts will be unencrypted
183 | PublishScripts/
184 | 
185 | # NuGet Packages
186 | *.nupkg
187 | # NuGet Symbol Packages
188 | *.snupkg
189 | # The packages folder can be ignored because of Package Restore
190 | **/[Pp]ackages/*
191 | # except build/, which is used as an MSBuild target.
192 | !**/[Pp]ackages/build/
193 | # Uncomment if necessary however generally it will be regenerated when needed
194 | #!**/[Pp]ackages/repositories.config
195 | # NuGet v3's project.json files produces more ignorable files
196 | *.nuget.props
197 | *.nuget.targets
198 | 
199 | # Microsoft Azure Build Output
200 | csx/
201 | *.build.csdef
202 | 
203 | # Microsoft Azure Emulator
204 | ecf/
205 | rcf/
206 | 
207 | # Windows Store app package directories and files
208 | AppPackages/
209 | BundleArtifacts/
210 | Package.StoreAssociation.xml
211 | _pkginfo.txt
212 | *.appx
213 | *.appxbundle
214 | *.appxupload
215 | 
216 | # Visual Studio cache files
217 | # files ending in .cache can be ignored
218 | *.[Cc]ache
219 | # but keep track of directories ending in .cache
220 | !?*.[Cc]ache/
221 | 
222 | # Others
223 | ClientBin/
224 | ~$*
225 | *~
226 | *.dbmdl
227 | *.dbproj.schemaview
228 | *.jfm
229 | *.pfx
230 | *.publishsettings
231 | orleans.codegen.cs
232 | 
233 | # Including strong name files can present a security risk
234 | # (https://github.com/github/gitignore/pull/2483#issue-259490424)
235 | #*.snk
236 | 
237 | # Since there are multiple workflows, uncomment next line to ignore bower_components
238 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
239 | #bower_components/
240 | 
241 | # RIA/Silverlight projects
242 | Generated_Code/
243 | 
244 | # Backup & report files from converting an old project file
245 | # to a newer Visual Studio version. Backup files are not needed,
246 | # because we have git ;-)
247 | _UpgradeReport_Files/
248 | Backup*/
249 | UpgradeLog*.XML
250 | UpgradeLog*.htm
251 | ServiceFabricBackup/
252 | *.rptproj.bak
253 | 
254 | # SQL Server files
255 | *.mdf
256 | *.ldf
257 | *.ndf
258 | 
259 | # Business Intelligence projects
260 | *.rdl.data
261 | *.bim.layout
262 | *.bim_*.settings
263 | *.rptproj.rsuser
264 | *- [Bb]ackup.rdl
265 | *- [Bb]ackup ([0-9]).rdl
266 | *- [Bb]ackup ([0-9][0-9]).rdl
267 | 
268 | # Microsoft Fakes
269 | FakesAssemblies/
270 | 
271 | # GhostDoc plugin setting file
272 | *.GhostDoc.xml
273 | 
274 | # Node.js Tools for Visual Studio
275 | .ntvs_analysis.dat
276 | node_modules/
277 | 
278 | # Visual Studio 6 build log
279 | *.plg
280 | 
281 | # Visual Studio 6 workspace options file
282 | *.opt
283 | 
284 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
285 | *.vbw
286 | 
287 | # Visual Studio LightSwitch build output
288 | **/*.HTMLClient/GeneratedArtifacts
289 | **/*.DesktopClient/GeneratedArtifacts
290 | **/*.DesktopClient/ModelManifest.xml
291 | **/*.Server/GeneratedArtifacts
292 | **/*.Server/ModelManifest.xml
293 | _Pvt_Extensions
294 | 
295 | # Paket dependency manager
296 | .paket/paket.exe
297 | paket-files/
298 | 
299 | # FAKE - F# Make
300 | .fake/
301 | 
302 | # CodeRush personal settings
303 | .cr/personal
304 | 
305 | # Python Tools for Visual Studio (PTVS)
306 | __pycache__/
307 | *.pyc
308 | 
309 | # Cake - Uncomment if you are using it
310 | # tools/**
311 | # !tools/packages.config
312 | 
313 | # Tabs Studio
314 | *.tss
315 | 
316 | # Telerik's JustMock configuration file
317 | *.jmconfig
318 | 
319 | # BizTalk build output
320 | *.btp.cs
321 | *.btm.cs
322 | *.odx.cs
323 | *.xsd.cs
324 | 
325 | # OpenCover UI analysis results
326 | OpenCover/
327 | 
328 | # Azure Stream Analytics local run output
329 | ASALocalRun/
330 | 
331 | # MSBuild Binary and Structured Log
332 | *.binlog
333 | 
334 | # NVidia Nsight GPU debugger configuration file
335 | *.nvuser
336 | 
337 | # MFractors (Xamarin productivity tool) working folder
338 | .mfractor/
339 | 
340 | # Local History for Visual Studio
341 | .localhistory/
342 | 
343 | # BeatPulse healthcheck temp database
344 | healthchecksdb
345 | 
346 | # Backup folder for Package Reference Convert tool in Visual Studio 2017
347 | MigrationBackup/
348 | 
349 | # Ionide (cross platform F# VS Code tools) working folder
350 | .ionide/
351 | 


--------------------------------------------------------------------------------
/AMLNotebooks/01_Create_CreditRisk_AML_Pipeline.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import azureml.core\n",
 10 |     "from azureml.core import Workspace\n",
 11 |     "\n",
 12 |     "ws = Workspace.from_config()\n",
 13 |     "\n",
 14 |     "# Get the default datastore\n",
 15 |     "default_ds = ws.get_default_datastore()\n",
 16 |     "\n",
 17 |     "default_ds.upload_files(files=['./Data/borrower.csv', './Data/loan.csv'], # Upload the diabetes csv files in /data\n",
 18 |     "                       target_path='creditrisk-data/', # Put it in a folder path in the datastore\n",
 19 |     "                       overwrite=True, # Replace existing files of the same name\n",
 20 |     "                       show_progress=True)\n",
 21 |     "\n",
 22 |     "#Create a Tabular dataset from the path on the datastore\n",
 23 |     "from azureml.core import Dataset\n",
 24 |     "\n",
 25 |     "tab_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, 'creditrisk-data/borrower.csv'))\n",
 26 |     "\n",
 27 |     "tab_data_set = tab_data_set.register(workspace=ws,\n",
 28 |     "                                        name='BorrowerData',\n",
 29 |     "                                        description='Borrower Data',\n",
 30 |     "                                        tags = {'format':'CSV'},\n",
 31 |     "                                        create_new_version=True)\n",
 32 |     "\n",
 33 |     "#Create a Tabular dataset from the path on the datastore\n",
 34 |     "tab_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, 'creditrisk-data/loan.csv'))\n",
 35 |     "\n",
 36 |     "tab_data_set = tab_data_set.register(workspace=ws,\n",
 37 |     "                                        name='LoanData',\n",
 38 |     "                                        description='Loans Data',\n",
 39 |     "                                        tags = {'format':'CSV'},\n",
 40 |     "                                        create_new_version=True)"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": null,
 46 |    "metadata": {},
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "from azureml.core import Workspace, Dataset, Datastore, ScriptRunConfig, Experiment\n",
 50 |     "from azureml.data.data_reference import DataReference\n",
 51 |     "import os\n",
 52 |     "import azureml.dataprep as dprep\n",
 53 |     "import pandas as pd\n",
 54 |     "import numpy as np\n",
 55 |     "\n",
 56 |     "import azureml.core\n",
 57 |     "from azureml.core import Workspace\n",
 58 |     "\n",
 59 |     "ws = Workspace.from_config()\n",
 60 |     "\n",
 61 |     "borrowerData = Dataset.get_by_name(ws, name='BorrowerData')\n",
 62 |     "loanData = Dataset.get_by_name(ws, name='LoanData')"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": null,
 68 |    "metadata": {},
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "from azureml.core import Datastore\n",
 72 |     "from azureml.core.compute import AmlCompute, ComputeTarget\n",
 73 |     "\n",
 74 |     "datastore = ws.get_default_datastore()\n",
 75 |     "\n",
 76 |     "# Create a compute cluster\n",
 77 |     "compute_name = 'cpu-cluster'\n",
 78 |     "if not compute_name in ws.compute_targets :\n",
 79 |     "    print('creating a new compute target...')\n",
 80 |     "    provisioning_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS2_V2',\n",
 81 |     "                                                                min_nodes=0,\n",
 82 |     "                                                                max_nodes=1)\n",
 83 |     "    compute_target = ComputeTarget.create(ws, compute_name, provisioning_config)\n",
 84 |     "\n",
 85 |     "    compute_target.wait_for_completion(\n",
 86 |     "        show_output=True, min_node_count=None, timeout_in_minutes=20)\n",
 87 |     "\n",
 88 |     "    # Show the result\n",
 89 |     "    print(compute_target.get_status().serialize())\n",
 90 |     "\n",
 91 |     "compute_target = ws.compute_targets[compute_name]"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "from azureml.core.runconfig import RunConfiguration\n",
101 |     "\n",
102 |     "from azureml.core import Environment\n",
103 |     "from azureml.core.conda_dependencies import CondaDependencies\n",
104 |     "\n",
105 |     "# Create a Python environment for the experiment\n",
106 |     "creditrisk_env = Environment(\"creditrisk-pipeline-env\")\n",
107 |     "creditrisk_env.python.user_managed_dependencies = False # Let Azure ML manage dependencies\n",
108 |     "creditrisk_env.docker.enabled = True # Use a docker container\n",
109 |     "\n",
110 |     "# Create a set of package dependencies\n",
111 |     "creditrisk_packages = CondaDependencies.create(conda_packages=['scikit-learn','joblib','pandas','numpy','pip'],\n",
112 |     "                                             pip_packages=['azureml-defaults','azureml-dataprep[pandas]'])\n",
113 |     "\n",
114 |     "\n",
115 |     "# Add the dependencies to the environment\n",
116 |     "creditrisk_env.python.conda_dependencies = creditrisk_packages\n",
117 |     "\n",
118 |     "# Register the environment \n",
119 |     "creditrisk_env.register(workspace=ws)\n",
120 |     "registered_env = Environment.get(ws, 'creditrisk-pipeline-env')\n",
121 |     "\n",
122 |     "# Create a new runconfig object for the pipeline\n",
123 |     "aml_run_config = RunConfiguration()\n",
124 |     "\n",
125 |     "# Use the compute you created above. \n",
126 |     "aml_run_config.target = compute_target\n",
127 |     "\n",
128 |     "# Assign the environment to the run configuration\n",
129 |     "aml_run_config.environment = registered_env\n",
130 |     "\n",
131 |     "print (\"Run configuration created.\")"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": null,
137 |    "metadata": {},
138 |    "outputs": [],
139 |    "source": [
140 |     "%%writefile PrepareData.py\n",
141 |     "from azureml.core import Run\n",
142 |     "\n",
143 |     "import pandas as pd \n",
144 |     "import numpy as np \n",
145 |     "import argparse\n",
146 |     "\n",
147 |     "parser = argparse.ArgumentParser()\n",
148 |     "parser.add_argument('--prepared_data', dest='prepared_data', required=True)\n",
149 |     "args = parser.parse_args()\n",
150 |     "    \n",
151 |     "borrowerData = Run.get_context().input_datasets['BorrowerData']\n",
152 |     "loanData = Run.get_context().input_datasets['LoanData']\n",
153 |     "\n",
154 |     "df_borrower = borrowerData.to_pandas_dataframe()\n",
155 |     "df_loan = loanData.to_pandas_dataframe()\n",
156 |     "\n",
157 |     "# Join data and do some transformations\n",
158 |     "df_data = df_borrower.merge(df_loan,on='memberId',how='inner')\n",
159 |     "df_data.shape\n",
160 |     "\n",
161 |     "df_data['homeOwnership'] = df_data['homeOwnership'].replace('nan', np.nan).fillna(0)\n",
162 |     "df_data['isJointApplication'] = df_data['isJointApplication'].replace('nan', np.nan).fillna(0)\n",
163 |     "\n",
164 |     "drop_cols = ['memberId', 'loanId', 'date','grade','residentialState']\n",
165 |     "df_data = df_data.drop(drop_cols, axis=1)\n",
166 |     "\n",
167 |     "df_data['loanStatus'] = np.where(df_data['loanStatus'] == 'Default', 1, 0) # change label column to 0/1\n",
168 |     "\n",
169 |     "df_data.to_csv(os.path.join(args.prepared_data,\"prepared_data.csv\"),index=False)\n",
170 |     "\n",
171 |     "print(f\"Wrote prepped data to {args.prepared_data}/prepared_data.csv\")"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": null,
177 |    "metadata": {},
178 |    "outputs": [],
179 |    "source": [
180 |     "from azureml.data import OutputFileDatasetConfig\n",
181 |     "from azureml.pipeline.steps import PythonScriptStep\n",
182 |     "\n",
183 |     "prepared_data = OutputFileDatasetConfig(name=\"prepared_data\")\n",
184 |     "\n",
185 |     "dataprep_step = PythonScriptStep(\n",
186 |     "    name=\"PrepareData\", \n",
187 |     "    script_name=\"PrepareData.py\", \n",
188 |     "    compute_target=compute_target, \n",
189 |     "    runconfig=aml_run_config,\n",
190 |     "    arguments=[\"--prepared_data\", prepared_data],\n",
191 |     "    inputs=[borrowerData.as_named_input('BorrowerData'),loanData.as_named_input('LoanData')],\n",
192 |     "    allow_reuse=True\n",
193 |     ")"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "code",
198 |    "execution_count": null,
199 |    "metadata": {},
200 |    "outputs": [],
201 |    "source": [
202 |     "# prepared_data = prepared_data_path.read_delimited_files()"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": null,
208 |    "metadata": {},
209 |    "outputs": [],
210 |    "source": [
211 |     "%%writefile TrainTestDataSplit.py\n",
212 |     "from azureml.core import Run\n",
213 |     "\n",
214 |     "import pandas as pd \n",
215 |     "import numpy as np \n",
216 |     "import argparse\n",
217 |     "\n",
218 |     "parser = argparse.ArgumentParser()\n",
219 |     "parser.add_argument('--prepared_data', dest='prepared_data', required=True)\n",
220 |     "parser.add_argument('--train_data', dest='train_data', required=True)\n",
221 |     "parser.add_argument('--test_data', dest='test_data', required=True)\n",
222 |     "args = parser.parse_args()\n",
223 |     "\n",
224 |     "df_data = pd.read_csv(args.prepared_data + '/prepared_data.csv')\n",
225 |     "\n",
226 |     "df_train=df_data.sample(frac=0.8,random_state=200) #random state is a seed value\n",
227 |     "df_train=df_data.drop(df_train.index)\n",
228 |     "\n",
229 |     "df_train.to_csv(os.path.join(args.train_data,\"train_data.csv\"),index=False)\n",
230 |     "df_train.to_csv(os.path.join(args.test_data,\"test_data.csv\"),index=False)\n",
231 |     "\n",
232 |     "print(f\"Wrote prepped data to {args.train_data}/train_data.csv\")\n",
233 |     "print(f\"Wrote prepped data to {args.test_data}/test_data.csv\")"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": null,
239 |    "metadata": {},
240 |    "outputs": [],
241 |    "source": [
242 |     "# test train split the data \n",
243 |     "train_data = OutputFileDatasetConfig(name=\"train_data\")\n",
244 |     "test_data = OutputFileDatasetConfig(name=\"test_data\")\n",
245 |     "\n",
246 |     "test_train_step = PythonScriptStep(name = \"TestTrainDataSplit\",\n",
247 |     "                                 script_name =\"TrainTestDataSplit.py\",\n",
248 |     "                                 arguments = [\"--prepared_data\", prepared_data.as_input(),\n",
249 |     "                                              \"--train_data\", train_data,\n",
250 |     "                                              \"--test_data\", test_data],\n",
251 |     "                                  outputs = [train_data,test_data],\n",
252 |     "                                  compute_target = compute_target, \n",
253 |     "                                  runconfig = aml_run_config, \n",
254 |     "                                  allow_reuse = True\n",
255 |     "                                  )"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "code",
260 |    "execution_count": null,
261 |    "metadata": {},
262 |    "outputs": [],
263 |    "source": [
264 |     "training_data = train_data.read_delimited_files()\n",
265 |     "training_data\n",
266 |     "\n",
267 |     "testing_data = test_data.read_delimited_files()\n",
268 |     "testing_data"
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "code",
273 |    "execution_count": null,
274 |    "metadata": {},
275 |    "outputs": [],
276 |    "source": [
277 |     "%%writefile TrainModel.py\n",
278 |     "\n",
279 |     "from azureml.core import Run\n",
280 |     "from azureml.core.model import Model\n",
281 |     "import joblib\n",
282 |     "\n",
283 |     "import pandas as pd \n",
284 |     "import numpy as np \n",
285 |     "import argparse\n",
286 |     "\n",
287 |     "from sklearn.linear_model import LogisticRegression\n",
288 |     "\n",
289 |     "import pandas as pd\n",
290 |     "import numpy as np\n",
291 |     "from sklearn.preprocessing import OneHotEncoder\n",
292 |     "from sklearn.impute import SimpleImputer\n",
293 |     "\n",
294 |     "def creditrisk_onehot_encoder(df_data):\n",
295 |     "    catColumns = df_data.select_dtypes(['object']).columns\n",
296 |     "    df_data[catColumns] = df_data[catColumns].fillna(value='Unknown')\n",
297 |     "    \n",
298 |     "    df_data = df_data.fillna(df_data.mean())\n",
299 |     "    OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)\n",
300 |     "    OH_cols= pd.DataFrame(OH_encoder.fit_transform(df_data[catColumns]),columns = list(OH_encoder.get_feature_names(catColumns)))\n",
301 |     "    \n",
302 |     "    # Remove categorical columns (will replace with one-hot encoding)\n",
303 |     "    numeric_cols = df_data.drop(catColumns, axis=1)\n",
304 |     "    \n",
305 |     "    # Add one-hot encoded columns to numerical features\n",
306 |     "    df_result = pd.concat([numeric_cols, OH_cols], axis=1)\n",
307 |     "    \n",
308 |     "    # impute missing numeric values with mean\n",
309 |     "    fill_NaN = SimpleImputer(missing_values=np.nan, strategy='mean')\n",
310 |     "    imputed_df = pd.DataFrame(fill_NaN.fit_transform(df_result))\n",
311 |     "    imputed_df.columns = df_result.columns\n",
312 |     "    imputed_df.index = df_result.index\n",
313 |     "    df_result = imputed_df\n",
314 |     "\n",
315 |     "    return(df_result)\n",
316 |     "\n",
317 |     "# Get the experiment run context\n",
318 |     "run = Run.get_context()\n",
319 |     "\n",
320 |     "parser = argparse.ArgumentParser()\n",
321 |     "parser.add_argument('--train_data', dest='train_data', required=True)\n",
322 |     "parser.add_argument('--test_data', dest='test_data', required=True)\n",
323 |     "parser.add_argument('--metrics_data', dest='metrics_data', required=True)\n",
324 |     "parser.add_argument('--model_data', dest='model_data', required=True)\n",
325 |     "args = parser.parse_args()\n",
326 |     "\n",
327 |     "df_train = pd.read_csv(args.train_data + '/train_data.csv')\n",
328 |     "df_test = pd.read_csv(args.test_data + '/test_data.csv')\n",
329 |     "\n",
330 |     "df_train = creditrisk_onehot_encoder(df_train)\n",
331 |     "df_test = creditrisk_onehot_encoder(df_test)\n",
332 |     "\n",
333 |     "cols = [col for col in df_train.columns if col not in [\"loanStatus\"]]\n",
334 |     "\n",
335 |     "clf = LogisticRegression()\n",
336 |     "clf.fit(df_train[cols].values, df_train[\"loanStatus\"].values)\n",
337 |     "\n",
338 |     "print('predicting ...')\n",
339 |     "y_hat = clf.predict(df_test[cols].astype(int).values)\n",
340 |     "\n",
341 |     "acc = np.average(y_hat == df_test[\"loanStatus\"].values)\n",
342 |     "print('Accuracy is', acc)\n",
343 |     "\n",
344 |     "print(\"save model\")\n",
345 |     "os.makedirs('models', exist_ok=True)    \n",
346 |     "joblib.dump(value=clf, filename= 'models/creditrisk_model.pkl')\n",
347 |     "\n",
348 |     "model = Model.register(model_path = 'models/creditrisk_model.pkl',\n",
349 |     "                    model_name = 'creditrisk_model',\n",
350 |     "                    description = 'creditrisk model',\n",
351 |     "                    workspace = run.experiment.workspace,\n",
352 |     "                    properties={'Accuracy': np.float(acc)})\n",
353 |     "\n",
354 |     "modeldata = []\n",
355 |     "modeldata.append(('models/creditrisk_model.pkl','creditrisk_model'))\n",
356 |     "df_model = pd.DataFrame(modeldata, columns=('modelfile', 'model_name'))\n",
357 |     "\n",
358 |     "metricsdata = []\n",
359 |     "metricsdata.append(('Accuracy',acc))\n",
360 |     "df_metrics = pd.DataFrame(metricsdata, columns=('Metric', 'Value'))\n",
361 |     "\n",
362 |     "df_model.to_csv(os.path.join(args.model_data,\"model_data.csv\"),index=False)\n",
363 |     "df_metrics.to_csv(os.path.join(args.metrics_data,\"metrics_data.csv\"),index=False)\n",
364 |     "\n",
365 |     "print(f\"Wrote model data to {args.model_data}/model_data.csv\")\n",
366 |     "print(f\"Wrote metrics data to {args.metrics_data}/metrics_data.csv\")\n"
367 |    ]
368 |   },
369 |   {
370 |    "cell_type": "code",
371 |    "execution_count": null,
372 |    "metadata": {},
373 |    "outputs": [],
374 |    "source": [
375 |     "# train the model\n",
376 |     "model_data = OutputFileDatasetConfig(name=\"model_data\")\n",
377 |     "metrics_data = OutputFileDatasetConfig(name=\"metrics_data\")\n",
378 |     "\n",
379 |     "train_step = PythonScriptStep(name = \"TrainModel\",\n",
380 |     "                                 script_name =\"TrainModel.py\",\n",
381 |     "                                 arguments = [\"--train_data\", train_data.as_input(),\n",
382 |     "                                              \"--test_data\", test_data.as_input(),\n",
383 |     "                                              \"--model_data\", model_data,\n",
384 |     "                                              \"--metrics_data\", metrics_data],\n",
385 |     "                                  outputs = [model_data,metrics_data],\n",
386 |     "                                  compute_target = compute_target, \n",
387 |     "                                  runconfig = aml_run_config, \n",
388 |     "                                  allow_reuse = True\n",
389 |     "                                  )"
390 |    ]
391 |   },
392 |   {
393 |    "cell_type": "code",
394 |    "execution_count": null,
395 |    "metadata": {},
396 |    "outputs": [],
397 |    "source": [
398 |     "%%writefile BatchInference.py\n",
399 |     "from azureml.core import Run\n",
400 |     "from azureml.core.model import Model\n",
401 |     "import joblib\n",
402 |     "\n",
403 |     "import pandas as pd \n",
404 |     "import numpy as np \n",
405 |     "import argparse\n",
406 |     "\n",
407 |     "from sklearn.preprocessing import OneHotEncoder\n",
408 |     "from sklearn.impute import SimpleImputer\n",
409 |     "\n",
410 |     "def creditrisk_onehot_encoder(df_data):\n",
411 |     "    catColumns = df_data.select_dtypes(['object']).columns\n",
412 |     "    df_data[catColumns] = df_data[catColumns].fillna(value='Unknown')\n",
413 |     "    \n",
414 |     "    df_data = df_data.fillna(df_data.mean())\n",
415 |     "    OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)\n",
416 |     "    OH_cols= pd.DataFrame(OH_encoder.fit_transform(df_data[catColumns]),columns = list(OH_encoder.get_feature_names(catColumns)))\n",
417 |     "    \n",
418 |     "    # Remove categorical columns (will replace with one-hot encoding)\n",
419 |     "    numeric_cols = df_data.drop(catColumns, axis=1)\n",
420 |     "    \n",
421 |     "    # Add one-hot encoded columns to numerical features\n",
422 |     "    df_result = pd.concat([numeric_cols, OH_cols], axis=1)\n",
423 |     "    \n",
424 |     "    # impute missing numeric values with mean\n",
425 |     "    fill_NaN = SimpleImputer(missing_values=np.nan, strategy='mean')\n",
426 |     "    imputed_df = pd.DataFrame(fill_NaN.fit_transform(df_result))\n",
427 |     "    imputed_df.columns = df_result.columns\n",
428 |     "    imputed_df.index = df_result.index\n",
429 |     "    df_result = imputed_df\n",
430 |     "\n",
431 |     "    return(df_result)\n",
432 |     "\n",
433 |     "parser = argparse.ArgumentParser()\n",
434 |     "parser.add_argument('--test_data', dest=\"test_data\", type=str, required=True)\n",
435 |     "parser.add_argument('--model_data', dest=\"model_data\", type=str, required=True)\n",
436 |     "parser.add_argument('--batchinfer_data', dest='batchinfer_data', required=True)\n",
437 |     "\n",
438 |     "args = parser.parse_args()\n",
439 |     "\n",
440 |     "# Get the experiment run context\n",
441 |     "run = Run.get_context()\n",
442 |     "\n",
443 |     "df_model = pd.read_csv(args.model_data + '/model_data.csv')\n",
444 |     "# model_path = Model.get_model_path(model_name = 'best_model_data')\n",
445 |     "model_name = df_model['model_name'][0]\n",
446 |     "\n",
447 |     "model_path = Model.get_model_path(model_name=model_name, _workspace=run.experiment.workspace)\n",
448 |     "model = joblib.load(model_path)\n",
449 |     "\n",
450 |     "df_test = pd.read_csv(args.test_data + '/test_data.csv')\n",
451 |     "df_test = creditrisk_onehot_encoder(df_test)\n",
452 |     "\n",
453 |     "x_test = df_test.drop(['loanStatus'], axis=1)\n",
454 |     "\n",
455 |     "y_predict = model.predict(x_test)\n",
456 |     "\n",
457 |     "df_test['Prediction'] = y_predict\n",
458 |     "\n",
459 |     "df_test.to_csv(os.path.join(args.batchinfer_data,\"batchinfer_data.csv\"),index=False)\n",
460 |     "\n",
461 |     "print(f\"Wrote prediction data with to {args.batchinfer_data}/batchinfer_data.csv\")"
462 |    ]
463 |   },
464 |   {
465 |    "cell_type": "code",
466 |    "execution_count": null,
467 |    "metadata": {},
468 |    "outputs": [],
469 |    "source": [
470 |     "from azureml.data import OutputFileDatasetConfig\n",
471 |     "from azureml.pipeline.steps import PythonScriptStep\n",
472 |     "\n",
473 |     "batchinfer_data = OutputFileDatasetConfig(name=\"batchinfer_data\").register_on_complete(name=\"CreditRiskBatchInferenceData\",description = 'Batch Inference Data Output')\n",
474 |     "\n",
475 |     "batchinfer_step = PythonScriptStep(\n",
476 |     "    name=\"RunBatchInference\", \n",
477 |     "    script_name=\"BatchInference.py\", \n",
478 |     "    compute_target=compute_target, \n",
479 |     "    runconfig=aml_run_config,\n",
480 |     "    arguments=[\"--test_data\", test_data.as_input(),\"--model_data\", model_data.as_input(),\"--batchinfer_data\", batchinfer_data],\n",
481 |     "    outputs = [batchinfer_data],\n",
482 |     "    allow_reuse=True\n",
483 |     ")"
484 |    ]
485 |   },
486 |   {
487 |    "cell_type": "code",
488 |    "execution_count": null,
489 |    "metadata": {},
490 |    "outputs": [],
491 |    "source": [
492 |     "from azureml.pipeline.core import Pipeline\n",
493 |     "from azureml.core import Experiment\n",
494 |     "\n",
495 |     "pipeline = Pipeline(ws, [dataprep_step, test_train_step, train_step,batchinfer_step])\n",
496 |     "\n",
497 |     "experiment = Experiment(workspace=ws, name='CreditRiskPipeline')\n",
498 |     "\n",
499 |     "run = experiment.submit(pipeline, show_output=True)\n",
500 |     "run.wait_for_completion()"
501 |    ]
502 |   }
503 |  ],
504 |  "metadata": {
505 |   "kernelspec": {
506 |    "display_name": "Python 3",
507 |    "language": "python",
508 |    "name": "python3"
509 |   },
510 |   "language_info": {
511 |    "codemirror_mode": {
512 |     "name": "ipython",
513 |     "version": 3
514 |    },
515 |    "file_extension": ".py",
516 |    "mimetype": "text/x-python",
517 |    "name": "python",
518 |    "nbconvert_exporter": "python",
519 |    "pygments_lexer": "ipython3",
520 |    "version": "3.7.4"
521 |   }
522 |  },
523 |  "nbformat": 4,
524 |  "nbformat_minor": 5
525 | }
526 | 


--------------------------------------------------------------------------------
/AMLNotebooks/02_Create_CreditRisk_AML_Pipeline_Lineage.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "pip install pyapacheatlas"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "#!pip install pyapacheatlas #run this cell if the above cell runs into any issues"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 1,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "from Authenticate_to_Purview_AML import *\n",
 28 |     "ws,guid,client = authentitae_to_purview_AML()"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": null,
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "from Create_ML_Lineage_Types import *\n",
 38 |     "create_ml_lineage_types(client)"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": 3,
 44 |    "metadata": {},
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "%run Create_ML_Lineage_Functions"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 4,
 53 |    "metadata": {},
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "create_workspace_entities(ws)"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": 5,
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "create_datastore_entities(ws)"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 6,
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "create_dataset_entities(ws)"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": null,
 80 |    "metadata": {},
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "create_experiment_entities(ws)"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": null,
 89 |    "metadata": {},
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "## uncomment below code to link PowerBI Dataset and Report in lineage if you have access to a PBI workspace \n",
 93 |     "# #The PowerBI entities will populate with more details if you set up a scan for PBI workspaces in Purview\n",
 94 |     "# #We are just creating a placeholders and links for lineage below\n",
 95 |     "\n",
 96 |     "# #get batch inference data entity name and exprimentname\n",
 97 |     "# batchpred_data_ent_name = 'batchinfer_data.csv_CreditRiskPipeline'\n",
 98 |     "# experimentname = \"CreditRiskPipeline\"\n",
 99 |     "\n",
100 |     "# #create PowerBI dataset entity and lineage \n",
101 |     "# pbi_workspace = '<YOUR PBIWORKSPACE URL>' #'https://xxx.powerbi.com/groups/7c555287-f9b8-45ff-be6c-9909afe9df40'\n",
102 |     "# pbi_datasetid = '<YOUR PBI Dataset ID>' #'c4a30c22-466d-4a30-a1ac-8736ed6567cc' \n",
103 |     "\n",
104 |     "# pbidata_ent_name = 'creditriskpbidataset' \n",
105 |     "# create_powerbi_dataset_and_lineage(experimentname,pbi_workspace,pbi_datasetid,pbidata_ent_name,batchpred_data_ent_name,'custom_ml_dataset')\n",
106 |     "\n",
107 |     "\n",
108 |     "# #create PowerBI report entity and lineage\n",
109 |     "# pbi_reportid = '<YOUR PBI Report ID>' #'e495453d-6c0c-4fb9-bdc4-556319f6a57b'\n",
110 |     "# pbi_ent_name = 'creditriskpbireport'\n",
111 |     " \n",
112 |     "# create_powerbi_report_and_lineage(experimentname,pbi_workspace,pbi_reportid,pbi_ent_name,pbi_datasetid)"
113 |    ]
114 |   }
115 |  ],
116 |  "metadata": {
117 |   "kernelspec": {
118 |    "display_name": "Python 3",
119 |    "language": "python",
120 |    "name": "python3"
121 |   },
122 |   "language_info": {
123 |    "codemirror_mode": {
124 |     "name": "ipython",
125 |     "version": 3
126 |    },
127 |    "file_extension": ".py",
128 |    "mimetype": "text/x-python",
129 |    "name": "python",
130 |    "nbconvert_exporter": "python",
131 |    "pygments_lexer": "ipython3",
132 |    "version": "3.7.4"
133 |   },
134 |   "save_output": true,
135 |   "synapse_widget": {
136 |    "state": {},
137 |    "version": "0.1"
138 |   }
139 |  },
140 |  "nbformat": 4,
141 |  "nbformat_minor": 2
142 | }
143 | 


--------------------------------------------------------------------------------
/AMLNotebooks/Authenticate_to_Purview_AML.py:
--------------------------------------------------------------------------------
 1 | def authentitae_to_purview_AML():
 2 |     from pyapacheatlas.auth import ServicePrincipalAuthentication
 3 |     from pyapacheatlas.core import PurviewClient, AtlasClassification, AtlasEntity, AtlasProcess  
 4 |     from pyapacheatlas.readers import ExcelConfiguration, ExcelReader
 5 |     from pyapacheatlas.core.util import GuidTracker
 6 |     from pyapacheatlas.core import AtlasAttributeDef, AtlasEntity, PurviewClient
 7 |     from pyapacheatlas.core.typedef import EntityTypeDef
 8 | 
 9 |     # get SPN details you created in step 2.1 of solution accelerator setup
10 |     tenant_id = "<TENANT_ID>"
11 |     client_id = "<CLIENT_ID>"
12 |     client_secret = "<CLIENT_SECRET>"
13 | 
14 |     # get Purview account name from azure portal
15 |     purview_name = "<PURVIEW_NAME>"
16 | 
17 |     # get AML workspace details from azure portal
18 |     subscription_id = "<SUBSCRIPTION_ID>" 
19 |     resource_group = "<RESOURCE_GROUP>"
20 |     workspace_name = "<WORKSPACE_NAME>"
21 |     workspace_region = "<WORKSPACE_REGION>"
22 | 
23 |     from pyapacheatlas.auth import ServicePrincipalAuthentication
24 |     from pyapacheatlas.core import PurviewClient
25 |     from pyapacheatlas.core.util import GuidTracker
26 | 
27 |     # Authenticate to your Atlas server using a Service Principal
28 |     oauth = ServicePrincipalAuthentication(
29 |         tenant_id= tenant_id,
30 |         client_id= client_id,
31 |         client_secret= client_secret
32 |     )
33 |     client = PurviewClient(
34 |         account_name = purview_name,
35 |         authentication=oauth
36 |     )
37 |     guid = GuidTracker()
38 | 
39 | 
40 |     # get SPN details you created in step 3.1 of solution accelerator setup
41 |     aml_client_id = "<CLIENT_ID>"
42 |     aml_client_secret = "<CLIENT_SECRET>"
43 |     
44 |     
45 |     from azureml.core.authentication import ServicePrincipalAuthentication
46 | 
47 |     sp = ServicePrincipalAuthentication(tenant_id=tenant_id, 
48 |                                         service_principal_id=aml_client_id, 
49 |                                         service_principal_password=aml_client_secret)
50 | 
51 |     from azureml.core import Workspace
52 | 
53 |     ws = Workspace.get(name=workspace_name,
54 |                     resource_group = resource_group,
55 |                     auth=sp,
56 |                     subscription_id=subscription_id)
57 |     return ws,guid,client
58 | 


--------------------------------------------------------------------------------
/AMLNotebooks/Create_ML_Lineage_Functions.py:
--------------------------------------------------------------------------------
  1 | # +
  2 | from pyapacheatlas.auth import ServicePrincipalAuthentication
  3 | from pyapacheatlas.core import PurviewClient, AtlasClassification, AtlasEntity, AtlasProcess  
  4 | from pyapacheatlas.readers import ExcelConfiguration, ExcelReader
  5 | from pyapacheatlas.core.util import GuidTracker
  6 | from pyapacheatlas.core import AtlasAttributeDef, AtlasEntity, PurviewClient
  7 | from pyapacheatlas.core.typedef import EntityTypeDef
  8 | 
  9 | from Authenticate_to_Purview_AML import *
 10 | ws,guid,client = authentitae_to_purview_AML()
 11 | 
 12 | def get_entity_details(qualifiedName,typeName):
 13 |     entities = client.get_entity(
 14 |         qualifiedName=[qualifiedName],
 15 |         typeName=typeName
 16 |     )
 17 |     for entity in entities.get("entities"):
 18 |         entity = entity
 19 |         break
 20 |     return entity
 21 | #get_entity_details('https://sampledataadls.dfs.core.windows.net/masterdata/employees.csv','azure_datalake_gen2_path')
 22 | 
 23 | def get_entity_guid(qualifiedName,typeName):
 24 |     entities = client.get_entity(
 25 |         qualifiedName=[qualifiedName],
 26 |         typeName=typeName
 27 |     )
 28 |     for entity in entities.get("entities"):
 29 |         entity_guid = entity.get("guid")
 30 |         break
 31 |     return entity_guid
 32 | #get_entity_guid('https://sampledataadls.dfs.core.windows.net/creditriskdata/borrower.csv','azure_datalake_gen2_path')
 33 | 
 34 | def get_entity_schema(guid):
 35 |     columns = []
 36 |     results = client.get_entity(guid)
 37 |     for entity in results["entities"]:
 38 |         if "tabular_schema" in entity["relationshipAttributes"]:
 39 |             ts = entity["relationshipAttributes"]["tabular_schema"]
 40 |             ts_entity = client.get_entity(ts["guid"])
 41 |             for schema in ts_entity["entities"]:
 42 |                 for col in schema["relationshipAttributes"]["columns"]:
 43 |                     if col['displayText'] != ':csv':
 44 |                         columns.append(col['displayText'])
 45 |     return(columns)
 46 |     
 47 | # ent_guid = 'a8698a33-9174-43cb-8835-26968862e2bf'
 48 | # get_entity_schema(ent_guid)
 49 | 
 50 | def create_data_entity_with_schema_and_parent(df_data,entityname,entitytype='custom_ml_dataset',parent_entityname=None,parent_entitytype='custom_ml_datastore'):
 51 |     # Create an asset for the output data schema.
 52 |     output_schema_entity = AtlasEntity(
 53 |     name="schema-" + entityname,
 54 |     qualified_name = "pyapacheatlas://"+"schema-" + entityname,
 55 |     typeName="tabular_schema",
 56 |     guid=guid.get_guid()
 57 |     )
 58 | 
 59 |     df_data_schema = pd.DataFrame(list(zip(list(df_data.columns), list(df_data.dtypes))),columns=['column','dtype'])
 60 | 
 61 |     #Iterate over the out data frame's columns and create entities
 62 |     output_entity_schema_columns = []
 63 |     #for column in df.schema:
 64 |     for index, row in df_data_schema.iterrows():  
 65 |         temp_column = AtlasEntity(
 66 |             name = row.column,
 67 |             typeName = "column",
 68 |             qualified_name = "pyapacheatlas://schema-" + entityname + "#" + row.column,
 69 |             guid=guid.get_guid(),
 70 |             attributes = {"type":str(row.dtype),"description": row.column},
 71 |             relationshipAttributes = {"composeSchema":output_schema_entity.to_json(minimum=True)}
 72 |         )
 73 |         output_entity_schema_columns.append(temp_column)
 74 | 
 75 | 
 76 |     if parent_entityname:
 77 |         dstore_entity = get_entity_details("pyapacheatlas://"+parent_entityname, parent_entitytype)
 78 |         # Create a entity for dataset 
 79 |         dataset_output_entity = AtlasEntity(
 80 |             name=entityname,
 81 |             typeName=entitytype,
 82 |             qualified_name="pyapacheatlas://" + entityname,
 83 |             guid = guid.get_guid(),
 84 |             relationshipAttributes = {
 85 |                 "tabular_schema": output_schema_entity.to_json(minimum=True),
 86 |                 "datastore":dstore_entity
 87 |             }
 88 |         )
 89 |     else:
 90 |         # Create a entity for dataset 
 91 |         dataset_output_entity = AtlasEntity(
 92 |             name=entityname,
 93 |             typeName=entitytype,
 94 |             qualified_name="pyapacheatlas://" + entityname,
 95 |             guid = guid.get_guid(),
 96 |             relationshipAttributes = {
 97 |                 "tabular_schema": output_schema_entity.to_json(minimum=True)
 98 |             }
 99 |         )
100 | 
101 |     # Prepare all the entities as a batch to be uploaded.
102 |     batch = [dataset_output_entity, output_schema_entity] + output_entity_schema_columns
103 |     batch
104 | 
105 |     # Upload all entities!
106 |     client.upload_entities(batch=batch)
107 |     
108 | def create_data_entity_with_schema(df_data,entityname,entitytype='custom_ml_dataset'):
109 |     # Create an asset for the output data schema.
110 |     output_schema_entity = AtlasEntity(
111 |     name="schema-" + entityname,
112 |     qualified_name = "pyapacheatlas://"+"schema-" + entityname,
113 |     typeName="tabular_schema",
114 |     guid=guid.get_guid()
115 |     )
116 | 
117 |     df_data_schema = pd.DataFrame(list(zip(list(df_data.columns), list(df_data.dtypes))),columns=['column','dtype'])
118 | 
119 |     #Iterate over the out data frame's columns and create entities
120 |     output_entity_schema_columns = []
121 |     #for column in df.schema:
122 |     for index, row in df_data_schema.iterrows():  
123 |         temp_column = AtlasEntity(
124 |             name = row.column,
125 |             typeName = "column",
126 |             qualified_name = "pyapacheatlas://schema-" + entityname + "#" + row.column,
127 |             guid=guid.get_guid(),
128 |             attributes = {"type":str(row.dtype),"description": row.column},
129 |             relationshipAttributes = {"composeSchema":output_schema_entity.to_json(minimum=True)}
130 |         )
131 |         output_entity_schema_columns.append(temp_column)
132 | 
133 |     # Create a entity for dataset 
134 |     dataset_output_entity = AtlasEntity(
135 |         name=entityname,
136 |         typeName=entitytype,
137 |         qualified_name="pyapacheatlas://" + entityname,
138 |         guid = guid.get_guid(),
139 |         relationshipAttributes = {
140 |             "tabular_schema": output_schema_entity.to_json(minimum=True)
141 |         }
142 |     )
143 | 
144 |     # Prepare all the entities as a batch to be uploaded.
145 |     batch = [dataset_output_entity, output_schema_entity] + output_entity_schema_columns
146 |     batch
147 | 
148 |     # Upload all entities!
149 |     client.upload_entities(batch=batch)
150 |     
151 | def create_lineage_for_entities(experimentname,processname,in_ent_qns,out_ent_qns,process_type_name='Process',ColumnMapping=False):
152 |     # create a process 
153 |     # inputs: list of (entity,type) tuples
154 |     # outputs: list of (entity,type) tuples
155 | 
156 |     from pyapacheatlas.core import AtlasProcess
157 | 
158 |     in_ent_guids = []
159 |     for in_ent_qn in in_ent_qns:
160 |         #print(in_ent_qn,in_ent_qns[in_ent_qn])
161 |         in_ent_guid = get_entity_guid(in_ent_qn,in_ent_qns[in_ent_qn])
162 |         in_ent_guids.append({'guid':in_ent_guid})
163 |     
164 |     out_ent_guids = []
165 |     for out_ent_qn in out_ent_qns:
166 |         #print(in_ent_qn,in_ent_qns[in_ent_qn])
167 |         out_ent_guid = get_entity_guid(out_ent_qn,out_ent_qns[out_ent_qn])
168 |         out_ent_guids.append({'guid':out_ent_guid})
169 | 
170 |     process_name = experimentname + processname
171 |     process_qn = "pyapacheatlas://" + process_name
172 | 
173 |     if ColumnMapping == False:
174 |         process_type_name = process_type_name
175 | 
176 |         process = AtlasProcess(
177 |             name=process_name,
178 |             typeName=process_type_name,
179 |             qualified_name=process_qn,
180 |             inputs = in_ent_guids,
181 |             outputs = out_ent_guids,
182 |             guid=guid.get_guid()
183 |         )
184 |     else:
185 |         process_type_name = "ProcessWithColumnMapping"
186 | 
187 |         column_mapping_attributes = []
188 |         for in_ent_qn in in_ent_qns:
189 |             cl_mapping = []
190 |             in_ent_columns = get_entity_schema(get_entity_guid(in_ent_qn,in_ent_qns[in_ent_qn]))
191 |             for in_col in in_ent_columns:
192 |                 cl_mapping.append({"Source":in_col,"Sink":in_col})
193 |                 #break
194 |             mapping = {
195 |             'DatasetMapping': {'Source':in_ent_qn,'Sink':list(out_ent_qns.keys())[0]},
196 |             'ColumnMapping': cl_mapping
197 |             }
198 |             column_mapping_attributes.append(mapping)
199 | 
200 |         process = AtlasProcess(
201 |             name=process_name,
202 |             typeName=process_type_name,
203 |             qualified_name=process_qn,
204 |             inputs = in_ent_guids,
205 |             outputs = out_ent_guids,
206 |             guid=guid.get_guid(),
207 |             attributes={"columnMapping":json.dumps(column_mapping_attributes)}
208 |         )
209 | 
210 |     # Prepare all the entities as a batch to be uploaded.
211 |     batch = [process]
212 |     batch
213 | 
214 |     # Upload all entities!
215 |     client.upload_entities(batch=batch)
216 |     
217 | def create_entity(name,typeName,config_attibutes):
218 |     # Create an entity
219 |     name = name 
220 |     qn = "pyapacheatlas://" + name
221 | 
222 |     exp_config_entity = AtlasEntity(
223 |         name=name,
224 |         typeName=typeName,
225 |         qualified_name=qn,
226 |         guid = guid.get_guid(),
227 |         attributes = config_attibutes
228 |     )
229 | 
230 |     # Upload all entities!
231 |     client.upload_entities(batch=[exp_config_entity.to_json()])
232 | 
233 |         
234 | def get_dataset_details(indataset,experiment_name=''):
235 |     result = []
236 |     #print(indataset)
237 |     if 'FileDataset' in str(type((indataset))):
238 |         dssource = eval(json.loads(str(indataset).replace('FileDataset',''))['source'][0])
239 |         sourcestore = dssource[0]
240 |         sourcepath = dssource[1]
241 |         sourcepathfiles = indataset.to_path()
242 |         for sourcepathfile in sourcepathfiles:
243 |             entityname = sourcepath.split('/')[-1] + sourcepathfile.replace('/','_') #.replace('.parquet','').replace('.csv','')
244 |             #print('\nFileDataset:',entityname)
245 | 
246 |             dsdatastore = Datastore.get(ws, sourcestore)
247 |             datastore_path = [DataPath(dsdatastore, sourcepath+sourcepathfile.replace('/',''))]
248 |    
249 |             if '.parquet' in sourcepathfile:
250 |                 tabular_dataset = Dataset.Tabular.from_parquet_files(path=datastore_path)
251 |                 df_data = tabular_dataset.take(10).to_pandas_dataframe()
252 |                 
253 |             elif '.csv' in sourcepathfile:
254 |                 tabular_dataset = Dataset.Tabular.from_delimited_files(path=datastore_path,encoding ='iso88591') 
255 |                 #'utf8', 'iso88591', 'latin1', 'ascii', 'utf16', 'utf32', 'utf8bom' and 'windows1252'
256 |                 df_data = tabular_dataset.take(10).to_pandas_dataframe()
257 |             
258 |             if experiment_name != '':
259 |                 result.append((entityname + '_' + experiment_name,df_data))
260 |             else:
261 |                 result.append((entityname,df_data))
262 | 
263 |     elif 'TabularDataset' in str(type((indataset))):
264 |         tabular_dataset = indataset
265 |         entityname = json.loads(str(indataset).replace('TabularDataset',''))['registration']['name']
266 |         
267 |         # dataset = Dataset.get_by_name(ws, name=entityname)
268 |         # try:
269 |         #     sourcestore = json.loads(dataset._definition)['blocks'][0]['arguments']['datastore']['datastoreName']
270 |         # except:
271 |         #     sourcestore = json.loads(dataset._definition)['blocks'][0]['arguments']['datastores'][0]['datastoreName']
272 |         df_data = tabular_dataset.take(10).to_pandas_dataframe()
273 |         #print('TabularDataset:', entityname)
274 |         result.append((entityname,df_data))
275 |     return result
276 | 
277 | 
278 | from azureml.core import Experiment
279 | from azureml.pipeline.core import PipelineRun
280 | 
281 | from azureml.core import Workspace, Datastore, Dataset
282 | from azureml.data.datapath import DataPath
283 | import json  
284 | import pandas as pd
285 | 
286 | def create_aml_experiment_steps(ws,experiment_name):
287 |     experiments_lst = Experiment.list(ws)
288 |     for experiment in experiments_lst:
289 |         if experiment.name == experiment_name:
290 |             print(experiment)
291 |             exp = Experiment(ws,experiment.name)
292 |             for run in exp.get_runs(): 
293 |                 rundetails = run.get_details()
294 | 
295 |                 if rundetails['status'] != 'Completed': #continue until we find a completed run 
296 |                     continue
297 |                 pipeline_run = PipelineRun(exp, rundetails['runId'])
298 | 
299 |                 steps = pipeline_run.get_steps()
300 |                 for step_run in steps:
301 |                     step_run_details = step_run.get_details_with_logs()
302 | 
303 |                     #print(step_run_details['runDefinition']['script'])
304 | 
305 |                     purview_basepath = 'pyapacheatlas://'
306 |                     in_ent_qns = {}
307 |                     out_ent_qns = {}
308 | 
309 |                     step_name = step_run.name #step_run_details['runDefinition']['script']
310 | 
311 |                     #print('\n Input Datasets:\n')
312 |                     for indataset in step_run_details['inputDatasets']:
313 |                         in_result = get_dataset_details(indataset['dataset'],experiment_name)
314 |                         #print(in_result)
315 |                         #create entities                        
316 |                         for in_res in in_result:
317 |                             data_ent_name = in_res[0].strip('_')
318 |                             create_data_entity_with_schema(in_res[1],data_ent_name,'custom_ml_dataset')
319 |                             in_ent_qns[purview_basepath + data_ent_name] = 'custom_ml_dataset'
320 |                         #break
321 |                     #print('\n Output Datasets:\n')
322 |                     for outdataset in step_run_details['outputDatasets']:
323 |                         out_result = get_dataset_details(outdataset['dataset'],experiment_name)
324 |                         #print(out_result)
325 |                         #create entities
326 |                         for out_res in out_result:
327 |                             data_ent_name = out_res[0].strip('_')
328 |                             create_data_entity_with_schema(out_res[1],data_ent_name,'custom_ml_dataset')
329 |                             out_ent_qns[purview_basepath + data_ent_name] = 'custom_ml_dataset'
330 |                         #break
331 |                     #print(in_ent_qns,out_ent_qns)
332 |                     create_lineage_for_entities(experiment_name + '_',step_name, in_ent_qns,out_ent_qns,process_type_name='custom_ml_experiment_step',ColumnMapping=False)
333 |                     #break    
334 |                 
335 |                 break # break after processing one completed run
336 |             break #after finding the experiment
337 | 
338 | 
339 | #create workspace entity
340 | def create_workspace_entities(ws):
341 | 
342 |     config_attibutes={}
343 |     temp_column={}
344 | 
345 |     temp_column['name'] = ws.name
346 |     config_attibutes.update(temp_column)
347 |     temp_column['subscription_id'] = ws.subscription_id
348 |     config_attibutes.update(temp_column)
349 |     temp_column['resource_group'] = ws.resource_group
350 |     config_attibutes.update(temp_column)
351 | 
352 |     create_entity(ws.name,'custom_ml_workspace',config_attibutes)
353 |     #break
354 | 
355 | #create all datastore entities
356 | def create_datastore_entities(ws):
357 |     for datastore in ws.datastores.values():
358 |         config_attibutes={}
359 |         temp_column={}
360 |         
361 |         temp_column['name'] = datastore.name
362 |         config_attibutes.update(temp_column)
363 | 
364 |         if ('AzureDataLakeGen2Datastore' in str(type(datastore))) or ('AzureBlobDatastore' in str(type(datastore))):
365 |             temp_column['container_name'] = datastore.container_name
366 |             config_attibutes.update(temp_column)
367 |             temp_column['account_name'] = datastore.account_name
368 |             config_attibutes.update(temp_column)
369 |             temp_column['protocol'] = datastore.protocol
370 |             config_attibutes.update(temp_column)
371 |             temp_column['endpoint'] = datastore.endpoint
372 |             config_attibutes.update(temp_column)
373 |         elif 'AzureSqlDatabaseDatastore' in str(type(datastore)):
374 |             #print('sql',datastore.server_name)
375 |             temp_column['server_name'] = datastore.server_name
376 |             config_attibutes.update(temp_column)
377 |             temp_column['database_name'] = datastore.database_name
378 |             config_attibutes.update(temp_column)
379 |         elif 'AzureBlobDatastore' in str(type(datastore)):    
380 |             pass
381 | 
382 |         create_entity(datastore.name,'custom_ml_datastore',config_attibutes)
383 |         #break
384 | 
385 |     #create workspace and datastore relationship
386 |     purview_basepath = 'pyapacheatlas://'
387 |     for datastore in ws.datastores.values():
388 |         relationshiptype = 'custom_ml_workspace_datastore'
389 |         end1type = 'custom_ml_workspace'
390 |         end2type = 'custom_ml_datastore'
391 |         end1_qn = purview_basepath + ws.name
392 |         end2_qn = purview_basepath + datastore.name
393 |         try:
394 |             create_entities_relationship(relationshiptype,end1type,end2type,end1_qn,end2_qn)
395 |         except:
396 |             pass # ignore if relationship exists
397 | 
398 | #create all dataset entities (with datastore as parent)
399 | from azureml.core import Workspace, Datastore, Dataset
400 | import pandas as pd
401 | def create_dataset_entities(ws,parent_flag=True):
402 |     purview_basepath = 'pyapacheatlas://'
403 |     for dsname in ws.datasets:
404 |         dataset = ws.datasets[dsname]
405 |         try:
406 |             if 'FileDataset' in str(type((dataset))):
407 |                 datasetsource = eval(json.loads(str(dataset).replace('FileDataset',''))['source'][0])[0]
408 |             elif 'TabularDataset' in str(type((dataset))):
409 |                 datasetsource = eval(json.loads(str(dataset).replace('TabularDataset',''))['source'][0])[0]
410 | 
411 |             dsdetails = get_dataset_details(dataset)
412 |             #print(dsdetails)
413 |             for ds in dsdetails:
414 |                 if parent_flag == False:
415 | 
416 |                     create_data_entity_with_schema(ds[1],dsname,'custom_ml_dataset')
417 |                     create_lineage_for_entities('',('register_' + dsname), {(purview_basepath+datasetsource):'custom_ml_datastore'},
418 |                                                 {(purview_basepath+ds[0]):'custom_ml_dataset'},ColumnMapping=False)
419 |                 else:
420 |                     create_data_entity_with_schema_and_parent(ds[1],dsname,entitytype='custom_ml_dataset',
421 |                                                               parent_entityname=datasetsource,parent_entitytype='custom_ml_datastore')    
422 |         except:
423 |             print('Error:',dsname)        
424 |         #break
425 |         
426 |         
427 | #create experiment entity
428 | from azureml.core import Experiment
429 | 
430 | def create_experiment_entities(ws):
431 |     for experiment in Experiment.list(ws):
432 |         #create experiment entity
433 |         config_attibutes={}
434 |         temp_column={}
435 | 
436 |         temp_column['name'] = experiment.name
437 |         config_attibutes.update(temp_column)
438 | 
439 |         create_entity(experiment.name,'custom_ml_experiment',config_attibutes)
440 |         #break
441 |         
442 |         purview_basepath = 'pyapacheatlas://'
443 | 
444 |         #create experiment relationship to workspace
445 |         relationshiptype = 'custom_ml_workspace_experiment'
446 |         end1type = 'custom_ml_workspace'
447 |         end2type = 'custom_ml_experiment'
448 |         end1_qn = purview_basepath + ws.name
449 |         end2_qn = purview_basepath + experiment.name
450 |         try:
451 |             create_entities_relationship(relationshiptype,end1type,end2type,end1_qn,end2_qn)
452 |         except:
453 |             pass # ignore if relationship exists
454 |         
455 |         for run in experiment.get_runs(): 
456 |             rundetails = run.get_details()
457 |             #print(rundetails)
458 |             if rundetails['status'] != 'Completed': #continue until we find a completed run 
459 |                 continue
460 |             #create experiment steps
461 |             if rundetails['properties']['azureml.runsource'] == 'azureml.PipelineRun':
462 |                 print(experiment.name)
463 |                 create_aml_experiment_steps(ws,experiment.name)
464 | 
465 |                 pipeline_run = PipelineRun(experiment, rundetails['runId'])
466 | 
467 |                 steps = pipeline_run.get_steps()
468 |                 for step_run in steps:  
469 |                     #create experiment relationship to workspace
470 |                     relationshiptype = 'custom_ml_experiment_to_experimentstep'
471 |                     end1type = 'custom_ml_experiment'
472 |                     end2type = 'custom_ml_experiment_step'
473 |                     end1_qn = purview_basepath + experiment.name
474 |                     end2_qn = purview_basepath + experiment.name + '_' + step_run.name
475 |                     try:
476 |                         create_entities_relationship(relationshiptype,end1type,end2type,end1_qn,end2_qn)
477 |                     except:
478 |                         pass # ignore if relationship exists
479 | 
480 |             break # break after processing one completed run
481 |         #break
482 | 
483 | def create_entities_relationship(relationshiptype,end1type,end2type,end1_qn,end2_qn):
484 |     relationship = {}
485 |     end1 = {}
486 |     end2 = {}
487 | 
488 |     end1["guid"] = get_entity_guid(end1_qn,end1type)
489 |     end1["typeName"] = end1type
490 |     end1["uniqueAttributes"] = {"qualifiedName": end1_qn}
491 | 
492 |     end2["guid"] = get_entity_guid(end2_qn,end2type)
493 |     end2["typeName"] = end2type
494 |     end2["uniqueAttributes"] = {"qualifiedName": end2_qn}
495 | 
496 |     relationship["typeName"] = relationshiptype
497 |     relationship["attributes"] = {}
498 |     relationship["guid"] = guid.get_guid()
499 |     relationship["provenanceType"] = 0
500 |     relationship["end1"] = end1
501 |     relationship["end2"] = end2
502 |     relationship
503 |     
504 |     client.upload_relationship(relationship)         
505 |        
506 | def create_package_entities(experimentname,packageslist):
507 |     packages_name = experimentname + '-packages' 
508 |     packages_qn = "pyapacheatlas://" + packages_name
509 | 
510 |     # Create an asset for the packages.
511 |     packages_entity = AtlasEntity(
512 |         name = packages_name,
513 |         qualified_name = packages_qn,
514 |         typeName="custom_ml_packages",
515 |         attributes = {"notes":"test note"},
516 |         guid=guid.get_guid()
517 |     )
518 | 
519 |     packages_entity.to_json(minimum=True)
520 | 
521 |     atlas_packages = []
522 |     relationships = []
523 |     for package in packageslist:
524 |         package_attibutes={}
525 |         temp_column={}
526 |         temp_column['programming_language'] = str(package[0])
527 |         package_attibutes.update(temp_column)
528 |         temp_column['package_name'] = str(package[1])
529 |         package_attibutes.update(temp_column)
530 |         temp_column['version'] = str(package[2])
531 |         package_attibutes.update(temp_column)
532 |         temp_column['notes'] = str(package[3])
533 |         package_attibutes.update(temp_column)
534 | 
535 |         # Create an entity for each package
536 |         name = str(package[1]) #experimentname + '-package-' + package[1] 
537 |         qn =   packages_qn + '#' + str(package[1])     #"pyapacheatlas://" + name
538 | 
539 |         package_entity = AtlasEntity(
540 |             name= name,
541 |             typeName="custom_ml_package",
542 |             qualified_name=qn,
543 |             guid = guid.get_guid(),
544 |             attributes = package_attibutes,
545 |             relationshipAttributes = {"packages":packages_entity.to_json(minimum=True)}
546 |         )
547 |         atlas_packages.append(package_entity)
548 | 
549 |     atlas_packages
550 | 
551 |     # Prepare all the entities as a batch to be uploaded.
552 |     batch = [packages_entity] + atlas_packages
553 |     client.upload_entities(batch=batch) 
554 |     
555 | def create_experiment_config_entity(ws,experiment_name,automl_run):
556 |     # Get experiment config from AML run
557 |     import json
558 |     import pandas as pd
559 |     run_properties = automl_run.get_properties()
560 |     run_properties
561 | 
562 |     AMLSettingsJsonString = run_properties['AMLSettingsJsonString']
563 |     AMLSettings = json.loads(AMLSettingsJsonString)
564 | 
565 |     df_config = pd.DataFrame(list(AMLSettings.items()),columns = ['key','value']) 
566 | 
567 |     keys = ['task_type','enable_early_stopping','experiment_timeout_minutes','primary_metric','compute_target','label_column_name','n_cross_validations','model_explainability']
568 | 
569 |     df_config = df_config[df_config['key'].isin(keys)]
570 | 
571 |     dict_config = df_config.to_dict(orient = 'records')
572 |     dict_config
573 | 
574 |     config_attibutes={}
575 |     for attibutes in dict_config:
576 |         temp_column={}
577 |         temp_column[attibutes['key']] = attibutes['value']
578 |         config_attibutes.update(temp_column)
579 |     config_attibutes
580 | 
581 |     # Create a entity for exp config 
582 |     name = experiment_name + "-config"
583 |     qn = "pyapacheatlas://" + name
584 | 
585 |     exp_config_entity = AtlasEntity(
586 |         name=name,
587 |         typeName="custom_ml_exp_config",
588 |         qualified_name=qn,
589 |         guid = guid.get_guid(),
590 |         attributes = config_attibutes
591 |     )
592 | 
593 |     # Upload all entities!
594 |     client.upload_entities(batch=[exp_config_entity.to_json()])
595 |     
596 | def create_model_entity(ws,experiment_name,modelname):
597 |     # get deployed model
598 |     from azureml.core.model import Model
599 |     model = Model(ws, modelname)
600 | 
601 |     config_attibutes={}
602 |     temp_column={}
603 |     temp_column['workspace_name'] = model.workspace.name
604 |     config_attibutes.update(temp_column)
605 |     temp_column['workspace_subscription_id'] = model.workspace.subscription_id
606 |     config_attibutes.update(temp_column)
607 |     temp_column['workspace_subscription_id'] = model.workspace.subscription_id
608 |     config_attibutes.update(temp_column)
609 |     temp_column['workspace_resource_group'] = model.workspace.resource_group
610 |     config_attibutes.update(temp_column)
611 |     temp_column['name'] = model.name
612 |     config_attibutes.update(temp_column)
613 |     temp_column['id'] = model.id
614 |     config_attibutes.update(temp_column)
615 |     temp_column['version'] = model.version
616 |     config_attibutes.update(temp_column)
617 |     temp_column['tags'] = model.tags
618 |     config_attibutes.update(temp_column)
619 |     temp_column['properties'] = model.properties
620 |     config_attibutes.update(temp_column)
621 | 
622 |     # Create a entity for Model
623 |     name = modelname 
624 |     qn = "pyapacheatlas://" + name
625 | 
626 |     exp_config_entity = AtlasEntity(
627 |         name=name,
628 |         typeName="custom_ml_model",
629 |         qualified_name=qn,
630 |         guid = guid.get_guid(),
631 |         attributes = config_attibutes
632 |     )
633 | 
634 |     # Upload all entities!
635 |     client.upload_entities(batch=[exp_config_entity.to_json()])    
636 |     
637 | def create_model_metrics_entity(experiment_name,best_run):
638 |     metrics = best_run.get_metrics()
639 | 
640 |     # select relevant metrics
641 |     auc = metrics.get('AUC_weighted')
642 |     accuracy = metrics.get('accuracy')
643 |     precision = metrics.get('precision_score_weighted')
644 |     recall = metrics.get('recall_score_weighted')
645 |     f1 = metrics.get('f1_score_weighted')
646 | 
647 |     # # combine into single dataframe
648 |     # metrics_df = sc.parallelize([['AUC', auc], ['Accuracy', accuracy], ['Precision', precision], ['Recall', recall], ['F1', f1]]).toDF(('Metric', 'Value'))
649 |     metrics = ['AUC','Accuracy','Precision','Recall','F1']
650 |     metricslist= [auc,accuracy,precision,recall,f1]
651 |     columns = ['Metric','Value']
652 |     metrics_df =  pd.DataFrame(zip(metrics, metricslist),columns=columns)
653 | 
654 | 
655 |     dict_metrics = metrics_df.to_dict(orient = 'records')
656 |     dict_metrics
657 | 
658 |     config_attibutes={}
659 |     for attibutes in dict_metrics:
660 |         temp_column={}
661 |         temp_column[attibutes['Metric']] = attibutes['Value']
662 |         config_attibutes.update(temp_column)
663 |     config_attibutes
664 | 
665 |     name = experiment_name + "-modelmetrics"
666 |     qn = "pyapacheatlas://" + name
667 | 
668 |     # Create a entity for model metrics
669 |     exp_config_entity = AtlasEntity(
670 |         name=name,
671 |         typeName="custom_ml_model_metrics",
672 |         qualified_name=qn,
673 |         guid = guid.get_guid(),
674 |         attributes = config_attibutes
675 |     )
676 | 
677 |     # Upload all entities!
678 |     client.upload_entities(batch=[exp_config_entity.to_json()])
679 |     
680 | def create_experiment_lineage(experimentname,exp_data_qn,exp_config_qn,model_metrics_qn,model_qn):        
681 |     # create experiment process 
682 |     # inputs: prepareddata, modelconfig 
683 |     # outputs: model metrics and registered model
684 | 
685 |     from pyapacheatlas.core import AtlasProcess
686 | 
687 |     in_data_ent_guid = get_entity_guid(exp_data_qn,'custom_dataset')
688 |     in_exp_config_guid = get_entity_guid(exp_config_qn,'custom_ml_exp_config')
689 |     out_model_metrics_guid = get_entity_guid(model_metrics_qn,'custom_ml_model_metrics')
690 |     out_model_guid = get_entity_guid(model_qn,'custom_ml_model')
691 | 
692 |     process_name = experimentname + '-train'
693 |     process_qn = "pyapacheatlas://" + process_name
694 |     process_type_name = "Process"
695 | 
696 |     process = AtlasProcess(
697 |         name=process_name,
698 |         typeName=process_type_name,
699 |         qualified_name=process_qn,
700 |         inputs = [{"guid":in_data_ent_guid},{"guid":in_exp_config_guid}],
701 |         outputs = [{"guid":out_model_metrics_guid},{"guid":out_model_guid}],
702 |         guid=guid.get_guid()
703 |     )
704 | 
705 |     # Prepare all the entities as a batch to be uploaded.
706 |     batch = [process]
707 |     batch
708 | 
709 |     # Upload all entities!
710 |     client.upload_entities(batch=batch)  
711 |     
712 | def create_model_service_entity(ws,experimentname,aci_service_name,samplejson):
713 |     # get deployed ACI Web Service
714 |     from azureml.core.webservice import AciWebservice
715 |     aciws = AciWebservice(ws, aci_service_name)
716 | 
717 |     config_attibutes={}
718 |     temp_column={}
719 |     temp_column['workspace_name'] = aciws.workspace.name
720 |     config_attibutes.update(temp_column)
721 |     temp_column['workspace_subscription_id'] = aciws.workspace.subscription_id
722 |     config_attibutes.update(temp_column)
723 |     temp_column['workspace_resource_group'] = aciws.workspace.resource_group
724 |     config_attibutes.update(temp_column)
725 |     temp_column['name'] = aciws.name
726 |     config_attibutes.update(temp_column)
727 |     temp_column['image_id'] = aciws.image_id
728 |     config_attibutes.update(temp_column)
729 |     temp_column['compute_type'] = aciws.compute_type
730 |     config_attibutes.update(temp_column)
731 |     temp_column['state'] = aciws.state
732 |     config_attibutes.update(temp_column)
733 |     temp_column['scoring_uri'] = aciws.scoring_uri
734 |     config_attibutes.update(temp_column)
735 |     temp_column['tags'] = aciws.tags
736 |     config_attibutes.update(temp_column)
737 |     temp_column['state'] = aciws.state
738 |     config_attibutes.update(temp_column)
739 |     temp_column['properties'] = aciws.properties
740 |     config_attibutes.update(temp_column)
741 |     temp_column['created_by'] = aciws.created_by
742 |     config_attibutes.update(temp_column)
743 |     temp_column['sample_json'] = samplejson
744 |     config_attibutes.update(temp_column)
745 | 
746 |     name = experimentname + "-model_endpoint"
747 |     qn = "pyapacheatlas://" + name
748 | 
749 |     # Create a entity for ACI Web Service
750 |     endpoint_entity = AtlasEntity(
751 |         name=name,
752 |         typeName="custom_ml_model_endpoint",
753 |         qualified_name=qn,
754 |         guid = guid.get_guid(),
755 |         attributes = config_attibutes
756 |     )
757 | 
758 |     # Upload all entities!
759 |     client.upload_entities(batch=[endpoint_entity.to_json()])    
760 |     
761 | def create_powerbi_dataset_and_lineage(experiment_name,pbi_workspace,pbi_datasetid,pbidata_ent_name,ml_dataset_ent_name,ml_dataset_ent_type):
762 |     
763 |     pbidata_entity_type = 'powerbi_dataset'
764 |     pbidata_ent_qn = pbi_workspace + '/datasets/' + pbi_datasetid 
765 |     purview_basepath = 'pyapacheatlas://'
766 |     #"https://msit.powerbi.com/groups/7d666287-f9b8-45ff-be6c-9909afe9df40/datasets/e5a30c22-466d-4a30-a1ac-8736ed6567cc"
767 | 
768 |     pbidata_ent = AtlasEntity(
769 |         name=pbidata_ent_name,
770 |         typeName=pbidata_entity_type,
771 |         qualified_name= pbidata_ent_qn,
772 |         workspace = pbi_workspace,
773 |         guid = guid.get_guid()
774 |     )
775 | 
776 |     # Prepare all the entities as a batch to be uploaded.
777 |     batch = [pbidata_ent]
778 |     batch
779 | 
780 |     # Upload all entities!
781 |     client.upload_entities(batch=batch)
782 | 
783 |     #cretae powerbi_dataset_process lineage
784 |     in_ent_guids = []
785 |     in_ent_guid = get_entity_guid(purview_basepath + ml_dataset_ent_name,ml_dataset_ent_type)
786 |     in_ent_guids.append({'guid':in_ent_guid})
787 | 
788 |     out_ent_guids = []
789 |     out_ent_guid = get_entity_guid(pbidata_ent_qn,pbidata_entity_type)
790 |     out_ent_guids.append({'guid':out_ent_guid})
791 | 
792 |     process_name =  'createpowerbidataset' + pbidata_ent_name + experiment_name
793 |     process_qn = "pyapacheatlas://" + process_name
794 |     process_type_name = "powerbi_dataset_process"
795 | 
796 |     process = AtlasProcess(
797 |         name=process_name,
798 |         typeName=process_type_name,
799 |         qualified_name=process_qn,
800 |         inputs = in_ent_guids,
801 |         outputs = out_ent_guids,
802 |         guid=guid.get_guid()
803 |     )
804 | 
805 |     # Prepare all the entities as a batch to be uploaded.
806 |     batch = [process]
807 |     batch
808 | 
809 |     # Upload all entities!
810 |     client.upload_entities(batch=batch)
811 |     
812 | def create_powerbi_report_and_lineage(experiment_name,pbi_workspace,pbi_reportid,pbi_ent_name,pbi_datasetid):
813 | 
814 |     #create powerbi report
815 |     pbi_entity_type = 'powerbi_report'
816 |     pbi_ent_qn = pbi_workspace + '/reports/' + pbi_reportid 
817 |     purview_basepath = 'pyapacheatlas://'
818 |     
819 |     pbi_ent = AtlasEntity(
820 |         name=pbi_ent_name,
821 |         typeName=pbi_entity_type,
822 |         qualified_name= pbi_ent_qn, 
823 |         workspace = pbi_workspace,
824 |         guid = guid.get_guid()
825 |     )
826 | 
827 |     # Prepare all the entities as a batch to be uploaded.
828 |     batch = [pbi_ent]
829 |     batch
830 | 
831 |     # Upload all entities!
832 |     client.upload_entities(batch=batch)
833 | 
834 |     #create powerbi dashboard process lineage
835 |     pbidata_ent_qn = pbi_workspace + '/datasets/' + pbi_datasetid 
836 |     in_ent_guids = []
837 |     in_ent_guid = get_entity_guid(pbidata_ent_qn,'powerbi_dataset')
838 |     in_ent_guids.append({'guid':in_ent_guid})
839 | 
840 |     out_ent_guids = []
841 |     out_ent_guid = get_entity_guid(pbi_ent_qn,'powerbi_report')
842 |     out_ent_guids.append({'guid':out_ent_guid})
843 | 
844 |     process_name = 'createpowerbireport' + pbi_ent_name + experiment_name
845 |     process_qn = "pyapacheatlas://" + process_name
846 |     process_type_name = "powerbi_report_process"
847 | 
848 |     process = AtlasProcess(
849 |         name=process_name,
850 |         typeName=process_type_name,
851 |         qualified_name=process_qn,
852 |         inputs = in_ent_guids,
853 |         outputs = out_ent_guids,
854 |         guid=guid.get_guid()
855 |     )
856 | 
857 |     # Prepare all the entities as a batch to be uploaded.
858 |     batch = [process]
859 |     batch
860 | 
861 |     # Upload all entities!
862 |     client.upload_entities(batch=batch)
863 |     
864 | # clean up datasets
865 | def cleanup_entities(typename, entitytype):
866 |     filter_setup = {"typeName": typename, "includeSubTypes": True}
867 |     search = client.search_entities("*", search_filter=filter_setup)
868 |     for entity in search:
869 |         #print(entity)
870 |         if entity.get("entityType") == entitytype:
871 |             print(entity.get("id"),entity.get("qualifiedName"),entity.get("entityType"))
872 |             guid = entity.get("id")
873 |             client.delete_entity(guid=guid)
874 | 
875 | 


--------------------------------------------------------------------------------
/AMLNotebooks/Create_ML_Lineage_Types.py:
--------------------------------------------------------------------------------
  1 | def create_ml_lineage_types(client):
  2 |     from pyapacheatlas.core.typedef import AtlasAttributeDef, EntityTypeDef, RelationshipTypeDef
  3 |     try:
  4 |         #-----------------------------------------------------------------------------------#    
  5 |         #create custom dataset type
  6 |         type_df = EntityTypeDef(
  7 |             name="custom_dataset",
  8 |             attributeDefs=[
  9 |               AtlasAttributeDef(name="format")
 10 |             ],
 11 |             superTypes = ["DataSet"]
 12 |         )
 13 |         typedef_results = client.upload_typedefs(entityDefs = [type_df], force_update=True)
 14 | 
 15 |         #-----------------------------------------------------------------------------------#    
 16 |         #create process with column mapping type
 17 |         type_df = EntityTypeDef(
 18 |             name="ProcessWithColumnMapping",
 19 |             attributeDefs=[
 20 |               AtlasAttributeDef(name="columnMapping")
 21 |             ],
 22 |             superTypes = ["Process"]
 23 |             )
 24 |         typedef_results = client.upload_typedefs(entityDefs = [type_df], force_update=True)
 25 | 
 26 |         #-----------------------------------------------------------------------------------#    
 27 |         #create AML workspace type
 28 |         type_df = EntityTypeDef(
 29 |             name="custom_ml_workspace",
 30 |             attributeDefs=[
 31 |                 AtlasAttributeDef(name='name',typename='string'),
 32 |                 AtlasAttributeDef(name='description',typename='string'),
 33 |                 AtlasAttributeDef(name='subscription_id',typename='string'),
 34 |                 AtlasAttributeDef(name='resource_group',typename='string')
 35 |             ],
 36 |             superTypes = ["DataSet"]
 37 |         )
 38 |         typedef_results = client.upload_typedefs(entityDefs = [type_df], force_update=True)
 39 |         #-----------------------------------------------------------------------------------#        
 40 |         #create types for datastore and dataset
 41 | 
 42 |         #create AML datastore type
 43 |         datastore_type_df = EntityTypeDef(
 44 |             name="custom_ml_datastore",
 45 |             attributeDefs=[
 46 |               AtlasAttributeDef(name="name",typename='string'),
 47 |               AtlasAttributeDef(name='container_name',typename='string'),
 48 |               AtlasAttributeDef(name='account_name',typename='string'),
 49 |               AtlasAttributeDef(name='protocol',typename='string'),
 50 |               AtlasAttributeDef(name='endpoint',typename='string'),
 51 |               AtlasAttributeDef(name='server_name',typename='string'),
 52 |               AtlasAttributeDef(name='database_name',typename='string'),
 53 |               AtlasAttributeDef(name="createdby",typename='string')
 54 |             ],
 55 |             superTypes = ["DataSet"],
 56 |             options = {"schemaElementAttribute":"dataset"}
 57 |         )
 58 | 
 59 |         #create AML dataset type
 60 |         dataset_type_df = EntityTypeDef(
 61 |             name="custom_ml_dataset",
 62 |             attributeDefs=[
 63 |               AtlasAttributeDef(name="name",typename='string'),
 64 |               AtlasAttributeDef(name="description",typename='string'),
 65 |               AtlasAttributeDef(name="createdby",typename='string'),
 66 |               AtlasAttributeDef(name="createdtime",typename='string')
 67 |             ],
 68 |             superTypes = ["DataSet"]
 69 |         )
 70 | 
 71 |         # create relationsip between datastore and dataset
 72 |         dataset_to_datastore_relationship = RelationshipTypeDef(
 73 |             name="custom_ml_datastore_to_dataset",
 74 |             relationshipCategory="COMPOSITION",
 75 |             endDef1={
 76 |                     "type": "custom_ml_datastore",
 77 |                     "name": "dataset",
 78 |                     "isContainer": True,
 79 |                     "cardinality": "SET",
 80 |                     "isLegacyAttribute": False
 81 |                 },
 82 |             endDef2={
 83 |                     "type": "custom_ml_dataset",
 84 |                     "name": "datastore",
 85 |                     "isContainer": False,
 86 |                     "cardinality": "SINGLE",
 87 |                     "isLegacyAttribute": False
 88 |                 }
 89 |             )
 90 | 
 91 |         typedef_results = client.upload_typedefs(
 92 |             entityDefs = [datastore_type_df, dataset_type_df],
 93 |             relationshipDefs = [dataset_to_datastore_relationship],
 94 |             force_update=True
 95 |         )
 96 |         #-----------------------------------------------------------------------------------# 
 97 |         #create types for experiment and experimentstep
 98 | 
 99 |         #create process for Ml Experiment Step
100 |         exp_type_df = EntityTypeDef(
101 |             name="custom_ml_experiment",
102 |             attributeDefs=[
103 |               AtlasAttributeDef(name='name',typename='string'),
104 |               AtlasAttributeDef(name='notes',typename='string'),
105 |               AtlasAttributeDef(name="createdby",typename='string'),
106 |               AtlasAttributeDef(name="createdtime",typename='string')
107 |             ],
108 |             superTypes = ["Process"]
109 |         )
110 | 
111 |         #create process for Ml Experiment Step
112 |         exp_step_type_df = EntityTypeDef(
113 |             name="custom_ml_experiment_step",
114 |             attributeDefs=[
115 |               AtlasAttributeDef(name='notes',typename='string')
116 |             ],
117 |             superTypes = ["Process"]
118 |         )
119 | 
120 |         # create relationsip between experiment and experimentstep
121 |         step_to_exp_relationship = RelationshipTypeDef(
122 |             name="custom_ml_experiment_to_experimentstep",
123 |             relationshipCategory="COMPOSITION",
124 |             endDef1={
125 |                     "type": "custom_ml_experiment",
126 |                     "name": "experimentstep",
127 |                     "isContainer": True,
128 |                     "cardinality": "SET",
129 |                     "isLegacyAttribute": False
130 |                 },
131 |             endDef2={
132 |                     "type": "custom_ml_experiment_step",
133 |                     "name": "experiment",
134 |                     "isContainer": False,
135 |                     "cardinality": "SINGLE",
136 |                     "isLegacyAttribute": False
137 |                 }
138 |         )
139 | 
140 |         typedef_results = client.upload_typedefs(
141 |             entityDefs = [exp_type_df, exp_step_type_df],
142 |             relationshipDefs = [step_to_exp_relationship],
143 |             force_update=True
144 |         )
145 |         #-----------------------------------------------------------------------------------# 
146 | 
147 |         rd = RelationshipTypeDef(
148 |           name="custom_ml_workspace_datastore",
149 |           attributeDefs=[],
150 |           relationshipCategory="COMPOSITION", # Means the child can't exist  without the parent
151 |           endDef1={ # endDef1 decribes what the parent will have as an attribute
152 |               "type":"custom_ml_workspace", # Type of the parent
153 |               "name":"datastores", # What the parent will have
154 |               "isContainer": True,
155 |               "cardinality":"SET", # This is related to the cardinality, in this case the parent Server will have a SET of Models.
156 |               "isLegacyAttribute":False
157 |           },
158 |           endDef2={ # endDef2 decribes what the child will have as an attribute
159 |               "type":"custom_ml_datastore", # Type of the child
160 |               "name":"workspace", # What the child will have
161 |               "isContainer":False,
162 |               "cardinality":"SINGLE",
163 |               "isLegacyAttribute":False
164 |           }
165 |         )
166 |         client.upload_typedefs(relationshipDefs=[rd])
167 | 
168 |         #-----------------------------------------------------------------------------------#  
169 |         rd = RelationshipTypeDef(
170 |           name="custom_ml_workspace_experiment",
171 |           attributeDefs=[],
172 |           relationshipCategory="COMPOSITION", # Means the child can't exist  without the parent
173 |           endDef1={ # endDef1 decribes what the parent will have as an attribute
174 |               "type":"custom_ml_workspace", # Type of the parent
175 |               "name":"experiments", # What the parent will have
176 |               "isContainer": True,
177 |               "cardinality":"SET", # This is related to the cardinality, in this case the parent Server will have a SET of Models.
178 |               "isLegacyAttribute":False
179 |           },
180 |           endDef2={ # endDef2 decribes what the child will have as an attribute
181 |               "type":"custom_ml_experiment", # Type of the child
182 |               "name":"workspace", # What the child will have
183 |               "isContainer":False,
184 |               "cardinality":"SINGLE",
185 |               "isLegacyAttribute":False
186 |           }
187 |         )
188 |         client.upload_typedefs(relationshipDefs=[rd])
189 | 
190 |         #-----------------------------------------------------------------------------------# 
191 |         #create types for packages and package
192 | 
193 |         #create packages type
194 |         packages_type_df = EntityTypeDef(
195 |             name="custom_ml_packages",
196 |             attributeDefs=[
197 |                 AtlasAttributeDef(name='notes',typename='string')
198 |             ],
199 |             superTypes = ["DataSet"],
200 |             options = {"schemaElementAttribute":"package"}
201 |         )
202 | 
203 |         package_type_df = EntityTypeDef(
204 |             name="custom_ml_package",
205 |             attributeDefs=[
206 |                 AtlasAttributeDef(name='programming_language',typename='string'),
207 |                 AtlasAttributeDef(name='package_name',typename='string'),
208 |                 AtlasAttributeDef(name='version',typename='string'),
209 |                 AtlasAttributeDef(name='notes',typename='string')
210 |             ],
211 |             superTypes = ["DataSet"]
212 |         )
213 | 
214 |         # create relationsip between packages and package
215 |         package_to_packages_relationship = RelationshipTypeDef(
216 |             name="custom_ml_packages_to_package",
217 |             relationshipCategory="COMPOSITION",
218 |             endDef1={
219 |                     "type": "custom_ml_packages",
220 |                     "name": "package",
221 |                     "isContainer": True,
222 |                     "cardinality": "SET",
223 |                     "isLegacyAttribute": False
224 |                 },
225 |             endDef2={
226 |                     "type": "custom_ml_package",
227 |                     "name": "packages",
228 |                     "isContainer": False,
229 |                     "cardinality": "SINGLE",
230 |                     "isLegacyAttribute": False
231 |                 }
232 |         )
233 | 
234 |         typedef_results = client.upload_typedefs(
235 |             entityDefs = [packages_type_df, package_type_df],
236 |             relationshipDefs = [package_to_packages_relationship],
237 |             force_update=True
238 |         )
239 |       #-----------------------------------------------------------------------------------# 
240 | 
241 |         #create experiemnt config type
242 |         type_df = EntityTypeDef(
243 |             name="custom_ml_exp_config",
244 |             attributeDefs=[
245 |                 AtlasAttributeDef(name='task_type',typename='string'),
246 |                 AtlasAttributeDef(name='enable_early_stopping',typename='bool'),
247 |                 AtlasAttributeDef(name='experiment_timeout_minutes',typename='int'),
248 |                 AtlasAttributeDef(name='primary_metric',typename='string'),
249 |                 AtlasAttributeDef(name='compute_target',typename='string'),
250 |                 AtlasAttributeDef(name='label_column_name',typename='string'),
251 |                 AtlasAttributeDef(name='n_cross_validations',typename='int'),
252 |                 AtlasAttributeDef(name='model_explainability',typename='bool')
253 |             ],
254 |             superTypes = ["DataSet"]
255 |         )
256 |         typedef_results = client.upload_typedefs(entityDefs = [type_df], force_update=True)
257 | 
258 |         #-----------------------------------------------------------------------------------# 
259 | 
260 |         #create model metrics type
261 |         type_df = EntityTypeDef(
262 |             name="custom_ml_model_metrics",
263 |             attributeDefs=[
264 |                 AtlasAttributeDef(name='AUC',typename='float'),
265 |                 AtlasAttributeDef(name='Accuracy',typename='float'),
266 |                 AtlasAttributeDef(name='Precision',typename='float'),
267 |                 AtlasAttributeDef(name='Recall',typename='float'),
268 |                 AtlasAttributeDef(name='F1',typename='float')
269 |             ],
270 |             superTypes = ["DataSet"]
271 |         )
272 |         typedef_results = client.upload_typedefs(entityDefs = [type_df], force_update=True)
273 | 
274 |         #-----------------------------------------------------------------------------------# 
275 | 
276 |         #create model type
277 |         type_df = EntityTypeDef(
278 |             name="custom_ml_model",
279 |             attributeDefs=[
280 |                 AtlasAttributeDef(name='workspace_name',typename='string'),
281 |                 AtlasAttributeDef(name='workspace_subscription_id',typename='string'),
282 |                 AtlasAttributeDef(name='workspace_resource_group',typename='string'),
283 |                 AtlasAttributeDef(name='name',typename='string'),
284 |                 AtlasAttributeDef(name='id',typename='string'),
285 |                 AtlasAttributeDef(name='version',typename='string'),
286 |                 AtlasAttributeDef(name='tags',typename='string'),
287 |                 AtlasAttributeDef(name='properties',typename='string')
288 |             ],
289 |             superTypes = ["DataSet"]
290 |         )
291 |         typedef_results = client.upload_typedefs(entityDefs = [type_df], force_update=True)
292 | 
293 |         #-----------------------------------------------------------------------------------# 
294 | 
295 |         #create endpoint type
296 |         type_df = EntityTypeDef(
297 |             name="custom_ml_model_endpoint",
298 |             attributeDefs=[
299 |                   AtlasAttributeDef(name='workspace_name',typename='string'),
300 |                   AtlasAttributeDef(name='workspace_subscription_id',typename='string'),
301 |                   AtlasAttributeDef(name='workspace_resource_group',typename='string'),
302 |                   AtlasAttributeDef(name='name',typename='string'),
303 |                   AtlasAttributeDef(name='image_id',typename='string'),
304 |                   AtlasAttributeDef(name='compute_type',typename='string'),
305 |                   AtlasAttributeDef(name='state',typename='string'),
306 |                   AtlasAttributeDef(name='scoring_uri',typename='string'),
307 |                   AtlasAttributeDef(name='tags',typename='string'),
308 |                   AtlasAttributeDef(name='state',typename='string'),
309 |                   AtlasAttributeDef(name='properties',typename='string'),
310 |                   AtlasAttributeDef(name='created_by',typename='string'),
311 |                   AtlasAttributeDef(name='sample_json',typename='string')
312 |             ],
313 |             superTypes = ["DataSet"]
314 |         )
315 |         typedef_results = client.upload_typedefs(entityDefs = [type_df], force_update=True)
316 | 
317 |         #-----------------------------------------------------------------------------------# 
318 |     except:
319 |         print('types already created')  
320 | 
321 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Microsoft Open Source Code of Conduct
 2 | 
 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 4 | 
 5 | Resources:
 6 | 
 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing 
 2 | 
 3 | Welcome, and thank you for your interest in contributing. There are many ways to contribute: 
 4 | * [Submit issues](https://github.com/microsoft/Purview-Machine-Learning-Lineage-Solution-Accelerator/issues) to report bugs and make suggestions. 
 5 | * Review the [source code changes](https://github.com/microsoft/Purview-Machine-Learning-Lineage-Solution-Accelerator/pulls). 
 6 | * Contribute features and fixes by forking the repository and creating a [pull request](https://github.com/microsoft/Purview-Machine-Learning-Lineage-Solution-Accelerator/compare). 
 7 | 
 8 | ## Contributor License Agreement 
 9 | This project welcomes contributors and suggestions. Most contributions require you to agree to a Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us the rights to use your contribution. For details, visit [https://cla.opensource.microsoft.com](https://cla.opensource.microsoft.com).
10 | 
11 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide a CLA and decorate the PR appropriately (e.g., status checks, comments). Simply follow the instructions provided by the bot. You will only need to do this once across all repos using our CLA. 
12 | 
13 | ## Microsoft Open Source Code of Conduct
14 | This project has adopted the [Microsoft Open Source Code](https://opensource.microsoft.com/codeofconduct/) of Conduct. For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 
15 | 


--------------------------------------------------------------------------------
/Deployment/deploy.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#",
  3 |     "contentVersion": "1.0.0.0",
  4 |     
  5 |     "parameters": {
  6 |         "prefixName": {
  7 |             "type": "string",
  8 |             "defaultValue": "zzmlpr",
  9 |             "minLength": 3,
 10 |             "maxLength": 10,
 11 |             "metadata": {
 12 |                 "description": "Name prefix between 3-6 characters with only characters and numbers"
 13 |             }
 14 |         },
 15 |         "AllowAll": {
 16 |             "type": "string",
 17 |             "allowedValues": [
 18 |                 "true",
 19 |                 "false"
 20 |             ],
 21 |             "defaultValue": "true"
 22 |         }
 23 |     },
 24 | 
 25 |     "variables": {
 26 |         "subscriptionId": "[subscription().subscriptionId]",
 27 |         "location": "[resourceGroup().location]",
 28 |         "rgId": "[resourceGroup().id]",
 29 |         "rgName": "[resourceGroup().name]",
 30 | 
 31 |         "tenantId": "[subscription().tenantId]",
 32 |         "paramName": "[parameters('prefixName')]",
 33 |         "storageContainer": "data",
 34 | 
 35 |         "uniqueName": "[substring(uniqueString(variables('rgId')),0,4)]",
 36 |         
 37 |         "synapseWorkspaceName": "[concat('synapse-ws-',variables('paramName'))]",
 38 |         "storageName": "[replace(replace(toLower(concat(concat('synapsestrg',variables('paramName')),variables('uniqueName'))),'-',''),'_','')]",
 39 |         
 40 |         "machinelearningName": "[concat('ml-', variables('paramName'))]",
 41 |         "storageMLname": "[replace(replace(toLower(concat(concat('mlstrg',variables('paramName')),variables('uniqueName'))),'-',''),'_','')]",
 42 |         
 43 |         "appinsightsname": "[concat(variables('machinelearningName'), 'ai')]",
 44 |         "keyvaultname": "[replace(replace(toLower(concat('keyvault',variables('paramName'))),'-',''),'_','')]",
 45 | 		"keyvaultname": "[replace(replace(toLower(concat(concat('keyvault',variables('paramName')),variables('uniqueName'))),'-',''),'_','')]",
 46 |         "purviewName": "[concat('purview-', variables('paramName'))]",
 47 |         
 48 |         "StorageBlobDataContributor": "ba92f5b4-2d11-453d-a403-e96b0029c9fe"
 49 |     },
 50 | 
 51 |     "resources": [
 52 |         {
 53 |               "name": "[concat(variables('purviewname'), 'dev')]",
 54 |               "type": "Microsoft.Purview/accounts",
 55 |               "apiVersion": "2020-12-01-preview",
 56 |               "location": "[variables('location')]",
 57 |               "identity": {
 58 |                 "type": "SystemAssigned"
 59 |               },
 60 |               "properties": {
 61 |                 "networkAcls": {
 62 |                   "defaultAction": "Allow"
 63 |                 }
 64 |               },
 65 |               "dependsOn": [],
 66 |               "sku": {
 67 |                 "name": "Standard",
 68 |                 "capacity": "4"
 69 |               },
 70 |               "tags": {}
 71 |         },
 72 |         {
 73 |             "type": "Microsoft.Storage/storageAccounts",
 74 |             "apiVersion": "2019-06-01",
 75 |             "name": "[variables('storageName')]",
 76 |             "location": "[variables('location')]",
 77 |             "sku": {
 78 |                 "name": "Standard_LRS",
 79 |                 "tier": "Standard"
 80 |             },
 81 |             "kind": "StorageV2",
 82 |             "properties": {
 83 |                 "isHnsEnabled": true,
 84 |                 "networkAcls": {
 85 |                     "bypass": "AzureServices",
 86 |                     "virtualNetworkRules": [],
 87 |                     "ipRules": [],
 88 |                     "defaultAction": "Allow"
 89 |                 },
 90 |                 "supportsHttpsTrafficOnly": true,
 91 |                 "encryption": {
 92 |                     "services": {
 93 |                         "file": {
 94 |                             "enabled": true
 95 |                         },
 96 |                         "blob": {
 97 |                             "enabled": true
 98 |                         }
 99 |                     },
100 |                     "keySource": "Microsoft.Storage"
101 |                 },
102 |                 "accessTier": "Hot"
103 |             }
104 |         },
105 |         {
106 |             "type": "Microsoft.Storage/storageAccounts/blobServices",
107 |             "apiVersion": "2019-06-01",
108 |             "name": "[concat(variables('storageName'), '/default')]",
109 |             "dependsOn": [
110 |                 "[resourceId('Microsoft.Storage/storageAccounts', variables('storageName'))]"
111 |             ],
112 |             "properties": {
113 |                 "cors": {
114 |                     "corsRules": []
115 |                 },
116 |                 "deleteRetentionPolicy": {
117 |                     "enabled": false
118 |                 }
119 |             }
120 |         },
121 |         {
122 |             "type": "Microsoft.Storage/storageAccounts/blobServices/containers",
123 |             "apiVersion": "2019-06-01",
124 |             "name": "[concat(variables('storageName'), '/default/', variables('storageContainer'))]",
125 |             "dependsOn": [
126 |                 "[resourceId('Microsoft.Storage/storageAccounts/blobServices', variables('storageName'), 'default')]",
127 |                 "[resourceId('Microsoft.Storage/storageAccounts', variables('storageName'))]"
128 |             ],
129 |             "properties": {
130 |                 "publicAccess": "None"
131 |             }
132 |         },
133 |         {
134 |             "type": "Microsoft.Synapse/workspaces",
135 |             "apiVersion": "2020-12-01",
136 |             "name": "[variables('synapseWorkspaceName')]",
137 |             "location": "[variables('location')]",
138 |             "identity": {
139 |                 "type": "SystemAssigned"
140 |             },
141 |             "properties": {
142 |                 "defaultDataLakeStorage": {
143 |                     "accountUrl": "[concat('https://', variables('storageName') , '.dfs.core.windows.net')]",
144 |                     "filesystem": "[variables('storageContainer')]"
145 |                 },
146 |                 "virtualNetworkProfile": {
147 |                     "computeSubnetId": ""
148 |                 },
149 |                 "sqlAdministratorLogin": "sqladminuser"
150 |             },
151 |             "resources": [
152 |                 {
153 |                     "condition": "[equals(parameters('AllowAll'),'true')]",
154 |                     "type": "firewallrules",
155 |                     "apiVersion": "2019-06-01-preview",
156 |                     "name": "allowAll",
157 |                     "location": "[variables('location')]",
158 |                     "dependsOn": [ "[variables('synapseWorkspaceName')]" ],
159 |                     "properties": {
160 |                       "startIpAddress": "0.0.0.0",
161 |                       "endIpAddress": "255.255.255.255"
162 |                     }
163 |                   }
164 |             ]
165 |         },
166 |         {
167 |             "type": "Microsoft.Synapse/workspaces/bigDataPools",
168 |             "apiVersion": "2020-12-01",
169 |             "name": "[concat(variables('synapseWorkspaceName'), '/spark1')]",
170 |             "location": "[variables('location')]",
171 |             "dependsOn": [
172 |                 "[resourceId('Microsoft.Synapse/workspaces', variables('synapseWorkspaceName'))]"
173 |             ],
174 |             "properties": {
175 |                 "sparkVersion": "2.4",
176 |                 "nodeCount": 3,
177 |                 "nodeSize": "Medium",
178 |                 "nodeSizeFamily": "MemoryOptimized",
179 |                 "autoScale": {
180 |                     "enabled": true,
181 |                     "minNodeCount": 3,
182 |                     "maxNodeCount": 6
183 |                 },
184 |                 "autoPause": {
185 |                     "enabled": true,
186 |                     "delayInMinutes": 15
187 |                 },
188 |                 "isComputeIsolationEnabled": false,
189 |                 "sessionLevelPackagesEnabled": false,
190 |                 "cacheSize": 0,
191 |                 "dynamicExecutorAllocation": {
192 |                     "enabled": true
193 |                 },
194 |                 "provisioningState": "Succeeded"
195 |             }
196 |         },
197 |         
198 |         {
199 |             "type": "microsoft.insights/components",
200 |             "apiVersion": "2020-02-02-preview",
201 |             "name": "[variables('appinsightsName')]",
202 |             "location": "[variables('location')]",
203 |             "kind": "web",
204 |             "properties": {
205 |                 "Application_Type": "web",
206 |                 "IngestionMode": "ApplicationInsights",
207 |                 "publicNetworkAccessForIngestion": "Enabled",
208 |                 "publicNetworkAccessForQuery": "Enabled"
209 |             }
210 |         },
211 |         {
212 |             "type": "Microsoft.KeyVault/vaults",
213 |             "apiVersion": "2020-04-01-preview",
214 |             "name": "[variables('keyvaultName')]",
215 |             "location": "[variables('location')]",
216 |             "properties": {
217 |                 "sku": {
218 |                     "family": "A",
219 |                     "name": "standard"
220 |                 },
221 |                 "tenantId": "[variables('tenantId')]",
222 |                 "accessPolicies": [
223 |                 ],
224 |                 "enabledForDeployment": false,
225 |                 "enableSoftDelete": true,
226 |                 "enablePurgeProtection": true,
227 |                 "vaultUri": "[concat('https://', variables('keyvaultName'), '.vault.azure.net/')]",
228 |                 "provisioningState": "Succeeded"
229 |             }
230 |         },
231 |         {
232 |             "type": "Microsoft.Storage/storageAccounts",
233 |             "apiVersion": "2021-01-01",
234 |             "name": "[variables('storageMLName')]",
235 |             "location": "[variables('location')]",
236 |             "sku": {
237 |                 "name": "Standard_LRS",
238 |                 "tier": "Standard"
239 |             },
240 |             "kind": "StorageV2",
241 |             "properties": {
242 |                 "networkAcls": {
243 |                     "bypass": "AzureServices",
244 |                     "virtualNetworkRules": [],
245 |                     "ipRules": [],
246 |                     "defaultAction": "Allow"
247 |                 },
248 |                 "supportsHttpsTrafficOnly": true,
249 |                 "encryption": {
250 |                     "services": {
251 |                         "file": {
252 |                             "keyType": "Account",
253 |                             "enabled": true
254 |                         },
255 |                         "blob": {
256 |                             "keyType": "Account",
257 |                             "enabled": true
258 |                         }
259 |                     },
260 |                     "keySource": "Microsoft.Storage"
261 |                 },
262 |                 "accessTier": "Hot"
263 |             }
264 |         },
265 |         {
266 |             "type": "Microsoft.MachineLearningServices/workspaces",
267 |             "apiVersion": "2021-01-01",
268 |             "name": "[variables('machinelearningName')]",
269 |             "location": "[variables('location')]",
270 |             "dependsOn": [
271 |                 "[resourceId('Microsoft.Storage/storageAccounts', variables('storageMLname'))]",
272 |                 "[resourceId('Microsoft.KeyVault/vaults', variables('keyvaultname'))]",
273 |                 "[resourceId('microsoft.insights/components', variables('appinsightsname'))]"
274 |             ],
275 |             "sku": {
276 |                 "name": "Basic",
277 |                 "tier": "Basic"
278 |             },
279 |             "identity": {
280 |                 "type": "SystemAssigned"
281 |             },
282 |             "properties": {
283 |                 "friendlyName": "[variables('machinelearningName')]",
284 |                 "storageAccount": "[resourceId('Microsoft.Storage/storageAccounts', variables('storageMLname'))]",
285 |                 "keyVault": "[resourceId('Microsoft.KeyVault/vaults', variables('keyvaultname'))]",
286 |                 "applicationInsights": "[resourceId('microsoft.insights/components', variables('appinsightsname'))]",
287 |                 "hbiWorkspace": false,
288 |                 "allowPublicAccessWhenBehindVnet": false
289 |             }
290 |         },
291 |         {
292 |             "scope": "[concat('Microsoft.Storage/storageAccounts/', variables('storageName'))]",
293 |             "type": "Microsoft.Authorization/roleAssignments",
294 |             "apiVersion": "2020-04-01-preview",
295 |             "name": "[guid(uniqueString(variables('storageName')))]",
296 |             "location": "[variables('location')]",
297 |             "dependsOn": [
298 |               "[variables('synapseWorkspaceName')]"
299 |             ],
300 |             "properties": {
301 |               "roleDefinitionId": "[resourceId('Microsoft.Authorization/roleDefinitions', variables('StorageBlobDataContributor'))]",
302 |               "principalId": "[reference(resourceId('Microsoft.Synapse/workspaces', variables('synapseWorkspaceName')), '2019-06-01-preview', 'Full').identity.principalId]",
303 |               "principalType": "ServicePrincipal"
304 |             }
305 |         },
306 |         { 
307 |             "apiVersion": "2020-10-01",
308 |             "name": "pid-2da55a03-dd52-561e-8690-cae328ce0200",
309 |             "type": "Microsoft.Resources/deployments",
310 |             "properties": {
311 |                 "mode": "Incremental",
312 |                 "template": {
313 |                     "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
314 |                     "contentVersion": "1.0.0.0",
315 |                     "resources": []
316 |                 }
317 |             }
318 |         }
319 |     ]
320 | }


--------------------------------------------------------------------------------
/Deployment/img/ADLSGen2Scanning.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/Purview-Machine-Learning-Lineage-Solution-Accelerator/171e292aa682764aef4c9e2bfa2b90297c7724e4/Deployment/img/ADLSGen2Scanning.PNG


--------------------------------------------------------------------------------
/Deployment/img/AMLPipeline.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/Purview-Machine-Learning-Lineage-Solution-Accelerator/171e292aa682764aef4c9e2bfa2b90297c7724e4/Deployment/img/AMLPipeline.PNG


--------------------------------------------------------------------------------
/Deployment/img/AMLPipelineLineage.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/Purview-Machine-Learning-Lineage-Solution-Accelerator/171e292aa682764aef4c9e2bfa2b90297c7724e4/Deployment/img/AMLPipelineLineage.PNG


--------------------------------------------------------------------------------
/Deployment/img/Architecture.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/Purview-Machine-Learning-Lineage-Solution-Accelerator/171e292aa682764aef4c9e2bfa2b90297c7724e4/Deployment/img/Architecture.PNG


--------------------------------------------------------------------------------
/Deployment/img/MLLineageScreenshot.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/Purview-Machine-Learning-Lineage-Solution-Accelerator/171e292aa682764aef4c9e2bfa2b90297c7724e4/Deployment/img/MLLineageScreenshot.PNG


--------------------------------------------------------------------------------
/Deployment/img/ManageSparkPool.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/Purview-Machine-Learning-Lineage-Solution-Accelerator/171e292aa682764aef4c9e2bfa2b90297c7724e4/Deployment/img/ManageSparkPool.png


--------------------------------------------------------------------------------
/Deployment/img/PurviewMLLineageIntroduction.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/Purview-Machine-Learning-Lineage-Solution-Accelerator/171e292aa682764aef4c9e2bfa2b90297c7724e4/Deployment/img/PurviewMLLineageIntroduction.PNG


--------------------------------------------------------------------------------
/Deployment/img/PurviewMLLineageSolutionAccelerator.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/Purview-Machine-Learning-Lineage-Solution-Accelerator/171e292aa682764aef4c9e2bfa2b90297c7724e4/Deployment/img/PurviewMLLineageSolutionAccelerator.PNG


--------------------------------------------------------------------------------
/Deployment/img/PurviewScreenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/Purview-Machine-Learning-Lineage-Solution-Accelerator/171e292aa682764aef4c9e2bfa2b90297c7724e4/Deployment/img/PurviewScreenshot.png


--------------------------------------------------------------------------------
/Deployment/img/Requirements.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/Purview-Machine-Learning-Lineage-Solution-Accelerator/171e292aa682764aef4c9e2bfa2b90297c7724e4/Deployment/img/Requirements.png


--------------------------------------------------------------------------------
/Deployment/img/add-role-assignment-page.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/Purview-Machine-Learning-Lineage-Solution-Accelerator/171e292aa682764aef4c9e2bfa2b90297c7724e4/Deployment/img/add-role-assignment-page.png


--------------------------------------------------------------------------------
/Deployment/img/deploy-firewall.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/Purview-Machine-Learning-Lineage-Solution-Accelerator/171e292aa682764aef4c9e2bfa2b90297c7724e4/Deployment/img/deploy-firewall.png


--------------------------------------------------------------------------------
/Deployment/requirements.txt:
--------------------------------------------------------------------------------
1 | pyapacheatlas
2 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE
22 | 
23 | 	## Note about Libraries with MPL-2.0 and LGPL-2.1 Licenses   
24 | 	The following libraries are not **explicitly included** in this repository, but users who use this Solution Accelerator may need to install them locally and in Azure Synapse and Azure Machine Learning to fully utilize this Solution Accelerator. However, the actual binaries and files associated with the libraries **are not included** as part of this repository, but they are available for installation via the PyPI library using the pip installation tool.  
25 | 	  
26 | 	Libraries: chardet, certifi
27 | 


--------------------------------------------------------------------------------
/NOTICE.txt:
--------------------------------------------------------------------------------
 1 | NOTICES AND INFORMATION
 2 | Do Not Translate or Localize
 3 | 
 4 | This software incorporates material from third parties.
 5 | Microsoft makes certain open source code available at https://3rdpartysource.microsoft.com,
 6 | or you may send a check or money order for US $5.00, including the product name,
 7 | the open source component name, platform, and version number, to:
 8 | 
 9 | Source Code Compliance Team
10 | Microsoft Corporation
11 | One Microsoft Way
12 | Redmond, WA 98052
13 | USA
14 | 
15 | Notwithstanding any other terms, you may reverse engineer this software to the extent
16 | required to debug changes to any libraries licensed under the GNU Lesser General Public License.
17 | 
18 | ---------------------------------------------------------
19 | 


--------------------------------------------------------------------------------
/PRIVACY.md:
--------------------------------------------------------------------------------
 1 | # Privacy
 2 | 
 3 | When you deploy this template, Microsoft is able to identify the installation of the software with the Azure resources that are deployed. Microsoft is able to correlate the Azure resources that are used to support the software. Microsoft collects this information to provide the best experiences with their products and to operate their business. The data is collected and governed by Microsoft's privacy policies, which can be found at [Microsoft Privacy Statement](https://go.microsoft.com/fwlink/?LinkID=824704).
 4 | 
 5 | To disable this, simply remove the following section from [deploy.json](./Deployment/deploy.json) before deploying the resources to Azure:
 6 | 
 7 | ```json
 8 | {
 9 |     "apiVersion": "2018-02-01",
10 |     "name": "pid-2da55a03-dd52-561e-8690-cae328ce0200",
11 |     "type": "Microsoft.Resources/deployments",
12 |     "properties": {
13 |         "mode": "Incremental",
14 |         "template": {
15 |             "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#",
16 |             "contentVersion": "1.0.0.0",
17 |             "resources": []
18 |         }
19 |     }
20 | }
21 | ```
22 | 
23 | You can see more information on this at https://docs.microsoft.com/en-us/azure/marketplace/azure-partner-customer-usage-attribution.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | page_type: sample
  3 | languages:
  4 | - python
  5 | - bash
  6 | products:
  7 | - microsoft-purview
  8 | - azure-synapse-analytics
  9 | - azure-machine-learning
 10 | ---
 11 | ![Purview Machine Learning Lineage Solution Accelerator](./Deployment/img/PurviewMLLineageSolutionAccelerator.PNG)
 12 | 
 13 | # Purview Machine Learning Lineage Solution Accelerator 
 14 | 
 15 | Microsoft Purview is a unified data governance service that helps you manage and govern data across different sources.
 16 | 
 17 | Machine Learning project life cycle involves many steps to transform raw data into insights. This process usually requires individuals with different roles/skillsets across multiple teams to collaborate effectively. Microsoft Purview helps simplify this complex process by providing an end-to-end lineage of ML entities and processes to enable better collaboration, auditing and debugging capabilities.
 18 | 
 19 | This solution accelerator helps developers with the resources needed to build an end-to-end lineage in Purview for Machine Learning scenarios.
 20 | 
 21 | ## Sample Credit Risk Prediction ML Process Flow
 22 | ![Purview Machine Learning Lineage Introduction](./Deployment/img/PurviewMLLineageIntroduction.PNG)
 23 | 
 24 | ## Purview ML Process Lineage
 25 | ![ML Lineage](./Deployment/img/MLLineageScreenshot.PNG)
 26 | 
 27 | ## Prerequisites
 28 | To use this solution accelerator, you will need access to an [Azure subscription](https://azure.microsoft.com/free/). While not required, a prior understanding of Microsoft Purview, Azure Synapse Analytics and Machine Learning will be helpful.
 29 | 
 30 | For additional training and support, please see:
 31 | 1. [Microsoft Purview](https://azure.microsoft.com/en-us/services/purview/) 
 32 | 2. [Azure Synapse Analytics](https://azure.microsoft.com/en-us/services/synapse-analytics/) 
 33 | 3. [Azure Machine Learning](https://azure.microsoft.com/en-us/services/machine-learning/) 
 34 | 
 35 | ## Getting Started
 36 | Start by deploying the required resources to Azure. The button below will deploy Microsoft Purview, Azure Synapse Analytics, Azure Machine Learning and its related resources:
 37 | 
 38 | [![Deploy to Azure](https://aka.ms/deploytoazurebutton)](https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2Fmicrosoft%2FPurview-Machine-Learning-Lineage-Solution-Accelerator%2Fmain%2FDeployment%2Fdeploy.json)
 39 | 
 40 | If you prefer to setup manually, you need to deploy Microsoft Purview, Azure Synapse Analytics, Azure Machine Learning.
 41 | 
 42 | Note: To minimize Azure costs, consider deleting the Purview instance at the end of this exercise if you do not plan to use this instance actively. 
 43 | 
 44 | ### Step 1. Download Files
 45 | Clone or download this repository and navigate to the project's root directory.
 46 | 
 47 | ### Step 2. Purview Security Access
 48 | 
 49 | #### Step 2.1 Create a Service Principal for Purview Rest API access
 50 | [Create a service principal](https://docs.microsoft.com/en-us/azure/purview/tutorial-using-rest-apis#create-a-service-principal-application)
 51 | 
 52 | #### Step 2.2 Configure your Purview catalog to trust the service principal
 53 | [Configure your Purview catalog to trust the service principal](https://docs.microsoft.com/en-us/azure/purview/tutorial-using-rest-apis#configure-your-catalog-to-trust-the-service-principal-application)
 54 | 
 55 | ### Step 3. Azure Machine Learning Security Access
 56 | 
 57 | #### Step 3.1 Create a Service Principal for AML access
 58 | [Create a service principal](https://docs.microsoft.com/en-us/azure/purview/tutorial-using-rest-apis#create-a-service-principal-application)
 59 | 
 60 | #### Step 3.2 Configure your Azure Machine Learning to trust the service principal
 61 | 1. From the [Azure portal](https://portal.azure.com/), select your AML workspace
 62 | 2. select Access Control (IAM)
 63 | 3. Select Add, Add Role Assignment to open the Add role assignment page
 64 | 
 65 | 	3.1 For the `Role` type in `Contributor`
 66 | 	
 67 | 	3.2 For `Assign access to` leave the default, `User, group, or service principal`
 68 | 	
 69 | 	3.2 For `Select` enter the name of the previosly created service principal in step 3.1 and then click on their name in the results pane
 70 | 	
 71 | 	3.3 Click on Save
 72 | You've now configured the service principal as a contributor on Azure Machine Learning resource.
 73 | 
 74 | ### Step 4. Synapse Security Access
 75 | 
 76 | #### Step 4.1 Add your IP address to Synapse firewall
 77 | Before you can upload assests to the Synapse Workspace you will need to add your IP address:
 78 | 1. Go to the Synapse resouce you created in the previous step
 79 | 2. Navigate to `Firewalls` under `Security` on the left hand side of the page
 80 | 3. At the top of the screen click `+ Add client IP`
 81 | 	![Update Firewalls](./Deployment/img/deploy-firewall.png)  
 82 | 4. Your IP address should now be visable in the IP list
 83 | 
 84 | #### Step 4.2: Update storage account permisions 
 85 | In order to perform the necessary actions in Synapse workspace, you will need to grant more access.
 86 | 1. Go to the Azure Data Lake Storage Account created above
 87 | 2. Go to the `Access Control (IAM) > + Add > Add role assignment` 
 88 | 3. Now click the Role dropdown and select `Storage Blob Data Contributor`
 89 | 	- Search for your username and add
 90 | 4. Click `Save` at the bottom
 91 | 
 92 | [Learn more](https://docs.microsoft.com/azure/synapse-analytics/security/how-to-set-up-access-control)
 93 | 
 94 | ### Step 5. Upload CreditRisk Sample Dataset
 95 | 1. Launch the Synapse workspace [Synapse Workspace](https://ms.web.azuresynapse.net/)
 96 | 2. Select the `subscription` and `workspace` name you are using for this solution accelerator
 97 | 3. In Synapse Studio, navigate to the `Data` Hub
 98 | 4. Select `Linked`
 99 | 5. Under the category `Azure Data Lake Storage Gen2` you'll see an item with a name like `xxxxx(xxxxx- Primary)`
100 | 6. Select the container named `data (Primary)`
101 | 7. Create a new folder `creditriskdata`
102 | 8. Select `Upload` and select `loan.csv` and `borrower.csv` files downloaded from [Data](./Data/) folder
103 | 
104 | ### Step 6. Register and scan uploaded data in Purview
105 | 
106 | 1. [Setting up authentication for a scan](https://docs.microsoft.com/en-us/azure/purview/register-scan-adls-gen2#managed-identity-recommended)
107 | 
108 | 2. [Register and scan adls gen2](https://docs.microsoft.com/en-us/azure/purview/register-scan-adls-gen2#register-azure-data-lake-storage-gen2-data-source) 
109 | 
110 | select only the `creditriskdata` folder while creating the scan.
111 | 
112 | ![ADLSGen2 Scanning folder selection](./Deployment/img/ADLSGen2Scanning.PNG)
113 | 
114 | Wait for scan run status to change to `Completed` before running next step.
115 | 
116 | ### Step 7. Upload Assets and Run Noteboks
117 | 1. Launch the Synapse workspace [Synapse Workspace](https://ms.web.azuresynapse.net/)
118 | 2. Select the `subscription` and `workspace` name you are using for this solution accelerator
119 | 3. Go to the `Manage` tab in the Synapse workspace and click on the `Apache Spark pools`
120 | 
121 |     - ![Spark Pool](./Deployment/img/ManageSparkPool.png)
122 | 4. Click `...` on the deployed Spark Pool and select `Packages`
123 | 5. Click `Upload` and select [requirements.txt](/Deployment/requirements.txt) from the cloned repo and click `Apply` 
124 |  
125 |     - ![Requirements File](./Deployment/img/Requirements.png)
126 | 
127 | 6. Go to `Develop`, click the `+`, and click `Import` to select all notebooks from the repository's `/SynapseNotebooks/` folder
128 | 7. For each of the notebooks, select `Attach to > spark1` in the top dropdown
129 | 8. Update Purview Tenant, Client Id and Secret from step `2.1` in `01_Authenticate_to_Purview_AML.ipynb`
130 | 9. Update Azure Machine Learning Tenant, Client Id and Secret from step `3.1` in `01_Authenticate_to_Purview_AML.ipynb`
131 | 10. Update `account_name` variable to your ADLS in `04_Create_CreditRisk_Experiment.ipynb`
132 | 11. Click `Publish all` to publish the notebook changes
133 | 12. Run the following notebook:
134 | 	- `04_Create_CreditRisk_Experiment.ipynb` (This notebook runs other notebooks you imported)
135 | 	
136 | ### Step 8. Check Machine Learning Lineage in Purview Studio
137 | 1. Launch [Purview Studio](https://ms.web.purview.azure.com/)
138 | 2. Click on `Browse Assets`
139 | 3. Click on `Custom Model` and select the model we created from running notebooks in `Step 7`
140 | 4. Click on `Lineage` to see Machine Learning process Lineage
141 | ![ML Lineage](./Deployment/img/PurviewScreenshot.png)
142 | 
143 | ### Step 9. Upload Assets and Run Azure Machine Learning Noteboks (Optional)
144 | 1. Launch the Azure Machine Learning studio [AML Studio](https://ml.azure.com/) 
145 | 2. Select the `subscription` and `workspace` name you are using for this solution accelerator
146 | 3. Go to the `Notebooks` tab in the AML Studio and upload the notebooks and scripts in `AML Notebooks` folder including `Data` folder
147 | 4. Go to the `Compute` tab in the AML Studio and click on the `Compute Instances` 
148 | 5. Click `New` and create a new compute instance
149 | 6. Click `Jupyter` and launch the compute instance
150 | 7. In the browser window that opens, click the folders to see the notebooks you uploaded in step `9.3`
151 | 7. Update Purview Tenant, Client Id and Secret from step `2.1` in `Authenticate_to_Purview_AML.py`
152 | 8. Update Azure Machine Learning Tenant, Client Id and Secret from step `3.1` in `Authenticate_to_Purview_AML.py`
153 | 9. Run the following notebooks in order:
154 | 	- `01_Create_CreditRisk_AML_Pipeline.ipynb` ( Pipeline run might take few minutes so please wait for completion before running the next notebook) 
155 | 	- `02_Create_CreditRisk_AML_Pipeline_Lineage.ipynb`	
156 | 	
157 | ![ML Pipeline](./Deployment/img/AMLPipeline.PNG)	
158 | 
159 | ### Step 10. Check Machine Learning pipeline Lineage in Purview Studio (Optional)
160 | 1. Launch [Purview Studio](https://ms.web.purview.azure.com/)
161 | 2. Click on `Browse Assets`
162 | 3. Click on `Custom ML Experiment Step` and select any step we created from running notebooks in `Step 9`
163 | 4. Click on `Lineage` to see Machine Learning pipeline Lineage
164 | 
165 | ![ML Pipeline Lineage](./Deployment/img/AMLPipelineLineage.PNG)
166 | 	
167 | ## Architecture
168 | The architecture diagram below details what you will be building for this Solution Accelerator.
169 | ![Architecture](./Deployment/img/Architecture.PNG)
170 | 
171 | 
172 | ## License
173 | MIT License
174 | 
175 | Copyright (c) Microsoft Corporation.
176 | 
177 | Permission is hereby granted, free of charge, to any person obtaining a copy
178 | of this software and associated documentation files (the "Software"), to deal
179 | in the Software without restriction, including without limitation the rights
180 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
181 | copies of the Software, and to permit persons to whom the Software is
182 | furnished to do so, subject to the following conditions:
183 | 
184 | The above copyright notice and this permission notice shall be included in all
185 | copies or substantial portions of the Software.
186 | 
187 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
188 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
189 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
190 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
191 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
192 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
193 | SOFTWARE
194 | 
195 | ## Note about Libraries with MPL-2.0 and LGPL-2.1 Licenses   
196 | The following libraries are not **explicitly included** in this repository, but users who use this Solution Accelerator may need to install them locally and in Azure Synapse and Azure Machine Learning to fully utilize this Solution Accelerator. However, the actual binaries and files associated with the libraries **are not included** as part of this repository, but they are available for installation via the PyPI library using the pip installation tool.  
197 |   
198 | Libraries: chardet, certifi
199 | 
200 | ## Contributing
201 | This project welcomes contributions and suggestions.  Most contributions require you to agree to a Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
202 | 
203 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions provided by the bot. You will only need to do this once across all repos using our CLA.
204 | 
205 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
206 | 
207 | ## Trademarks
208 | This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft trademarks or logos is subject to and must follow [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. Any use of third-party trademarks or logos are subject to those third-party's policies.
209 | 
210 | ## Data Collection
211 | The software may collect information about you and your use of the software and send it to Microsoft. Microsoft may use this information to provide services and improve our products and services. You may turn off the telemetry as described in the repository. There are also some features in the software that may enable you and Microsoft to collect data from users of your applications. If you use these features, you must comply with applicable law, including providing appropriate notices to users of your applications together with a copy of Microsoft's privacy statement. Our privacy statement is located at https://go.microsoft.com/fwlink/?LinkID=824704. You can learn more about data collection and use in the help documentation and our privacy statement. Your use of the software operates as your consent to these practices.
212 | 
213 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.5 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->


--------------------------------------------------------------------------------
/SUPPORT.md:
--------------------------------------------------------------------------------
 1 | # Support
 2 | 
 3 | ## How to file issues and get help  
 4 | 
 5 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 
 6 | issues before filing new issues to avoid duplicates.  For new issues, file your bug or 
 7 | feature request as a new Issue.
 8 | 
 9 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 
10 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER
11 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**.
12 | 
13 | ## Microsoft Support Policy  
14 | 
15 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above.
16 | 


--------------------------------------------------------------------------------
/SynapseNotebooks/01_Authenticate_to_Purview_AML.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from pyapacheatlas.auth import ServicePrincipalAuthentication\n",
 10 |     "from pyapacheatlas.core import PurviewClient, AtlasClassification, AtlasEntity, AtlasProcess  \n",
 11 |     "from pyapacheatlas.readers import ExcelConfiguration, ExcelReader\n",
 12 |     "from pyapacheatlas.core.util import GuidTracker\n",
 13 |     "from pyapacheatlas.core import AtlasAttributeDef, AtlasEntity, PurviewClient\n",
 14 |     "from pyapacheatlas.core.typedef import EntityTypeDef"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "# get SPN details you created in step 2.1 of solution accelerator setup\n",
 24 |     "tenant_id = \"<TENANT_ID>\"\n",
 25 |     "client_id = \"<CLIENT_ID>\"\n",
 26 |     "client_secret = \"<CLIENT_SECRET>\"\n",
 27 |     "\n",
 28 |     "# get Purview account name from azure portal\n",
 29 |     "purview_name = \"<PURVIEW_NAME>\"\n"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": null,
 35 |    "metadata": {},
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "# get AML workspace details from azure portal\n",
 39 |     "subscription_id = \"<SUBSCRIPTION_ID>\" \n",
 40 |     "resource_group = \"<RESOURCE_GROUP>\"\n",
 41 |     "workspace_name = \"<WORKSPACE_NAME>\"\n",
 42 |     "workspace_region = \"<WORKSPACE_REGION>\"\n"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "from pyapacheatlas.auth import ServicePrincipalAuthentication\n",
 52 |     "from pyapacheatlas.core import PurviewClient\n",
 53 |     "from pyapacheatlas.core.util import GuidTracker\n",
 54 |     "\n",
 55 |     "# Authenticate to your Atlas server using a Service Principal\n",
 56 |     "oauth = ServicePrincipalAuthentication(\n",
 57 |     "    tenant_id= tenant_id,\n",
 58 |     "    client_id= client_id,\n",
 59 |     "    client_secret= client_secret\n",
 60 |     ")\n",
 61 |     "client = PurviewClient(\n",
 62 |     "    account_name = purview_name,\n",
 63 |     "    authentication=oauth\n",
 64 |     ")\n",
 65 |     "guid = GuidTracker()"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "# get SPN details you created in step 3.1 of solution accelerator setup\n",
 75 |     "aml_client_id = \"<CLIENT_ID>\"\n",
 76 |     "aml_client_secret = \"<CLIENT_SECRET>\""
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": null,
 82 |    "metadata": {},
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "from azureml.core.authentication import ServicePrincipalAuthentication\n",
 86 |     "\n",
 87 |     "sp = ServicePrincipalAuthentication(tenant_id=tenant_id, \n",
 88 |     "                                    service_principal_id=aml_client_id, \n",
 89 |     "                                    service_principal_password=aml_client_secret)\n",
 90 |     "\n",
 91 |     "from azureml.core import Workspace\n",
 92 |     "\n",
 93 |     "ws = Workspace.get(name=workspace_name,\n",
 94 |     "                resource_group = resource_group,\n",
 95 |     "                auth=sp,\n",
 96 |     "                subscription_id=subscription_id)"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": null,
102 |    "metadata": {},
103 |    "outputs": [],
104 |    "source": []
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": null,
109 |    "metadata": {},
110 |    "outputs": [],
111 |    "source": [
112 |     "# We recommend you add your service principal secrets in KeyVault instead of hardcoded values above\n",
113 |     "# Create a linked service for key vault in Synapse Studio \n",
114 |     "# See below code snippet how to access secrets from KeyVault "
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": null,
120 |    "metadata": {},
121 |    "outputs": [],
122 |    "source": [
123 |     "# linked_service = \"AzureKeyVault1\" # Azure Key Vault Linked Service name \n",
124 |     "# akv_name = \"<KEYVALUTNAME>\"  # Azure Key Vault name\n",
125 |     "# secret_name = \"<AML-SPN-SECRETNAME>\"  # Azure Key Vault Secret name\n",
126 |     "\n",
127 |     "# # Fetch the key from Azure Key Vault\n",
128 |     "# aml_spn = mssparkutils.credentials.getSecret(\n",
129 |     "#     linkedService=linked_service,\n",
130 |     "#     akvName=akv_name, \n",
131 |     "#     secret=secret_name)\n",
132 |     "\n",
133 |     "# linked_service = \"AzureKeyVault1\" # Azure Key Vault Linked Service name \n",
134 |     "# akv_name = \"<KEYVALUTNAME>\"  # Azure Key Vault name\n",
135 |     "# secret_name = \"<PURVEW-SPN-SECRETNAME>\"  # Azure Key Vault Secret name\n",
136 |     "\n",
137 |     "# # Fetch the key from Azure Key Vault\n",
138 |     "# purview_spn = mssparkutils.credentials.getSecret(\n",
139 |     "#     linkedService=linked_service,\n",
140 |     "#     akvName=akv_name, \n",
141 |     "# #     secret=secret_name)"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": null,
147 |    "metadata": {},
148 |    "outputs": [],
149 |    "source": [
150 |     "# # get SPN details you created in step 2.1 of solution accelerator installation\n",
151 |     "# tenant_id = \"<TENANT_ID>\"\n",
152 |     "# purview_client_id = \"<CLIENT_ID>\"\n",
153 |     "# purview_name = \"<PURVIEW_NAME>\"\n",
154 |     "# purview_client_secret = purview_spn"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": null,
160 |    "metadata": {},
161 |    "outputs": [],
162 |    "source": [
163 |     "# import json\n",
164 |     "# import os\n",
165 |     "# import sys\n",
166 |     "\n",
167 |     "# from pyapacheatlas.auth import ServicePrincipalAuthentication\n",
168 |     "# from pyapacheatlas.core import PurviewClient, AtlasClassification, AtlasEntity, AtlasProcess  \n",
169 |     "# from pyapacheatlas.readers import ExcelConfiguration, ExcelReader\n",
170 |     "# from pyapacheatlas.core.util import GuidTracker\n",
171 |     "# from pyapacheatlas.core import AtlasAttributeDef, AtlasEntity, PurviewClient\n",
172 |     "# from pyapacheatlas.core.typedef import EntityTypeDef\n",
173 |     "\n",
174 |     "# # Authenticate to your Atlas server using a Service Principal\n",
175 |     "# oauth = ServicePrincipalAuthentication(\n",
176 |     "#     tenant_id= tenant_id,\n",
177 |     "#     client_id= purview_client_id,\n",
178 |     "#     client_secret= purview_client_secret\n",
179 |     "# )\n",
180 |     "# client = PurviewClient(\n",
181 |     "#     account_name = purview_name,\n",
182 |     "#     authentication=oauth\n",
183 |     "# )\n",
184 |     "# guid = GuidTracker()"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": null,
190 |    "metadata": {},
191 |    "outputs": [],
192 |    "source": [
193 |     "# # get AML workspace details from azure portal\n",
194 |     "# subscription_id = \"<SUBSCRIPTION_ID>\" \n",
195 |     "# resource_group = \"<RESOURCE_GROUP>\"\n",
196 |     "# workspace_name = \"<WORKSPACE_NAME>\""
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": null,
202 |    "metadata": {},
203 |    "outputs": [],
204 |    "source": [
205 |     "# # get SPN details you created in step 2.1 of solution accelerator installation\n",
206 |     "# tenant_id = \"<TENANT_ID>\"\n",
207 |     "# aml_client_id = \"<CLIENT_ID>\"\n",
208 |     "# aml_client_secret = aml_spn"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": null,
214 |    "metadata": {},
215 |    "outputs": [],
216 |    "source": [
217 |     "# # Authentiate to AML\n",
218 |     "\n",
219 |     "# from azureml.core.authentication import ServicePrincipalAuthentication\n",
220 |     "\n",
221 |     "# sp = ServicePrincipalAuthentication(tenant_id=tenant_id, \n",
222 |     "#                                     service_principal_id=aml_client_id, \n",
223 |     "#                                     service_principal_password=aml_client_secret) \n",
224 |     "\n",
225 |     "# from azureml.core import Workspace\n",
226 |     "\n",
227 |     "# ws = Workspace.get(name=workspace_name,\n",
228 |     "#                 resource_group = resource_group,\n",
229 |     "#                 auth=sp,\n",
230 |     "#                 subscription_id=subscription_id)"
231 |    ]
232 |   }
233 |  ],
234 |  "metadata": {
235 |   "kernelspec": {
236 |    "display_name": "Python 3",
237 |    "language": "python",
238 |    "name": "python3"
239 |   },
240 |   "language_info": {
241 |    "codemirror_mode": {
242 |     "name": "ipython",
243 |     "version": 3
244 |    },
245 |    "file_extension": ".py",
246 |    "mimetype": "text/x-python",
247 |    "name": "python",
248 |    "nbconvert_exporter": "python",
249 |    "pygments_lexer": "ipython3",
250 |    "version": "3.7.4"
251 |   },
252 |   "save_output": true,
253 |   "synapse_widget": {
254 |    "state": {},
255 |    "version": "0.1"
256 |   }
257 |  },
258 |  "nbformat": 4,
259 |  "nbformat_minor": 2
260 | }
261 | 


--------------------------------------------------------------------------------
/SynapseNotebooks/02_Create_ML_Lineage_Types.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "collapsed": true,
  8 |     "jupyter": {
  9 |      "outputs_hidden": false,
 10 |      "source_hidden": false
 11 |     },
 12 |     "nteract": {
 13 |      "transient": {
 14 |       "deleting": false
 15 |      }
 16 |     }
 17 |    },
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "from pyapacheatlas.core.typedef import AtlasAttributeDef, EntityTypeDef, RelationshipTypeDef\n",
 21 |     "\n",
 22 |     "try:\n",
 23 |     "    #-----------------------------------------------------------------------------------#    \n",
 24 |     "    #create custom dataset type\n",
 25 |     "    type_df = EntityTypeDef(\n",
 26 |     "        name=\"custom_dataset\",\n",
 27 |     "        attributeDefs=[\n",
 28 |     "          AtlasAttributeDef(name=\"format\")\n",
 29 |     "        ],\n",
 30 |     "        superTypes = [\"DataSet\"]\n",
 31 |     "    )\n",
 32 |     "    typedef_results = client.upload_typedefs(entityDefs = [type_df], force_update=True)\n",
 33 |     "\n",
 34 |     "    #-----------------------------------------------------------------------------------#    \n",
 35 |     "    #create process with column mapping type\n",
 36 |     "    type_df = EntityTypeDef(\n",
 37 |     "        name=\"ProcessWithColumnMapping\",\n",
 38 |     "        attributeDefs=[\n",
 39 |     "          AtlasAttributeDef(name=\"columnMapping\")\n",
 40 |     "        ],\n",
 41 |     "        superTypes = [\"Process\"]\n",
 42 |     "        )\n",
 43 |     "    typedef_results = client.upload_typedefs(entityDefs = [type_df], force_update=True)\n",
 44 |     "\n",
 45 |     "    #-----------------------------------------------------------------------------------#    \n",
 46 |     "    #create AML workspace type\n",
 47 |     "    type_df = EntityTypeDef(\n",
 48 |     "        name=\"custom_ml_workspace\",\n",
 49 |     "        attributeDefs=[\n",
 50 |     "            AtlasAttributeDef(name='name',typename='string'),\n",
 51 |     "            AtlasAttributeDef(name='description',typename='string'),\n",
 52 |     "            AtlasAttributeDef(name='subscription_id',typename='string'),\n",
 53 |     "            AtlasAttributeDef(name='resource_group',typename='string')\n",
 54 |     "        ],\n",
 55 |     "        superTypes = [\"DataSet\"]\n",
 56 |     "    )\n",
 57 |     "    typedef_results = client.upload_typedefs(entityDefs = [type_df], force_update=True)\n",
 58 |     "    #-----------------------------------------------------------------------------------#        \n",
 59 |     "    #create types for datastore and dataset\n",
 60 |     "\n",
 61 |     "    #create AML datastore type\n",
 62 |     "    datastore_type_df = EntityTypeDef(\n",
 63 |     "        name=\"custom_ml_datastore\",\n",
 64 |     "        attributeDefs=[\n",
 65 |     "          AtlasAttributeDef(name=\"name\",typename='string'),\n",
 66 |     "          AtlasAttributeDef(name='container_name',typename='string'),\n",
 67 |     "          AtlasAttributeDef(name='account_name',typename='string'),\n",
 68 |     "          AtlasAttributeDef(name='protocol',typename='string'),\n",
 69 |     "          AtlasAttributeDef(name='endpoint',typename='string'),\n",
 70 |     "          AtlasAttributeDef(name='server_name',typename='string'),\n",
 71 |     "          AtlasAttributeDef(name='database_name',typename='string'),\n",
 72 |     "          AtlasAttributeDef(name=\"createdby\",typename='string')\n",
 73 |     "        ],\n",
 74 |     "        superTypes = [\"DataSet\"],\n",
 75 |     "        options = {\"schemaElementAttribute\":\"dataset\"}\n",
 76 |     "    )\n",
 77 |     "\n",
 78 |     "    #create AML dataset type\n",
 79 |     "    dataset_type_df = EntityTypeDef(\n",
 80 |     "        name=\"custom_ml_dataset\",\n",
 81 |     "        attributeDefs=[\n",
 82 |     "          AtlasAttributeDef(name=\"name\",typename='string'),\n",
 83 |     "          AtlasAttributeDef(name=\"description\",typename='string'),\n",
 84 |     "          AtlasAttributeDef(name=\"createdby\",typename='string'),\n",
 85 |     "          AtlasAttributeDef(name=\"createdtime\",typename='string')\n",
 86 |     "        ],\n",
 87 |     "        superTypes = [\"DataSet\"]\n",
 88 |     "    )\n",
 89 |     "\n",
 90 |     "    # create relationsip between datastore and dataset\n",
 91 |     "    dataset_to_datastore_relationship = RelationshipTypeDef(\n",
 92 |     "        name=\"custom_ml_datastore_to_dataset\",\n",
 93 |     "        relationshipCategory=\"COMPOSITION\",\n",
 94 |     "        endDef1={\n",
 95 |     "                \"type\": \"custom_ml_datastore\",\n",
 96 |     "                \"name\": \"dataset\",\n",
 97 |     "                \"isContainer\": True,\n",
 98 |     "                \"cardinality\": \"SET\",\n",
 99 |     "                \"isLegacyAttribute\": False\n",
100 |     "            },\n",
101 |     "        endDef2={\n",
102 |     "                \"type\": \"custom_ml_dataset\",\n",
103 |     "                \"name\": \"datastore\",\n",
104 |     "                \"isContainer\": False,\n",
105 |     "                \"cardinality\": \"SINGLE\",\n",
106 |     "                \"isLegacyAttribute\": False\n",
107 |     "            }\n",
108 |     "        )\n",
109 |     "\n",
110 |     "    typedef_results = client.upload_typedefs(\n",
111 |     "        entityDefs = [datastore_type_df, dataset_type_df],\n",
112 |     "        relationshipDefs = [dataset_to_datastore_relationship],\n",
113 |     "        force_update=True\n",
114 |     "    )\n",
115 |     "    #-----------------------------------------------------------------------------------# \n",
116 |     "    #create types for experiment and experimentstep\n",
117 |     "  \n",
118 |     "    #create process for Ml Experiment Step\n",
119 |     "    exp_type_df = EntityTypeDef(\n",
120 |     "        name=\"custom_ml_experiment\",\n",
121 |     "        attributeDefs=[\n",
122 |     "          AtlasAttributeDef(name='name',typename='string'),\n",
123 |     "          AtlasAttributeDef(name='notes',typename='string'),\n",
124 |     "          AtlasAttributeDef(name=\"createdby\",typename='string'),\n",
125 |     "          AtlasAttributeDef(name=\"createdtime\",typename='string')\n",
126 |     "        ],\n",
127 |     "        superTypes = [\"Process\"]\n",
128 |     "    )\n",
129 |     "\n",
130 |     "    #create process for Ml Experiment Step\n",
131 |     "    exp_step_type_df = EntityTypeDef(\n",
132 |     "        name=\"custom_ml_experiment_step\",\n",
133 |     "        attributeDefs=[\n",
134 |     "          AtlasAttributeDef(name='notes',typename='string')\n",
135 |     "        ],\n",
136 |     "        superTypes = [\"Process\"]\n",
137 |     "    )\n",
138 |     "\n",
139 |     "    # create relationsip between experiment and experimentstep\n",
140 |     "    step_to_exp_relationship = RelationshipTypeDef(\n",
141 |     "        name=\"custom_ml_experiment_to_experimentstep\",\n",
142 |     "        relationshipCategory=\"COMPOSITION\",\n",
143 |     "        endDef1={\n",
144 |     "                \"type\": \"custom_ml_experiment\",\n",
145 |     "                \"name\": \"experimentstep\",\n",
146 |     "                \"isContainer\": True,\n",
147 |     "                \"cardinality\": \"SET\",\n",
148 |     "                \"isLegacyAttribute\": False\n",
149 |     "            },\n",
150 |     "        endDef2={\n",
151 |     "                \"type\": \"custom_ml_experiment_step\",\n",
152 |     "                \"name\": \"experiment\",\n",
153 |     "                \"isContainer\": False,\n",
154 |     "                \"cardinality\": \"SINGLE\",\n",
155 |     "                \"isLegacyAttribute\": False\n",
156 |     "            }\n",
157 |     "    )\n",
158 |     "\n",
159 |     "    typedef_results = client.upload_typedefs(\n",
160 |     "        entityDefs = [exp_type_df, exp_step_type_df],\n",
161 |     "        relationshipDefs = [step_to_exp_relationship],\n",
162 |     "        force_update=True\n",
163 |     "    )\n",
164 |     "    #-----------------------------------------------------------------------------------# \n",
165 |     "    \n",
166 |     "    rd = RelationshipTypeDef(\n",
167 |     "      name=\"custom_ml_workspace_datastore\",\n",
168 |     "      attributeDefs=[],\n",
169 |     "      relationshipCategory=\"COMPOSITION\", # Means the child can't exist  without the parent\n",
170 |     "      endDef1={ # endDef1 decribes what the parent will have as an attribute\n",
171 |     "          \"type\":\"custom_ml_workspace\", # Type of the parent\n",
172 |     "          \"name\":\"datastores\", # What the parent will have\n",
173 |     "          \"isContainer\": True,\n",
174 |     "          \"cardinality\":\"SET\", # This is related to the cardinality, in this case the parent Server will have a SET of Models.\n",
175 |     "          \"isLegacyAttribute\":False\n",
176 |     "      },\n",
177 |     "      endDef2={ # endDef2 decribes what the child will have as an attribute\n",
178 |     "          \"type\":\"custom_ml_datastore\", # Type of the child\n",
179 |     "          \"name\":\"workspace\", # What the child will have\n",
180 |     "          \"isContainer\":False,\n",
181 |     "          \"cardinality\":\"SINGLE\",\n",
182 |     "          \"isLegacyAttribute\":False\n",
183 |     "      }\n",
184 |     "    )\n",
185 |     "    client.upload_typedefs(relationshipDefs=[rd])\n",
186 |     "    \n",
187 |     "    #-----------------------------------------------------------------------------------#  \n",
188 |     "    rd = RelationshipTypeDef(\n",
189 |     "      name=\"custom_ml_workspace_experiment\",\n",
190 |     "      attributeDefs=[],\n",
191 |     "      relationshipCategory=\"COMPOSITION\", # Means the child can't exist  without the parent\n",
192 |     "      endDef1={ # endDef1 decribes what the parent will have as an attribute\n",
193 |     "          \"type\":\"custom_ml_workspace\", # Type of the parent\n",
194 |     "          \"name\":\"experiments\", # What the parent will have\n",
195 |     "          \"isContainer\": True,\n",
196 |     "          \"cardinality\":\"SET\", # This is related to the cardinality, in this case the parent Server will have a SET of Models.\n",
197 |     "          \"isLegacyAttribute\":False\n",
198 |     "      },\n",
199 |     "      endDef2={ # endDef2 decribes what the child will have as an attribute\n",
200 |     "          \"type\":\"custom_ml_experiment\", # Type of the child\n",
201 |     "          \"name\":\"workspace\", # What the child will have\n",
202 |     "          \"isContainer\":False,\n",
203 |     "          \"cardinality\":\"SINGLE\",\n",
204 |     "          \"isLegacyAttribute\":False\n",
205 |     "      }\n",
206 |     "    )\n",
207 |     "    client.upload_typedefs(relationshipDefs=[rd])\n",
208 |     "\n",
209 |     "    #-----------------------------------------------------------------------------------# \n",
210 |     "    #create types for packages and package\n",
211 |     "\n",
212 |     "    #create packages type\n",
213 |     "    packages_type_df = EntityTypeDef(\n",
214 |     "        name=\"custom_ml_packages\",\n",
215 |     "        attributeDefs=[\n",
216 |     "            AtlasAttributeDef(name='notes',typename='string')\n",
217 |     "        ],\n",
218 |     "        superTypes = [\"DataSet\"],\n",
219 |     "        options = {\"schemaElementAttribute\":\"package\"}\n",
220 |     "    )\n",
221 |     "\n",
222 |     "    package_type_df = EntityTypeDef(\n",
223 |     "        name=\"custom_ml_package\",\n",
224 |     "        attributeDefs=[\n",
225 |     "            AtlasAttributeDef(name='programming_language',typename='string'),\n",
226 |     "            AtlasAttributeDef(name='package_name',typename='string'),\n",
227 |     "            AtlasAttributeDef(name='version',typename='string'),\n",
228 |     "            AtlasAttributeDef(name='notes',typename='string')\n",
229 |     "        ],\n",
230 |     "        superTypes = [\"DataSet\"]\n",
231 |     "    )\n",
232 |     "\n",
233 |     "    # create relationsip between packages and package\n",
234 |     "    package_to_packages_relationship = RelationshipTypeDef(\n",
235 |     "        name=\"custom_ml_packages_to_package\",\n",
236 |     "        relationshipCategory=\"COMPOSITION\",\n",
237 |     "        endDef1={\n",
238 |     "                \"type\": \"custom_ml_packages\",\n",
239 |     "                \"name\": \"package\",\n",
240 |     "                \"isContainer\": True,\n",
241 |     "                \"cardinality\": \"SET\",\n",
242 |     "                \"isLegacyAttribute\": False\n",
243 |     "            },\n",
244 |     "        endDef2={\n",
245 |     "                \"type\": \"custom_ml_package\",\n",
246 |     "                \"name\": \"packages\",\n",
247 |     "                \"isContainer\": False,\n",
248 |     "                \"cardinality\": \"SINGLE\",\n",
249 |     "                \"isLegacyAttribute\": False\n",
250 |     "            }\n",
251 |     "    )\n",
252 |     "\n",
253 |     "    typedef_results = client.upload_typedefs(\n",
254 |     "        entityDefs = [packages_type_df, package_type_df],\n",
255 |     "        relationshipDefs = [package_to_packages_relationship],\n",
256 |     "        force_update=True\n",
257 |     "    )\n",
258 |     "  #-----------------------------------------------------------------------------------# \n",
259 |     "  \n",
260 |     "    #create experiemnt config type\n",
261 |     "    type_df = EntityTypeDef(\n",
262 |     "        name=\"custom_ml_exp_config\",\n",
263 |     "        attributeDefs=[\n",
264 |     "            AtlasAttributeDef(name='task_type',typename='string'),\n",
265 |     "            AtlasAttributeDef(name='enable_early_stopping',typename='bool'),\n",
266 |     "            AtlasAttributeDef(name='experiment_timeout_minutes',typename='int'),\n",
267 |     "            AtlasAttributeDef(name='primary_metric',typename='string'),\n",
268 |     "            AtlasAttributeDef(name='compute_target',typename='string'),\n",
269 |     "            AtlasAttributeDef(name='label_column_name',typename='string'),\n",
270 |     "            AtlasAttributeDef(name='n_cross_validations',typename='int'),\n",
271 |     "            AtlasAttributeDef(name='model_explainability',typename='bool')\n",
272 |     "        ],\n",
273 |     "        superTypes = [\"DataSet\"]\n",
274 |     "    )\n",
275 |     "    typedef_results = client.upload_typedefs(entityDefs = [type_df], force_update=True)\n",
276 |     "    \n",
277 |     "    #-----------------------------------------------------------------------------------# \n",
278 |     "\n",
279 |     "    #create model metrics type\n",
280 |     "    type_df = EntityTypeDef(\n",
281 |     "        name=\"custom_ml_model_metrics\",\n",
282 |     "        attributeDefs=[\n",
283 |     "            AtlasAttributeDef(name='AUC',typename='float'),\n",
284 |     "            AtlasAttributeDef(name='Accuracy',typename='float'),\n",
285 |     "            AtlasAttributeDef(name='Precision',typename='float'),\n",
286 |     "            AtlasAttributeDef(name='Recall',typename='float'),\n",
287 |     "            AtlasAttributeDef(name='F1',typename='float')\n",
288 |     "        ],\n",
289 |     "        superTypes = [\"DataSet\"]\n",
290 |     "    )\n",
291 |     "    typedef_results = client.upload_typedefs(entityDefs = [type_df], force_update=True)\n",
292 |     "    \n",
293 |     "    #-----------------------------------------------------------------------------------# \n",
294 |     "\n",
295 |     "    #create model type\n",
296 |     "    type_df = EntityTypeDef(\n",
297 |     "        name=\"custom_ml_model\",\n",
298 |     "        attributeDefs=[\n",
299 |     "            AtlasAttributeDef(name='workspace_name',typename='string'),\n",
300 |     "            AtlasAttributeDef(name='workspace_subscription_id',typename='string'),\n",
301 |     "            AtlasAttributeDef(name='workspace_resource_group',typename='string'),\n",
302 |     "            AtlasAttributeDef(name='name',typename='string'),\n",
303 |     "            AtlasAttributeDef(name='id',typename='string'),\n",
304 |     "            AtlasAttributeDef(name='version',typename='string'),\n",
305 |     "            AtlasAttributeDef(name='tags',typename='string'),\n",
306 |     "            AtlasAttributeDef(name='properties',typename='string')\n",
307 |     "        ],\n",
308 |     "        superTypes = [\"DataSet\"]\n",
309 |     "    )\n",
310 |     "    typedef_results = client.upload_typedefs(entityDefs = [type_df], force_update=True)\n",
311 |     "    \n",
312 |     "    #-----------------------------------------------------------------------------------# \n",
313 |     "\n",
314 |     "    #create endpoint type\n",
315 |     "    type_df = EntityTypeDef(\n",
316 |     "        name=\"custom_ml_model_endpoint\",\n",
317 |     "        attributeDefs=[\n",
318 |     "              AtlasAttributeDef(name='workspace_name',typename='string'),\n",
319 |     "              AtlasAttributeDef(name='workspace_subscription_id',typename='string'),\n",
320 |     "              AtlasAttributeDef(name='workspace_resource_group',typename='string'),\n",
321 |     "              AtlasAttributeDef(name='name',typename='string'),\n",
322 |     "              AtlasAttributeDef(name='image_id',typename='string'),\n",
323 |     "              AtlasAttributeDef(name='compute_type',typename='string'),\n",
324 |     "              AtlasAttributeDef(name='state',typename='string'),\n",
325 |     "              AtlasAttributeDef(name='scoring_uri',typename='string'),\n",
326 |     "              AtlasAttributeDef(name='tags',typename='string'),\n",
327 |     "              AtlasAttributeDef(name='state',typename='string'),\n",
328 |     "              AtlasAttributeDef(name='properties',typename='string'),\n",
329 |     "              AtlasAttributeDef(name='created_by',typename='string'),\n",
330 |     "              AtlasAttributeDef(name='sample_json',typename='string')\n",
331 |     "        ],\n",
332 |     "        superTypes = [\"DataSet\"]\n",
333 |     "    )\n",
334 |     "    typedef_results = client.upload_typedefs(entityDefs = [type_df], force_update=True)\n",
335 |     "    \n",
336 |     "    #-----------------------------------------------------------------------------------# \n",
337 |     "except:\n",
338 |     "    print('types already created')  "
339 |    ]
340 |   }
341 |  ],
342 |  "metadata": {
343 |   "kernelspec": {
344 |    "display_name": "Python 3",
345 |    "language": "python",
346 |    "name": "python3"
347 |   },
348 |   "language_info": {
349 |    "codemirror_mode": {
350 |     "name": "ipython",
351 |     "version": 3
352 |    },
353 |    "file_extension": ".py",
354 |    "mimetype": "text/x-python",
355 |    "name": "python",
356 |    "nbconvert_exporter": "python",
357 |    "pygments_lexer": "ipython3",
358 |    "version": "3.7.4"
359 |   },
360 |   "save_output": true,
361 |   "synapse_widget": {
362 |    "state": {},
363 |    "version": "0.1"
364 |   }
365 |  },
366 |  "nbformat": 4,
367 |  "nbformat_minor": 2
368 | }
369 | 


--------------------------------------------------------------------------------
/SynapseNotebooks/03_Create_ML_Lineage_Functions.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "collapsed": true,
  8 |     "jupyter": {
  9 |      "outputs_hidden": false,
 10 |      "source_hidden": false
 11 |     },
 12 |     "nteract": {
 13 |      "transient": {
 14 |       "deleting": false
 15 |      }
 16 |     }
 17 |    },
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "def get_entity_details(qualifiedName,typeName):\n",
 21 |     "    entities = client.get_entity(\n",
 22 |     "        qualifiedName=[qualifiedName],\n",
 23 |     "        typeName=typeName\n",
 24 |     "    )\n",
 25 |     "    for entity in entities.get(\"entities\"):\n",
 26 |     "        entity = entity\n",
 27 |     "        break\n",
 28 |     "    return entity\n",
 29 |     "#get_entity_details('https://sampledataadls.dfs.core.windows.net/masterdata/employees.csv','azure_datalake_gen2_path')\n",
 30 |     "\n",
 31 |     "def get_entity_guid(qualifiedName,typeName):\n",
 32 |     "    entities = client.get_entity(\n",
 33 |     "        qualifiedName=[qualifiedName],\n",
 34 |     "        typeName=typeName\n",
 35 |     "    )\n",
 36 |     "    for entity in entities.get(\"entities\"):\n",
 37 |     "        entity_guid = entity.get(\"guid\")\n",
 38 |     "        break\n",
 39 |     "    return entity_guid\n",
 40 |     "#get_entity_guid('https://sampledataadls.dfs.core.windows.net/creditriskdata/borrower.csv','azure_datalake_gen2_path')\n",
 41 |     "\n",
 42 |     "def get_entity_schema(guid):\n",
 43 |     "    columns = []\n",
 44 |     "    results = client.get_entity(guid)\n",
 45 |     "    for entity in results[\"entities\"]:\n",
 46 |     "        if \"tabular_schema\" in entity[\"relationshipAttributes\"]:\n",
 47 |     "            ts = entity[\"relationshipAttributes\"][\"tabular_schema\"]\n",
 48 |     "            ts_entity = client.get_entity(ts[\"guid\"])\n",
 49 |     "            for schema in ts_entity[\"entities\"]:\n",
 50 |     "                for col in schema[\"relationshipAttributes\"][\"columns\"]:\n",
 51 |     "                    if col['displayText'] != ':csv':\n",
 52 |     "                        columns.append(col['displayText'])\n",
 53 |     "    return(columns)\n",
 54 |     "    \n",
 55 |     "# ent_guid = 'a8698a33-9174-43cb-8835-26968862e2bf'\n",
 56 |     "# get_entity_schema(ent_guid)\n",
 57 |     "\n",
 58 |     "def create_data_entity_with_schema_and_parent(df_data,entityname,entitytype='custom_ml_dataset',parent_entityname=None,parent_entitytype='custom_ml_datastore'):\n",
 59 |     "    # Create an asset for the output data schema.\n",
 60 |     "    output_schema_entity = AtlasEntity(\n",
 61 |     "    name=\"schema-\" + entityname,\n",
 62 |     "    qualified_name = \"pyapacheatlas://\"+\"schema-\" + entityname,\n",
 63 |     "    typeName=\"tabular_schema\",\n",
 64 |     "    guid=guid.get_guid()\n",
 65 |     "    )\n",
 66 |     "\n",
 67 |     "    df_data_schema = pd.DataFrame(list(zip(list(df_data.columns), list(df_data.dtypes))),columns=['column','dtype'])\n",
 68 |     "\n",
 69 |     "    #Iterate over the out data frame's columns and create entities\n",
 70 |     "    output_entity_schema_columns = []\n",
 71 |     "    #for column in df.schema:\n",
 72 |     "    for index, row in df_data_schema.iterrows():  \n",
 73 |     "        temp_column = AtlasEntity(\n",
 74 |     "            name = row.column,\n",
 75 |     "            typeName = \"column\",\n",
 76 |     "            qualified_name = \"pyapacheatlas://schema-\" + entityname + \"#\" + row.column,\n",
 77 |     "            guid=guid.get_guid(),\n",
 78 |     "            attributes = {\"type\":str(row.dtype),\"description\": row.column},\n",
 79 |     "            relationshipAttributes = {\"composeSchema\":output_schema_entity.to_json(minimum=True)}\n",
 80 |     "        )\n",
 81 |     "        output_entity_schema_columns.append(temp_column)\n",
 82 |     "\n",
 83 |     "\n",
 84 |     "    if parent_entityname:\n",
 85 |     "        dstore_entity = get_entity_details(\"pyapacheatlas://\"+parent_entityname, parent_entitytype)\n",
 86 |     "        # Create a entity for dataset \n",
 87 |     "        dataset_output_entity = AtlasEntity(\n",
 88 |     "            name=entityname,\n",
 89 |     "            typeName=entitytype,\n",
 90 |     "            qualified_name=\"pyapacheatlas://\" + entityname,\n",
 91 |     "            guid = guid.get_guid(),\n",
 92 |     "            relationshipAttributes = {\n",
 93 |     "                \"tabular_schema\": output_schema_entity.to_json(minimum=True),\n",
 94 |     "                \"datastore\":dstore_entity\n",
 95 |     "            }\n",
 96 |     "        )\n",
 97 |     "    else:\n",
 98 |     "        # Create a entity for dataset \n",
 99 |     "        dataset_output_entity = AtlasEntity(\n",
100 |     "            name=entityname,\n",
101 |     "            typeName=entitytype,\n",
102 |     "            qualified_name=\"pyapacheatlas://\" + entityname,\n",
103 |     "            guid = guid.get_guid(),\n",
104 |     "            relationshipAttributes = {\n",
105 |     "                \"tabular_schema\": output_schema_entity.to_json(minimum=True)\n",
106 |     "            }\n",
107 |     "        )\n",
108 |     "\n",
109 |     "    # Prepare all the entities as a batch to be uploaded.\n",
110 |     "    batch = [dataset_output_entity, output_schema_entity] + output_entity_schema_columns\n",
111 |     "    batch\n",
112 |     "\n",
113 |     "    # Upload all entities!\n",
114 |     "    client.upload_entities(batch=batch)\n",
115 |     "    \n",
116 |     "def create_data_entity_with_schema(df_data,entityname,entitytype='custom_ml_dataset'):\n",
117 |     "    # Create an asset for the output data schema.\n",
118 |     "    output_schema_entity = AtlasEntity(\n",
119 |     "    name=\"schema-\" + entityname,\n",
120 |     "    qualified_name = \"pyapacheatlas://\"+\"schema-\" + entityname,\n",
121 |     "    typeName=\"tabular_schema\",\n",
122 |     "    guid=guid.get_guid()\n",
123 |     "    )\n",
124 |     "\n",
125 |     "    df_data_schema = pd.DataFrame(list(zip(list(df_data.columns), list(df_data.dtypes))),columns=['column','dtype'])\n",
126 |     "\n",
127 |     "    #Iterate over the out data frame's columns and create entities\n",
128 |     "    output_entity_schema_columns = []\n",
129 |     "    #for column in df.schema:\n",
130 |     "    for index, row in df_data_schema.iterrows():  \n",
131 |     "        temp_column = AtlasEntity(\n",
132 |     "            name = row.column,\n",
133 |     "            typeName = \"column\",\n",
134 |     "            qualified_name = \"pyapacheatlas://schema-\" + entityname + \"#\" + row.column,\n",
135 |     "            guid=guid.get_guid(),\n",
136 |     "            attributes = {\"type\":str(row.dtype),\"description\": row.column},\n",
137 |     "            relationshipAttributes = {\"composeSchema\":output_schema_entity.to_json(minimum=True)}\n",
138 |     "        )\n",
139 |     "        output_entity_schema_columns.append(temp_column)\n",
140 |     "\n",
141 |     "    # Create a entity for dataset \n",
142 |     "    dataset_output_entity = AtlasEntity(\n",
143 |     "        name=entityname,\n",
144 |     "        typeName=entitytype,\n",
145 |     "        qualified_name=\"pyapacheatlas://\" + entityname,\n",
146 |     "        guid = guid.get_guid(),\n",
147 |     "        relationshipAttributes = {\n",
148 |     "            \"tabular_schema\": output_schema_entity.to_json(minimum=True)\n",
149 |     "        }\n",
150 |     "    )\n",
151 |     "\n",
152 |     "    # Prepare all the entities as a batch to be uploaded.\n",
153 |     "    batch = [dataset_output_entity, output_schema_entity] + output_entity_schema_columns\n",
154 |     "    batch\n",
155 |     "\n",
156 |     "    # Upload all entities!\n",
157 |     "    client.upload_entities(batch=batch)\n",
158 |     "    \n",
159 |     "def create_lineage_for_entities(experimentname,processname,in_ent_qns,out_ent_qns,process_type_name='Process',ColumnMapping=False):\n",
160 |     "    # create a process \n",
161 |     "    # inputs: list of (entity,type) tuples\n",
162 |     "    # outputs: list of (entity,type) tuples\n",
163 |     "\n",
164 |     "    from pyapacheatlas.core import AtlasProcess\n",
165 |     "\n",
166 |     "    in_ent_guids = []\n",
167 |     "    for in_ent_qn in in_ent_qns:\n",
168 |     "        #print(in_ent_qn,in_ent_qns[in_ent_qn])\n",
169 |     "        in_ent_guid = get_entity_guid(in_ent_qn,in_ent_qns[in_ent_qn])\n",
170 |     "        in_ent_guids.append({'guid':in_ent_guid})\n",
171 |     "    \n",
172 |     "    out_ent_guids = []\n",
173 |     "    for out_ent_qn in out_ent_qns:\n",
174 |     "        #print(in_ent_qn,in_ent_qns[in_ent_qn])\n",
175 |     "        out_ent_guid = get_entity_guid(out_ent_qn,out_ent_qns[out_ent_qn])\n",
176 |     "        out_ent_guids.append({'guid':out_ent_guid})\n",
177 |     "\n",
178 |     "    process_name = experimentname + processname\n",
179 |     "    process_qn = \"pyapacheatlas://\" + process_name\n",
180 |     "\n",
181 |     "    if ColumnMapping == False:\n",
182 |     "        process_type_name = process_type_name\n",
183 |     "\n",
184 |     "        process = AtlasProcess(\n",
185 |     "            name=process_name,\n",
186 |     "            typeName=process_type_name,\n",
187 |     "            qualified_name=process_qn,\n",
188 |     "            inputs = in_ent_guids,\n",
189 |     "            outputs = out_ent_guids,\n",
190 |     "            guid=guid.get_guid()\n",
191 |     "        )\n",
192 |     "    else:\n",
193 |     "        process_type_name = \"ProcessWithColumnMapping\"\n",
194 |     "\n",
195 |     "        column_mapping_attributes = []\n",
196 |     "        for in_ent_qn in in_ent_qns:\n",
197 |     "            cl_mapping = []\n",
198 |     "            in_ent_columns = get_entity_schema(get_entity_guid(in_ent_qn,in_ent_qns[in_ent_qn]))\n",
199 |     "            for in_col in in_ent_columns:\n",
200 |     "                cl_mapping.append({\"Source\":in_col,\"Sink\":in_col})\n",
201 |     "                #break\n",
202 |     "            mapping = {\n",
203 |     "            'DatasetMapping': {'Source':in_ent_qn,'Sink':list(out_ent_qns.keys())[0]},\n",
204 |     "            'ColumnMapping': cl_mapping\n",
205 |     "            }\n",
206 |     "            column_mapping_attributes.append(mapping)\n",
207 |     "\n",
208 |     "        process = AtlasProcess(\n",
209 |     "            name=process_name,\n",
210 |     "            typeName=process_type_name,\n",
211 |     "            qualified_name=process_qn,\n",
212 |     "            inputs = in_ent_guids,\n",
213 |     "            outputs = out_ent_guids,\n",
214 |     "            guid=guid.get_guid(),\n",
215 |     "            attributes={\"columnMapping\":json.dumps(column_mapping_attributes)}\n",
216 |     "        )\n",
217 |     "\n",
218 |     "    # Prepare all the entities as a batch to be uploaded.\n",
219 |     "    batch = [process]\n",
220 |     "    batch\n",
221 |     "\n",
222 |     "    # Upload all entities!\n",
223 |     "    client.upload_entities(batch=batch)\n",
224 |     "    \n",
225 |     "def create_entity(name,typeName,config_attibutes):\n",
226 |     "    # Create an entity\n",
227 |     "    name = name \n",
228 |     "    qn = \"pyapacheatlas://\" + name\n",
229 |     "\n",
230 |     "    exp_config_entity = AtlasEntity(\n",
231 |     "        name=name,\n",
232 |     "        typeName=typeName,\n",
233 |     "        qualified_name=qn,\n",
234 |     "        guid = guid.get_guid(),\n",
235 |     "        attributes = config_attibutes\n",
236 |     "    )\n",
237 |     "\n",
238 |     "    # Upload all entities!\n",
239 |     "    client.upload_entities(batch=[exp_config_entity.to_json()])\n",
240 |     "\n",
241 |     "        \n",
242 |     "def get_dataset_details(indataset,experiment_name=''):\n",
243 |     "    result = []\n",
244 |     "    #print(indataset)\n",
245 |     "    if 'FileDataset' in str(type((indataset))):\n",
246 |     "        dssource = eval(json.loads(str(indataset).replace('FileDataset',''))['source'][0])\n",
247 |     "        sourcestore = dssource[0]\n",
248 |     "        sourcepath = dssource[1]\n",
249 |     "        sourcepathfiles = indataset.to_path()\n",
250 |     "        for sourcepathfile in sourcepathfiles:\n",
251 |     "            entityname = sourcepath.split('/')[-1] + sourcepathfile.replace('/','_') #.replace('.parquet','').replace('.csv','')\n",
252 |     "            #print('\\nFileDataset:',entityname)\n",
253 |     "\n",
254 |     "            dsdatastore = Datastore.get(ws, sourcestore)\n",
255 |     "            datastore_path = [DataPath(dsdatastore, sourcepath+sourcepathfile.replace('/',''))]\n",
256 |     "   \n",
257 |     "            if '.parquet' in sourcepathfile:\n",
258 |     "                tabular_dataset = Dataset.Tabular.from_parquet_files(path=datastore_path)\n",
259 |     "                df_data = tabular_dataset.take(10).to_pandas_dataframe()\n",
260 |     "                \n",
261 |     "            elif '.csv' in sourcepathfile:\n",
262 |     "                tabular_dataset = Dataset.Tabular.from_delimited_files(path=datastore_path,encoding ='iso88591') \n",
263 |     "                #'utf8', 'iso88591', 'latin1', 'ascii', 'utf16', 'utf32', 'utf8bom' and 'windows1252'\n",
264 |     "                df_data = tabular_dataset.take(10).to_pandas_dataframe()\n",
265 |     "            \n",
266 |     "            if experiment_name != '':\n",
267 |     "                result.append((entityname + '_' + experiment_name,df_data))\n",
268 |     "            else:\n",
269 |     "                result.append((entityname,df_data))\n",
270 |     "\n",
271 |     "    elif 'TabularDataset' in str(type((indataset))):\n",
272 |     "        tabular_dataset = indataset\n",
273 |     "        entityname = json.loads(str(indataset).replace('TabularDataset',''))['registration']['name']\n",
274 |     "        \n",
275 |     "        # dataset = Dataset.get_by_name(ws, name=entityname)\n",
276 |     "        # try:\n",
277 |     "        #     sourcestore = json.loads(dataset._definition)['blocks'][0]['arguments']['datastore']['datastoreName']\n",
278 |     "        # except:\n",
279 |     "        #     sourcestore = json.loads(dataset._definition)['blocks'][0]['arguments']['datastores'][0]['datastoreName']\n",
280 |     "        df_data = tabular_dataset.take(10).to_pandas_dataframe()\n",
281 |     "        #print('TabularDataset:', entityname)\n",
282 |     "        result.append((entityname,df_data))\n",
283 |     "    return result\n",
284 |     "\n",
285 |     "\n",
286 |     "from azureml.core import Experiment\n",
287 |     "from azureml.pipeline.core import PipelineRun\n",
288 |     "\n",
289 |     "from azureml.core import Workspace, Datastore, Dataset\n",
290 |     "from azureml.data.datapath import DataPath\n",
291 |     "import json  \n",
292 |     "import pandas as pd\n",
293 |     "\n",
294 |     "def create_aml_experiment_steps(ws,experiment_name):\n",
295 |     "    experiments_lst = Experiment.list(ws)\n",
296 |     "    for experiment in experiments_lst:\n",
297 |     "        if experiment.name == experiment_name:\n",
298 |     "            #print(experiment)\n",
299 |     "            exp = Experiment(ws,experiment.name)\n",
300 |     "            for run in exp.get_runs(): \n",
301 |     "                rundetails = run.get_details()\n",
302 |     "                #print(rundetails)\n",
303 |     "                if rundetails['status'] != 'Completed': #continue until we find a completed run \n",
304 |     "                    continue\n",
305 |     "                pipeline_run = PipelineRun(exp, rundetails['runId'])\n",
306 |     "\n",
307 |     "                steps = pipeline_run.get_steps()\n",
308 |     "                for step_run in steps:\n",
309 |     "                    step_run_details = step_run.get_details_with_logs()\n",
310 |     "                    #print(step_run_details)\n",
311 |     "                    #print(step_run_details['runDefinition']['script'])\n",
312 |     "\n",
313 |     "                    purview_basepath = 'pyapacheatlas://'\n",
314 |     "                    in_ent_qns = {}\n",
315 |     "                    out_ent_qns = {}\n",
316 |     "                    #print(step_run_details)\n",
317 |     "                    step_name = step_run.name #step_run_details['runDefinition']['script']\n",
318 |     "                    #print(step_name)\n",
319 |     "                    \n",
320 |     "                    #print('\\n Input Datasets:\\n')\n",
321 |     "                    for indataset in step_run_details['inputDatasets']:\n",
322 |     "                        in_result = get_dataset_details(indataset['dataset'],experiment_name)\n",
323 |     "                        #print(in_result)\n",
324 |     "                        #create entities                        \n",
325 |     "                        for in_res in in_result:\n",
326 |     "                            data_ent_name = in_res[0].strip('_')\n",
327 |     "                            create_data_entity_with_schema(in_res[1],data_ent_name,'custom_ml_dataset')\n",
328 |     "                            in_ent_qns[purview_basepath + data_ent_name] = 'custom_ml_dataset'\n",
329 |     "                        #break\n",
330 |     "                    #print('\\n Output Datasets:\\n')\n",
331 |     "                    for outdataset in step_run_details['outputDatasets']:\n",
332 |     "                        out_result = get_dataset_details(outdataset['dataset'],experiment_name)\n",
333 |     "                        #print(out_result)\n",
334 |     "                        #create entities\n",
335 |     "                        for out_res in out_result:\n",
336 |     "                            data_ent_name = out_res[0].strip('_')\n",
337 |     "                            create_data_entity_with_schema(out_res[1],data_ent_name,'custom_ml_dataset')\n",
338 |     "                            out_ent_qns[purview_basepath + data_ent_name] = 'custom_ml_dataset'\n",
339 |     "                        #break\n",
340 |     "                    #print(in_ent_qns,out_ent_qns)\n",
341 |     "                    create_lineage_for_entities(experiment_name + '_',step_name, in_ent_qns,out_ent_qns,process_type_name='custom_ml_experiment_step',ColumnMapping=False)\n",
342 |     "                    #break    \n",
343 |     "                \n",
344 |     "                break # break after processing one completed run\n",
345 |     "            break #after finding the experiment\n",
346 |     "\n",
347 |     "\n",
348 |     "#create workspace entity\n",
349 |     "def create_workspace_entities(ws):\n",
350 |     "\n",
351 |     "    config_attibutes={}\n",
352 |     "    temp_column={}\n",
353 |     "\n",
354 |     "    temp_column['name'] = ws.name\n",
355 |     "    config_attibutes.update(temp_column)\n",
356 |     "    temp_column['subscription_id'] = ws.subscription_id\n",
357 |     "    config_attibutes.update(temp_column)\n",
358 |     "    temp_column['resource_group'] = ws.resource_group\n",
359 |     "    config_attibutes.update(temp_column)\n",
360 |     "\n",
361 |     "    create_entity(ws.name,'custom_ml_workspace',config_attibutes)\n",
362 |     "    #break\n",
363 |     "\n",
364 |     "\n",
365 |     "#create all datastore entities\n",
366 |     "def create_datastore_entities(ws):\n",
367 |     "    for datastore in ws.datastores.values():\n",
368 |     "        config_attibutes={}\n",
369 |     "        temp_column={}\n",
370 |     "        \n",
371 |     "        temp_column['name'] = datastore.name\n",
372 |     "        config_attibutes.update(temp_column)\n",
373 |     "\n",
374 |     "        if ('AzureDataLakeGen2Datastore' in str(type(datastore))) or ('AzureBlobDatastore' in str(type(datastore))):\n",
375 |     "            temp_column['container_name'] = datastore.container_name\n",
376 |     "            config_attibutes.update(temp_column)\n",
377 |     "            temp_column['account_name'] = datastore.account_name\n",
378 |     "            config_attibutes.update(temp_column)\n",
379 |     "            temp_column['protocol'] = datastore.protocol\n",
380 |     "            config_attibutes.update(temp_column)\n",
381 |     "            temp_column['endpoint'] = datastore.endpoint\n",
382 |     "            config_attibutes.update(temp_column)\n",
383 |     "        elif 'AzureSqlDatabaseDatastore' in str(type(datastore)):\n",
384 |     "            #print('sql',datastore.server_name)\n",
385 |     "            temp_column['server_name'] = datastore.server_name\n",
386 |     "            config_attibutes.update(temp_column)\n",
387 |     "            temp_column['database_name'] = datastore.database_name\n",
388 |     "            config_attibutes.update(temp_column)\n",
389 |     "        elif 'AzureBlobDatastore' in str(type(datastore)):    \n",
390 |     "            pass\n",
391 |     "\n",
392 |     "        create_entity(datastore.name,'custom_ml_datastore',config_attibutes)\n",
393 |     "        #break\n",
394 |     "\n",
395 |     "    #create workspace and datastore relationship\n",
396 |     "    purview_basepath = 'pyapacheatlas://'\n",
397 |     "    for datastore in ws.datastores.values():\n",
398 |     "        relationshiptype = 'custom_ml_workspace_datastore'\n",
399 |     "        end1type = 'custom_ml_workspace'\n",
400 |     "        end2type = 'custom_ml_datastore'\n",
401 |     "        end1_qn = purview_basepath + ws.name\n",
402 |     "        end2_qn = purview_basepath + datastore.name\n",
403 |     "        try:\n",
404 |     "            create_entities_relationship(relationshiptype,end1type,end2type,end1_qn,end2_qn)\n",
405 |     "        except:\n",
406 |     "            pass # ignore if relationship exists\n",
407 |     "\n",
408 |     "#create all dataset entities (with datastore as parent)\n",
409 |     "from azureml.core import Workspace, Datastore, Dataset\n",
410 |     "import pandas as pd\n",
411 |     "def create_dataset_entities(ws,parent_flag=True):\n",
412 |     "    purview_basepath = 'pyapacheatlas://'\n",
413 |     "    for dsname in ws.datasets:\n",
414 |     "        dataset = ws.datasets[dsname]\n",
415 |     "        try:\n",
416 |     "            if 'FileDataset' in str(type((dataset))):\n",
417 |     "                datasetsource = eval(json.loads(str(dataset).replace('FileDataset',''))['source'][0])[0]\n",
418 |     "            elif 'TabularDataset' in str(type((dataset))):\n",
419 |     "                datasetsource = eval(json.loads(str(dataset).replace('TabularDataset',''))['source'][0])[0]\n",
420 |     "            dsdetails = get_dataset_details(dataset)\n",
421 |     "            #print(dsdetails)\n",
422 |     "            for ds in dsdetails:\n",
423 |     "                if parent_flag == False:\n",
424 |     "                    create_data_entity_with_schema(ds[1],dsname,'custom_ml_dataset')\n",
425 |     "                    create_lineage_for_entities('',('register_' + dsname), {(purview_basepath+datasetsource):'custom_ml_datastore'},\n",
426 |     "                                                {(purview_basepath+ds[0]):'custom_ml_dataset'},ColumnMapping=False)\n",
427 |     "                else:                    \n",
428 |     "                    create_data_entity_with_schema_and_parent(ds[1],dsname,entitytype='custom_ml_dataset',\n",
429 |     "                                                              parent_entityname=datasetsource,parent_entitytype='custom_ml_datastore')    \n",
430 |     "        except:\n",
431 |     "            print('Error:',dsname)        \n",
432 |     "        #break\n",
433 |     "        \n",
434 |     "        \n",
435 |     "#create experiment entity\n",
436 |     "from azureml.core import Experiment\n",
437 |     "\n",
438 |     "def create_experiment_entities(ws):\n",
439 |     "    for experiment in Experiment.list(ws):\n",
440 |     "        #create experiment entity\n",
441 |     "        config_attibutes={}\n",
442 |     "        temp_column={}\n",
443 |     "\n",
444 |     "        temp_column['name'] = experiment.name\n",
445 |     "        config_attibutes.update(temp_column)\n",
446 |     "\n",
447 |     "        create_entity(experiment.name,'custom_ml_experiment',config_attibutes)\n",
448 |     "        #break\n",
449 |     "        \n",
450 |     "        purview_basepath = 'pyapacheatlas://'\n",
451 |     "\n",
452 |     "        #create experiment relationship to workspace\n",
453 |     "        relationshiptype = 'custom_ml_workspace_experiment'\n",
454 |     "        end1type = 'custom_ml_workspace'\n",
455 |     "        end2type = 'custom_ml_experiment'\n",
456 |     "        end1_qn = purview_basepath + ws.name\n",
457 |     "        end2_qn = purview_basepath + experiment.name\n",
458 |     "        try:\n",
459 |     "            create_entities_relationship(relationshiptype,end1type,end2type,end1_qn,end2_qn)\n",
460 |     "        except:\n",
461 |     "            pass # ignore if relationship exists\n",
462 |     "        \n",
463 |     "        for run in experiment.get_runs(): \n",
464 |     "            rundetails = run.get_details()\n",
465 |     "            #print(rundetails)\n",
466 |     "            if rundetails['status'] != 'Completed': #continue until we find a completed run \n",
467 |     "                continue\n",
468 |     "            # print(rundetails['properties']['azureml.runsource'])\n",
469 |     "            #create experiment steps\n",
470 |     "            if rundetails['properties']['azureml.runsource'] == 'azureml.PipelineRun':\n",
471 |     "                print(experiment.name)\n",
472 |     "                create_aml_experiment_steps(ws,experiment.name)\n",
473 |     "\n",
474 |     "                pipeline_run = PipelineRun(experiment, rundetails['runId'])\n",
475 |     "\n",
476 |     "                steps = pipeline_run.get_steps()\n",
477 |     "                for step_run in steps:\n",
478 |     "                    #print(experiment.name + '_' + step_run.name)\n",
479 |     "                    \n",
480 |     "                    #create experiment relationship to workspace\n",
481 |     "                    relationshiptype = 'custom_ml_experiment_to_experimentstep'\n",
482 |     "                    end1type = 'custom_ml_experiment'\n",
483 |     "                    end2type = 'custom_ml_experiment_step'\n",
484 |     "                    end1_qn = purview_basepath + experiment.name\n",
485 |     "                    end2_qn = purview_basepath + experiment.name + '_' + step_run.name\n",
486 |     "                    try:\n",
487 |     "                        create_entities_relationship(relationshiptype,end1type,end2type,end1_qn,end2_qn)\n",
488 |     "                    except:\n",
489 |     "                        pass # ignore if relationship exists\n",
490 |     "\n",
491 |     "            break # break after processing one completed run\n",
492 |     "        #break\n",
493 |     "\n",
494 |     "def create_entities_relationship(relationshiptype,end1type,end2type,end1_qn,end2_qn):\n",
495 |     "    relationship = {}\n",
496 |     "    end1 = {}\n",
497 |     "    end2 = {}\n",
498 |     "\n",
499 |     "    end1[\"guid\"] = get_entity_guid(end1_qn,end1type)\n",
500 |     "    end1[\"typeName\"] = end1type\n",
501 |     "    end1[\"uniqueAttributes\"] = {\"qualifiedName\": end1_qn}\n",
502 |     "\n",
503 |     "    end2[\"guid\"] = get_entity_guid(end2_qn,end2type)\n",
504 |     "    end2[\"typeName\"] = end2type\n",
505 |     "    end2[\"uniqueAttributes\"] = {\"qualifiedName\": end2_qn}\n",
506 |     "\n",
507 |     "    relationship[\"typeName\"] = relationshiptype\n",
508 |     "    relationship[\"attributes\"] = {}\n",
509 |     "    relationship[\"guid\"] = guid.get_guid()\n",
510 |     "    relationship[\"provenanceType\"] = 0\n",
511 |     "    relationship[\"end1\"] = end1\n",
512 |     "    relationship[\"end2\"] = end2\n",
513 |     "    relationship\n",
514 |     "    \n",
515 |     "    client.upload_relationship(relationship)         \n",
516 |     "       \n",
517 |     "def create_package_entities(experimentname,packageslist):\n",
518 |     "    packages_name = experimentname + '-packages' \n",
519 |     "    packages_qn = \"pyapacheatlas://\" + packages_name\n",
520 |     "\n",
521 |     "    # Create an asset for the packages.\n",
522 |     "    packages_entity = AtlasEntity(\n",
523 |     "        name = packages_name,\n",
524 |     "        qualified_name = packages_qn,\n",
525 |     "        typeName=\"custom_ml_packages\",\n",
526 |     "        attributes = {\"notes\":\"test note\"},\n",
527 |     "        guid=guid.get_guid()\n",
528 |     "    )\n",
529 |     "\n",
530 |     "    packages_entity.to_json(minimum=True)\n",
531 |     "\n",
532 |     "    atlas_packages = []\n",
533 |     "    relationships = []\n",
534 |     "    for package in packageslist:\n",
535 |     "        package_attibutes={}\n",
536 |     "        temp_column={}\n",
537 |     "        temp_column['programming_language'] = str(package[0])\n",
538 |     "        package_attibutes.update(temp_column)\n",
539 |     "        temp_column['package_name'] = str(package[1])\n",
540 |     "        package_attibutes.update(temp_column)\n",
541 |     "        temp_column['version'] = str(package[2])\n",
542 |     "        package_attibutes.update(temp_column)\n",
543 |     "        temp_column['notes'] = str(package[3])\n",
544 |     "        package_attibutes.update(temp_column)\n",
545 |     "\n",
546 |     "        # Create an entity for each package\n",
547 |     "        name = str(package[1]) #experimentname + '-package-' + package[1] \n",
548 |     "        qn =   packages_qn + '#' + str(package[1])     #\"pyapacheatlas://\" + name\n",
549 |     "\n",
550 |     "        package_entity = AtlasEntity(\n",
551 |     "            name= name,\n",
552 |     "            typeName=\"custom_ml_package\",\n",
553 |     "            qualified_name=qn,\n",
554 |     "            guid = guid.get_guid(),\n",
555 |     "            attributes = package_attibutes,\n",
556 |     "            relationshipAttributes = {\"packages\":packages_entity.to_json(minimum=True)}\n",
557 |     "        )\n",
558 |     "        atlas_packages.append(package_entity)\n",
559 |     "\n",
560 |     "    atlas_packages\n",
561 |     "\n",
562 |     "    # Prepare all the entities as a batch to be uploaded.\n",
563 |     "    batch = [packages_entity] + atlas_packages\n",
564 |     "    client.upload_entities(batch=batch) \n",
565 |     "    \n",
566 |     "def create_experiment_config_entity(ws,experiment_name,automl_run):\n",
567 |     "    # Get experiment config from AML run\n",
568 |     "    import json\n",
569 |     "    import pandas as pd\n",
570 |     "    run_properties = automl_run.get_properties()\n",
571 |     "    run_properties\n",
572 |     "\n",
573 |     "    AMLSettingsJsonString = run_properties['AMLSettingsJsonString']\n",
574 |     "    AMLSettings = json.loads(AMLSettingsJsonString)\n",
575 |     "\n",
576 |     "    df_config = pd.DataFrame(list(AMLSettings.items()),columns = ['key','value']) \n",
577 |     "\n",
578 |     "    keys = ['task_type','enable_early_stopping','experiment_timeout_minutes','primary_metric','compute_target','label_column_name','n_cross_validations','model_explainability']\n",
579 |     "\n",
580 |     "    df_config = df_config[df_config['key'].isin(keys)]\n",
581 |     "\n",
582 |     "    dict_config = df_config.to_dict(orient = 'records')\n",
583 |     "    dict_config\n",
584 |     "\n",
585 |     "    config_attibutes={}\n",
586 |     "    for attibutes in dict_config:\n",
587 |     "        temp_column={}\n",
588 |     "        temp_column[attibutes['key']] = attibutes['value']\n",
589 |     "        config_attibutes.update(temp_column)\n",
590 |     "    config_attibutes\n",
591 |     "\n",
592 |     "    # Create a entity for exp config \n",
593 |     "    name = experiment_name + \"-config\"\n",
594 |     "    qn = \"pyapacheatlas://\" + name\n",
595 |     "\n",
596 |     "    exp_config_entity = AtlasEntity(\n",
597 |     "        name=name,\n",
598 |     "        typeName=\"custom_ml_exp_config\",\n",
599 |     "        qualified_name=qn,\n",
600 |     "        guid = guid.get_guid(),\n",
601 |     "        attributes = config_attibutes\n",
602 |     "    )\n",
603 |     "\n",
604 |     "    # Upload all entities!\n",
605 |     "    client.upload_entities(batch=[exp_config_entity.to_json()])\n",
606 |     "    \n",
607 |     "def create_model_entity(ws,experiment_name,modelname):\n",
608 |     "    # get deployed model\n",
609 |     "    from azureml.core.model import Model\n",
610 |     "    model = Model(ws, modelname)\n",
611 |     "\n",
612 |     "    config_attibutes={}\n",
613 |     "    temp_column={}\n",
614 |     "    temp_column['workspace_name'] = model.workspace.name\n",
615 |     "    config_attibutes.update(temp_column)\n",
616 |     "    temp_column['workspace_subscription_id'] = model.workspace.subscription_id\n",
617 |     "    config_attibutes.update(temp_column)\n",
618 |     "    temp_column['workspace_subscription_id'] = model.workspace.subscription_id\n",
619 |     "    config_attibutes.update(temp_column)\n",
620 |     "    temp_column['workspace_resource_group'] = model.workspace.resource_group\n",
621 |     "    config_attibutes.update(temp_column)\n",
622 |     "    temp_column['name'] = model.name\n",
623 |     "    config_attibutes.update(temp_column)\n",
624 |     "    temp_column['id'] = model.id\n",
625 |     "    config_attibutes.update(temp_column)\n",
626 |     "    temp_column['version'] = model.version\n",
627 |     "    config_attibutes.update(temp_column)\n",
628 |     "    temp_column['tags'] = model.tags\n",
629 |     "    config_attibutes.update(temp_column)\n",
630 |     "    temp_column['properties'] = model.properties\n",
631 |     "    config_attibutes.update(temp_column)\n",
632 |     "\n",
633 |     "    # Create a entity for Model\n",
634 |     "    name = modelname \n",
635 |     "    qn = \"pyapacheatlas://\" + name\n",
636 |     "\n",
637 |     "    exp_config_entity = AtlasEntity(\n",
638 |     "        name=name,\n",
639 |     "        typeName=\"custom_ml_model\",\n",
640 |     "        qualified_name=qn,\n",
641 |     "        guid = guid.get_guid(),\n",
642 |     "        attributes = config_attibutes\n",
643 |     "    )\n",
644 |     "\n",
645 |     "    # Upload all entities!\n",
646 |     "    client.upload_entities(batch=[exp_config_entity.to_json()])    \n",
647 |     "    \n",
648 |     "def create_model_metrics_entity(experiment_name,best_run):\n",
649 |     "    metrics = best_run.get_metrics()\n",
650 |     "\n",
651 |     "    # select relevant metrics\n",
652 |     "    auc = metrics.get('AUC_weighted')\n",
653 |     "    accuracy = metrics.get('accuracy')\n",
654 |     "    precision = metrics.get('precision_score_weighted')\n",
655 |     "    recall = metrics.get('recall_score_weighted')\n",
656 |     "    f1 = metrics.get('f1_score_weighted')\n",
657 |     "\n",
658 |     "    # # combine into single dataframe\n",
659 |     "    # metrics_df = sc.parallelize([['AUC', auc], ['Accuracy', accuracy], ['Precision', precision], ['Recall', recall], ['F1', f1]]).toDF(('Metric', 'Value'))\n",
660 |     "    metrics = ['AUC','Accuracy','Precision','Recall','F1']\n",
661 |     "    metricslist= [auc,accuracy,precision,recall,f1]\n",
662 |     "    columns = ['Metric','Value']\n",
663 |     "    metrics_df =  pd.DataFrame(zip(metrics, metricslist),columns=columns)\n",
664 |     "\n",
665 |     "\n",
666 |     "    dict_metrics = metrics_df.to_dict(orient = 'records')\n",
667 |     "    dict_metrics\n",
668 |     "\n",
669 |     "    config_attibutes={}\n",
670 |     "    for attibutes in dict_metrics:\n",
671 |     "        temp_column={}\n",
672 |     "        temp_column[attibutes['Metric']] = attibutes['Value']\n",
673 |     "        config_attibutes.update(temp_column)\n",
674 |     "    config_attibutes\n",
675 |     "\n",
676 |     "    name = experiment_name + \"-modelmetrics\"\n",
677 |     "    qn = \"pyapacheatlas://\" + name\n",
678 |     "\n",
679 |     "    # Create a entity for model metrics\n",
680 |     "    exp_config_entity = AtlasEntity(\n",
681 |     "        name=name,\n",
682 |     "        typeName=\"custom_ml_model_metrics\",\n",
683 |     "        qualified_name=qn,\n",
684 |     "        guid = guid.get_guid(),\n",
685 |     "        attributes = config_attibutes\n",
686 |     "    )\n",
687 |     "\n",
688 |     "    # Upload all entities!\n",
689 |     "    client.upload_entities(batch=[exp_config_entity.to_json()])\n",
690 |     "    \n",
691 |     "def create_experiment_lineage(experimentname,exp_data_qn,exp_config_qn,model_metrics_qn,model_qn):        \n",
692 |     "    # create experiment process \n",
693 |     "    # inputs: prepareddata, modelconfig \n",
694 |     "    # outputs: model metrics and registered model\n",
695 |     "\n",
696 |     "    from pyapacheatlas.core import AtlasProcess\n",
697 |     "\n",
698 |     "    in_data_ent_guid = get_entity_guid(exp_data_qn,'custom_dataset')\n",
699 |     "    in_exp_config_guid = get_entity_guid(exp_config_qn,'custom_ml_exp_config')\n",
700 |     "    out_model_metrics_guid = get_entity_guid(model_metrics_qn,'custom_ml_model_metrics')\n",
701 |     "    out_model_guid = get_entity_guid(model_qn,'custom_ml_model')\n",
702 |     "\n",
703 |     "    process_name = experimentname + '-train'\n",
704 |     "    process_qn = \"pyapacheatlas://\" + process_name\n",
705 |     "    process_type_name = \"Process\"\n",
706 |     "\n",
707 |     "    process = AtlasProcess(\n",
708 |     "        name=process_name,\n",
709 |     "        typeName=process_type_name,\n",
710 |     "        qualified_name=process_qn,\n",
711 |     "        inputs = [{\"guid\":in_data_ent_guid},{\"guid\":in_exp_config_guid}],\n",
712 |     "        outputs = [{\"guid\":out_model_metrics_guid},{\"guid\":out_model_guid}],\n",
713 |     "        guid=guid.get_guid()\n",
714 |     "    )\n",
715 |     "\n",
716 |     "    # Prepare all the entities as a batch to be uploaded.\n",
717 |     "    batch = [process]\n",
718 |     "    batch\n",
719 |     "\n",
720 |     "    # Upload all entities!\n",
721 |     "    client.upload_entities(batch=batch)  \n",
722 |     "    \n",
723 |     "def create_model_service_entity(ws,experimentname,aci_service_name,samplejson):\n",
724 |     "    # get deployed ACI Web Service\n",
725 |     "    from azureml.core.webservice import AciWebservice\n",
726 |     "    aciws = AciWebservice(ws, aci_service_name)\n",
727 |     "\n",
728 |     "    config_attibutes={}\n",
729 |     "    temp_column={}\n",
730 |     "    temp_column['workspace_name'] = aciws.workspace.name\n",
731 |     "    config_attibutes.update(temp_column)\n",
732 |     "    temp_column['workspace_subscription_id'] = aciws.workspace.subscription_id\n",
733 |     "    config_attibutes.update(temp_column)\n",
734 |     "    temp_column['workspace_resource_group'] = aciws.workspace.resource_group\n",
735 |     "    config_attibutes.update(temp_column)\n",
736 |     "    temp_column['name'] = aciws.name\n",
737 |     "    config_attibutes.update(temp_column)\n",
738 |     "    temp_column['image_id'] = aciws.image_id\n",
739 |     "    config_attibutes.update(temp_column)\n",
740 |     "    temp_column['compute_type'] = aciws.compute_type\n",
741 |     "    config_attibutes.update(temp_column)\n",
742 |     "    temp_column['state'] = aciws.state\n",
743 |     "    config_attibutes.update(temp_column)\n",
744 |     "    temp_column['scoring_uri'] = aciws.scoring_uri\n",
745 |     "    config_attibutes.update(temp_column)\n",
746 |     "    temp_column['tags'] = aciws.tags\n",
747 |     "    config_attibutes.update(temp_column)\n",
748 |     "    temp_column['state'] = aciws.state\n",
749 |     "    config_attibutes.update(temp_column)\n",
750 |     "    temp_column['properties'] = aciws.properties\n",
751 |     "    config_attibutes.update(temp_column)\n",
752 |     "    temp_column['created_by'] = aciws.created_by\n",
753 |     "    config_attibutes.update(temp_column)\n",
754 |     "    temp_column['sample_json'] = samplejson\n",
755 |     "    config_attibutes.update(temp_column)\n",
756 |     "\n",
757 |     "    name = experimentname + \"-model_endpoint\"\n",
758 |     "    qn = \"pyapacheatlas://\" + name\n",
759 |     "\n",
760 |     "    # Create a entity for ACI Web Service\n",
761 |     "    endpoint_entity = AtlasEntity(\n",
762 |     "        name=name,\n",
763 |     "        typeName=\"custom_ml_model_endpoint\",\n",
764 |     "        qualified_name=qn,\n",
765 |     "        guid = guid.get_guid(),\n",
766 |     "        attributes = config_attibutes\n",
767 |     "    )\n",
768 |     "\n",
769 |     "    # Upload all entities!\n",
770 |     "    client.upload_entities(batch=[endpoint_entity.to_json()])    \n",
771 |     "    \n",
772 |     "def create_powerbi_dataset_and_lineage(experiment_name,pbi_workspace,pbi_datasetid,pbidata_ent_name,ml_dataset_ent_name,ml_dataset_ent_type):\n",
773 |     "    \n",
774 |     "    pbidata_entity_type = 'powerbi_dataset'\n",
775 |     "    pbidata_ent_qn = pbi_workspace + '/datasets/' + pbi_datasetid \n",
776 |     "    purview_basepath = 'pyapacheatlas://'\n",
777 |     "    #\"https://msit.powerbi.com/groups/7d666287-f9b8-45ff-be6c-9909afe9df40/datasets/e5a30c22-466d-4a30-a1ac-8736ed6567cc\"\n",
778 |     "\n",
779 |     "    pbidata_ent = AtlasEntity(\n",
780 |     "        name=pbidata_ent_name,\n",
781 |     "        typeName=pbidata_entity_type,\n",
782 |     "        qualified_name= pbidata_ent_qn,\n",
783 |     "        workspace = pbi_workspace,\n",
784 |     "        guid = guid.get_guid()\n",
785 |     "    )\n",
786 |     "\n",
787 |     "    # Prepare all the entities as a batch to be uploaded.\n",
788 |     "    batch = [pbidata_ent]\n",
789 |     "    batch\n",
790 |     "\n",
791 |     "    # Upload all entities!\n",
792 |     "    client.upload_entities(batch=batch)\n",
793 |     "\n",
794 |     "    #cretae powerbi_dataset_process lineage\n",
795 |     "    in_ent_guids = []\n",
796 |     "    in_ent_guid = get_entity_guid(purview_basepath + ml_dataset_ent_name,ml_dataset_ent_type)\n",
797 |     "    in_ent_guids.append({'guid':in_ent_guid})\n",
798 |     "\n",
799 |     "    out_ent_guids = []\n",
800 |     "    out_ent_guid = get_entity_guid(pbidata_ent_qn,pbidata_entity_type)\n",
801 |     "    out_ent_guids.append({'guid':out_ent_guid})\n",
802 |     "\n",
803 |     "    process_name =  'createpowerbidataset' + pbidata_ent_name + experiment_name\n",
804 |     "    process_qn = \"pyapacheatlas://\" + process_name\n",
805 |     "    process_type_name = \"powerbi_dataset_process\"\n",
806 |     "\n",
807 |     "    process = AtlasProcess(\n",
808 |     "        name=process_name,\n",
809 |     "        typeName=process_type_name,\n",
810 |     "        qualified_name=process_qn,\n",
811 |     "        inputs = in_ent_guids,\n",
812 |     "        outputs = out_ent_guids,\n",
813 |     "        guid=guid.get_guid()\n",
814 |     "    )\n",
815 |     "\n",
816 |     "    # Prepare all the entities as a batch to be uploaded.\n",
817 |     "    batch = [process]\n",
818 |     "    batch\n",
819 |     "\n",
820 |     "    # Upload all entities!\n",
821 |     "    client.upload_entities(batch=batch)\n",
822 |     "    \n",
823 |     "def create_powerbi_report_and_lineage(experiment_name,pbi_workspace,pbi_reportid,pbi_ent_name,pbi_datasetid):\n",
824 |     "\n",
825 |     "    #create powerbi report\n",
826 |     "    pbi_entity_type = 'powerbi_report'\n",
827 |     "    pbi_ent_qn = pbi_workspace + '/reports/' + pbi_reportid \n",
828 |     "    purview_basepath = 'pyapacheatlas://'\n",
829 |     "    \n",
830 |     "    pbi_ent = AtlasEntity(\n",
831 |     "        name=pbi_ent_name,\n",
832 |     "        typeName=pbi_entity_type,\n",
833 |     "        qualified_name= pbi_ent_qn, \n",
834 |     "        workspace = pbi_workspace,\n",
835 |     "        guid = guid.get_guid()\n",
836 |     "    )\n",
837 |     "\n",
838 |     "    # Prepare all the entities as a batch to be uploaded.\n",
839 |     "    batch = [pbi_ent]\n",
840 |     "    batch\n",
841 |     "\n",
842 |     "    # Upload all entities!\n",
843 |     "    client.upload_entities(batch=batch)\n",
844 |     "\n",
845 |     "    #create powerbi dashboard process lineage\n",
846 |     "    pbidata_ent_qn = pbi_workspace + '/datasets/' + pbi_datasetid \n",
847 |     "    in_ent_guids = []\n",
848 |     "    in_ent_guid = get_entity_guid(pbidata_ent_qn,'powerbi_dataset')\n",
849 |     "    in_ent_guids.append({'guid':in_ent_guid})\n",
850 |     "\n",
851 |     "    out_ent_guids = []\n",
852 |     "    out_ent_guid = get_entity_guid(pbi_ent_qn,'powerbi_report')\n",
853 |     "    out_ent_guids.append({'guid':out_ent_guid})\n",
854 |     "\n",
855 |     "    process_name = 'createpowerbireport' + pbi_ent_name + experiment_name\n",
856 |     "    process_qn = \"pyapacheatlas://\" + process_name\n",
857 |     "    process_type_name = \"powerbi_report_process\"\n",
858 |     "\n",
859 |     "    process = AtlasProcess(\n",
860 |     "        name=process_name,\n",
861 |     "        typeName=process_type_name,\n",
862 |     "        qualified_name=process_qn,\n",
863 |     "        inputs = in_ent_guids,\n",
864 |     "        outputs = out_ent_guids,\n",
865 |     "        guid=guid.get_guid()\n",
866 |     "    )\n",
867 |     "\n",
868 |     "    # Prepare all the entities as a batch to be uploaded.\n",
869 |     "    batch = [process]\n",
870 |     "    batch\n",
871 |     "\n",
872 |     "    # Upload all entities!\n",
873 |     "    client.upload_entities(batch=batch)\n",
874 |     "    \n",
875 |     "# clean up datasets\n",
876 |     "def cleanup_entities(typename, entitytype):\n",
877 |     "    filter_setup = {\"typeName\": typename, \"includeSubTypes\": True}\n",
878 |     "    search = client.search_entities(\"*\", search_filter=filter_setup)\n",
879 |     "    for entity in search:\n",
880 |     "        #print(entity)\n",
881 |     "        if entity.get(\"entityType\") == entitytype:\n",
882 |     "            print(entity.get(\"id\"),entity.get(\"qualifiedName\"),entity.get(\"entityType\"))\n",
883 |     "            guid = entity.get(\"id\")\n",
884 |     "            client.delete_entity(guid=guid)\n",
885 |     "\n"
886 |    ]
887 |   }
888 |  ],
889 |  "metadata": {
890 |   "kernelspec": {
891 |    "display_name": "Python 3",
892 |    "language": "python",
893 |    "name": "python3"
894 |   },
895 |   "language_info": {
896 |    "codemirror_mode": {
897 |     "name": "ipython",
898 |     "version": 3
899 |    },
900 |    "file_extension": ".py",
901 |    "mimetype": "text/x-python",
902 |    "name": "python",
903 |    "nbconvert_exporter": "python",
904 |    "pygments_lexer": "ipython3",
905 |    "version": "3.7.4"
906 |   },
907 |   "save_output": true,
908 |   "synapse_widget": {
909 |    "state": {},
910 |    "version": "0.1"
911 |   }
912 |  },
913 |  "nbformat": 4,
914 |  "nbformat_minor": 2
915 | }
916 | 


--------------------------------------------------------------------------------
/SynapseNotebooks/04_Create_CreditRisk_Experiment.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true,
  8 |     "jupyter": {
  9 |      "outputs_hidden": false,
 10 |      "source_hidden": false
 11 |     },
 12 |     "nteract": {
 13 |      "transient": {
 14 |       "deleting": false
 15 |      }
 16 |     }
 17 |    },
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "%run /01_Authenticate_to_Purview_AML"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 2,
 26 |    "metadata": {
 27 |     "collapsed": true,
 28 |     "jupyter": {
 29 |      "outputs_hidden": false,
 30 |      "source_hidden": false
 31 |     },
 32 |     "nteract": {
 33 |      "transient": {
 34 |       "deleting": false
 35 |      }
 36 |     }
 37 |    },
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "%run /02_Create_ML_Lineage_Types"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 3,
 46 |    "metadata": {
 47 |     "collapsed": true,
 48 |     "jupyter": {
 49 |      "outputs_hidden": false,
 50 |      "source_hidden": false
 51 |     },
 52 |     "nteract": {
 53 |      "transient": {
 54 |       "deleting": false
 55 |      }
 56 |     }
 57 |    },
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "%run /03_Create_ML_Lineage_Functions"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 2,
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "import json"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": 3,
 75 |    "metadata": {},
 76 |    "outputs": [],
 77 |    "source": [
 78 |     "#update below variables with synapse adls name and container/filesystem name\n",
 79 |     "data_lake_account_name = \"\"\n",
 80 |     "file_system_name = \"data\""
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": 4,
 86 |    "metadata": {
 87 |     "jupyter": {
 88 |      "outputs_hidden": false,
 89 |      "source_hidden": false
 90 |     },
 91 |     "nteract": {
 92 |      "transient": {
 93 |       "deleting": false
 94 |      }
 95 |     }
 96 |    },
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "import pandas as pd\n",
100 |     "import numpy as np\n",
101 |     "\n",
102 |     "synapse_base_path = 'abfss://' + file_system_name + '@' + data_lake_account_name + '.dfs.core.windows.net'\n",
103 |     "df_borrower = spark.read.load(synapse_base_path+ '/creditriskdata/borrower.csv', format='csv', header=True).toPandas()\n",
104 |     "#display(df_borrower.head(10))\n",
105 |     "\n",
106 |     "df_loan = spark.read.load(synapse_base_path + '/creditriskdata/loan.csv', format='csv', header=True).toPandas()\n",
107 |     "#display(df_loan.head(1))\n",
108 |     "\n",
109 |     "# Join data and do some transformations\n",
110 |     "df_data = df_borrower.merge(df_loan,on='memberId',how='inner')\n",
111 |     "df_data.shape\n",
112 |     "\n",
113 |     "df_sp = spark.createDataFrame(df_data)\n",
114 |     "df_sp = df_sp.drop('loanStatus')\n",
115 |     "\n",
116 |     "df_sp.write.option('header', 'true').mode('overwrite').csv(synapse_base_path + '/creditriskdata/testdata/')\n",
117 |     "\n",
118 |     "df_data['homeOwnership'] = df_data['homeOwnership'].replace('nan', np.nan).fillna(0)\n",
119 |     "df_data['isJointApplication'] = df_data['isJointApplication'].replace('nan', np.nan).fillna(0)\n",
120 |     "\n",
121 |     "drop_cols = ['memberId', 'loanId', 'date','grade']\n",
122 |     "df_data = df_data.drop(drop_cols, axis=1)\n",
123 |     "#df_data.dtypes"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": 6,
129 |    "metadata": {
130 |     "collapsed": true,
131 |     "jupyter": {
132 |      "outputs_hidden": false,
133 |      "source_hidden": false
134 |     },
135 |     "nteract": {
136 |      "transient": {
137 |       "deleting": false
138 |      }
139 |     }
140 |    },
141 |    "outputs": [],
142 |    "source": [
143 |     "experimentname = \"CreditRiskExperiment\""
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": 7,
149 |    "metadata": {
150 |     "collapsed": true,
151 |     "jupyter": {
152 |      "outputs_hidden": false,
153 |      "source_hidden": false
154 |     },
155 |     "nteract": {
156 |      "transient": {
157 |       "deleting": false
158 |      }
159 |     }
160 |    },
161 |    "outputs": [],
162 |    "source": [
163 |     "#create an entity for prepated data\n",
164 |     "data_ent_name = 'creditriskdata'\n",
165 |     "create_data_entity_with_schema(df_data,data_ent_name,'custom_dataset')\n",
166 |     "\n",
167 |     "#create preprocess lineage \n",
168 |     "\n",
169 |     "syn_basepath = 'https://' + data_lake_account_name + '.dfs.core.windows.net/' + file_system_name + '/creditriskdata'\n",
170 |     "purview_basepath = 'pyapacheatlas://'\n",
171 |     "\n",
172 |     "in_ent_qns = {syn_basepath + '/borrower.csv':'azure_datalake_gen2_path',syn_basepath + '/loan.csv':'azure_datalake_gen2_path'}\n",
173 |     "out_ent_qns = {purview_basepath + data_ent_name:'custom_dataset'}\n",
174 |     "\n",
175 |     "processname = '-preprocess'\n",
176 |     "create_lineage_for_entities(experimentname,processname, in_ent_qns,out_ent_qns,ColumnMapping=True)"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": 8,
182 |    "metadata": {
183 |     "collapsed": true,
184 |     "jupyter": {
185 |      "outputs_hidden": false,
186 |      "source_hidden": false
187 |     },
188 |     "nteract": {
189 |      "transient": {
190 |       "deleting": false
191 |      }
192 |     }
193 |    },
194 |    "outputs": [],
195 |    "source": [
196 |     "from azureml.core.experiment import Experiment\n",
197 |     "from azureml.train.automl.run import AutoMLRun\n",
198 |     "from azureml.train.automl import AutoMLConfig\n",
199 |     "\n",
200 |     "##run only once\n",
201 |     "experiment = Experiment(ws, experimentname)\n",
202 |     "\n",
203 |     "automl_classifier_config = AutoMLConfig(\n",
204 |     "        task='classification', \n",
205 |     "        enable_early_stopping = True, \n",
206 |     "        iterations = 2,      \n",
207 |     "        experiment_timeout_minutes=15,\n",
208 |     "        primary_metric='AUC_weighted',\n",
209 |     "        training_data= df_data,\n",
210 |     "        #compute = 'local',\n",
211 |     "        label_column_name='loanStatus',\n",
212 |     "        n_cross_validations=5,\n",
213 |     "        model_explainability=True,\n",
214 |     "        enable_onnx_compatible_models=True,\n",
215 |     "        enable_voting_ensemble=False,\n",
216 |     "        enable_stack_ensemble=False\n",
217 |     "        )\n",
218 |     "local_run = experiment.submit(automl_classifier_config, show_output=True)"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "code",
223 |    "execution_count": 10,
224 |    "metadata": {
225 |     "collapsed": true,
226 |     "jupyter": {
227 |      "outputs_hidden": false,
228 |      "source_hidden": false
229 |     },
230 |     "nteract": {
231 |      "transient": {
232 |       "deleting": false
233 |      }
234 |     }
235 |    },
236 |    "outputs": [],
237 |    "source": [
238 |     "# get experiment run, get the best model and register\n",
239 |     "\n",
240 |     "from azureml.core.experiment import Experiment\n",
241 |     "from azureml.core.workspace import Workspace\n",
242 |     "from azureml.train.automl.run import AutoMLRun\n",
243 |     "from azureml.train.automl import AutoMLConfig\n",
244 |     "from azureml.core.model import Model\n",
245 |     "import joblib\n",
246 |     "\n",
247 |     "# get experiment run, get the best model and register\n",
248 |     "experimentname = \"CreditRiskExperiment\"\n",
249 |     "\n",
250 |     "for automl_run in ws.experiments[experimentname].get_runs():\n",
251 |     "    best_run, fitted_model = automl_run.get_output()  # We are taking the first run. You can update this if you like to take a different run\n",
252 |     "    break\n",
253 |     "\n",
254 |     "#save the model to a local file\n",
255 |     "model_path = 'creditrisk_model'\n",
256 |     "joblib.dump(fitted_model, model_path)\n",
257 |     "\n",
258 |     "model_name = \"creditrisk_model\"\n",
259 |     "registered_model = Model.register(model_path = model_path, # this points to a local file\n",
260 |     "                       model_name = model_name, # name the model is registered as\n",
261 |     "                       tags = {'type': \"classification\"}, \n",
262 |     "                       description = \"Credit Risk Classifier\", \n",
263 |     "                       workspace = ws)\n"
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "code",
268 |    "execution_count": 11,
269 |    "metadata": {
270 |     "collapsed": true,
271 |     "jupyter": {
272 |      "outputs_hidden": false,
273 |      "source_hidden": false
274 |     },
275 |     "nteract": {
276 |      "transient": {
277 |       "deleting": false
278 |      }
279 |     }
280 |    },
281 |    "outputs": [],
282 |    "source": [
283 |     "#create packages entities\n",
284 |     "#[programming_language,package_name,version,notes]\n",
285 |     "packageslist = [['python','mmlspark','v0.0.11','older versions before 0.0.10 give error'],\n",
286 |     "                ['python','scikit-learn','0.22rc2.post1','latest version 0.24.x gives error if you call the model from Azure Function']]\n",
287 |     "create_package_entities(experimentname,packageslist)"
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "code",
292 |    "execution_count": 12,
293 |    "metadata": {
294 |     "collapsed": true,
295 |     "jupyter": {
296 |      "outputs_hidden": false,
297 |      "source_hidden": false
298 |     },
299 |     "nteract": {
300 |      "transient": {
301 |       "deleting": false
302 |      }
303 |     }
304 |    },
305 |    "outputs": [],
306 |    "source": [
307 |     "#create experiment train lineage\n",
308 |     "create_experiment_config_entity(ws,experimentname,automl_run)\n",
309 |     "create_model_entity(ws,experimentname,model_name)\n",
310 |     "create_model_metrics_entity(experimentname,best_run)\n",
311 |     "\n",
312 |     "pbasepath = 'pyapacheatlas://'\n",
313 |     "\n",
314 |     "in_ent_qns = {pbasepath + data_ent_name:'custom_dataset',pbasepath + experimentname + \"-config\":'custom_ml_exp_config',pbasepath + experimentname + '-packages':'custom_ml_packages'}\n",
315 |     "out_ent_qns = {pbasepath + model_name:'custom_ml_model',pbasepath + experimentname + \"-modelmetrics\":'custom_ml_model_metrics'}\n",
316 |     "\n",
317 |     "processname = '-train'\n",
318 |     "create_lineage_for_entities(experimentname,processname, in_ent_qns,out_ent_qns,ColumnMapping=False)"
319 |    ]
320 |   },
321 |   {
322 |    "cell_type": "code",
323 |    "execution_count": 13,
324 |    "metadata": {
325 |     "collapsed": true,
326 |     "jupyter": {
327 |      "outputs_hidden": false,
328 |      "source_hidden": false
329 |     },
330 |     "nteract": {
331 |      "transient": {
332 |       "deleting": false
333 |      }
334 |     }
335 |    },
336 |    "outputs": [],
337 |    "source": [
338 |     "scoring_script = \"\"\"\n",
339 |     "import json\n",
340 |     "import pickle\n",
341 |     "import numpy as np\n",
342 |     "import pandas as pd\n",
343 |     "import azureml.train.automl\n",
344 |     "from sklearn.externals import joblib\n",
345 |     "from azureml.core.model import Model\n",
346 |     "\n",
347 |     "def init():\n",
348 |     "    global model\n",
349 |     "    # This name is model.id of model that we want to deploy deserialize the model file back\n",
350 |     "    model_path = Model.get_model_path(model_name = 'creditrisk_model')\n",
351 |     "    model = joblib.load(model_path)\n",
352 |     "\n",
353 |     "def run(input_json):     \n",
354 |     "    try:\n",
355 |     "        data_df = pd.read_json(input_json)       \n",
356 |     "        # Get the predictions...\n",
357 |     "        prediction = model.predict(data_df)\n",
358 |     "        prediction = json.dumps(prediction.tolist())\n",
359 |     "    except Exception as e:\n",
360 |     "        prediction = str(e)\n",
361 |     "    return prediction\n",
362 |     "\"\"\"\n",
363 |     "exec(scoring_script)\n",
364 |     "with open(\"scoring_script.py\", \"w\") as file:\n",
365 |     "    file.write(scoring_script)\n",
366 |     "    \n",
367 |     "scoring_script_file_name = 'scoring_script.py'\n",
368 |     "\n",
369 |     "#test locally\n",
370 |     "import numpy as np\n",
371 |     "# X_test = spark.sql('select * from default.creditrisk_data limit 20').toPandas()\n",
372 |     "drop_cols = ['loanStatus']\n",
373 |     "X_test = df_data.drop(drop_cols, axis=1)\n",
374 |     "X_test = X_test.head(1)\n",
375 |     "json_test_data = X_test.to_json(orient='records')\n",
376 |     "print(json_test_data)\n",
377 |     "init()\n",
378 |     "run(json_test_data)"
379 |    ]
380 |   },
381 |   {
382 |    "cell_type": "code",
383 |    "execution_count": 14,
384 |    "metadata": {
385 |     "collapsed": true,
386 |     "jupyter": {
387 |      "outputs_hidden": false,
388 |      "source_hidden": false
389 |     },
390 |     "nteract": {
391 |      "transient": {
392 |       "deleting": false
393 |      }
394 |     }
395 |    },
396 |    "outputs": [],
397 |    "source": [
398 |     "# obtain conda dependencies from the automl run and save the file locally\n",
399 |     "from azureml.core import Environment\n",
400 |     "environment_config_file = 'creditrisk_conda_env.yml'\n",
401 |     "best_run.download_file('outputs/conda_env_v_1_0_0.yml', environment_config_file)\n",
402 |     "# with open('creditrisk_conda_env.yml', 'r') as f:\n",
403 |     "#     print(f.read())\n",
404 |     "\n",
405 |     "# create the environment based on the saved conda dependencies file\n",
406 |     "myenv = Environment.from_conda_specification(name=\"creditriskenv\", file_path=environment_config_file)\n",
407 |     "myenv.register(workspace=ws)\n",
408 |     "\n",
409 |     "from azureml.core.model import InferenceConfig\n",
410 |     "from azureml.core.webservice import AciWebservice\n",
411 |     "from azureml.core.webservice import Webservice\n",
412 |     "\n",
413 |     "# Configure and deploy the web service to Azure Container Instances\n",
414 |     "inference_config = InferenceConfig(environment=myenv, entry_script=scoring_script_file_name)\n",
415 |     "aci_config = AciWebservice.deploy_configuration(cpu_cores = 1, memory_gb= 2, tags = { 'type' : 'automl-classification'}, description='AutoML Credit Risk Classifier Service')\n",
416 |     "aci_service_name = 'creditrisk-automl-service'\n",
417 |     "aci_service = Model.deploy(ws, aci_service_name, [registered_model], inference_config, aci_config)\n",
418 |     "aci_service.wait_for_deployment(show_output = True)\n",
419 |     "print(aci_service.state)"
420 |    ]
421 |   },
422 |   {
423 |    "cell_type": "code",
424 |    "execution_count": 16,
425 |    "metadata": {
426 |     "collapsed": true,
427 |     "jupyter": {
428 |      "outputs_hidden": false,
429 |      "source_hidden": false
430 |     },
431 |     "nteract": {
432 |      "transient": {
433 |       "deleting": false
434 |      }
435 |     }
436 |    },
437 |    "outputs": [],
438 |    "source": [
439 |     "aci_service_name = 'creditrisk-automl-service'\n",
440 |     "create_model_service_entity(ws,experimentname,aci_service_name,json_test_data)\n",
441 |     "\n",
442 |     "pbasepath = 'pyapacheatlas://'\n",
443 |     "\n",
444 |     "in_ent_qns = {pbasepath + model_name:'custom_ml_model'}\n",
445 |     "out_ent_qns = {pbasepath + experimentname + \"-model_endpoint\":'custom_ml_model_endpoint'}\n",
446 |     "\n",
447 |     "processname = '-deploymodel'\n",
448 |     "create_lineage_for_entities(experimentname,processname, in_ent_qns,out_ent_qns,ColumnMapping=False)"
449 |    ]
450 |   },
451 |   {
452 |    "cell_type": "code",
453 |    "execution_count": 17,
454 |    "metadata": {
455 |     "jupyter": {
456 |      "outputs_hidden": false,
457 |      "source_hidden": false
458 |     },
459 |     "nteract": {
460 |      "transient": {
461 |       "deleting": false
462 |      }
463 |     }
464 |    },
465 |    "outputs": [],
466 |    "source": [
467 |     "#batch inferencing\n",
468 |     "df_test = spark.read.load(synapse_base_path +'/creditriskdata/testdata', format='csv', header=True).toPandas()\n",
469 |     "\n",
470 |     "drop_cols = ['memberId', 'loanId', 'date','grade']\n",
471 |     "df_test1 = df_test.drop(drop_cols, axis=1)\n",
472 |     "\n",
473 |     "model_path = Model.get_model_path(model_name = 'creditrisk_model')\n",
474 |     "model = joblib.load(model_path)\n",
475 |     "\n",
476 |     "prediction = model.predict(df_test1)\n",
477 |     "prediction\n",
478 |     "\n",
479 |     "df_result = df_test \n",
480 |     "df_result['prediction'] = prediction\n",
481 |     "df_result\n",
482 |     "\n",
483 |     "data_lake_account_name = 'purviewaccdl'\n",
484 |     "file_system_name = 'purviewaccfs'\n",
485 |     "df_sp = spark.createDataFrame(df_result)\n",
486 |     "df_sp.write.option('header', 'true').mode('overwrite').csv(synapse_base_path + '/creditriskdata/batchpredictions/')\n",
487 |     "\n",
488 |     "df_sp.write.mode(\"overwrite\").saveAsTable(\"default.creditrisk_predictions\")"
489 |    ]
490 |   },
491 |   {
492 |    "cell_type": "code",
493 |    "execution_count": 22,
494 |    "metadata": {
495 |     "collapsed": true,
496 |     "jupyter": {
497 |      "outputs_hidden": false,
498 |      "source_hidden": false
499 |     },
500 |     "nteract": {
501 |      "transient": {
502 |       "deleting": false
503 |      }
504 |     }
505 |    },
506 |    "outputs": [],
507 |    "source": [
508 |     "#create an entity for test data\n",
509 |     "test_data_ent_name = 'creditrisktestdata'\n",
510 |     "create_data_entity_with_schema(df_test,test_data_ent_name,entitytype='custom_dataset')\n",
511 |     "\n",
512 |     "#create an entity for batch inference data\n",
513 |     "batchpred_data_ent_name = 'creditriskbatchpredictions'\n",
514 |     "create_data_entity_with_schema(df_result,batchpred_data_ent_name,entitytype='custom_dataset')\n",
515 |     "\n",
516 |     "#create batch inference lineage \n",
517 |     "syn_basepath = 'https://' + data_lake_account_name + 'dfs.core.windows.net' + file_system_name + '/'\n",
518 |     "pbasepath = 'pyapacheatlas://'\n",
519 |     "\n",
520 |     "in_ent_qns = {pbasepath + test_data_ent_name:'custom_dataset',pbasepath + model_name:'custom_ml_model'}\n",
521 |     "out_ent_qns = {pbasepath + batchpred_data_ent_name:'custom_dataset'}\n",
522 |     "\n",
523 |     "processname = '-batchinference'\n",
524 |     "create_lineage_for_entities(experimentname,processname, in_ent_qns,out_ent_qns,ColumnMapping=True)"
525 |    ]
526 |   },
527 |   {
528 |    "cell_type": "code",
529 |    "execution_count": null,
530 |    "metadata": {},
531 |    "outputs": [],
532 |    "source": [
533 |     "## uncomment below code to link PowerBI Dataset and Report in lineage if you have access to a PBI workspace "
534 |    ]
535 |   },
536 |   {
537 |    "cell_type": "code",
538 |    "execution_count": 23,
539 |    "metadata": {},
540 |    "outputs": [],
541 |    "source": [
542 |     "# #The PowerBI entities will populate with more details if you set up a scan for PBI workspaces in Purview\n",
543 |     "# #We are only creating placeholders and links for lineage below\n",
544 |     "\n",
545 |     "# #create PowerBI dataset entity and lineage \n",
546 |     "# pbi_workspace = '<YOUR PBIWORKSPACE URL>' #'https://xxx.powerbi.com/groups/7c555287-f9b8-45ff-be6c-9909afe9df40'\n",
547 |     "# pbi_datasetid = '<YOUR PBI Dataset ID>' #'c4a30c22-466d-4a30-a1ac-8736ed6567cc' \n",
548 |     "\n",
549 |     "# pbidata_ent_name = 'creditriskpbidataset' \n",
550 |     "\n",
551 |     "# create_powerbi_dataset_and_lineage(experimentname,pbi_workspace,pbi_datasetid,pbidata_ent_name,batchpred_data_ent_name,'custom_dataset')"
552 |    ]
553 |   },
554 |   {
555 |    "cell_type": "code",
556 |    "execution_count": 24,
557 |    "metadata": {},
558 |    "outputs": [],
559 |    "source": [
560 |     "# #create PowerBI report entity and lineage\n",
561 |     "# pbi_reportid = '<YOUR PBI Report ID>' #'e495453d-6c0c-4fb9-bdc4-556319f6a57b'\n",
562 |     "# pbi_ent_name = 'creditriskpbireport'\n",
563 |     " \n",
564 |     "# create_powerbi_report_and_lineage(experimentname,pbi_workspace,pbi_reportid,pbi_ent_name,pbi_datasetid)"
565 |    ]
566 |   }
567 |  ],
568 |  "metadata": {
569 |   "kernelspec": {
570 |    "display_name": "Python 3",
571 |    "language": "python",
572 |    "name": "python3"
573 |   },
574 |   "language_info": {
575 |    "codemirror_mode": {
576 |     "name": "ipython",
577 |     "version": 3
578 |    },
579 |    "file_extension": ".py",
580 |    "mimetype": "text/x-python",
581 |    "name": "python",
582 |    "nbconvert_exporter": "python",
583 |    "pygments_lexer": "ipython3",
584 |    "version": "3.7.4"
585 |   },
586 |   "save_output": true
587 |  },
588 |  "nbformat": 4,
589 |  "nbformat_minor": 2
590 | }
591 | 


--------------------------------------------------------------------------------