├── CODE_OF_CONDUCT.md
├── LICENSE
├── README.md
├── SECURITY.md
├── SUPPORT.md
├── sparklin
    ├── BlobTriggerFuncApp
    │   ├── .funcignore
    │   ├── .gitignore
    │   ├── BlobTriggerFunction
    │   │   ├── .funcignore
    │   │   ├── Data.py
    │   │   ├── Synapse_JsonParser.py
    │   │   ├── __init__.py
    │   │   ├── column_parser.py
    │   │   ├── function.json
    │   │   ├── host.json
    │   │   ├── join_parser.py
    │   │   ├── readme.md
    │   │   └── sample.dat
    │   ├── host.json
    │   └── requirements.txt
    ├── HttpTriggerFuncApp
    │   ├── .funcignore
    │   ├── .gitignore
    │   ├── HttpTriggerFunction
    │   │   ├── __init__.py
    │   │   ├── event.py
    │   │   ├── function.json
    │   │   ├── sample.dat
    │   │   └── tablestorage.py
    │   ├── Reqtest.http
    │   ├── host.json
    │   └── requirements.txt
    ├── Onboarding.md
    ├── OpenLineage
    │   ├── OpenLineage.sln
    │   └── OpenLineage
    │   │   ├── CaptureLineage.cs
    │   │   ├── Constant.cs
    │   │   ├── EventMetadata.cs
    │   │   ├── OpenLineage.csproj
    │   │   ├── OpenLineage.csproj.user
    │   │   ├── TableStorage.cs
    │   │   ├── Utility.cs
    │   │   └── host.json
    ├── README.md
    └── openlineage-spark-0.4.0.jar
└── tompo
    ├── Onboarding.md
    ├── README.md
    ├── TOMPo.pbix
    ├── TOMPo_ModelMetada.ipynb
    ├── TOMPo_OnboardingSteps.docx
    └── TOMPo_ReportParser.ipynb


/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Microsoft Open Source Code of Conduct
 2 | 
 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 4 | 
 5 | Resources:
 6 | 
 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | page_type: sample
 3 | languages:
 4 | - python
 5 | products:
 6 | - azure
 7 | - power bi
 8 | - synapse
 9 | - databricks
10 | - purview
11 | - analysis services
12 | - datalake
13 | 
14 | name: Data Lineage
15 | description: End to end data lineage from source to visualizations.
16 | ---
17 | 
18 | # Onboarding Documents
19 | 
20 | - [Onboarding Sparklin](https://github.com/microsoft/DataLineage/blob/main/sparklin/Onboarding.md)
21 | - [Onboarding TOMPo](https://github.com/microsoft/DataLineage/blob/main/tompo/Onboarding.md)
22 | 
23 | ![image](https://user-images.githubusercontent.com/118733500/227436747-f527883b-92da-4482-8aec-b34c03003cf7.png)
24 | 
25 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.8 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->
42 | 


--------------------------------------------------------------------------------
/SUPPORT.md:
--------------------------------------------------------------------------------
 1 | # TODO: The maintainer of this repo has not yet edited this file
 2 | 
 3 | **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project?
 4 | 
 5 | - **No CSS support:** Fill out this template with information about how to file issues and get help.
 6 | - **Yes CSS support:** Fill out an intake form at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). CSS will work with/help you to determine next steps.
 7 | - **Not sure?** Fill out an intake as though the answer were "Yes". CSS will help you decide.
 8 | 
 9 | *Then remove this first heading from this SUPPORT.MD file before publishing your repo.*
10 | 
11 | # Support
12 | 
13 | ## How to file issues and get help  
14 | 
15 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 
16 | issues before filing new issues to avoid duplicates.  For new issues, file your bug or 
17 | feature request as a new Issue.
18 | 
19 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 
20 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER
21 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**.
22 | 
23 | ## Microsoft Support Policy  
24 | 
25 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above.
26 | 


--------------------------------------------------------------------------------
/sparklin/BlobTriggerFuncApp/.funcignore:
--------------------------------------------------------------------------------
1 | .git*
2 | .vscode
3 | local.settings.json
4 | test
5 | .venv


--------------------------------------------------------------------------------
/sparklin/BlobTriggerFuncApp/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | 
 53 | # Translations
 54 | *.mo
 55 | *.pot
 56 | 
 57 | # Django stuff:
 58 | *.log
 59 | local_settings.py
 60 | db.sqlite3
 61 | 
 62 | # Flask stuff:
 63 | instance/
 64 | .webassets-cache
 65 | 
 66 | # Scrapy stuff:
 67 | .scrapy
 68 | 
 69 | # Sphinx documentation
 70 | docs/_build/
 71 | 
 72 | # PyBuilder
 73 | target/
 74 | 
 75 | # Jupyter Notebook
 76 | .ipynb_checkpoints
 77 | 
 78 | # IPython
 79 | profile_default/
 80 | ipython_config.py
 81 | 
 82 | # pyenv
 83 | .python-version
 84 | 
 85 | # pipenv
 86 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 87 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 88 | #   having no cross-platform support, pipenv may install dependencies that don’t work, or not
 89 | #   install all needed dependencies.
 90 | #Pipfile.lock
 91 | 
 92 | # celery beat schedule file
 93 | celerybeat-schedule
 94 | 
 95 | # SageMath parsed files
 96 | *.sage.py
 97 | 
 98 | # Environments
 99 | .env
100 | .venv
101 | env/
102 | venv/
103 | ENV/
104 | env.bak/
105 | venv.bak/
106 | 
107 | # Spyder project settings
108 | .spyderproject
109 | .spyproject
110 | 
111 | # Rope project settings
112 | .ropeproject
113 | 
114 | # mkdocs documentation
115 | /site
116 | 
117 | # mypy
118 | .mypy_cache/
119 | .dmypy.json
120 | dmypy.json
121 | 
122 | # Pyre type checker
123 | .pyre/
124 | 
125 | # Azure Functions artifacts
126 | bin
127 | obj
128 | appsettings.json
129 | local.settings.json
130 | 
131 | # Azurite artifacts
132 | __blobstorage__
133 | __queuestorage__
134 | __azurite_db*__.json
135 | .python_packages


--------------------------------------------------------------------------------
/sparklin/BlobTriggerFuncApp/BlobTriggerFunction/.funcignore:
--------------------------------------------------------------------------------
1 | venv


--------------------------------------------------------------------------------
/sparklin/BlobTriggerFuncApp/BlobTriggerFunction/Data.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from azure.data.tables import TableServiceClient, UpdateMode
 3 | # import os
 4 | 
 5 | class AZTableStorage:
 6 | 
 7 |     table_name = ""
 8 |     conn_str = ""
 9 |     table_client = ""
10 | 
11 |     def createClient(self, table_name=None, conn_str=None):
12 |         self.table_name = table_name
13 |         self.conn_str = conn_str
14 |         print(">>>>>>>>table_name>>>>>>>>"+self.table_name)
15 |         print(">>>>>>>>conn_str>>>>>>>>"+self.conn_str)
16 |         self.table_service = TableServiceClient.from_connection_string(self.conn_str)
17 | 
18 |         # Create the table if it does not already exist
19 |         self.table_service.create_table_if_not_exists(self.table_name)
20 | 
21 |         self.table_client = self.table_service.get_table_client(self.table_name)
22 |         # logging.info(self.table_name)
23 |         return self.table_client
24 | 
25 |     def insert_entity(self, tbl_client, entity):
26 |         #entity = self.deserialize()
27 |         return tbl_client.create_entity(entity)
28 |     
29 |     def azure_upsert_entity(self, tbl_client, entity):
30 |         # print(entity)
31 |         #entity = self.deserialize()        
32 |         return tbl_client.upsert_entity(mode=UpdateMode.REPLACE, entity=entity)
33 |     
34 |     def azure_query_entities(self,tbl_client, queryfilter):
35 |         return tbl_client.query_entities(queryfilter)
36 | 
37 |     # @staticmethod
38 |     # def deserialize():
39 |     #     params = {key: request.form.get(key) for key in request.form.keys()}
40 |     #     params["PartitionKey"] = "Chicago"
41 |     #     params["RowKey"] = "2021-07-01 12:00 AM"
42 |     #     return params
43 | 
44 |     def create_event_entity(self, PartitionKey, RowKey, Status, RetryCount, FilePath, isArchived, Message):
45 |         my_entity = {"PartitionKey" : str(PartitionKey),
46 |                 "RowKey" : str(RowKey),
47 |                 "Status" : str(Status),
48 |                 "RetryCount" : RetryCount,
49 |                 "FilePath" : str(FilePath),
50 |                 "isArchived" : isArchived,
51 |                 "Message" : str(Message)
52 |                 }
53 |         return my_entity    
54 | 
55 |     def create_lineage_entity(self, PartitionKey, RowKey, input_tables, output_table, input_columns, output_columns, isdelta, isintermediate, isglobal, derived_columns, joinconditions):
56 |         my_entity = {"PartitionKey" : str(PartitionKey),
57 |                 "RowKey" : str(RowKey),
58 |                 "input_tables" : str(input_tables),
59 |                 "output_table" : str(output_table),
60 |                 "input_columns" : str(input_columns),
61 |                 "output_columns" : str(output_columns),
62 |                 "isdelta" : str(isdelta),
63 |                 "isintermediate" : str(isintermediate),
64 |                 "isglobal" : str(isglobal),
65 |                 "derived_columns" : str(derived_columns),
66 |                 "joinconditions" : str(joinconditions)
67 |                 }
68 |         return my_entity   
69 | # def main():
70 | #     name = "e4b54cdb-e6ca-432d-8edf-e584fec37611_sql_test_data_lineage_pool_1650000026883_20220527140329.json"
71 | #     fileName = name[0:name.index(".")]
72 | #     print(">>>>>>>>1>>>>>>>>"+fileName)
73 | #     azStorage =  AZTableStorage()
74 | #     azStorage.createClient()
75 | #     metadata = azStorage.create_entity("Learning",fileName,"UnProcessed","Success",3,name,True)
76 | #     azStorage.upsert_entity(metadata)
77 |     
78 | # if __name__ == "__main__":
79 | #     main()


--------------------------------------------------------------------------------
/sparklin/BlobTriggerFuncApp/BlobTriggerFunction/Synapse_JsonParser.py:
--------------------------------------------------------------------------------
  1 | import json, os
  2 | from pyapacheatlas.core.util import GuidTracker, AtlasException
  3 | from pyapacheatlas.core import PurviewClient, AtlasEntity, TypeCategory, AtlasProcess
  4 | from pyapacheatlas.core.typedef import (EntityTypeDef, AtlasAttributeDef)
  5 | import logging, re
  6 | 
  7 | from .column_parser import get_column_transformations
  8 | from .join_parser import get_join_conditions
  9 | 
 10 | 
 11 | class PurviewTransform:
 12 |     def __init__(self, client, in_data):
 13 |         self.client = client
 14 |         self.in_data = in_data
 15 |         self._inputs = []
 16 |         self._relationship_inputs = []
 17 |         self._raw_inputs = []
 18 |         self._outputs = []
 19 |         self._output_fields = []
 20 |         self._raw_outputs = []
 21 |         self._relationship_outputs = []
 22 |         self._raw_outputs = []
 23 |         self._aliases = []
 24 |         self._tables = []
 25 |         self._alias_tablenames = {}
 26 |         self._column_mapping = []
 27 |         self._field_mapping = None
 28 |         self._col_alias_map = {}
 29 |         self._input_cols = []
 30 |         self._output_cols = []
 31 |         self.hardcodecol = []
 32 |         self.input_tables = []
 33 |         self.output_table = ""
 34 |         self.output_table_schm = ""
 35 |         self.joinList = []
 36 |         self._table_and_columns = {}
 37 |         self.inp_qualified_Name = ""
 38 |         self.out_qualified_Name = ""
 39 |         self.setproject = 0
 40 |         self.table_and_schema_count = {}
 41 |         self.sqlcommand = ""
 42 |         self._input_table_qualifiedName = {}
 43 | 
 44 |         self.inp_name = ""
 45 |         self.inp_guid = ""
 46 |         self.inp_entitytype = ""
 47 | 
 48 |         self.output_name = ""
 49 |         self.output_guid = ""
 50 |         self.output_entitytype = ""
 51 | 
 52 |         self.a = []
 53 |         self.tbl = []
 54 |         self.ds_create_i = 0
 55 | 
 56 |         self.intermediate_tbl_views = []
 57 |         self.deltatable = []
 58 |         self.globaltempviews = []
 59 | 
 60 |         self.nb_name = ""
 61 |         self.rowkey = ""
 62 |         self.cluster_name = ""
 63 |         self.dataframe = "N"
 64 | 
 65 |         self.match = 0
 66 |         self.unmatch = 0
 67 | 
 68 |         logging.info("logger started")
 69 |         self.gt = GuidTracker()
 70 | 
 71 |         logging.info("finished init")
 72 | 
 73 |     def get_tbl_nm(self, inp_tblname, type='out'):
 74 |         if inp_tblname.count(",") > 0:
 75 |             outtblnm = inp_tblname.split(",")[1].strip(" ").replace('[', '').replace(']', '')
 76 |             schnm = inp_tblname.split(",")[0].strip(" ").replace('[', '').replace(']', '')
 77 |             
 78 |             if type == "inp":
 79 |                 self.input_tables.append(schnm + "." + outtblnm)
 80 |             else:
 81 |                 self.output_table_schm = schnm + "." + outtblnm
 82 | 
 83 |             if outtblnm not in self.deltatable and schnm.lower() != "global_temp":
 84 |                 # self.deltatable.update({outtblnm: schnm})
 85 |                 self.deltatable.append(schnm + "." + outtblnm)
 86 |             if outtblnm not in self.deltatable and schnm.lower() == "global_temp":
 87 |                 self.globaltempviews.append(schnm + "." + outtblnm)
 88 |             # tblnm = str(child['multipartIdentifier']).replace('[', '').replace(']', '')
 89 |         else:
 90 |             outtblnm = inp_tblname.replace('[', '').replace(']', '')
 91 |             self.intermediate_tbl_views.append(outtblnm)
 92 |             if type == "inp":
 93 |                 self.input_tables.append(outtblnm)
 94 |             else:
 95 |                 self.output_table_schm = outtblnm
 96 |             # schnm = ""
 97 | 
 98 |         if outtblnm not in self.table_and_schema_count:
 99 |             self.table_and_schema_count.update({outtblnm: inp_tblname.count(",")})
100 | 
101 |         return outtblnm
102 | 
103 |     def subquery_alias(self, plan):
104 |         if plan['class'] == 'org.apache.spark.sql.catalyst.plans.logical.SubqueryAlias':
105 |             if plan['identifier']['name'] != '__auto_generated_subquery_name':
106 |                 self._aliases.append(plan['identifier']['name'])
107 | 
108 |     def unresolved_relation(self, plan):
109 |         if plan['class'] == 'org.apache.spark.sql.catalyst.analysis.UnresolvedRelation':
110 |             tblnm = self.get_tbl_nm(str(plan['multipartIdentifier']), "inp")
111 |             self._tables.append(tblnm)
112 | 
113 |     def get_fields_pattern(self, sqlcmd, plan):
114 | 
115 |         if sqlcmd == "INSERT":
116 |             if plan['class'] == 'org.apache.spark.sql.catalyst.plans.logical.Project':
117 |                 if self.setproject == 0:
118 |                     for project in plan['projectList']:
119 |                         # print("Project Entered")
120 |                         _fields = ""
121 |                         for field in project:
122 |                             if field['class'] == "org.apache.spark.sql.catalyst.expressions.Alias":
123 |                                 _fields += field['name'] + "|"
124 |                             if field['class'] == "org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute":
125 |                                 _fields += field['nameParts'] + "|"
126 |                             if field['class'] == "org.apache.spark.sql.catalyst.expressions.Literal":
127 |                                 if field['value'] is not None:
128 |                                     _fields += "lit(" + field['value'] + ")" + "|"
129 |                                 else:
130 |                                     _fields += "lit(null)" + "|"
131 |                             if field['class'] == "org.apache.spark.sql.catalyst.expressions.WindowExpression":
132 |                                 _fields += "Row_number()" + "|" + "Partition" + "|" + "Group" + "|"
133 |                         self._output_fields.append(_fields)
134 |                     self.hardcodecol = get_column_transformations(plan['projectList'])
135 |                     self.setproject = 1
136 | 
137 |             self.subquery_alias(plan)
138 |             self.unresolved_relation(plan)
139 | 
140 |         if sqlcmd == "CreateViewCommand":
141 |             for child in plan['child']:
142 |                 if child['class'] == 'org.apache.spark.sql.catalyst.plans.logical.Project':
143 |                     if self.setproject == 0:
144 |                         for project in child['projectList']:
145 |                             # print(project)
146 |                             _fields = ""
147 |                             for field in project:
148 |                                 if field['class'] == "org.apache.spark.sql.catalyst.expressions.Alias":
149 |                                     _fields += field['name'] + "|"
150 |                                 if field['class'] == "org.apache.spark.sql.catalyst.expressions.AttributeReference":
151 |                                     _fields += field['name'] + "|"
152 |                             self._output_fields.append(_fields)
153 |                         self.hardcodecol = get_column_transformations(child['projectList'])
154 |                         self.setproject = 1
155 |                 self.subquery_alias(child)
156 |                 self.unresolved_relation(child)
157 |                 if not self._tables:
158 |                     self._tables.append(self.nb_name + "_dataframe")
159 |                     self.dataframe = "Y"
160 | 
161 |         if sqlcmd == "CREATEVIEW":
162 |             for child in plan['child']:
163 |                 if child['class'] == 'org.apache.spark.sql.catalyst.plans.logical.Project':
164 |                     if self.setproject == 0:
165 |                         for project in child['projectList']:
166 |                             _fields = ""
167 |                             for field in project:
168 |                                 if field['class'] == "org.apache.spark.sql.catalyst.expressions.Alias":
169 |                                     _fields += field['name'] + "|"
170 |                                 if field['class'] == "org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute":
171 |                                     _fields += field['nameParts'] + "|"
172 |                                 if field['class'] == "org.apache.spark.sql.catalyst.analysis.UnresolvedFunction":
173 |                                     if field['name']['funcName'] == "DENSE_RANK":
174 |                                         _fields += "DENSE_RANK||||" + "|"
175 |                                 if field['class'] == "org.apache.spark.sql.catalyst.expressions.Literal":
176 |                                     if field['value'] is not None:
177 |                                         _fields += "lit(" + field['value'] + ")" + "|"
178 |                                     else:
179 |                                         _fields += "lit(null)" + "|"
180 |                                 if field['class'] == 'org.apache.spark.sql.catalyst.analysis.UnresolvedStar':
181 |                                     print("*")
182 |                                 if field['class'] == "org.apache.spark.sql.catalyst.expressions.WindowExpression":
183 |                                     _fields += "Row_number()" + "|" + "Partition" + "|" + "Group" + "|"
184 |                             self._output_fields.append(_fields)
185 |                         self.hardcodecol = get_column_transformations(child['projectList'])
186 |                         self.setproject = 1
187 | 
188 |                 self.subquery_alias(child)
189 |                 self.unresolved_relation(child)
190 | 
191 |         if sqlcmd == "DS_CREATETABLE":
192 |             _output_proj = []
193 |             if plan['class'] == 'org.apache.spark.sql.catalyst.plans.logical.SubqueryAlias':
194 |                 self.ds_create_i += 1
195 |                 self.a.append(self.ds_create_i)
196 |                 # print(plan['identifier']['name'])
197 |                 self.tbl.append(plan['identifier']['name'])
198 | 
199 |             if plan['class'] == 'org.apache.spark.sql.execution.datasources.LogicalRelation':
200 |                 self.ds_create_i += 1
201 | 
202 |             if plan['class'] == 'org.apache.spark.sql.catalyst.plans.logical.Project':
203 |                 self.ds_create_i += 1
204 |                 if self.setproject == 0:
205 |                     for project in plan['projectList']:
206 |                         _fields = ""
207 |                         for field in project:
208 |                             if field['class'] == "org.apache.spark.sql.catalyst.expressions.Alias":
209 |                                 _fields += field['name'] + "|"
210 |                             if field['class'] == "org.apache.spark.sql.catalyst.expressions.AttributeReference":
211 |                                 if field['qualifier']:
212 |                                     qualifier = field['qualifier'].replace('[', '').replace(']', '')
213 |                                     _fields += "[" + qualifier + "," + field['name'] + "]" + "|"
214 |                                 else:
215 |                                     _fields += field['name'] + "|"
216 |                         self._output_fields.append(_fields)
217 |                     self.hardcodecol = get_column_transformations(child['projectList'])
218 |                     self.setproject = 1
219 | 
220 |         if sqlcmd == "Project":
221 |             if self.setproject == 0:
222 |                 for project in plan['projectList']:
223 |                     _fields = ""
224 |                     for field in project:
225 |                         if field['class'] == "org.apache.spark.sql.catalyst.expressions.Alias":
226 |                             _fields += field['name'] + "|"
227 |                         if field['class'] == "org.apache.spark.sql.catalyst.expressions.AttributeReference":
228 |                             _fields += field['name'] + "|"
229 |                     self._output_fields.append(_fields)
230 |                 self.setproject = 1
231 |             self._tables.append(self.nb_name + "_dataframe")
232 | 
233 |         if sqlcmd == "MERGE":
234 | 
235 |             for key, values in plan.items():
236 |                 if key == "matchedActions":
237 |                     if values:
238 |                         if 'InsertAction' in values[0][0]['class']:
239 |                             for field in values[0]:
240 |                                 if field['class'] == "org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute":
241 |                                     self._output_fields.append(field['nameParts'])
242 |                         else:
243 |                             self.match = 1
244 |                     else:
245 |                         self.match = 1
246 |                 if key == "notMatchedActions":
247 |                     if values:
248 |                         if 'InsertAction' in values[0][0]['class']:
249 |                             for field in values[0]:
250 |                                 if field['class'] == "org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute":
251 |                                     self._output_fields.append(field['nameParts'])
252 |                         else:
253 |                             self.unmatch = 1
254 |                     else:
255 |                         self.unmatch = 1
256 | 
257 |             if self.match == 1 and self.unmatch == 1:
258 |                 os._exit(0)
259 | 
260 |             self.subquery_alias(plan)
261 |             self.unresolved_relation(plan)
262 | 
263 |     def get_parse_plan(self, output_plan):
264 | 
265 |         for plan in output_plan:
266 |             _alias_tables = ""
267 |             if plan['class'] == 'org.apache.spark.sql.catalyst.plans.logical.InsertIntoStatement':
268 |                 for tbl in plan['table']:
269 |                     self.output_table = self.get_tbl_nm(tbl['multipartIdentifier'])
270 |                     self.sqlcommand = "INSERT"
271 | 
272 |             if plan['class'] == 'org.apache.spark.sql.catalyst.plans.logical.CreateViewStatement':
273 |                 # print("hi")
274 |                 self.output_table = self.get_tbl_nm(plan['viewName'])
275 |                 self.sqlcommand = "CREATEVIEW"
276 | 
277 |             if plan['class'] == "org.apache.spark.sql.catalyst.plans.logical.CreateTableAsSelectStatement":
278 |                 self.output_table = self.get_tbl_nm(plan['tableName'])
279 |                 self.sqlcommand = "INSERT"
280 | 
281 |             if plan['class'] == 'org.apache.spark.sql.execution.datasources.CreateTable':
282 |                 self.output_table = self.get_tbl_nm(plan['tableDesc']['identifier']['table'])
283 |                 self.sqlcommand = "DS_CREATETABLE"
284 | 
285 |             if plan['class'] == 'org.apache.spark.sql.catalyst.plans.logical.MergeIntoTable':
286 |                 self.sqlcommand = "MERGE"
287 | 
288 |             if plan['class'] == 'org.apache.spark.sql.execution.command.CreateViewCommand':
289 |                 self.output_table = self.get_tbl_nm(plan['name']['table'])
290 |                 # print(plan['name']['table'])
291 |                 self.sqlcommand = "CreateViewCommand"
292 | 
293 |             self.get_fields_pattern(self.sqlcommand, plan)
294 | 
295 |         return True
296 | 
297 |     def _column_alias_map(self, val_list, alias=""):
298 |         if alias == 'NoAlias':
299 |             if 'NoAlias' in self._col_alias_map.keys():
300 |                 self._col_alias_map['NoAlias'].append(val_list[0].strip())
301 |             else:
302 |                 # create a new array in this slot
303 |                 self._col_alias_map['NoAlias'] = [val_list[0].strip()]
304 |         else:
305 |             if val_list[0] in self._col_alias_map:
306 |                 self._col_alias_map[val_list[0]].append(val_list[1].strip())
307 |             else:
308 |                 # create a new array in this slot
309 |                 self._col_alias_map[val_list[0]] = [val_list[1].strip()]
310 | 
311 |     @staticmethod
312 |     def _column_clean(val):
313 |         return val.replace('[', '').replace(']', '').split(",")
314 | 
315 |     def get_inp_out_fields(self, _output_fields, sqlcommand):
316 |         print("Raw Source Fields ")
317 |         print(_output_fields)
318 |         if sqlcommand == "MERGE":
319 |             # abc = {}
320 |             # for i in range(len(_output_fields)):
321 |             #     if _output_fields.count(_output_fields[i]) > 1:
322 |             #         if _output_fields[i] not in abc:
323 |             #             abc.update({_output_fields[i]: i})
324 |             # print(abc)
325 |             # for key, value in abc.items():
326 |             #     _output_fields.pop(value)
327 | 
328 |             odd_i = []
329 |             even_i = []
330 |             for i in range(0, len(_output_fields)):
331 |                 if i % 2:
332 |                     even_i.append(_output_fields[i])
333 |                 else:
334 |                     odd_i.append(_output_fields[i])
335 | 
336 |             for i in odd_i:
337 |                 val_list = self._column_clean(i)
338 |                 if len(val_list) > 1:
339 |                     self._output_cols.append(val_list[1].strip())
340 |                 else:
341 |                     self._output_cols.append(val_list[0].strip())
342 |                 # self._output_cols.append(i.replace('[', '').replace(']', ''))
343 |             for i in even_i:
344 |                 val_list = self._column_clean(i)
345 |                 if len(val_list) > 1:
346 |                     self._input_cols.append(val_list[1].strip())
347 |                     self._column_alias_map(val_list)
348 |                 else:
349 |                     self._input_cols.append(val_list[0].strip())
350 |                     self._column_alias_map(val_list, "NoAlias")
351 | 
352 |             self.output_table = self._tables[0]
353 |             self._tables.pop(0)
354 |             self.output_table_schm = self.input_tables[0]
355 |             self.input_tables.pop(0)
356 |         else:
357 |             for val in _output_fields:
358 |                 if val.count("|") == 1:
359 |                     val_list = self._column_clean(val.replace('|', ''))
360 |                     if len(val_list) > 1:
361 |                         self._input_cols.append(val_list[1].strip())
362 |                         self._output_cols.append(val_list[1].strip())
363 |                         self._column_alias_map(val_list)
364 |                     else:
365 |                         # self._col_alias_map.update({'NoAlias':val_list[0].strip()})
366 |                         self._input_cols.append(val_list[0])
367 |                         self._output_cols.append(val_list[0])
368 |                         self._column_alias_map(val_list, 'NoAlias')
369 | 
370 |                 if val.count("|") == 2:
371 |                     val_split = val.split("|")[1]
372 |                     if val_split.__contains__("lit"):
373 |                         self._input_cols.append(val_split)
374 |                         # self.hardcodecol.append(val_split + " as " + val.split("|")[0])
375 |                     else:
376 |                         # print(val_split)
377 |                         val_list = self._column_clean(val_split)
378 |                         # print(val_list)
379 |                         if len(val_list) > 1:
380 |                             self._input_cols.append(val_list[1].strip())
381 |                             self._column_alias_map(val_list)
382 |                         else:
383 |                             self._input_cols.append(val_list[0].strip())
384 |                             self._column_alias_map(val_list, 'NoAlias')
385 | 
386 |                     self._output_cols.append(val.split("|")[0])
387 | 
388 |                 if val.count("|") == 3:
389 |                     val_split = val.split("|")[1]
390 |                     val_list = self._column_clean(val_split)
391 |                     if len(val_list) > 1:
392 |                         self._input_cols.append(val_list[1].strip())
393 |                     else:
394 |                         self._input_cols.append(val_list[0].strip())
395 |                     self._output_cols.append(val.split("|")[0].strip())
396 |                     # self._column_alias_map(val_list)
397 | 
398 |                 # if val.count("|") > 3:
399 |                 #     self.hardcodecol.append("Derived Logic like CASE/CONCAT/DENSE_RANK  " + val.split("|")[0])
400 | 
401 |         print("Input and Output and HardCode and Column_Alias Mapping")
402 |         print(self._input_cols)
403 |         print(self._output_cols)
404 |         print(self.hardcodecol)
405 |         print(self._col_alias_map)
406 | 
407 |         return True
408 | 
409 |     # def get_join_conditions(self, output_plan):
410 |     #
411 |     #     for plan in output_plan:
412 |     #         if plan['class'] == 'org.apache.spark.sql.catalyst.plans.logical.Join':
413 |     #             join = (plan['joinType']['object'])[:-1]
414 |     #             joinname = join.split('.')[-1]
415 |     #             tablealias1 = ((plan['condition'][1])['nameParts']).split(',')[0][1:]
416 |     #             column1 = ((plan['condition'][1])['nameParts']).split(',')[1][1:-1]
417 |     #             tablealias2 = ((plan['condition'][2])['nameParts']).split(',')[0][1:]
418 |     #             column2 = ((plan['condition'][2])['nameParts']).split(',')[1][1:-1]
419 |     #             row = self._alias_tablenames[tablealias1] + " " + joinname + "Join" + " " + self._alias_tablenames[
420 |     #                 tablealias2] + " on " + column1 + "=" + column2
421 |     #             self.joinList.append(row)
422 |     #     return True
423 | 
424 |     def get_alias_table_cols(self):
425 | 
426 |         for key, value in self._col_alias_map.items():
427 |             if key in self._alias_tablenames:
428 |                 if type(value) is list:
429 |                     for col in value:
430 |                         colname = col
431 |                         tablename = self._alias_tablenames.get(key)
432 |                         self._table_and_columns.update({colname: tablename})
433 |                 else:
434 |                     colname = value[0]
435 |                     tablename = self._alias_tablenames.get(key)
436 |                     self._table_and_columns.update({colname: tablename})
437 |             else:
438 |                 if type(value) is list:
439 |                     for col in value:
440 |                         colname = col
441 |                         tablename = list(self._alias_tablenames.values())[0]
442 |                         self._table_and_columns.update({colname: tablename})
443 |                 else:
444 |                     colname = value[0]
445 |                     tablename = list(self._alias_tablenames.values())[0]
446 |                     self._table_and_columns.update({colname: tablename})
447 | 
448 |         print("Alias Tables and Columns Mapping")
449 |         print(self._alias_tablenames)
450 |         print(self._table_and_columns)
451 | 
452 |         return True
453 | 
454 |     def search_purview_entity(self, entityname):
455 | 
456 |         print(entityname)
457 |         search = self.client.discovery.search_entities(entityname)
458 |         qname, name, guid, entitytype = "", "", "", ""
459 |         for entity in search:
460 |             # re.search(r"^cooked*\W?", entity['qualifiedName'])
461 |             # print(entity)
462 |             try:
463 |                 if (entity["entityType"] == "azure_datalake_gen2_path" or str(
464 |                         entity["entityType"]).lower() == "dataset") and \
465 |                         str(entity['name']).lower().strip() == entityname.lower() and re.search(
466 |                     r"^" + self.cluster_name + "://", entity['qualifiedName']):
467 |                     qname = entity['qualifiedName']
468 |                     name = entity['name']
469 |                     guid = entity['id']
470 |                     entitytype = entity['entityType']
471 | 
472 |                     # print(entity['qualifiedName'])
473 |                     # print("ABC    "+qname)
474 |             except:
475 |                 print("Search Scan results are Coming Differently")
476 | 
477 |         return qname, name, guid, entitytype
478 | 
479 |     def get_ds_create_tables(self):
480 |         for j in range(len(self.a) - 1):
481 |             k = self.a[j]
482 |             if self.a[j + 1] == k + 1:
483 |                 self._alias_tablenames.update({self.tbl[j]: self.tbl[j + 1]})
484 | 
485 |     def purview_plan_push(self, qualifiedName, runid):
486 | 
487 |         print(self._tables)
488 |         print(self.table_and_schema_count)
489 | 
490 |         purv_inpqname, purv_inpname, purv_inpguid, purv_inpentitytype = "", "", "", ""
491 |         purv_outqname, purv_outname, purv_outguid, purv_outentitytype = "", "", "", ""
492 | 
493 |         for inp in self._tables:
494 |             # Need to convert it into a Function
495 |             if inp in self.table_and_schema_count and self.table_and_schema_count[inp] > 0:
496 |                 print(inp + "  hiii " + str(self.table_and_schema_count[inp]))
497 |                 # purv_inpqname, purv_inpname, purv_inpguid, purv_inpentitytype = self.search_purview_entity(inp.lower())
498 | 
499 |             # print(purv_inpqname)
500 |             if purv_inpqname:
501 |                 # print("Here" + "    " + purv_qname)
502 |                 self.inp_qualified_Name = purv_inpqname
503 |                 self.inp_name = purv_inpname
504 |                 self.inp_guid = purv_inpguid
505 |                 self.inp_entitytype = purv_inpentitytype
506 |             else:
507 |                 self.inp_qualified_Name = (qualifiedName + "://" + inp).lower()
508 |                 self.inp_name = inp
509 |                 self.inp_guid = self.gt.get_guid()
510 |                 self.inp_entitytype = "DataSet"
511 | 
512 |             self._input_table_qualifiedName.update({inp: self.inp_qualified_Name})
513 |             self._inputs.append(AtlasEntity(name=self.inp_name,
514 |                                             typeName=self.inp_entitytype,
515 |                                             qualified_name=self.inp_qualified_Name,
516 |                                             guid=self.inp_guid)
517 |                                 )
518 | 
519 |         print("Input Tables and Its Qualified Names")
520 |         print(self._input_table_qualifiedName)
521 | 
522 |         if self.sqlcommand == "INSERT" or self.sqlcommand == "MERGE":
523 |             print("INSERT Occured Hence Output Table will be Searched")
524 |             # purv_outqname, purv_outname, purv_outguid, purv_outentitytype = self.search_purview_entity(self.output_table)
525 | 
526 |         if purv_outqname:
527 |             self.out_qualified_Name = purv_outqname
528 |             self.output_name = purv_outname
529 |             self.output_guid = purv_outguid
530 |             self.output_entitytype = purv_outentitytype
531 |         else:
532 |             self.out_qualified_Name = (qualifiedName + "://" + self.output_table).lower()
533 |             self.output_name = self.output_table
534 |             self.output_guid = self.gt.get_guid()
535 |             self.output_entitytype = "DataSet"
536 | 
537 |         print(self.output_table + "   " + self.out_qualified_Name)
538 | 
539 |         OutputTable = AtlasEntity(
540 |             name=self.output_name,
541 |             typeName=self.output_entitytype,
542 |             qualified_name=self.out_qualified_Name,
543 |             guid=self.output_guid
544 |         )
545 |         if self._input_cols and self._output_cols:
546 |             for item in zip(self._input_cols, self._output_cols):
547 |                 if item[0] in self._table_and_columns:
548 |                     inp = self._input_table_qualifiedName.get(self._table_and_columns.get(item[0]))
549 |                     self._column_mapping.append(
550 |                         {"ColumnMapping": [
551 |                             {"Source": item[0], "Sink": item[1]}],
552 |                             "DatasetMapping": {"Source": inp,
553 |                                                "Sink": OutputTable.qualifiedName}})
554 | 
555 |         print("ColumnMapping")
556 |         print(self._column_mapping)
557 |         # print(self.hardcodecol)
558 |         # print(self.joinList)
559 |         process = AtlasProcess(
560 |             name=qualifiedName + "_" + self.output_table + "_process",
561 |             typeName="HRServicesInsights_OneHRSI",
562 |             qualified_name="hrdi://synapse_notebook/" + self.nb_name + "/" + qualifiedName + "_" + self.output_table,
563 |             inputs=self._inputs,
564 |             outputs=[OutputTable],
565 |             guid=self.gt.get_guid(),
566 |             attributes={"columnMapping": json.dumps(self._column_mapping),
567 |                         "hardCoded_Columns": self.hardcodecol,
568 |                         "Delta_Tables": self.deltatable,
569 |                         "Global_Temp_Views_or_Tables": self.globaltempviews,
570 |                         "Intermediate_Views_or_Tables": self.intermediate_tbl_views,
571 |                         "JoinConditions": self.joinList}
572 |         )
573 |         if self._inputs and OutputTable:
574 |             try:
575 |                 results = self.client.upload_entities([process, OutputTable] + self._inputs)
576 |             except:
577 |                 print("No ColumnMapping or Input or Output Tables Available")
578 | 
579 |     def purview_dataset_push(self, inp_qname, inp_name, out_qname, out_name, process_qname, name):
580 |         print("Came")
581 |         a = AtlasEntity(
582 |             name=inp_name,
583 |             typeName="DataSet",
584 |             qualified_name=inp_qname,
585 |             guid=self.gt.get_guid()
586 |         )
587 |         b = AtlasEntity(
588 |             name=out_name,
589 |             typeName="DataSet",
590 |             qualified_name=out_qname,
591 |             guid=self.gt.get_guid()
592 |         )
593 | 
594 |         process = AtlasProcess(
595 |             name=name,
596 |             typeName="Process",
597 |             qualified_name="Process" + process_qname,
598 |             inputs=[a],
599 |             outputs=[b],
600 |             guid=self.gt.get_guid()
601 |         )
602 | 
603 |         results = self.client.upload_entities(batch=[a, b, process])
604 | 
605 |     def get_project_details(self, output_plan):
606 |         for plan in output_plan:
607 |             _alias_tables = ""
608 |             if plan['class'] == "org.apache.spark.sql.catalyst.plans.logical.Project":
609 |                 if plan['projectList'][0][0]['qualifier']:
610 |                     # print(plan['projectList'][0][0]['qualifier'])
611 |                     self.output_table = self.get_tbl_nm(plan['projectList'][0][0]['qualifier'])
612 |                     self.sqlcommand = "Project"
613 |                 else:
614 |                     os._exit(0)
615 | 
616 |             self.get_fields_pattern(self.sqlcommand, plan)
617 |         return True
618 | 
619 |     def transform_to_purview(self):
620 | 
621 |         inputs_array = self.in_data['inputs']
622 |         outputs_array = self.in_data['outputs']
623 |         runid = self.in_data['run']['runId']
624 | 
625 |         _name = self.in_data['job']['name'].split(".")[0].split("_")[:-2]
626 |         self.rowkey = "_".join(_name) + "_" + runid
627 |         self.nb_name = "_".join(_name) 
628 | 
629 |         if re.search("hrsi", self.in_data['job']['name'].lower()):
630 |             self.cluster_name = "hrservicesinsights"
631 |         elif re.search("ultp", self.in_data['job']['name'].lower()):
632 |             self.cluster_name = "ultp_services"
633 |         elif re.search("gtabi", self.in_data['job']['name'].lower()):
634 |             self.cluster_name = "gtabi_services"
635 |         elif re.search("learning", self.in_data['job']['name'].lower()):
636 |             self.cluster_name = "learninginsights"
637 |         elif re.search("hcm", self.in_data['job']['name'].lower()):
638 |             self.cluster_name = "headcountmanagement"
639 |         else:
640 |             self.cluster_name = "external"
641 | 
642 |         # qualifiedName = self.cluster_name + "://" + self.nb_name
643 |         qualifiedName = self.cluster_name
644 |         print(qualifiedName)
645 |         # print(len(self.in_data['run']['facets']))
646 | 
647 |         if len(self.in_data['run']['facets']) > 1:
648 | 
649 |             _plan = self.in_data['run']['facets']['spark.logicalPlan']['plan']
650 |             print(runid + "  " + self.nb_name)
651 | 
652 |             classname = self.in_data['run']['facets']['spark.logicalPlan']['plan'][0]['class']
653 | 
654 |             if classname == "org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand":
655 |                 for inp in self.in_data['inputs']:
656 |                     table = inp['name'].split("/")[-1]
657 |                     self.input_tables.append(table)
658 |                     for col in inp['facets']['schema']['fields']:
659 |                         column = col['name']
660 |                         self._input_cols.append(column)
661 |                         self._table_and_columns.update({column: table})
662 |                 self.output_table_schm = self.in_data['outputs'][0]['name'].split("/")[-1]
663 |                 self.input_tables = list(dict.fromkeys(self.input_tables))
664 |                 self._input_cols = list(dict.fromkeys(self._input_cols))
665 |                 self.output_table = self.output_table_schm
666 |                 self._output_cols = self._input_cols
667 |                 self.deltatable = self.output_table
668 |                 self._tables = self.input_tables
669 | 
670 |                 print(self.input_tables)
671 |                 print(self.output_table_schm)
672 |                 print(self._input_cols)
673 |                 print(self._output_cols)
674 |                 print(self._table_and_columns)
675 | 
676 |             else:
677 |                 if classname == "org.apache.spark.sql.catalyst.plans.logical.Project":
678 |                     self.get_project_details(_plan)
679 |                 else:
680 |                     self.get_parse_plan(_plan)
681 | 
682 |                 if self.sqlcommand == "DS_CREATETABLE":
683 |                     self.get_inp_out_fields(self._output_fields, self.sqlcommand)
684 |                     self.get_ds_create_tables()
685 |                 else:
686 |                     self.get_inp_out_fields(self._output_fields, self.sqlcommand)
687 | 
688 |                 if self._aliases and self._tables:
689 |                     if len(self._aliases) == len(self._tables):
690 |                         for item in zip(self._aliases, self._tables):
691 |                             self._alias_tablenames.update({item[0]: item[1]})
692 |                     else:
693 |                         _aliases_new = self._aliases[1:]
694 |                         for item in zip(_aliases_new, self._tables):
695 |                             self._alias_tablenames.update({item[0]: item[1]})
696 |                 elif self._tables:
697 |                     self._alias_tablenames.update({'NoAlias': self._tables[0]})
698 | 
699 |                 if self._col_alias_map and self._alias_tablenames:
700 |                     self.get_alias_table_cols()
701 | 
702 |                 if self.sqlcommand in ("CreateViewCommand", "CREATEVIEW"):
703 |                     self.joinList = get_join_conditions(_plan[0]['child'], self._alias_tablenames)
704 |                 elif self.sqlcommand == "INSERT":
705 |                     self.joinList = get_join_conditions(_plan, self._alias_tablenames)
706 | 
707 |                 if self.dataframe == "Y":
708 |                     inp_qname = self.out_qualified_Name
709 |                     inp_name = self.output_name
710 |                     out_name = self.nb_name + "_dataframe"
711 |                     out_qname = (self.cluster_name + "://" + self.nb_name + "://" + out_name).lower()
712 |                     # out_qname = (self.cluster_name + "://" + out_name).lower()
713 | 
714 |                     # print(inp_name)
715 |                     self.purview_dataset_push(inp_qname, inp_name, out_qname, out_name, inp_name, "TabletoDataframe")
716 | 
717 |                 print("Output Table")
718 |                 print(self.output_table)
719 |                 print(self.output_table_schm)
720 |                 print(self.input_tables)
721 |                 print(self.deltatable)
722 |                 print(self.globaltempviews)
723 |                 print(self.intermediate_tbl_views)
724 | 
725 |                 print(self._tables)
726 |             self.purview_plan_push(qualifiedName, runid)
727 | 
728 |         else:
729 |             print("Facets doesnt have any Plan")
730 |             inp_qname, inp_name = "", ""
731 |             if inputs_array:
732 |                 for inp in inputs_array:
733 |                     inp_qname = dict(inp).get('name')
734 |                     inp_name = str(dict(inp).get('name')).split("/")[-1].split('.')[0]
735 |                 out_name = self.nb_name + "_dataframe"
736 |                 out_qname = self.cluster_name + "://" + self.nb_name + "://" + out_name
737 |                 print(inp_name)
738 |                 self.purview_dataset_push(inp_qname.lower(), inp_name, out_qname.lower(), out_name, out_name,
739 |                                           "FiletoDataframe")
740 |             else:
741 |                 os._exit(0)
742 | 
743 |         return self.cluster_name, self.rowkey, self.input_tables, self.output_table_schm, self._input_cols, self._output_cols, self.deltatable, self.intermediate_tbl_views, self.globaltempviews, self.hardcodecol, self.joinList
744 | 


--------------------------------------------------------------------------------
/sparklin/BlobTriggerFuncApp/BlobTriggerFunction/__init__.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import azure.functions as func
 4 | import json
 5 | from .Data import AZTableStorage
 6 | from azure.data.tables import TableClient
 7 | import os, traceback, sys
 8 | 
 9 | from .Synapse_JsonParser import PurviewTransform
10 | from pyapacheatlas.auth import ServicePrincipalAuthentication
11 | from pyapacheatlas.core import PurviewClient
12 | 
13 | oauth = ServicePrincipalAuthentication(
14 |     tenant_id=<<tenant_id>>,
15 |     client_id=<<client_id>>,
16 |     client_secret=<<client_secret>>
17 | )
18 | 
19 | client = PurviewClient(
20 |     account_name=<<purview instance>>,
21 |     authentication=oauth
22 | )
23 | 
24 | def main(myblob: func.InputStream):
25 | 
26 |     # logging.info(f"Python blob trigger function processed blob \n"
27 |     #              f"Name: {myblob.name}\n"
28 |     #              f"Blob Size: {myblob.length} bytes")
29 | 
30 |     fileName = myblob.name.split("/")[1].split(".")[0]
31 |     # logging.info(f"****fileName*******  {fileName} ")
32 |     
33 |     logging.info(f"****TableName*******  {os.getenv('StorageTableName')}")
34 | 
35 |     # cluster_name, nb_name, input_tables,output_table, _input_cols, _output_cols,deltatable, intermediate_tbl_views,globaltempviews, hardcodecol, joinList = "","","","","","","","","","",""
36 |     
37 |     pt = PurviewTransform(client, json.load(myblob))
38 | 
39 |     azStorage =  AZTableStorage()
40 | 
41 |     event_client = azStorage.createClient(os.getenv('TableName'),os.getenv('datalineagesynapsestrpoc_STORAGE'))
42 | 
43 |     lineage_client = azStorage.createClient(os.getenv('StorageTableName'),os.getenv('datalineagesynapsestrpoc_STORAGE'))
44 | 
45 |     streamName = "HRSI"
46 | 
47 |     name_filter = "PartitionKey eq '%s' and RowKey eq '%s' and (Status eq 'Unprocessed' or Status eq 'Parsing Failed')" % (streamName, fileName)
48 |     # # table_client = TableClient.from_connection_string(conn_str=os.getenv('datalineagesynapsestrpoc_STORAGE'), table_name=os.getenv('TableName'))
49 |    
50 |     entities = azStorage.azure_query_entities(event_client, name_filter)
51 |     
52 |     # lineage_metadata = azStorage.create_lineage_entity("HRSI",fileName,"","","","","","","","")
53 |     # event_metadata = azStorage.create_event_entity("HRSI",fileName,"Unprocessed",3,myblob.name,False,"SUCCESS")
54 |     
55 |     # azStorage.insert_entity(event_client, event_metadata)
56 |     # azStorage.insert_entity(lineage_client, lineage_metadata)
57 | 
58 |     
59 |     # cluster_name, nb_name, input_tables,output_table, _input_cols, _output_cols,deltatable, intermediate_tbl_views,globaltempviews, hardcodecol, joinList = pt.transform_to_purview()
60 |     # lineage_metadata = azStorage.create_lineage_entity(cluster_name, nb_name, input_tables,
61 |     #                                                 output_table, _input_cols, _output_cols,
62 |     #                                                 deltatable, intermediate_tbl_views,
63 |     #                                                 globaltempviews, hardcodecol, joinList)
64 |     # azStorage.azure_upsert_entity(lineage_client, lineage_metadata)
65 | 
66 |     for entity in entities:
67 |         print('RowKey:' + entity['RowKey'])
68 |         try:
69 |             cluster_name, nb_name, input_tables,output_table, _input_cols, _output_cols,deltatable, intermediate_tbl_views,globaltempviews, hardcodecol, joinList = pt.transform_to_purview()
70 |             if input_tables and output_table:
71 |                 lineage_metadata = azStorage.create_lineage_entity(cluster_name, nb_name, input_tables,
72 |                                                             output_table, _input_cols, _output_cols,
73 |                                                             deltatable, intermediate_tbl_views,
74 |                                                             globaltempviews, hardcodecol, joinList)
75 |                 azStorage.azure_upsert_entity(lineage_client, lineage_metadata)
76 |             print("Execution Completed")
77 |             metadata = azStorage.create_event_entity("HRSI",fileName,"Processed",3,myblob.name,False,"SUCCESS")
78 |         except BaseException as e:
79 |             print("Exception Caused in Parsing  " + str(e))
80 |             exc_type, exc_value, exc_traceback = sys.exc_info()
81 |             err_msg = traceback.format_exception(exc_type, exc_value,exc_traceback)[-2:]
82 |             # for i in traceback.format_exception(exc_type, exc_value,exc_traceback):
83 |             #     print(i)
84 |             metadata = azStorage.create_event_entity("HRSI",fileName,"Parsing Failed",3,myblob.name,False,err_msg)
85 | 
86 | 
87 |         try:
88 |             azStorage.azure_upsert_entity(event_client, metadata)
89 |         except:
90 |             logging.info("There is no ROWKEY (alias FileName) in the EventMetadata Table for Update")
91 | 


--------------------------------------------------------------------------------
/sparklin/BlobTriggerFuncApp/BlobTriggerFunction/column_parser.py:
--------------------------------------------------------------------------------
  1 | # import json
  2 | # from collections.abc import Mapping
  3 | 
  4 | 
  5 | def unresolvedAttribute(string):
  6 |     a = string.strip('[]').split(',')
  7 |     return (".".join(i.strip() for i in a))
  8 | 
  9 | 
 10 | def literal(rec):
 11 |     elem = ""
 12 |     if rec["dataType"] == "string":
 13 |         elem = "'" + rec["value"] + "'"
 14 |     elif rec["dataType"] == "null":
 15 |         elem = "NULL"
 16 |     else:
 17 |         elem = rec["value"]
 18 |     return elem
 19 | 
 20 | 
 21 | def attribute(rec):
 22 |     str = ""
 23 |     if rec["class"] == "org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute":
 24 |         str = unresolvedAttribute(rec["nameParts"])
 25 |     elif rec["class"] == "org.apache.spark.sql.catalyst.analysis.Literal" or rec[
 26 |         "class"] == "org.apache.spark.sql.catalyst.expressions.Literal":
 27 |         str = literal(rec)
 28 |     elif rec["class"] == "org.apache.spark.sql.catalyst.analysis.UnresolvedStar":
 29 |         str = "*"
 30 |     return str
 31 | 
 32 | 
 33 | def WindowSpecDefinition(row, index):
 34 |     str = ""
 35 |     # print(index)
 36 |     curr_index = index + 1
 37 |     if len(row[index]["partitionSpec"]) > 0:
 38 |         children = len(row[index]["partitionSpec"])
 39 |         no_of_children = 0
 40 |         str += "PARTITION BY "
 41 |         while no_of_children < children and curr_index < len(row):
 42 |             temp = ""
 43 |             if row[curr_index]["num-children"] > 0:
 44 |                 temp, curr_index = function(row, curr_index)
 45 |             else:
 46 |                 temp = attribute(row[curr_index])
 47 |             str += temp
 48 |             if no_of_children < children - 1:
 49 |                 str += ", "
 50 |             curr_index += 1
 51 |             no_of_children += 1
 52 |     if len(row[index]["orderSpec"]) > 0:
 53 |         str += " ORDER BY "
 54 |         if row[curr_index]["num-children"] == 1:
 55 |             temp = ""
 56 |             if row[curr_index]["direction"]["object"] == "org.apache.spark.sql.catalyst.expressions.Descending$":
 57 |                 temp = " DESC"
 58 |             else:
 59 |                 temp = " ASC"
 60 |             curr_index += 1
 61 |             children = len(row[index]["orderSpec"])
 62 |             no_of_children = 0
 63 |             while no_of_children < children and curr_index < len(row):
 64 |                 temp2 = ""
 65 |                 if row[curr_index]["class"] == "org.apache.spark.sql.catalyst.expressions.SortOrder":
 66 |                     if row[curr_index]["direction"]["object"] == "org.apache.spark.sql.catalyst.expressions.Descending$":
 67 |                         temp = " DESC"
 68 |                     else:
 69 |                         temp = " ASC"
 70 |                     curr_index += 1
 71 |                     continue
 72 |                 elif row[curr_index]["num-children"] > 0:
 73 |                     temp2, curr_index = function(row, curr_index)
 74 |                 else:
 75 |                     temp2 = attribute(row[curr_index])
 76 |                 str += temp2
 77 |                 str += temp
 78 |                 if no_of_children < children - 1:
 79 |                     str += ", "
 80 |                 curr_index += 1
 81 |                 no_of_children += 1
 82 |                 # print(curr_index)
 83 | 
 84 |     if row[curr_index]["class"] == "org.apache.spark.sql.catalyst.expressions.UnspecifiedFrame$":
 85 |         curr_index += 1
 86 |         pass
 87 |     return (str, curr_index - 1)
 88 | 
 89 | 
 90 | def windowExpression(row, index):
 91 |     fstr = ""
 92 |     curr_index = index + 1
 93 |     if row[curr_index]["class"] == "org.apache.spark.sql.catalyst.analysis.UnresolvedFunction":
 94 |         str, curr_index = function(row, curr_index)
 95 |         fstr += str
 96 |         curr_index += 1
 97 |     fstr += " OVER ("
 98 |     if row[curr_index]["class"] == "org.apache.spark.sql.catalyst.expressions.WindowSpecDefinition":
 99 |         str, curr_index = WindowSpecDefinition(row, curr_index)
100 |         fstr += str
101 |         curr_index += 1
102 |     fstr += ")"
103 |     return (fstr, curr_index - 1)
104 | 
105 | 
106 | def recurse(row, index):
107 |     children = row[index]["num-children"]
108 |     no_of_children = 0
109 |     curr_index = index + 1
110 |     while no_of_children < children and curr_index < len(row):
111 |         curr_index = recurse(row, curr_index)
112 |         curr_index += 1
113 |         no_of_children += 1
114 |     return curr_index - 1
115 | 
116 | 
117 | def function(row, curr_index):
118 |     funcName,funcStr, index = "", "", ""
119 |     if row[curr_index]["class"] == "org.apache.spark.sql.catalyst.expressions.CaseWhen":
120 |         return ("CASE WHEN function", recurse(row, curr_index))
121 |     elif row[curr_index]["class"] == "org.apache.spark.sql.catalyst.expressions.Cast":
122 |         funcName = "CAST"
123 |     # elif row[curr_index]["class"] == "org.apache.spark.sql.catalyst.expressions.Add":
124 |     #     pass
125 |     elif row[curr_index]["class"] == "org.apache.spark.sql.catalyst.expressions.UnspecifiedFrame":
126 |         return (funcStr, index)
127 |     elif row[curr_index]["class"] == "org.apache.spark.sql.catalyst.analysis.UnresolvedAlias":
128 |         pass
129 |     else:
130 |         if "name" in row[curr_index]:
131 |             funcName = row[curr_index]["name"]["funcName"]
132 |         else:
133 |             funcName = row[curr_index]["class"].split(".")[-1]
134 |     children = row[curr_index]["num-children"]
135 |     if row[curr_index]["class"] == "org.apache.spark.sql.catalyst.analysis.UnresolvedAlias":
136 |         funcStr = ""
137 |     else:
138 |         funcStr = f"{funcName}("
139 |     index = curr_index + 1
140 |     no_of_children = 0
141 |     while no_of_children < children and index < len(row):
142 |         elem = row[index]
143 |         str = ""
144 |         if elem["class"] == "org.apache.spark.sql.catalyst.analysis.UnresolvedFunction":
145 |             str, index = function(row, index)
146 |         elif elem["class"] == "org.apache.spark.sql.catalyst.expressions.Cast":
147 |             str, index = function(row, index)
148 |         elif elem["class"] == "org.apache.spark.sql.catalyst.expressions.WindowExpression":
149 |             str, index = windowExpression(row, index)
150 |         elif elem["num-children"] == 0:
151 |             str = attribute(elem)
152 |         else:
153 |             str, index = function(row, index)
154 |         funcStr += str
155 |         if no_of_children < children - 1:
156 |             funcStr += ", "
157 |         index += 1
158 |         no_of_children += 1
159 |     if funcName == "CAST":
160 |         dataType = row[curr_index]["dataType"].upper()
161 |         funcStr += f" AS {dataType}"
162 | 
163 |     if row[curr_index]["class"] == "org.apache.spark.sql.catalyst.analysis.UnresolvedAlias":
164 |         pass
165 |     else:
166 |         funcStr += ")"
167 |     return (funcStr, index - 1)
168 | 
169 | 
170 | def get_column_transformations(parseJson):
171 |     output = []
172 |     for line in parseJson:
173 |         # print(len(line)>1)
174 |         firstElem = line[0]
175 |         alias = ""
176 |         index = 0
177 |         if firstElem["class"] == "org.apache.spark.sql.catalyst.analysis.UnresolvedAlias":
178 |             pass
179 |         if firstElem["class"] == "org.apache.spark.sql.catalyst.expressions.Alias":
180 |             alias = firstElem["name"]
181 |             index += 1
182 |         elif firstElem["num-children"] == 0:
183 |             alias = attribute(firstElem)
184 |             index += 1
185 |         # print("sdc")
186 |         if len(line) > 1:
187 |             # print("Dsc")
188 |             funcStr = ""
189 |             while index < len(line):
190 |                 if line[index]["class"] == "org.apache.spark.sql.catalyst.expressions.WindowExpression":
191 |                     funcStr, index = windowExpression(line, index)
192 |                 # elif line[index]["num-children"] == 0:
193 |                 #     funcStr = attribute(line[index])
194 |                 elif line[index]["num-children"] > 0:
195 |                     funcStr, index = function(line, index)
196 |                 index += 1
197 |             if funcStr != "":
198 | 
199 |                 # print(index)
200 |                 if firstElem["class"] == "org.apache.spark.sql.catalyst.analysis.UnresolvedAlias":
201 |                     output += [funcStr]
202 |                     # print(funcStr)
203 |                 else:
204 |                     output += [funcStr + " AS " + alias]
205 |                     # print(funcStr + " AS " + alias)
206 |         # else:
207 |         #     print(alias)
208 |         # elif
209 |     if len(output) == 0:
210 |         return None
211 |     else:
212 |         return output
213 | 
214 | 
215 | # def findProjectList(elem, count):
216 | #     output = []
217 | #     if isinstance(elem, Mapping) or isinstance(elem, list):
218 | #         if "projectList" in elem:
219 | #             temp = parser(elem["projectList"])
220 | #             # print("")
221 | #             # if temp is not None:
222 | #             output += [temp]
223 | #         else:
224 | #             for item in elem:
225 | #                 if isinstance(elem, Mapping):
226 | #                     temp = findProjectList(elem[item], count + 1)
227 | #                 else:
228 | #                     temp = findProjectList(item, count + 1)
229 | #                 if temp is not None:
230 | #                     output += temp
231 |     # if len(output) > 0:
232 |     # print("Output List",output)
233 |     # print("-------------------------------")
234 |     # print("-------------------------------")
235 |     # if len(output) > 0:
236 |     #     return output
237 |     # else:
238 |     #     return None
239 | 
240 | 
241 | # cnt=0
242 | # data = json.load(open('dimperson/dimperson_create6.json'))
243 | # # data= json.load(open('Yogesh (1)/dimcrm1.json'), object_pairs_hook=OrderedDict)
244 | #
245 | # tablePlan = data["run"]["facets"]["spark.logicalPlan"]["plan"]
246 | # for i in tablePlan:
247 | #     # cnt+=1
248 | #     # print(cnt)
249 | #     result = findProjectList(i, 0)
250 | #     if result is not None:
251 | #         print()
252 | #         print("Result=", result)
253 | #         print("-------------------------------")
254 |         # break
255 |     # if len(result) > 0:
256 |     # print(result)
257 |     # print(result)
258 | 


--------------------------------------------------------------------------------
/sparklin/BlobTriggerFuncApp/BlobTriggerFunction/function.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "scriptFile": "__init__.py",
 3 |   "bindings": [
 4 |     {
 5 |       "name": "myblob",
 6 |       "type": "blobTrigger",
 7 |       "direction": "in",
 8 |       "path": "openlineage/{name}",
 9 |       "connection": "datalineagesynapsestrpoc_STORAGE"
10 |     }
11 |   ]
12 | }
13 | 


--------------------------------------------------------------------------------
/sparklin/BlobTriggerFuncApp/BlobTriggerFunction/host.json:
--------------------------------------------------------------------------------
1 | {
2 |   "version": "2.0",
3 |   "extensionBundle": {
4 |     "id": "Microsoft.Azure.Functions.ExtensionBundle",
5 |     "version": "[2.*, 3.0.0)"
6 |   }
7 | }


--------------------------------------------------------------------------------
/sparklin/BlobTriggerFuncApp/BlobTriggerFunction/join_parser.py:
--------------------------------------------------------------------------------
  1 | 
  2 | def get_join_conditions(output_plan,_alias_tablenames):
  3 |     # print("get joins info")
  4 |     overall_conditions = []
  5 |     mid_expressions = []
  6 |     attr_expressions = []
  7 |     attr_functions = []
  8 |     attr_output = []
  9 |     intermediate_output = []
 10 |     aliases = []
 11 |     joinList = []
 12 |     count = 0
 13 | 
 14 |     for child in output_plan:
 15 | 
 16 |         finalout = ""
 17 |         if child['class'] == 'org.apache.spark.sql.catalyst.plans.logical.Join':
 18 |             join = (child['joinType']['object'])[:-1]
 19 |             joinname = join.split('.')[-1]
 20 | 
 21 |             if (joinname == "LeftOuter"):
 22 |                 joinname = "LOJ"
 23 |             elif (joinname == "RightOuter"):
 24 |                 joinname = "ROJ"
 25 |             elif (joinname == "Inner"):
 26 |                 joinname = "IJ"
 27 |             elif (joinname == "FullOuter"):
 28 |                 joinname = "FOJ"
 29 | 
 30 |             for i in range(len(child['condition'])):
 31 | 
 32 |                 childclass = child['condition'][i]['class']
 33 | 
 34 |                 if childclass != 'org.apache.spark.sql.catalyst.expressions.And' and childclass != 'org.apache.spark.sql.catalyst.expressions.Or':
 35 |                     if childclass != 'org.apache.spark.sql.catalyst.expressions.EqualTo':
 36 |                         if (
 37 |                                 childclass != 'org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute' and childclass != 'org.apache.spark.sql.catalyst.expressions.Literal'):
 38 | 
 39 |                             if childclass == "org.apache.spark.sql.catalyst.analysis.UnresolvedFunction":
 40 |                                 attr_functions.append(child['condition'][i]['name']['funcName'])
 41 |                             # this is for expressions on attribute like isnotnull OR CAST
 42 |                             else:
 43 |                                 func = childclass.split('.')[-1]
 44 |                                 if func == 'Cast':
 45 |                                     datatype = child['condition'][i]['dataType']
 46 |                                     func = "Cast AS " + datatype
 47 |                                 attr_expressions.append(func)
 48 | 
 49 |                         else:
 50 |                             if childclass == 'org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute':
 51 |                                 if (child['condition'][i])['nameParts'].find(",") > 0:
 52 |                                     column = ((child['condition'][i])['nameParts']).split(',')[1][1:-1]
 53 |                                     tablealias = ((child['condition'][i])['nameParts']).split(',')[0][1:]
 54 |                                     table = tablealias + "." + column
 55 |                                 else:
 56 |                                     column = ((child['condition'][i])['nameParts'])[0]
 57 |                                     tablealias = ""
 58 |                                     table = column
 59 | 
 60 | 
 61 |                                 # apply functions and expressions on attribute
 62 |                                 if len(attr_functions) > 0:
 63 |                                     for func in attr_functions:
 64 |                                         table = func + "(" + table + ")"
 65 |                                     attr_functions.clear()
 66 | 
 67 |                                 if len(attr_expressions) > 0:
 68 |                                     for exp in attr_expressions:
 69 |                                         table = table + " " + exp
 70 |                                     attr_expressions.clear()
 71 | 
 72 |                                 attr_output.append(table)
 73 |                                 if tablealias:
 74 |                                     aliases.append(tablealias)
 75 |                             else:
 76 |                                 literaldatatype = child['condition'][i]['dataType']
 77 |                                 value = child['condition'][i]['value']
 78 |                                 attr_output.append(value)
 79 | 
 80 |                             # if we have mid expression that means we will get two unresoved attributes class
 81 |                             if (len(mid_expressions) >= 1):
 82 |                                 count = count + 1
 83 |                                 if count == 2:
 84 |                                     intermediate_output.append(
 85 |                                         attr_output[0] + " " + mid_expressions[0] + " " + attr_output[1])
 86 | 
 87 |                                     attr_output.clear()
 88 |                                     mid_expressions.clear()
 89 |                                     count = 0
 90 |                             else:
 91 |                                 intermediate_output.append(attr_output[0])
 92 |                                 attr_output.clear()
 93 |                     else:
 94 |                         if childclass.split('.')[-1] == "EqualTo":
 95 |                             mid_expressions.append("=")
 96 |                         else:
 97 |                             mid_expressions.append(childclass.split('.')[-1])
 98 |                 else:
 99 |                     overall_conditions.append(childclass.split('.')[-1])
100 | 
101 |                 if i + 1 == len(child['condition']):
102 | 
103 |                     finalaliases = list(set(aliases))
104 | 
105 |                     _alias_tablenames_new = dict((k.lower(), v) for k, v in _alias_tablenames.items())
106 |                     
107 |                     if finalaliases[0].lower() in _alias_tablenames_new.keys():
108 |                         alias1 = _alias_tablenames_new[finalaliases[0].lower()]
109 |                     else:
110 |                         alias1 = "InlineQuery"
111 |                     if finalaliases[1].lower() in _alias_tablenames_new.keys():
112 |                         alias11 = _alias_tablenames_new[finalaliases[1].lower()]
113 |                     else:
114 |                         alias11 = "InlineQuery"
115 |                     if len(overall_conditions) > 0 and len(intermediate_output) > 1:
116 |                         finalout = alias1 + " " + finalaliases[0] + " " + joinname + " " + alias11 + " " + \
117 |                                    finalaliases[1] + " ON " + " ".join(
118 |                             [x for y in zip(intermediate_output, overall_conditions + [0]) for x in y][:-1])
119 |                     else:
120 |                         finalout = alias1 + " " + finalaliases[0] + " " + joinname + " " + alias11 + " " + \
121 |                                    finalaliases[1] + " ON " + intermediate_output[0]
122 | 
123 |                     # print(finalout)
124 |                     joinList.append(finalout)
125 | 
126 |                     overall_conditions.clear()
127 |                     intermediate_output.clear()
128 |                     aliases.clear()
129 |     return joinList
130 |     # print("exit joins info")


--------------------------------------------------------------------------------
/sparklin/BlobTriggerFuncApp/BlobTriggerFunction/readme.md:
--------------------------------------------------------------------------------
 1 | # BlobTrigger - Python
 2 | 
 3 | The `BlobTrigger` makes it incredibly easy to react to new Blobs inside of Azure Blob Storage. This sample demonstrates a simple use case of processing data from a given Blob using Python.
 4 | 
 5 | ## How it works
 6 | 
 7 | For a `BlobTrigger` to work, you provide a path which dictates where the blobs are located inside your container, and can also help restrict the types of blobs you wish to return. For instance, you can set the path to `samples/{name}.png` to restrict the trigger to only the samples path and only blobs with ".png" at the end of their name.
 8 | 
 9 | ## Learn more
10 | 
11 | <TODO> Documentation


--------------------------------------------------------------------------------
/sparklin/BlobTriggerFuncApp/BlobTriggerFunction/sample.dat:
--------------------------------------------------------------------------------
1 | samples-workitems/workitem.txt


--------------------------------------------------------------------------------
/sparklin/BlobTriggerFuncApp/host.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": "2.0",
 3 |   "logging": {
 4 |     "applicationInsights": {
 5 |       "samplingSettings": {
 6 |         "isEnabled": true,
 7 |         "excludedTypes": "Request"
 8 |       }
 9 |     }
10 |   },
11 |   "extensionBundle": {
12 |     "id": "Microsoft.Azure.Functions.ExtensionBundle",
13 |     "version": "[2.*, 3.0.0)"
14 |   },
15 |   "functionTimeout": "00:10:00"
16 | }
17 | 


--------------------------------------------------------------------------------
/sparklin/BlobTriggerFuncApp/requirements.txt:
--------------------------------------------------------------------------------
1 | # DO NOT include azure-functions-worker in this file
2 | # The Python Worker is managed by Azure Functions platform
3 | # Manually managing azure-functions-worker may cause unexpected issues
4 | 
5 | azure-data-tables
6 | azure-functions==1.7.2
7 | azure-identity==1.6.1
8 | pyapacheatlas==0.12
9 | 


--------------------------------------------------------------------------------
/sparklin/HttpTriggerFuncApp/.funcignore:
--------------------------------------------------------------------------------
1 | .git*
2 | .vscode
3 | local.settings.json
4 | test
5 | .venv


--------------------------------------------------------------------------------
/sparklin/HttpTriggerFuncApp/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | 
 53 | # Translations
 54 | *.mo
 55 | *.pot
 56 | 
 57 | # Django stuff:
 58 | *.log
 59 | local_settings.py
 60 | db.sqlite3
 61 | 
 62 | # Flask stuff:
 63 | instance/
 64 | .webassets-cache
 65 | 
 66 | # Scrapy stuff:
 67 | .scrapy
 68 | 
 69 | # Sphinx documentation
 70 | docs/_build/
 71 | 
 72 | # PyBuilder
 73 | target/
 74 | 
 75 | # Jupyter Notebook
 76 | .ipynb_checkpoints
 77 | 
 78 | # IPython
 79 | profile_default/
 80 | ipython_config.py
 81 | 
 82 | # pyenv
 83 | .python-version
 84 | 
 85 | # pipenv
 86 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 87 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 88 | #   having no cross-platform support, pipenv may install dependencies that don’t work, or not
 89 | #   install all needed dependencies.
 90 | #Pipfile.lock
 91 | 
 92 | # celery beat schedule file
 93 | celerybeat-schedule
 94 | 
 95 | # SageMath parsed files
 96 | *.sage.py
 97 | 
 98 | # Environments
 99 | .env
100 | .venv
101 | env/
102 | venv/
103 | ENV/
104 | env.bak/
105 | venv.bak/
106 | 
107 | # Spyder project settings
108 | .spyderproject
109 | .spyproject
110 | 
111 | # Rope project settings
112 | .ropeproject
113 | 
114 | # mkdocs documentation
115 | /site
116 | 
117 | # mypy
118 | .mypy_cache/
119 | .dmypy.json
120 | dmypy.json
121 | 
122 | # Pyre type checker
123 | .pyre/
124 | 
125 | # Azure Functions artifacts
126 | bin
127 | obj
128 | appsettings.json
129 | local.settings.json
130 | 
131 | # Azurite artifacts
132 | __blobstorage__
133 | __queuestorage__
134 | __azurite_db*__.json
135 | .python_packages


--------------------------------------------------------------------------------
/sparklin/HttpTriggerFuncApp/HttpTriggerFunction/__init__.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import datetime as dt
 4 | import json
 5 | import azure.functions as func
 6 | from azure.storage.blob import BlobClient, ContainerClient, ContentSettings 
 7 | from .tablestorage import tablestorage
 8 | from .event import event
 9 | 
10 | 
11 | def uploadblob(json_in, blobname, conn_str, lin_container):
12 | 
13 |     container_client = ContainerClient.from_connection_string(conn_str, container_name=lin_container)
14 |     blob = BlobClient.from_connection_string(conn_str, container_name= lin_container, blob_name=blobname)
15 |     BlobClient
16 |     blob.upload_blob(json_in, overwrite=True)
17 | 
18 | 
19 | def main(req: func.HttpRequest) -> func.HttpResponse:
20 | 
21 |     logging.info("http trigger function kicked off")
22 | 
23 |     lineageContainerStr = os.environ["LINEAGE_STORAGE_CONN_STR"]
24 |     lineageContainer = os.environ["LINEAGE_CONTAINER"]
25 | 
26 |     data = req.get_json()
27 | 
28 |     currenttimestamp = dt.datetime.utcnow().strftime("%Y%m%d%H%M%S")
29 |     eventType = data["eventType"]
30 |     className = data["run"]["facets"]["spark.logicalPlan"]["plan"][0]["class"]
31 |     runId = data["run"]["runId"]
32 |     notebookName = data["job"]["name"]
33 |     notebookName = notebookName[0 : notebookName.index('.')]
34 |     
35 |     fileName = runId + '_' + notebookName + '_' + currenttimestamp + '.json'
36 |     filePath = lineageContainer + '/' + fileName
37 | 
38 |     predefined_class_list = ["org.apache.spark.sql.execution.datasources.CreateTable",
39 |                 "org.apache.spark.sql.catalyst.plans.logical.CreateViewStatement",
40 |                 "org.apache.spark.sql.catalyst.plans.logical.CreateTableAsSelectStatement",
41 |                 "org.apache.spark.sql.catalyst.plans.logical.InsertIntoStatement",
42 |                 "org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand"]
43 | 
44 |     if eventType is not None and eventType == "COMPLETE" and className in predefined_class_list :
45 |         # upload file as json into blob storage
46 |         uploadblob(json.dumps(data), fileName, lineageContainerStr, lineageContainer)
47 |         
48 |         #code to add new unprocessed row for same uploaded json into azure table  
49 |         eventrow = event('HRSI', fileName)
50 |         eventrow.Status = 'Unprocessed'
51 |         eventrow.RetryCount = 3
52 |         eventrow.FilepPath = filePath
53 |         eventrow.isArchived = False
54 |         eventrow.Message = ''
55 | 
56 |         tableStorage = tablestorage()
57 |         tableStorage.insertEventMetadata(eventrow.__dict__)
58 | 
59 |         return func.HttpResponse(f"Func App successfully processed http request")
60 | 
61 |     else:
62 | 
63 |         return func.HttpResponse(f"Event Type is not COMPLETE Or ClassName Not Matched")


--------------------------------------------------------------------------------
/sparklin/HttpTriggerFuncApp/HttpTriggerFunction/event.py:
--------------------------------------------------------------------------------
 1 | import azure.data.tables
 2 | 
 3 | class event():
 4 | 
 5 |     Status = 'Unprocessed'
 6 |     Message = ''
 7 |     RetryCount = 3
 8 |     FilepPath = '/openlineage/'
 9 |     isArchived = 0
10 | 
11 |     def __init__(self, teamname: str, filename: str) -> None:
12 |         self.PartitionKey = teamname
13 |         self.RowKey = filename
14 | 


--------------------------------------------------------------------------------
/sparklin/HttpTriggerFuncApp/HttpTriggerFunction/function.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "scriptFile": "__init__.py",
 3 |   "bindings": [
 4 |     {
 5 |       "authLevel": "function",
 6 |       "type": "httpTrigger",
 7 |       "direction": "in",
 8 |       "name": "req",
 9 |       "Route": "1/lineage",
10 |       "methods": [
11 |         "get",
12 |         "post"
13 |       ]
14 |     },
15 |     {
16 |       "type": "http",
17 |       "direction": "out",
18 |       "name": "$return"
19 |     }
20 |   ]
21 | }


--------------------------------------------------------------------------------
/sparklin/HttpTriggerFuncApp/HttpTriggerFunction/sample.dat:
--------------------------------------------------------------------------------
1 | {
2 |     "name": "Azure"
3 | }


--------------------------------------------------------------------------------
/sparklin/HttpTriggerFuncApp/HttpTriggerFunction/tablestorage.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from azure.data.tables import TableServiceClient
 3 | 
 4 | class tablestorage:
 5 |     
 6 |     def __init__(self) -> None:
 7 |         # connstr = os.environ["LINEAGE_STORAGE_CONN_STR"]
 8 |         self.connstr = "DefaultEndpointsProtocol=https;AccountName=datalineagesynapsestrpoc;AccountKey=<<Provide your Storage Account Key>>;EndpointSuffix=core.windows.net"
 9 |         # tablename = os.environ["LINEAGE_EVENT_TABLE"]
10 |         self.tablename = "EventMetadataYJ"
11 | 
12 |         self.table_service_client = TableServiceClient.from_connection_string(self.connstr)
13 |         self.table_service_client.create_table_if_not_exists(self.tablename)
14 |         
15 | 
16 |     def insertEventMetadata(self, eventrow) -> None:
17 |         table_client = self.table_service_client.get_table_client(self.tablename)
18 |         table_client.create_entity(eventrow)
19 | 


--------------------------------------------------------------------------------
/sparklin/HttpTriggerFuncApp/Reqtest.http:
--------------------------------------------------------------------------------
1 | POST http://localhost:7071/api/sparkopenlineage
2 | Connect-Type: application/json
3 | 
4 | {
5 |     "name" : "Yogesh",
6 |     "age" : 28
7 | }


--------------------------------------------------------------------------------
/sparklin/HttpTriggerFuncApp/host.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": "2.0",
 3 |   "logging": {
 4 |     "applicationInsights": {
 5 |       "samplingSettings": {
 6 |         "isEnabled": true,
 7 |         "excludedTypes": "Request"
 8 |       }
 9 |     }
10 |   },
11 |   "extensionBundle": {
12 |     "id": "Microsoft.Azure.Functions.ExtensionBundle",
13 |     "version": "[3.*, 4.0.0)"
14 |   }
15 | }


--------------------------------------------------------------------------------
/sparklin/HttpTriggerFuncApp/requirements.txt:
--------------------------------------------------------------------------------
1 | # DO NOT include azure-functions-worker in this file
2 | # The Python Worker is managed by Azure Functions platform
3 | # Manually managing azure-functions-worker may cause unexpected issues
4 | 
5 | azure-functions


--------------------------------------------------------------------------------
/sparklin/Onboarding.md:
--------------------------------------------------------------------------------
  1 | Onboarding is easy with just a few configurations in Synapse Spark Pool environment and taking code scripts from SparkLin Branch.
  2 | Overall Steps looks like:
  3 | 
  4 | &emsp;1. Upload the Jar “openlineage-spark:.jar” into the Synapse Spark Pool Packages.
  5 | 
  6 | &emsp;2. Add spark configurations related to open lineage in Synapse Spark Pool.
  7 | 
  8 | &emsp;3. Create a new storage account with a blob container name openlineage to have all json files uploaded and 2 azure table storage (EventMetadata and LineageDetails)
  9 | 
 10 | &emsp;4. Create the Azure Function Apps and  functions related to SparkLin.
 11 | 
 12 | &emsp;5. Create Purview Collection where all lineage assets will reside.
 13 | 
 14 | **Cluster Setup**
 15 | 
 16 | OpenLineage integrates with Spark by implementing SparkListener (SparkListenerSQLExecution, SparkListenerEvent) interface and collecting information about jobs that are executed inside a Spark application.
 17 | 
 18 | To activate the listener, add the following properties to your Spark configuration: 
 19 | 
 20 | &emsp;•	spark.extraListeners	io.openlineage.spark.agent.OpenLineageSparkListener
 21 | 
 22 | Once the listener is activated, it needs to know where to report lineage events, as well as the namespace of your jobs. Add the following additional configuration lines to your Spark Configuration in the Spark pool.
 23 | 
 24 | &emsp;•	spark.openlineage.host                 {your.openlineage.host i.e. func app endpoint url}
 25 | 
 26 | &emsp;•	spark.openlineage.namespace            {your workspacename}
 27 | 
 28 | &emsp;•	spark.openlineage.url.param.code       {your func app host key (default key)}
 29 | 
 30 | &emsp;•	spark.openlineage.version              { 1 or v1 depends on the jar}
 31 | 
 32 | 
 33 | **Storage Account Setup**
 34 | 
 35 | Create new storage account in Azure portal by using portal wizard.
 36 | 
 37 | &emsp;• create new container and name it as **openlineage**
 38 | 
 39 | 
 40 | **Azure Table Storages setup**
 41 | 
 42 | We work with 2 azure table storages, one to store all events and their processing status and other is to store all lineage details.
 43 | 
 44 | **Creation steps:**
 45 | 
 46 | Open your sotrage account and go to **Tables** and create two new blank tables and name them as EventMetadata and LineageDetails.
 47 | 
 48 | <img width="248" alt="image" src="https://user-images.githubusercontent.com/123259339/214266628-8ce0ccc7-0811-481e-bc5d-ef97b7cf992a.png">
 49 | 
 50 | 
 51 | EventMetadata table is used to store all events information which is triggered by open lineage and to track parsing status of each event.
 52 | Structure of EventMetadata table looks like below after rows generated:
 53 | 1. PartitionKey
 54 | 2. RowKey
 55 | 3. Timestamp
 56 | 4. Status
 57 | 5. RetryCount
 58 | 6. FilePath
 59 | 7. isArchived
 60 | 8. Message
 61 | 
 62 | LineageDetails table is used to store all lineage information which is obtained after json parsing.
 63 | Structure of LineageDetails table looks like:
 64 | 1. PartitionKey
 65 | 2. RowKey
 66 | 3. Timestamp
 67 | 4. derived_columns
 68 | 5. input_columns
 69 | 6. input_tables
 70 | 7. isdelta
 71 | 8. isglobal
 72 | 9. isintermediate
 73 | 10. joinconditions
 74 | 11. output_columns
 75 | 12. output_table
 76 |  
 77 | After we have both azure storage tables, we make use of HTTP and Blob Storage based Azure functions to process all open lineage produced jsons.
 78 | 
 79 | **Function Apps Set up Steps: **
 80 | 
 81 | HTTP Trigger Function App:
 82 | 
 83 | Create new function app on azure portal with application insights enabled which will provide a http endpoint for spark cluster to make PUSH requests with json type data. This is C# based function app.
 84 | 
 85 | Deployment:
 86 | 
 87 | Deploy function from URL: https://github.com/microsoft/DataLineage/tree/main/sparklin/OpenLineage
 88 | 
 89 | Add new Configurations in Function App:
 90 | 
 91 | ConnectionString   :   < your new storage account connection string >
 92 | 
 93 | ContainerName      :   openlineage
 94 | 
 95 | TableName          :   EventMetadata
 96 | 
 97 | **What does function do**
 98 | 1. App will store this json data as file into blob storage
 99 | 2. App will insert an entry in eventmetadata table with status as Unprocessed for this particular json file
100 | 
101 | Blob Trigger Function App:
102 | 
103 | Create new function app on azure portal with application insights enabled which will get tiggered as and when new blobs will be uploaded by http trigger function app. This is python based function app.
104 | 
105 | Deployment:
106 | 
107 | Deploy function from URL: https://github.com/microsoft/DataLineage/tree/main/sparklin/BlobTriggerFuncApp
108 | 
109 | Add new Configurations in Function App:
110 | 
111 | datalineagesynapsestrpoc_STORAGE   :   < your new storage account connection string >
112 | 
113 | StorageTableName                   :   LineageDetails
114 | 
115 | TableName                          :   EventMetadata
116 | 
117 | **What does function do**
118 | 1. App will query EventMetadata table and take all records which are in status of 'Unprocessed'
119 | 2. For every event, App will read json file from blob storage and start parsing it using Python code and push all lineage details to ''LineageDetails' Azure table
120 | 3. Finally App will update status of working event as 'Processed' if lineage is pushed and 'Failed' if something fails and update 'Message' column with exception details.
121 | 
122 | **Note: Limitations**
123 | Based on our usecases we have implemented the parser, if we find any new edge usecases, we need to enhance the Parser.
124 | Currently it supports Spark 3.1, for greater Spark versions Parser enhancement is required.
125 | 


--------------------------------------------------------------------------------
/sparklin/OpenLineage/OpenLineage.sln:
--------------------------------------------------------------------------------
 1 | ﻿
 2 | Microsoft Visual Studio Solution File, Format Version 12.00
 3 | # Visual Studio Version 16
 4 | VisualStudioVersion = 16.0.32413.69
 5 | MinimumVisualStudioVersion = 10.0.40219.1
 6 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "OpenLineage", "OpenLineage\OpenLineage.csproj", "{41D137BF-7F46-42BB-94D6-437A05CAADDA}"
 7 | EndProject
 8 | Global
 9 | 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
10 | 		Debug|Any CPU = Debug|Any CPU
11 | 		Release|Any CPU = Release|Any CPU
12 | 	EndGlobalSection
13 | 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
14 | 		{41D137BF-7F46-42BB-94D6-437A05CAADDA}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
15 | 		{41D137BF-7F46-42BB-94D6-437A05CAADDA}.Debug|Any CPU.Build.0 = Debug|Any CPU
16 | 		{41D137BF-7F46-42BB-94D6-437A05CAADDA}.Release|Any CPU.ActiveCfg = Release|Any CPU
17 | 		{41D137BF-7F46-42BB-94D6-437A05CAADDA}.Release|Any CPU.Build.0 = Release|Any CPU
18 | 	EndGlobalSection
19 | 	GlobalSection(SolutionProperties) = preSolution
20 | 		HideSolutionNode = FALSE
21 | 	EndGlobalSection
22 | 	GlobalSection(ExtensibilityGlobals) = postSolution
23 | 		SolutionGuid = {917ADBC5-E454-47E0-9AE3-850361FEE5DB}
24 | 	EndGlobalSection
25 | EndGlobal
26 | 


--------------------------------------------------------------------------------
/sparklin/OpenLineage/OpenLineage/CaptureLineage.cs:
--------------------------------------------------------------------------------
 1 | using System;
 2 | using System.Text;
 3 | using System.IO;
 4 | using System.Threading.Tasks;
 5 | using Microsoft.AspNetCore.Mvc;
 6 | using Microsoft.Azure.WebJobs;
 7 | using Microsoft.Azure.WebJobs.Extensions.Http;
 8 | using Microsoft.AspNetCore.Http;
 9 | using Microsoft.Extensions.Logging;
10 | using Newtonsoft.Json;
11 | using Microsoft.Azure.Storage;
12 | using Microsoft.Azure.Storage.Blob;
13 | using System.Linq;
14 | 
15 | namespace OpenLineage
16 | {
17 |     public static class CaptureLineage
18 |     {
19 |         [FunctionName("CaptureLineage")]
20 |         public static async Task<IActionResult> Run(
21 |             [HttpTrigger(AuthorizationLevel.Function, "get", "post", Route = "1/lineage")] HttpRequest req,
22 |             ILogger log)
23 |         {
24 |             log.LogInformation("C# HTTP trigger function processed a request.");
25 | 
26 |             string eventType = req.Query["eventType"];
27 | 
28 |             string[] predefined_class_list = {"org.apache.spark.sql.execution.datasources.CreateTable",
29 |                 "org.apache.spark.sql.catalyst.plans.logical.CreateViewStatement",
30 |                 "org.apache.spark.sql.catalyst.plans.logical.CreateTableAsSelectStatement",
31 |                 "org.apache.spark.sql.catalyst.plans.logical.InsertIntoStatement",
32 |                 "org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand",
33 |             "org.apache.spark.sql.catalyst.plans.logical.MergeIntoTable"};
34 | 
35 |             string requestBody = await new StreamReader(req.Body).ReadToEndAsync();
36 |             dynamic data = JsonConvert.DeserializeObject(requestBody);
37 |             eventType = eventType ?? data?.eventType;
38 |             string className = data["run"]["facets"]["spark.logicalPlan"]["plan"][0]["class"];
39 |             string runId = data["run"]["runId"];
40 |             string notebookName = data["job"]["name"];
41 |             notebookName = notebookName.Substring(0,notebookName.IndexOf(".")-1);
42 | 
43 |             if (eventType != null && eventType.Equals("COMPLETE")  &&  predefined_class_list.Contains(className))
44 |             {
45 |                 string connectionString = Environment.GetEnvironmentVariable("ConnectionString");
46 |                 string containerName = Environment.GetEnvironmentVariable("ContainerName");
47 |                 TableStorage tableStorage = new TableStorage();
48 | 
49 |                 string currentTimestamp = DateTime.UtcNow.ToString("yyyyMMddHHmmss");
50 |                 string fileName = $"{runId}_{notebookName}_{currentTimestamp}.json";
51 |                 EventMetadata eventMetadata = new EventMetadata(Utility.getQualifierName(notebookName), $"{runId}_{notebookName}_{currentTimestamp}");
52 |                 eventMetadata.Status = Constant.UN_PROCESSED;
53 |                 eventMetadata.RetryCount = Constant.RETRY_COUNT;
54 |                 eventMetadata.isArchived = Constant.IS_ARCHIVE;
55 |                 eventMetadata.FilePath = $"{containerName}/{fileName}";
56 | 
57 |                 CloudStorageAccount storageAccount = CloudStorageAccount.Parse(connectionString);
58 |                 CloudBlobClient client = storageAccount.CreateCloudBlobClient();
59 |                 CloudBlobContainer container = client.GetContainerReference(containerName);
60 |                 
61 |                 CloudBlockBlob blob = container.GetBlockBlobReference(fileName);
62 |                 blob.Properties.ContentType = "application/json";
63 |                 using (Stream stream = new MemoryStream(Encoding.UTF8.GetBytes(requestBody)))
64 |                 {
65 |                     await blob.UploadFromStreamAsync(stream);
66 |                 }
67 |                 tableStorage.insetEventMetadata(eventMetadata);
68 |                 return new OkObjectResult("file uploaded successfylly");
69 |             }
70 |             else
71 |             {
72 |                 return new OkObjectResult("Event Type is not COMPLETE Or ClassName Not Matched");
73 |             }
74 |         }
75 |     }
76 | 
77 |     
78 | }
79 | 


--------------------------------------------------------------------------------
/sparklin/OpenLineage/OpenLineage/Constant.cs:
--------------------------------------------------------------------------------
 1 | ﻿using System;
 2 | using System.Collections.Generic;
 3 | using System.Text;
 4 | 
 5 | namespace OpenLineage
 6 | {
 7 |     static class Constant
 8 |     {
 9 |         public const String UN_PROCESSED = "Unprocessed";
10 |         public const int RETRY_COUNT = 3;
11 |         public const Boolean IS_ARCHIVE = false;
12 | 
13 |         public const String LEARNING = "learning";
14 |         public const String GAT = "gta";
15 |         public const String HCM = "hcm";
16 |         public const String HRSI = "hrsi";
17 |         public const String ULTP = "ultp";
18 |     }
19 | }
20 | 


--------------------------------------------------------------------------------
/sparklin/OpenLineage/OpenLineage/EventMetadata.cs:
--------------------------------------------------------------------------------
 1 | ﻿using System;
 2 | using System.Collections.Generic;
 3 | using System.Text;
 4 | using Microsoft.WindowsAzure.Storage.Table;
 5 | 
 6 | namespace OpenLineage
 7 | {
 8 |     class EventMetadata : TableEntity
 9 |     {
10 |         public EventMetadata(String teamName, String fileName)
11 |         {
12 |             PartitionKey = teamName;
13 |             RowKey = fileName;
14 |         }
15 | 
16 |         public EventMetadata() { }
17 |        
18 |         public string Status { get; set; }
19 |         public string Message { get; set; }
20 |         public int RetryCount { get; set; }
21 |         public String FilePath { get; set; }
22 | 
23 |         public Boolean isArchived { get; set; }
24 | 
25 | }
26 | }
27 | 


--------------------------------------------------------------------------------
/sparklin/OpenLineage/OpenLineage/OpenLineage.csproj:
--------------------------------------------------------------------------------
 1 | <Project Sdk="Microsoft.NET.Sdk">
 2 |   <PropertyGroup>
 3 |     <TargetFramework>netcoreapp3.1</TargetFramework>
 4 |     <AzureFunctionsVersion>v3</AzureFunctionsVersion>
 5 |     <UserSecretsId>c5590014-6f5a-416a-9f64-80d4ed0332a8</UserSecretsId>
 6 |   </PropertyGroup>
 7 |   <ItemGroup>
 8 |     <PackageReference Include="Azure.Storage.Blobs" Version="12.11.0" />
 9 |     <PackageReference Include="Microsoft.ApplicationInsights.WorkerService" Version="2.15.0" />
10 |     <PackageReference Include="Microsoft.Azure.Storage.Blob" Version="11.2.3" />
11 |     <PackageReference Include="Microsoft.NET.Sdk.Functions" Version="3.0.13" />
12 |   </ItemGroup>
13 |   <ItemGroup>
14 |     <None Update="host.json">
15 |       <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
16 |     </None>
17 |     <None Update="local.settings.json">
18 |       <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
19 |       <CopyToPublishDirectory>Never</CopyToPublishDirectory>
20 |     </None>
21 |   </ItemGroup>
22 | </Project>
23 | 


--------------------------------------------------------------------------------
/sparklin/OpenLineage/OpenLineage/OpenLineage.csproj.user:
--------------------------------------------------------------------------------
1 | ﻿<?xml version="1.0" encoding="utf-8"?>
2 | <Project ToolsVersion="Current" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
3 |   <PropertyGroup>
4 |     <_LastSelectedProfileId>C:\Users\yjain\Source\Repos\HR-HRDI-TABI-DL-HRDIDataLineage\OpenLineage\OpenLineage\Properties\PublishProfiles\sparkOpenLineageRead - Zip Deploy.pubxml</_LastSelectedProfileId>
5 |   </PropertyGroup>
6 | </Project>


--------------------------------------------------------------------------------
/sparklin/OpenLineage/OpenLineage/TableStorage.cs:
--------------------------------------------------------------------------------
 1 | ﻿using System;
 2 | using System.Collections.Generic;
 3 | using System.Text;
 4 | using Microsoft.WindowsAzure.Storage.Table;
 5 | using Microsoft.Azure;
 6 | using Microsoft.WindowsAzure.Storage;
 7 | using Microsoft.WindowsAzure.Storage.Auth;
 8 | 
 9 | namespace OpenLineage
10 | {
11 |     class TableStorage
12 |     {
13 |         string connectionString = Environment.GetEnvironmentVariable("ConnectionString");
14 |         string tableName = Environment.GetEnvironmentVariable("TableName");
15 |         CloudTable table = null;
16 | 
17 |         public TableStorage()
18 |         {
19 |             createTableIfNotExist();
20 |         }
21 | 
22 |         public async void createTableIfNotExist()
23 |         {
24 |             CloudStorageAccount storageAcc = CloudStorageAccount.Parse(connectionString);
25 |             CloudTableClient tblclient = storageAcc.CreateCloudTableClient();
26 |             table = tblclient.GetTableReference(tableName);
27 |             await table.CreateIfNotExistsAsync();
28 |  
29 |         }
30 | 
31 |         public async void insetEventMetadata(EventMetadata eventMetadata)
32 |         {
33 |             if(table==null) createTableIfNotExist();
34 |             TableOperation insertOperation = TableOperation.Insert(eventMetadata);
35 |             await table.ExecuteAsync(insertOperation);
36 |         }
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/sparklin/OpenLineage/OpenLineage/Utility.cs:
--------------------------------------------------------------------------------
 1 | ﻿using System;
 2 | using System.Collections.Generic;
 3 | using System.Text;
 4 | 
 5 | namespace OpenLineage
 6 | {
 7 |     class Utility
 8 |     {
 9 |         public static String getQualifierName(String notebookName)
10 |         {
11 |             if (notebookName != null && notebookName.IndexOf(Constant.GAT, StringComparison.OrdinalIgnoreCase) >= 0)
12 |             {
13 |                 return Constant.GAT.ToUpper();
14 |             }
15 |             else if (notebookName != null && notebookName.IndexOf(Constant.HRSI, StringComparison.OrdinalIgnoreCase) >= 0)
16 |             {
17 |                 return Constant.HRSI.ToUpper();
18 |             }
19 |             else if (notebookName != null && notebookName.IndexOf(Constant.LEARNING, StringComparison.OrdinalIgnoreCase) >= 0)
20 |             {
21 |                 return Constant.LEARNING.ToUpper();
22 |             }
23 |             else if (notebookName != null && notebookName.IndexOf(Constant.HCM, StringComparison.OrdinalIgnoreCase) >= 0)
24 |             {
25 |                 return Constant.HCM.ToUpper();
26 |             }
27 |             else if (notebookName != null && notebookName.IndexOf(Constant.ULTP, StringComparison.OrdinalIgnoreCase) >= 0)
28 |             {
29 |                 return Constant.ULTP.ToUpper();
30 |             }
31 |             else
32 |             {
33 |                 return notebookName.Substring(0, notebookName.LastIndexOf("_")).ToUpper();
34 |             }
35 | 
36 |         }
37 | 
38 |         public static void main()
39 |         {
40 |             Console.WriteLine(Utility.getQualifierName("sql_test_data_lineage_pool_1650268838"));
41 |         }
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/sparklin/OpenLineage/OpenLineage/host.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "version": "2.0",
 3 |     "logging": {
 4 |         "applicationInsights": {
 5 |             "samplingSettings": {
 6 |                 "isEnabled": true,
 7 |                 "excludedTypes": "Request"
 8 |             }
 9 |         }
10 |     }
11 | }


--------------------------------------------------------------------------------
/sparklin/README.md:
--------------------------------------------------------------------------------
1 | SparkLin is a Custom Parser which parses the Spark Internal Logical Execution Plan and fetches the required Attributes, entities and the Transformations/functions applied on Columns and Join Conditions on Entities.
2 | 
3 | SparkLin uses OpenLineage 0.4 Version Jar which is tightly coupled with Spark 3.1 Version and provides more detailed plan.
4 | 
5 | This Project uses two Function Apps which are used for Capturing the Event Json Payloads from OpenLineage into Blob Storage and read or parse the Jsons automatically . 
6 | 
7 | We are constantly improving the SparkLin to tackle different usecases where the Synapse Notebooks contain Complete Spark Native Code.
8 | 


--------------------------------------------------------------------------------
/sparklin/openlineage-spark-0.4.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/DataLineage/a4390286d334ad02dccd2e10d199487ca13640c2/sparklin/openlineage-spark-0.4.0.jar


--------------------------------------------------------------------------------
/tompo/Onboarding.md:
--------------------------------------------------------------------------------
1 | Please refer Onboarding document present in tompo branch
2 | 


--------------------------------------------------------------------------------
/tompo/README.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/tompo/TOMPo.pbix:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/DataLineage/a4390286d334ad02dccd2e10d199487ca13640c2/tompo/TOMPo.pbix


--------------------------------------------------------------------------------
/tompo/TOMPo_ModelMetada.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "cell_type": "code",
  5 |       "source": [
  6 |         "%%sql\r\n",
  7 |         "--This notebook should be run after TOMPo_ReportParser notebook is executed as the current notebook(TOMPo_ModelMetadata) utilizes the output of TOMPo_ReportParser\r\n",
  8 |         "--First Create the TOMPo Database (If not exisits)\r\n",
  9 |         "CREATE DATABASE IF NOT EXISTS tompo"
 10 |       ],
 11 |       "outputs": [],
 12 |       "execution_count": null,
 13 |       "metadata": {
 14 |         "jupyter": {
 15 |           "source_hidden": false,
 16 |           "outputs_hidden": false
 17 |         },
 18 |         "nteract": {
 19 |           "transient": {
 20 |             "deleting": false
 21 |           }
 22 |         },
 23 |         "microsoft": {
 24 |           "language": "sparksql"
 25 |         },
 26 |         "collapsed": false
 27 |       }
 28 |     },
 29 |     {
 30 |       "cell_type": "code",
 31 |       "source": [
 32 |         "%%sql\r\n",
 33 |         "--Create the tables for TOMPo\r\n",
 34 |         "use tompo;\r\n",
 35 |         "DROP TABLE IF EXISTS tompo_datasources;\r\n",
 36 |         "DROP TABLE IF EXISTS tompo_tables;\r\n",
 37 |         "DROP TABLE IF EXISTS tompo_tablepartitions;\r\n",
 38 |         "DROP TABLE IF EXISTS tompo_totables;\r\n",
 39 |         "DROP TABLE IF EXISTS tompo_columns;\r\n",
 40 |         "DROP TABLE IF EXISTS tompo_measures;\r\n",
 41 |         "DROP TABLE IF EXISTS tompo_calcdependency;\r\n",
 42 |         "DROP TABLE IF EXISTS tompo_relationships;\r\n",
 43 |         "DROP TABLE IF EXISTS tompo_rolememberships; \r\n",
 44 |         "DROP TABLE IF EXISTS tompo_roles;\r\n",
 45 |         "DROP TABLE IF EXISTS tompo_dbschema_catalogs;\r\n",
 46 |         "DROP TABLE IF EXISTS tompo.tompo_reportimpact;"
 47 |       ],
 48 |       "outputs": [],
 49 |       "execution_count": null,
 50 |       "metadata": {
 51 |         "jupyter": {
 52 |           "source_hidden": false,
 53 |           "outputs_hidden": false
 54 |         },
 55 |         "nteract": {
 56 |           "transient": {
 57 |             "deleting": false
 58 |           }
 59 |         },
 60 |         "microsoft": {
 61 |           "language": "sparksql"
 62 |         }
 63 |       }
 64 |     },
 65 |     {
 66 |       "cell_type": "code",
 67 |       "source": [
 68 |         "%%spark\r\n",
 69 |         "//Drop\\Remove the External tables\r\n",
 70 |         "var fs = org.apache.hadoop.fs.FileSystem.get(sc.hadoopConfiguration)\r\n",
 71 |         "if(fs.exists(new org.apache.hadoop.fs.Path(\"/data/tompo/tompo_datasources\"))) {\r\n",
 72 |         "    mssparkutils.fs.rm(\"/data/tompo/tompo_datasources\", recurse=true)\r\n",
 73 |         "}\r\n",
 74 |         "\r\n",
 75 |         "var fs = org.apache.hadoop.fs.FileSystem.get(sc.hadoopConfiguration)\r\n",
 76 |         "if(fs.exists(new org.apache.hadoop.fs.Path(\"/data/tompo/tompo_tables\"))) {\r\n",
 77 |         "    mssparkutils.fs.rm(\"/data/tompo/tompo_tables\", recurse=true)\r\n",
 78 |         "}\r\n",
 79 |         "\r\n",
 80 |         "var fs = org.apache.hadoop.fs.FileSystem.get(sc.hadoopConfiguration)\r\n",
 81 |         "if(fs.exists(new org.apache.hadoop.fs.Path(\"/data/tompo/tompo_tablepartitions\"))) {\r\n",
 82 |         "    mssparkutils.fs.rm(\"/data/tompo/tompo_tablepartitions\", recurse=true)\r\n",
 83 |         "}\r\n",
 84 |         "\r\n",
 85 |         "var fs = org.apache.hadoop.fs.FileSystem.get(sc.hadoopConfiguration)\r\n",
 86 |         "if(fs.exists(new org.apache.hadoop.fs.Path(\"/data/tompo/tompo_totables\"))) {\r\n",
 87 |         "    mssparkutils.fs.rm(\"/data/tompo/tompo_totables\", recurse=true)\r\n",
 88 |         "}\r\n",
 89 |         "\r\n",
 90 |         "var fs = org.apache.hadoop.fs.FileSystem.get(sc.hadoopConfiguration)\r\n",
 91 |         "if(fs.exists(new org.apache.hadoop.fs.Path(\"/data/tompo/tompo_columns\"))) {\r\n",
 92 |         "    mssparkutils.fs.rm(\"/data/tompo/tompo_columns\", recurse=true)\r\n",
 93 |         "}\r\n",
 94 |         "\r\n",
 95 |         "var fs = org.apache.hadoop.fs.FileSystem.get(sc.hadoopConfiguration)\r\n",
 96 |         "if(fs.exists(new org.apache.hadoop.fs.Path(\"/data/tompo/tompo_measures\"))) {\r\n",
 97 |         "    mssparkutils.fs.rm(\"/data/tompo/tompo_measures\", recurse=true)\r\n",
 98 |         "}\r\n",
 99 |         "\r\n",
100 |         "var fs = org.apache.hadoop.fs.FileSystem.get(sc.hadoopConfiguration)\r\n",
101 |         "if(fs.exists(new org.apache.hadoop.fs.Path(\"/data/tompo/tompo_calcdependency\"))) {\r\n",
102 |         "    mssparkutils.fs.rm(\"/data/tompo/tompo_calcdependency\", recurse=true)\r\n",
103 |         "}\r\n",
104 |         "\r\n",
105 |         "var fs = org.apache.hadoop.fs.FileSystem.get(sc.hadoopConfiguration)\r\n",
106 |         "if(fs.exists(new org.apache.hadoop.fs.Path(\"/data/tompo/tompo_relationships\"))) {\r\n",
107 |         "    mssparkutils.fs.rm(\"/data/tompo/tompo_relationships\", recurse=true)\r\n",
108 |         "}\r\n",
109 |         "\r\n",
110 |         "var fs = org.apache.hadoop.fs.FileSystem.get(sc.hadoopConfiguration)\r\n",
111 |         "if(fs.exists(new org.apache.hadoop.fs.Path(\"/data/tompo/tompo_rolememberships\"))) {\r\n",
112 |         "    mssparkutils.fs.rm(\"/data/tompo/tompo_rolememberships\", recurse=true)\r\n",
113 |         "}\r\n",
114 |         "\r\n",
115 |         "var fs = org.apache.hadoop.fs.FileSystem.get(sc.hadoopConfiguration)\r\n",
116 |         "if(fs.exists(new org.apache.hadoop.fs.Path(\"/data/tompo/tompo_roles\"))) {\r\n",
117 |         "    mssparkutils.fs.rm(\"/data/tompo/tompo_roles\", recurse=true)\r\n",
118 |         "}\r\n",
119 |         "\r\n",
120 |         "\r\n",
121 |         "var fs = org.apache.hadoop.fs.FileSystem.get(sc.hadoopConfiguration)\r\n",
122 |         "if(fs.exists(new org.apache.hadoop.fs.Path(\"/data/tompo/tompo_dbschema_catalogs\"))) {\r\n",
123 |         "    mssparkutils.fs.rm(\"/data/tompo/tompo_dbschema_catalogs\", recurse=true)\r\n",
124 |         "}\r\n",
125 |         "\r\n",
126 |         "var fs = org.apache.hadoop.fs.FileSystem.get(sc.hadoopConfiguration)\r\n",
127 |         "if(fs.exists(new org.apache.hadoop.fs.Path(\"/data/tompo/tompo_reportimpact\"))) {\r\n",
128 |         "    mssparkutils.fs.rm(\"/data/tompo/tompo_reportimpact\", recurse=true)\r\n",
129 |         "}\r\n",
130 |         "\r\n"
131 |       ],
132 |       "outputs": [],
133 |       "execution_count": null,
134 |       "metadata": {
135 |         "jupyter": {
136 |           "source_hidden": false,
137 |           "outputs_hidden": false
138 |         },
139 |         "nteract": {
140 |           "transient": {
141 |             "deleting": false
142 |           }
143 |         },
144 |         "microsoft": {
145 |           "language": "scala"
146 |         }
147 |       }
148 |     },
149 |     {
150 |       "cell_type": "code",
151 |       "source": [
152 |         "%%sql\r\n",
153 |         "--Create TOMPo tables\r\n",
154 |         "use tompo;\r\n",
155 |         "\r\n",
156 |         "CREATE TABLE IF NOT EXISTS tompo_datasources\r\n",
157 |         "(    \r\n",
158 |         "    ModelName STRING\r\n",
159 |         "   , ID BIGINT\r\n",
160 |         "   , ModelID BIGINT\r\n",
161 |         "   , Name STRING \r\n",
162 |         "   , Description STRING\r\n",
163 |         "   , Type BIGINT\r\n",
164 |         "   , ConnectionString STRING\r\n",
165 |         "   , ImpersonationMode BIGINT\r\n",
166 |         "   , Account STRING\r\n",
167 |         "   , ModifiedTime TIMESTAMP\r\n",
168 |         ")\r\n",
169 |         "USING DELTA\r\n",
170 |         "LOCATION \"/data/tompo/tompo_datasources\";\r\n",
171 |         "\r\n",
172 |         "CREATE TABLE IF NOT EXISTS tompo_tables\r\n",
173 |         "(    \r\n",
174 |         "     ModelName STRING\r\n",
175 |         "   , ID BIGINT\r\n",
176 |         "   , ModelID BIGINT\r\n",
177 |         "   , Name STRING \r\n",
178 |         "   , DataCategory STRING\r\n",
179 |         "   , Description STRING\r\n",
180 |         "   , IsHidden BOOLEAN\r\n",
181 |         ")\r\n",
182 |         "USING DELTA\r\n",
183 |         "LOCATION \"/data/tompo/tompo_tables\";\r\n",
184 |         "\r\n",
185 |         "\r\n",
186 |         "CREATE TABLE IF NOT EXISTS tompo_tablepartitions\r\n",
187 |         "(    \r\n",
188 |         "     ModelName STRING\r\n",
189 |         "   , ID BIGINT\r\n",
190 |         "   , TableID BIGINT\r\n",
191 |         "   , Name STRING \r\n",
192 |         "   , Description STRING\r\n",
193 |         "   , DataSourceID INT\r\n",
194 |         "   , QueryDefinition STRING\r\n",
195 |         "   , Type BIGINT\r\n",
196 |         "   , Mode BIGINT\r\n",
197 |         "   , ModifiedTime TIMESTAMP\r\n",
198 |         ")\r\n",
199 |         "USING DELTA\r\n",
200 |         "LOCATION \"/data/tompo/tompo_tablepartitions\";\r\n",
201 |         "\r\n",
202 |         "CREATE TABLE IF NOT EXISTS tompo_totables\r\n",
203 |         "(    \r\n",
204 |         "     ModelName STRING \r\n",
205 |         "   , ID BIGINT\r\n",
206 |         "   , ModelID BIGINT\r\n",
207 |         "   , Name STRING \r\n",
208 |         "   , DataCategory STRING\r\n",
209 |         "   , Description STRING\r\n",
210 |         "   , IsHidden BOOLEAN\r\n",
211 |         ")\r\n",
212 |         "USING DELTA\r\n",
213 |         "LOCATION \"/data/tompo/tompo_totables\";\r\n",
214 |         "\r\n",
215 |         "\r\n",
216 |         "CREATE TABLE IF NOT EXISTS tompo_columns\r\n",
217 |         "(    \r\n",
218 |         "     ModelName STRING\r\n",
219 |         "   , ID BIGINT\r\n",
220 |         "   , TableID BIGINT\r\n",
221 |         "   , Name STRING    \r\n",
222 |         "   , ExplicitDataType STRING\r\n",
223 |         "   , DataCategory STRING\r\n",
224 |         "   , Description BOOLEAN\r\n",
225 |         "   , isHidden BOOLEAN\r\n",
226 |         "   , isUnique BOOLEAN\r\n",
227 |         "   , isKey BOOLEAN\r\n",
228 |         "   , SummarizeBy BIGINT\r\n",
229 |         "   , ColumnStorageID BIGINT\r\n",
230 |         "   , Type BIGINT\r\n",
231 |         "   , SourceColumn STRING\r\n",
232 |         "   , Expression STRING\r\n",
233 |         "   , FormatString STRING\r\n",
234 |         "   , SortByColumnID BIGINT\r\n",
235 |         "   , AttributeHierarchyID BIGINT\r\n",
236 |         "   , ModifiedTime TIMESTAMP\r\n",
237 |         "   , StructuredModifiedTime TIMESTAMP\r\n",
238 |         "   , DisplayFolder STRING\r\n",
239 |         ")\r\n",
240 |         "USING DELTA\r\n",
241 |         "LOCATION \"/data/tompo/tompo_columns\";\r\n",
242 |         "\r\n",
243 |         "\r\n",
244 |         "CREATE TABLE IF NOT EXISTS tompo_measures\r\n",
245 |         "(    \r\n",
246 |         "     ModelName STRING\r\n",
247 |         "   , ID BIGINT\r\n",
248 |         "   , TableID BIGINT\r\n",
249 |         "   , Name STRING\r\n",
250 |         "   , Description STRING    \r\n",
251 |         "   , DataType BIGINT\r\n",
252 |         "   , Expression STRING\r\n",
253 |         "   , FormatString STRING\r\n",
254 |         "   , IsHidden BOOLEAN\r\n",
255 |         "   , ModifiedTime TIMESTAMP\r\n",
256 |         "   , StructuredModifiedTime TIMESTAMP\r\n",
257 |         "   , KPID BIGINT\r\n",
258 |         "   , IsSimpleMeasure BOOLEAN\r\n",
259 |         "   , DisplayFolder STRING\r\n",
260 |         ")\r\n",
261 |         "USING DELTA\r\n",
262 |         "LOCATION \"/data/tompo/tompo_measures\";\r\n",
263 |         "\r\n",
264 |         "CREATE TABLE IF NOT EXISTS tompo_calcdependency\r\n",
265 |         "(    \r\n",
266 |         "     ModelName STRING\r\n",
267 |         "   , Database_Name STRING\r\n",
268 |         "   , Object_Type STRING\r\n",
269 |         "   , Table STRING\r\n",
270 |         "   , Object STRING    \r\n",
271 |         "   , Expression STRING\r\n",
272 |         "   , ReferenceObjectType STRING\r\n",
273 |         "   , ReferencedTable STRING\r\n",
274 |         "   , ReferencedObject STRING\r\n",
275 |         "   , ReferencedExpression STRING\r\n",
276 |         "   , Query STRING\r\n",
277 |         ")\r\n",
278 |         "USING DELTA\r\n",
279 |         "LOCATION \"/data/tompo/tompo_calcdependency\";\r\n",
280 |         "\r\n",
281 |         "\r\n",
282 |         "CREATE TABLE IF NOT EXISTS tompo_relationships\r\n",
283 |         "(    \r\n",
284 |         "     ModelName STRING\r\n",
285 |         "   , ID BIGINT\r\n",
286 |         "   , ModelID BIGINT\r\n",
287 |         "   , IsActive BIGINT\r\n",
288 |         "   , Type BIGINT    \r\n",
289 |         "   , CrossFilteringBehavior STRING\r\n",
290 |         "   , FromTableID STRING\r\n",
291 |         "   , FromColumnID STRING\r\n",
292 |         "   , FromCardinality STRING\r\n",
293 |         "   , ToTableID STRING\r\n",
294 |         "   , ToColumnID STRING\r\n",
295 |         "   , ToCardinality BIGINT\r\n",
296 |         "   , ModifiedTime TIMESTAMP\r\n",
297 |         ")\r\n",
298 |         "USING DELTA\r\n",
299 |         "LOCATION \"/data/tompo/tompo_relationships\";\r\n",
300 |         "\r\n",
301 |         "\r\n",
302 |         "CREATE TABLE IF NOT EXISTS tompo_rolememberships\r\n",
303 |         "(    \r\n",
304 |         "     ModelName STRING\r\n",
305 |         "   , ID BIGINT\r\n",
306 |         "   , RoleID BIGINT\r\n",
307 |         "   , MemberName STRING    \r\n",
308 |         "   , IdentityProvider STRING\r\n",
309 |         "   , ModifiedTime TIMESTAMP\r\n",
310 |         ")\r\n",
311 |         "USING DELTA\r\n",
312 |         "LOCATION \"/data/tompo/tompo_rolememberships\";\r\n",
313 |         "\r\n",
314 |         "\r\n",
315 |         "CREATE TABLE IF NOT EXISTS tompo_roles\r\n",
316 |         "(    \r\n",
317 |         "     ModelName STRING\r\n",
318 |         "   , ID BIGINT\r\n",
319 |         "   , Name STRING\r\n",
320 |         "   , Description STRING    \r\n",
321 |         "   , ModelPermission INT\r\n",
322 |         "   , ModifiedTime TIMESTAMP\r\n",
323 |         ")\r\n",
324 |         "USING DELTA\r\n",
325 |         "LOCATION \"/data/tompo/tompo_roles\";\r\n",
326 |         "\r\n",
327 |         "CREATE TABLE IF NOT EXISTS tompo_dbschema_catalogs\r\n",
328 |         "(\r\n",
329 |         "    ModelName STRING\r\n",
330 |         "  , Description STRING\r\n",
331 |         "  , ROLES STRING\r\n",
332 |         "  , DATE_MODIFIED TIMESTAMP\r\n",
333 |         "  , COMPATIBILITY_LEVEL BIGINT\r\n",
334 |         "  , TYPE STRING\r\n",
335 |         ")\r\n",
336 |         "USING DELTA\r\n",
337 |         "LOCATION \"/data/tompo/tompo_dbschema_catalogs\";\r\n",
338 |         "\r\n",
339 |         "\r\n",
340 |         "CREATE TABLE IF NOT EXISTS tompo_reportimpact\r\n",
341 |         "(    \r\n",
342 |         "     ModelName STRING\r\n",
343 |         "   , TableName STRING\r\n",
344 |         "   , ObjectName STRING\r\n",
345 |         "   , ReportName STRING\r\n",
346 |         "   , PageName STRING\r\n",
347 |         "   , VisualType STRING\r\n",
348 |         "   , ObjectType STRING\r\n",
349 |         ")\r\n",
350 |         "USING DELTA\r\n",
351 |         "LOCATION \"/data/tompo/tompo_reportimpact\";\r\n"
352 |       ],
353 |       "outputs": [],
354 |       "execution_count": null,
355 |       "metadata": {
356 |         "jupyter": {
357 |           "source_hidden": false,
358 |           "outputs_hidden": false
359 |         },
360 |         "nteract": {
361 |           "transient": {
362 |             "deleting": false
363 |           }
364 |         },
365 |         "microsoft": {
366 |           "language": "sparksql"
367 |         },
368 |         "collapsed": false
369 |       }
370 |     },
371 |     {
372 |       "cell_type": "code",
373 |       "source": [
374 |         "#r \"nuget: Microsoft.AnalysisServices.AdomdClient.NetCore.retail.amd64, 19.51.0\"\r\n",
375 |         "\r\n",
376 |         "using System.Data;\r\n",
377 |         "using Microsoft.Spark.Sql;\r\n",
378 |         "using Microsoft.Spark.Sql.Types;\r\n",
379 |         "using T=Microsoft.Spark.Sql.Types;\r\n",
380 |         "using Microsoft.AnalysisServices.AdomdClient;\r\n",
381 |         "\r\n",
382 |         "DataFrame RunXmlaQuery(string constr, string query, int topRows=0)\r\n",
383 |         "{\r\n",
384 |         "    using (var con = new AdomdConnection(constr))\r\n",
385 |         "    {\r\n",
386 |         "        con.Open();\r\n",
387 |         "        var cmd = con.CreateCommand();\r\n",
388 |         "        cmd.CommandText = query;\r\n",
389 |         "        using (var rdr = cmd.ExecuteReader())\r\n",
390 |         "        {\r\n",
391 |         "            return GetDataFrame(rdr,topRows);\r\n",
392 |         "        }\r\n",
393 |         "\r\n",
394 |         "    }\r\n",
395 |         "}\r\n",
396 |         "\r\n",
397 |         "DataFrame GetDataFrame( IDataReader rdr, int topRows = 0)\r\n",
398 |         "{\r\n",
399 |         "    var fields= GetFields(rdr).ToList();\r\n",
400 |         "    var type = new StructType(fields);\r\n",
401 |         "\r\n",
402 |         "    //Console.WriteLine(type.SerializeToJson());\r\n",
403 |         "\r\n",
404 |         "    return spark.CreateDataFrame(GetRows(rdr, topRows), type);\r\n",
405 |         "}\r\n",
406 |         "\r\n",
407 |         "IEnumerable<GenericRow> GetRows(IDataReader rdr, int topRows = 0)\r\n",
408 |         "{    \r\n",
409 |         "    int rows = 0;\r\n",
410 |         "    while (rdr.Read())\r\n",
411 |         "    {\r\n",
412 |         "        rows++;\r\n",
413 |         "        var values = new object[rdr.FieldCount];\r\n",
414 |         "        rdr.GetValues(values);\r\n",
415 |         "\r\n",
416 |         "        for (int i=0;i<rdr.FieldCount;i++)\r\n",
417 |         "        {\r\n",
418 |         "            var obj = values[i];\r\n",
419 |         "            if (obj is UInt64 u)\r\n",
420 |         "            {\r\n",
421 |         "                 values[i] = Convert.ToInt64(u);\r\n",
422 |         "            }\r\n",
423 |         "            else if (obj is DateTime dt)\r\n",
424 |         "            {\r\n",
425 |         "           \r\n",
426 |         "               values[i] = dt.ToString();\r\n",
427 |         "               \r\n",
428 |         "            }\r\n",
429 |         "            else if (obj is Date d)\r\n",
430 |         "            {\r\n",
431 |         "               \r\n",
432 |         "                values[i] = d.ToString();\r\n",
433 |         "               \r\n",
434 |         "            }            \r\n",
435 |         "        }\r\n",
436 |         "        \r\n",
437 |         "        yield return new GenericRow(values);\r\n",
438 |         "\r\n",
439 |         "        if (topRows > 0 && rows >= topRows)\r\n",
440 |         "            break;\r\n",
441 |         "\r\n",
442 |         "    }\r\n",
443 |         "}\r\n",
444 |         "\r\n",
445 |         "DataType GetSparkType(Type t)\r\n",
446 |         "{\r\n",
447 |         "    if (t == typeof(string)) \r\n",
448 |         "        return new T.StringType();\r\n",
449 |         "    if (t == typeof(int))\r\n",
450 |         "        return new T.IntegerType();\r\n",
451 |         "    if (t == typeof(long) || t == typeof(UInt64))\r\n",
452 |         "        return new T.LongType();\r\n",
453 |         "    if (t == typeof(float))\r\n",
454 |         "        return new T.FloatType();\r\n",
455 |         "    if (t == typeof(double))\r\n",
456 |         "        return new T.DoubleType();\r\n",
457 |         "    if (t == typeof(decimal))\r\n",
458 |         "        return new T.DecimalType();\r\n",
459 |         "    if (t == typeof(DateTime) || t == typeof(DateTimeOffset))\r\n",
460 |         "        //return new T.TimestampType(); not yet supported\r\n",
461 |         "        return new T.StringType();\r\n",
462 |         "    if (t == typeof(Date))\r\n",
463 |         "        //return new T.DateType();\r\n",
464 |         "        return new T.StringType();\r\n",
465 |         "    if (t == typeof(Guid))\r\n",
466 |         "        return new T.StringType();\r\n",
467 |         "    if (t == typeof(decimal))\r\n",
468 |         "        return new T.DecimalType();\r\n",
469 |         "    if (t == typeof(long))\r\n",
470 |         "        return new T.LongType();\r\n",
471 |         "    if (t == typeof(short))\r\n",
472 |         "        return new T.ShortType();\r\n",
473 |         "    if (t == typeof(bool))\r\n",
474 |         "        return new T.BooleanType();\r\n",
475 |         "    if (t == typeof(byte))\r\n",
476 |         "        return new T.ByteType();\r\n",
477 |         "\r\n",
478 |         "\r\n",
479 |         "    throw new InvalidOperationException($\"Unsupported Type for DataFrame conversion: {t.Name}\");\r\n",
480 |         "}\r\n",
481 |         "\r\n",
482 |         "IEnumerable<StructField> GetFields(IDataReader rdr)\r\n",
483 |         "{\r\n",
484 |         "    for (int i = 0; i < rdr.FieldCount; i++)\r\n",
485 |         "    {\r\n",
486 |         "        var type = GetSparkType(rdr.GetFieldType(i));   \r\n",
487 |         "        var name = rdr.GetName(i);\r\n",
488 |         "\r\n",
489 |         "        //Console.WriteLine($\"{name} {type}\");\r\n",
490 |         "\r\n",
491 |         "        yield return new StructField(name, type, isNullable:true);\r\n",
492 |         "    }\r\n",
493 |         "\r\n",
494 |         "}\r\n"
495 |       ],
496 |       "outputs": [],
497 |       "execution_count": null,
498 |       "metadata": {
499 |         "jupyter": {
500 |           "source_hidden": false,
501 |           "outputs_hidden": false
502 |         },
503 |         "nteract": {
504 |           "transient": {
505 |             "deleting": false
506 |           }
507 |         }
508 |       }
509 |     },
510 |     {
511 |       "cell_type": "code",
512 |       "source": [
513 |         "DataFrame df_parser_output = spark\r\n",
514 |         "                            .Read()\r\n",
515 |         "                            .Option(\"header\", true)\r\n",
516 |         "                            .Option(\"inferShchema\", true)\r\n",
517 |         "                            .Option(\"ignoreLeadingWhiteSpace\", true)\r\n",
518 |         "                            .Option(\"ignoreTrailingWhiteSpace\", true)\r\n",
519 |         "                            .Csv(\"/data/tompo/tompo_parseroutput/tompo_output.csv\");\r\n",
520 |         "df_parser_output.CreateOrReplaceTempView(\"vwParserOutput\")"
521 |       ],
522 |       "outputs": [],
523 |       "execution_count": null,
524 |       "metadata": {
525 |         "jupyter": {
526 |           "source_hidden": false,
527 |           "outputs_hidden": false
528 |         },
529 |         "nteract": {
530 |           "transient": {
531 |             "deleting": false
532 |           }
533 |         }
534 |       }
535 |     },
536 |     {
537 |       "cell_type": "code",
538 |       "source": [
539 |         "DataFrame source = spark\r\n",
540 |         "                            .Read()\r\n",
541 |         "                            .Option(\"header\", true)\r\n",
542 |         "                            .Option(\"inferShchema\", true)\r\n",
543 |         "                            .Option(\"ignoreLeadingWhiteSpace\", true)\r\n",
544 |         "                            .Option(\"ignoreTrailingWhiteSpace\", true)\r\n",
545 |         "                            .Csv(\"/data/tompo/tompo_model_metadata.csv\");"
546 |       ],
547 |       "outputs": [],
548 |       "execution_count": null,
549 |       "metadata": {
550 |         "jupyter": {
551 |           "source_hidden": false,
552 |           "outputs_hidden": false
553 |         },
554 |         "nteract": {
555 |           "transient": {
556 |             "deleting": false
557 |           }
558 |         },
559 |         "microsoft": {}
560 |       }
561 |     },
562 |     {
563 |       "cell_type": "code",
564 |       "source": [
565 |         "%%pyspark\r\n",
566 |         "from pyspark.sql.types import StructType,StructField,StringType,DateType\r\n",
567 |         "from pyspark.sql.functions import *\r\n",
568 |         "import os\r\n",
569 |         "import re\r\n",
570 |         "from notebookutils import mssparkutils\r\n",
571 |         "\r\n",
572 |         "keyvaultName = os.getenv(\"keyvaultName\")  # Create a spark.yarn.appMasterEnv.keyvaultName property in Apache Spark Configuration and store the keyvalut name\r\n",
573 |         "applicationId = mssparkutils.credentials.getSecret(keyvaultName,\"secret name\",\"linked service name\") \r\n",
574 |         "applicationSecret = mssparkutils.credentials.getSecret(keyvaultName,\"secret name\",\"linked service name\")\r\n",
575 |         "storageConnString = mssparkutils.credentials.getSecret(keyvaultName,\"secret name\",\"linked service name\")\r\n",
576 |         "tenantId = mssparkutils.credentials.getSecret(keyvaultName,\"secret name\",\"linked service name\")\r\n",
577 |         "\r\n",
578 |         "paramData = [(applicationId, applicationSecret, storageConnString, tenantId)]\r\n",
579 |         "schema = StructType([ \\\r\n",
580 |         "  StructField(\"applicationId\", StringType(), True), \\\r\n",
581 |         "  StructField(\"applicationSecret\", StringType(), True), \\\r\n",
582 |         "  StructField(\"storageConnString\", StringType(), True), \\\r\n",
583 |         "  StructField(\"tenantId\", StringType(), True) \\\r\n",
584 |         "])\r\n",
585 |         "\r\n",
586 |         "df = spark.createDataFrame(spark.sparkContext.parallelize(paramData), schema)\r\n",
587 |         "df.createOrReplaceTempView(\"vw_tompo_params\")\r\n"
588 |       ],
589 |       "outputs": [],
590 |       "execution_count": null,
591 |       "metadata": {
592 |         "jupyter": {
593 |           "source_hidden": false,
594 |           "outputs_hidden": false
595 |         },
596 |         "nteract": {
597 |           "transient": {
598 |             "deleting": false
599 |           }
600 |         },
601 |         "microsoft": {
602 |           "language": "python"
603 |         }
604 |       }
605 |     },
606 |     {
607 |       "cell_type": "code",
608 |       "source": [
609 |         "public class TompoSecret\r\n",
610 |         "{\r\n",
611 |         "    public string applicationId { get; set; }\r\n",
612 |         "    public string applicationSecret { get; set; }\r\n",
613 |         "    public string storageConnString { get; set; }\r\n",
614 |         "    public string tenantId { get; set; }\r\n",
615 |         "}\r\n",
616 |         "\r\n",
617 |         "\r\n",
618 |         "TompoSecret secretObj = new TompoSecret();\r\n",
619 |         "var secretsdata = spark.Sql(\"SELECT * FROM vw_tompo_params\");\r\n",
620 |         "\r\n",
621 |         "secretsdata.Collect().ToList().ForEach(row => {\r\n",
622 |         "    secretObj.applicationId = row[0].ToString();\r\n",
623 |         "    secretObj.applicationSecret = row[1].ToString();\r\n",
624 |         "    secretObj.storageConnString = row[2].ToString();\r\n",
625 |         "    secretObj.tenantId = row[3].ToString();\r\n",
626 |         "    }\r\n",
627 |         ");\r\n",
628 |         "\r\n",
629 |         "var applicationId = secretObj.applicationId;\r\n",
630 |         "var applicationSecret = secretObj.applicationSecret;\r\n",
631 |         "var tenantId= secretObj.tenantId;\r\n",
632 |         "\r\n",
633 |         "var clientId = applicationId;\r\n",
634 |         "//var tenantId = tenantId;\r\n",
635 |         "var userId = $\"app:{clientId}@{tenantId}\";\r\n",
636 |         "var secret = applicationSecret ;      \r\n",
637 |         "var xmlaEndpoint=\"\";\r\n",
638 |         "var dataset=\"\";\r\n",
639 |         "var constr = \"\";\r\n",
640 |         "var query=\"\";\r\n",
641 |         "var modelname=\"\";\r\n",
642 |         "\r\n",
643 |         "foreach (var obj in source.Collect())\r\n",
644 |         "{\r\n",
645 |         "       Console.WriteLine(obj[0]);\r\n",
646 |         "       Console.WriteLine(obj[1]);\r\n",
647 |         "       Console.WriteLine(obj[2]);\r\n",
648 |         "       xmlaEndpoint = \"powerbi://api.powerbi.com/v1.0/myorg/\" + obj[1] ;\r\n",
649 |         "       dataset = obj[2]+ \"\" ;\r\n",
650 |         "       \r\n",
651 |         "       //Populate Data Sources\r\n",
652 |         "       query = \"SELECT [ID], ModelID, [Name], [Description], [Type], ConnectionString, ImpersonationMode, Account, ModifiedTime from $SYSTEM.TMSCHEMA_DATA_SOURCES\";\r\n",
653 |         "       constr = $\"Data Source={xmlaEndpoint};database={dataset};user id={userId};password={secret}\";\r\n",
654 |         "       var df = RunXmlaQuery(constr,query);\r\n",
655 |         "       df.CreateOrReplaceTempView(\"vwTompoMetadata\");   \r\n",
656 |         "       //Insert result into table here - UNION ALL\r\n",
657 |         "       query=\"INSERT INTO tompo.tompo_datasources SELECT '\" + obj[2] + \"',ID, ModelID, Name, Description, Type, ConnectionString, ImpersonationMode, Account, CAST(ModifiedTime AS DATE) FROM vwTompoMetadata \";\r\n",
658 |         "       var res = spark.Sql(query);\r\n",
659 |         "   \r\n",
660 |         "\r\n",
661 |         "        //Populate Tables\r\n",
662 |         "       query = \"Select [ID], [ModelID], [Name], [DataCategory], [Description], [IsHidden] from $SYSTEM.TMSCHEMA_TABLES\";\r\n",
663 |         "       constr = $\"Data Source={xmlaEndpoint};database={dataset};user id={userId};password={secret}\";\r\n",
664 |         "       df = RunXmlaQuery(constr,query);\r\n",
665 |         "       df.CreateOrReplaceTempView(\"vwTompoMetadata\");\r\n",
666 |         "       //Insert result into table here - UNION ALL\r\n",
667 |         "       query=\"INSERT INTO tompo.tompo_tables SELECT '\" +  obj[2] + \"',  ID, ModelID, Name, DataCategory, Description, IsHidden  FROM vwTompoMetadata\";\r\n",
668 |         "       res = spark.Sql(query);\r\n",
669 |         "     \r\n",
670 |         "\r\n",
671 |         "      //Populate Table Partitions\r\n",
672 |         "       query = \"Select [ID], [TableID], [Name], [Description], [DataSourceID], [QueryDefinition], [Type], [Mode], ModifiedTime from $SYSTEM.TMSCHEMA_PARTITIONS\";\r\n",
673 |         "       constr = $\"Data Source={xmlaEndpoint};database={dataset};user id={userId};password={secret}\";\r\n",
674 |         "       df = RunXmlaQuery(constr,query);\r\n",
675 |         "       df.CreateOrReplaceTempView(\"vwTompoMetadata\");\r\n",
676 |         "       //Insert result into table here - UNION ALL\r\n",
677 |         "       query=\"INSERT INTO tompo.tompo_tablepartitions Select '\" + obj[2] + \"', ID, TableID, Name, Description, DataSourceID, QueryDefinition, Type, Mode, CAST(ModifiedTime AS DATE) FROM vwTompoMetadata\";\r\n",
678 |         "       res = spark.Sql(query);\r\n",
679 |         "\r\n",
680 |         "\r\n",
681 |         "       //Populate ToTables (This is to handle relationships)\r\n",
682 |         "       query = \"Select  [ID], [ModelID], [Name], [DataCategory], [Description], [IsHidden] from $SYSTEM.TMSCHEMA_TABLES\";\r\n",
683 |         "       constr = $\"Data Source={xmlaEndpoint};database={dataset};user id={userId};password={secret}\";\r\n",
684 |         "       df = RunXmlaQuery(constr,query);\r\n",
685 |         "       df.CreateOrReplaceTempView(\"vwTompoMetadata\");\r\n",
686 |         "       //Insert result into table here - UNION ALL\r\n",
687 |         "       query=\"INSERT INTO tompo.tompo_totables  SELECT '\"  +  obj[2] + \"' , ID, ModelID, Name, DataCategory, Description, IsHidden  FROM vwTompoMetadata\";\r\n",
688 |         "       res = spark.Sql(query);\r\n",
689 |         "\r\n",
690 |         "\r\n",
691 |         "       //Populate Columns\r\n",
692 |         "       query = \"Select [ID], [TableID], [ExplicitName] , [ExplicitDataType],  [DataCategory], [Description], [IsHidden], [IsUnique], [IsKey], [SummarizeBy], [ColumnStorageID], [Type], [SourceColumn], [Expression], [FormatString], [SortByColumnID], [AttributeHierarchyID], [ModifiedTime], [StructureModifiedTime], [DisplayFolder] from $SYSTEM.TMSCHEMA_COLUMNS\";\r\n",
693 |         "       constr = $\"Data Source={xmlaEndpoint};database={dataset};user id={userId};password={secret}\";\r\n",
694 |         "       df = RunXmlaQuery(constr,query);\r\n",
695 |         "       df.CreateOrReplaceTempView(\"vwTompoMetadata\");\r\n",
696 |         "       //Insert result into table here - UNION ALL\r\n",
697 |         "       query=\"INSERT INTO tompo.tompo_columns  Select '\" +  obj[2] + \"', ID, TableID, ExplicitName , ExplicitDataType,  DataCategory, Description, IsHidden, IsUnique, IsKey, SummarizeBy, ColumnStorageID, Type, SourceColumn, Expression, FormatString, SortByColumnID, AttributeHierarchyID, CAST(ModifiedTime AS Date), CAST(StructureModifiedTime AS Date), CAST(DisplayFolder AS Date)  FROM vwTompoMetadata\";\r\n",
698 |         "       res = spark.Sql(query);\r\n",
699 |         "\r\n",
700 |         "\r\n",
701 |         "       //Populate Measures\r\n",
702 |         "       query = \"Select [ID], [TableID],  [Name], [Description] , [DataType], [Expression], [FormatString], [IsHidden], [ModifiedTime], [StructureModifiedTime], [KPIID], [IsSimpleMeasure], [DisplayFolder] from $SYSTEM.TMSCHEMA_MEASURES\";\r\n",
703 |         "       constr = $\"Data Source={xmlaEndpoint};database={dataset};user id={userId};password={secret}\";\r\n",
704 |         "       df = RunXmlaQuery(constr,query);\r\n",
705 |         "       df.CreateOrReplaceTempView(\"vwTompoMetadata\");\r\n",
706 |         "       //Insert result into table here - UNION ALL\r\n",
707 |         "       query=\"INSERT INTO tompo.tompo_measures  Select '\" + obj[2] + \"', ID, TableID,  Name, Description , DataType, Expression, FormatString, IsHidden, CAST(ModifiedTime AS Date), CAST(StructureModifiedTime AS Date), KPIID, IsSimpleMeasure, DisplayFolder FROM vwTompoMetadata\";\r\n",
708 |         "       res = spark.Sql(query);\r\n",
709 |         "\r\n",
710 |         "\r\n",
711 |         "       //Populate CALC Dependency\r\n",
712 |         "       query = \"SELECT DATABASE_NAME, OBJECT_TYPE, [TABLE], OBJECT, EXPRESSION, REFERENCED_OBJECT_TYPE, REFERENCED_TABLE, REFERENCED_OBJECT, REFERENCED_EXPRESSION, QUERY  FROM $System.DISCOVER_CALC_DEPENDENCY where OBJECT_TYPE='MEASURE'\";\r\n",
713 |         "       constr = $\"Data Source={xmlaEndpoint};database={dataset};user id={userId};password={secret}\";\r\n",
714 |         "       df = RunXmlaQuery(constr,query);\r\n",
715 |         "       df.CreateOrReplaceTempView(\"vwTompoMetadata\");\r\n",
716 |         "       //Insert result into table here - UNION ALL\r\n",
717 |         "       query=\"INSERT INTO tompo.tompo_calcdependency SELECT '\"  + obj[2] + \"', DATABASE_NAME, OBJECT_TYPE, TABLE, OBJECT, EXPRESSION, REFERENCED_OBJECT_TYPE, REFERENCED_TABLE, REFERENCED_OBJECT, REFERENCED_EXPRESSION, QUERY FROM vwTompoMetadata\";\r\n",
718 |         "       res = spark.Sql(query);\r\n",
719 |         "\r\n",
720 |         "\r\n",
721 |         "       //Populate Relationships\r\n",
722 |         "       query = \"Select [ID], [ModelID], [IsActive], [Type], [CrossfilteringBehavior], [FromTableID], [FromColumnID], [FromCardinality], [ToTableID], [ToColumnID], [ToCardinality], [ModifiedTime] from $SYSTEM.TMSCHEMA_RELATIONSHIPS\";\r\n",
723 |         "       constr = $\"Data Source={xmlaEndpoint};database={dataset};user id={userId};password={secret}\";\r\n",
724 |         "       df = RunXmlaQuery(constr,query);\r\n",
725 |         "       df.CreateOrReplaceTempView(\"vwTompoMetadata\");\r\n",
726 |         "       //Insert result into table here - UNION ALL\r\n",
727 |         "       query=\"INSERT INTO tompo.tompo_relationships Select '\" + obj[2] +  \"', ID, ModelID, IsActive, Type, CrossfilteringBehavior, FromTableID, FromColumnID, FromCardinality, ToTableID, ToColumnID, ToCardinality, CAST(ModifiedTime as Date) FROM vwTompoMetadata\";\r\n",
728 |         "       res = spark.Sql(query);\r\n",
729 |         "\r\n",
730 |         "\r\n",
731 |         "       //Populate Rolememberships\r\n",
732 |         "       query = \"select [ID], RoleID, MemberName,IdentityProvider,ModifiedTime from $System.TMSCHEMA_ROLE_MEMBERSHIPS\";\r\n",
733 |         "       constr = $\"Data Source={xmlaEndpoint};database={dataset};user id={userId};password={secret}\";\r\n",
734 |         "       df = RunXmlaQuery(constr,query);\r\n",
735 |         "       df.CreateOrReplaceTempView(\"vwTompoMetadata\");\r\n",
736 |         "       //Insert result into table here - UNION ALL\r\n",
737 |         "       query=\"INSERT INTO tompo.tompo_rolememberships Select '\" + obj[2] + \"',ID, RoleID, MemberName,IdentityProvider,CAST(ModifiedTime as Date) FROM vwTompoMetadata\";\r\n",
738 |         "       res = spark.Sql(query);\r\n",
739 |         "\r\n",
740 |         "\r\n",
741 |         "       //Populate Roles\r\n",
742 |         "       query = \"select [ID], [Name], [Description], ModelPermission,ModifiedTime from $System.TMSCHEMA_ROLES\";\r\n",
743 |         "       constr = $\"Data Source={xmlaEndpoint};database={dataset};user id={userId};password={secret}\";\r\n",
744 |         "       df = RunXmlaQuery(constr,query);\r\n",
745 |         "       df.CreateOrReplaceTempView(\"vwTompoMetadata\");\r\n",
746 |         "       //Insert result into table here - UNION ALL\r\n",
747 |         "       query=\"INSERT INTO tompo.tompo_roles select '\"  +  obj[2] + \"', ID, Name, Description, ModelPermission,CAST(ModifiedTime as Date) FROM vwTompoMetadata\";\r\n",
748 |         "       res = spark.Sql(query);\r\n",
749 |         "\r\n",
750 |         "\r\n",
751 |         "       //Populate Roles\r\n",
752 |         "       query = \"select [CATALOG_NAME], [Description], [ROLES],[DATE_MODIFIED],[COMPATIBILITY_LEVEL],[TYPE]  from $SYSTEM.DBSCHEMA_CATALOGS WHERE [CATALOG_NAME]='\" +  obj[2] + \"'\";\r\n",
753 |         "       constr = $\"Data Source={xmlaEndpoint};database={dataset};user id={userId};password={secret}\";\r\n",
754 |         "       df = RunXmlaQuery(constr,query);\r\n",
755 |         "       df.CreateOrReplaceTempView(\"vwTompoMetadata\");\r\n",
756 |         "       //Insert result into table here - UNION ALL\r\n",
757 |         "       query=\"INSERT INTO tompo.tompo_dbschema_catalogs select CATALOG_NAME, Description, ROLES,CAST(DATE_MODIFIED as Date),COMPATIBILITY_LEVEL,TYPE FROM vwTompoMetadata\";\r\n",
758 |         "       res = spark.Sql(query);\r\n",
759 |         "      \r\n",
760 |         "\r\n",
761 |         "       //Display(df);\r\n",
762 |         "       Console.WriteLine(\"New Line\");\r\n",
763 |         "}"
764 |       ],
765 |       "outputs": [],
766 |       "execution_count": null,
767 |       "metadata": {
768 |         "jupyter": {
769 |           "source_hidden": false,
770 |           "outputs_hidden": false
771 |         },
772 |         "nteract": {
773 |           "transient": {
774 |             "deleting": false
775 |           }
776 |         },
777 |         "microsoft": {},
778 |         "collapsed": false
779 |       }
780 |     },
781 |     {
782 |       "cell_type": "code",
783 |       "source": [
784 |         "%%sql\r\n",
785 |         "USE tompo;\r\n",
786 |         "INSERT INTO tompo.tompo_reportimpact\r\n",
787 |         "select distinct a.ModelName, b.Name as TableName, c.Object as MeasureName, po.ReportName,po.PageName, po.VisualType, 'Measure'\r\n",
788 |         "from tompo.tompo_dbschema_catalogs a\r\n",
789 |         "JOIN tompo.tompo_tables b\r\n",
790 |         "on a.ModelName= b.ModelName\r\n",
791 |         "JOIN tompo.tompo_calcdependency c\r\n",
792 |         "on concat(b.ModelName,b.Name) = concat(c.ModelName,c.ReferencedTable)\r\n",
793 |         "JOIN vwParserOutput po\r\n",
794 |         "on concat(c.ModelName, c.Object) = concat(po.ModelName,po.Column)\r\n",
795 |         "\r\n",
796 |         "UNION ALL\r\n",
797 |         "\r\n",
798 |         "select DISTINCT a.ModelName, b.Name as TableName, c.Name as ColumnName,po.ReportName,po.PageName, po.VisualType, 'Column'\r\n",
799 |         "FROM tompo.tompo_dbschema_catalogs a\r\n",
800 |         "JOIN tompo.tompo_tables b\r\n",
801 |         "ON a.ModelName= b.ModelName\r\n",
802 |         "JOIN tompo.tompo_columns c\r\n",
803 |         "ON concat(b.ModelName, b.ID) = concat(c.ModelName,c.TableID)\r\n",
804 |         "JOIN vwParserOutput po\r\n",
805 |         "on concat(c.ModelName, c.Name) = concat(po.ModelName,po.Column)\r\n"
806 |       ],
807 |       "outputs": [],
808 |       "execution_count": null,
809 |       "metadata": {
810 |         "jupyter": {
811 |           "source_hidden": false,
812 |           "outputs_hidden": false
813 |         },
814 |         "nteract": {
815 |           "transient": {
816 |             "deleting": false
817 |           }
818 |         },
819 |         "microsoft": {
820 |           "language": "sparksql"
821 |         },
822 |         "collapsed": false
823 |       }
824 |     }
825 |   ],
826 |   "metadata": {
827 |     "kernelspec": {
828 |       "name": "synapse_pyspark",
829 |       "language": "Python",
830 |       "display_name": "Synapse PySpark"
831 |     },
832 |     "language_info": {
833 |       "name": "csharp"
834 |     },
835 |     "kernel_info": {
836 |       "name": "synapse_pyspark"
837 |     },
838 |     "description": null,
839 |     "save_output": true,
840 |     "synapse_widget": {
841 |       "version": "0.1",
842 |       "state": {}
843 |     }
844 |   },
845 |   "nbformat": 4,
846 |   "nbformat_minor": 2
847 | }


--------------------------------------------------------------------------------
/tompo/TOMPo_OnboardingSteps.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/DataLineage/a4390286d334ad02dccd2e10d199487ca13640c2/tompo/TOMPo_OnboardingSteps.docx


--------------------------------------------------------------------------------
/tompo/TOMPo_ReportParser.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "cell_type": "code",
  5 |       "source": [
  6 |         "#r \"nuget: Microsoft.PowerBI.Api, 4.10.0\"\r\n",
  7 |         "#r \"nuget: Microsoft.IdentityModel.Clients.ActiveDirectory, 5.2.9\"\r\n",
  8 |         "#r \"nuget: Azure.Identity, 1.7.0\"\r\n",
  9 |         "#r \"nuget: Microsoft.Rest.ClientRuntime, 2.3.24\"\r\n",
 10 |         "#r \"nuget: Microsoft.Azure.Storage.Blob\"\r\n",
 11 |         "#r \"nuget: Microsoft.Azure.Storage.Common\"\r\n",
 12 |         "#r \"nuget: Azure.Security.KeyVault.Secrets, 4.4.0\"\r\n",
 13 |         "#r \"nuget: CsvHelper\"\r\n",
 14 |         "#r \"nuget: microsoft.aspnetcore.mvc.core\""
 15 |       ],
 16 |       "outputs": [],
 17 |       "execution_count": null,
 18 |       "metadata": {}
 19 |     },
 20 |     {
 21 |       "cell_type": "code",
 22 |       "source": [
 23 |         "%%csharp\r\n",
 24 |         "using System;\r\n",
 25 |         "using System.Linq;\r\n",
 26 |         "using System.Configuration;\r\n",
 27 |         "using System.IO;\r\n",
 28 |         "using System.Security;\r\n",
 29 |         "using System.Reflection;\r\n",
 30 |         "using Microsoft.Rest;\r\n",
 31 |         "using Microsoft.Identity.Client;\r\n",
 32 |         "using Microsoft.PowerBI.Api;\r\n",
 33 |         "using System.Collections.Generic;\r\n",
 34 |         "using Microsoft.Spark.Sql;\r\n",
 35 |         "using System.Collections.Generic;\r\n",
 36 |         "using System.Net.Http;\r\n",
 37 |         "using System.Net.Http.Headers;\r\n",
 38 |         "using Newtonsoft.Json.Linq;\r\n",
 39 |         "using System.IO.Compression;\r\n",
 40 |         "using Microsoft.Azure.Storage.Blob;\r\n",
 41 |         "using System.Threading;\r\n",
 42 |         "using System.Threading.Tasks;\r\n",
 43 |         "using Microsoft.Spark.Sql;\r\n",
 44 |         "using Microsoft.Spark.Sql.Types;\r\n",
 45 |         "using Microsoft.Azure.Storage.Auth;\r\n",
 46 |         "using Microsoft.Azure.Storage;\r\n",
 47 |         "using Microsoft.Extensions.Logging;\r\n",
 48 |         "using System.Net;\r\n",
 49 |         "using Newtonsoft.Json;\r\n",
 50 |         "using System.Globalization;\r\n",
 51 |         "using Newtonsoft.Json.Serialization;\r\n",
 52 |         "using CsvHelper;\r\n",
 53 |         "using Microsoft.AspNetCore.Mvc;\r\n",
 54 |         "using Microsoft.PowerBI.Api;\r\n",
 55 |         "\r\n",
 56 |         "private static string spnAccessToken = string.Empty;\r\n",
 57 |         "\r\n",
 58 |         "static string GetSPNAccessToken(string applicationId, string applicationSecret, string tenantSpecificAuthority) {\r\n",
 59 |         "        if (spnAccessToken.Equals(string.Empty)) {\r\n",
 60 |         "        var appConfidential = ConfidentialClientApplicationBuilder.Create(applicationId)\r\n",
 61 |         "                                .WithClientSecret(applicationSecret)\r\n",
 62 |         "                                .WithAuthority(tenantSpecificAuthority)\r\n",
 63 |         "                                .Build();\r\n",
 64 |         "\r\n",
 65 |         "        string[] scopesDefault = new string[] { \"https://analysis.windows.net/powerbi/api/.default\" };\r\n",
 66 |         "        var authResult = appConfidential.AcquireTokenForClient(scopesDefault).ExecuteAsync().Result;\r\n",
 67 |         "        spnAccessToken = authResult.AccessToken;\r\n",
 68 |         "        }\r\n",
 69 |         "        return spnAccessToken;\r\n",
 70 |         "    }\r\n",
 71 |         "    \r\n",
 72 |         "public static PowerBIClient GetPowerBiAppOnlyClient(string applicationId\r\n",
 73 |         "                                                    ,string applicationSecret\r\n",
 74 |         "                                                    ,string tenantSpecificAuthority\r\n",
 75 |         "                                                    ,string urlPowerBiServiceApiRoot) {\r\n",
 76 |         "    var tokenCredentials = new TokenCredentials(GetSPNAccessToken(applicationId, applicationSecret, tenantSpecificAuthority), \"Bearer\");\r\n",
 77 |         "    return new PowerBIClient(new Uri(urlPowerBiServiceApiRoot), tokenCredentials);\r\n",
 78 |         "}"
 79 |       ],
 80 |       "outputs": [],
 81 |       "execution_count": null,
 82 |       "metadata": {
 83 |         "jupyter": {
 84 |           "source_hidden": false,
 85 |           "outputs_hidden": false
 86 |         },
 87 |         "nteract": {
 88 |           "transient": {
 89 |             "deleting": false
 90 |           }
 91 |         },
 92 |         "microsoft": {
 93 |           "language": "csharp"
 94 |         }
 95 |       }
 96 |     },
 97 |     {
 98 |       "cell_type": "code",
 99 |       "source": [
100 |         "%%pyspark\r\n",
101 |         "from pyspark.sql.types import StructType,StructField,StringType,DateType\r\n",
102 |         "from pyspark.sql.functions import *\r\n",
103 |         "import os\r\n",
104 |         "import re\r\n",
105 |         "from notebookutils import mssparkutils\r\n",
106 |         "\r\n",
107 |         "keyvaultName = os.getenv(\"keyvaultName\")  # Create a spark.yarn.appMasterEnv.keyvaultName property in Apache Spark Configuration and store the keyvalut name\r\n",
108 |         "applicationId = mssparkutils.credentials.getSecret(keyvaultName,\"secret name\",\"linked service name\") \r\n",
109 |         "applicationSecret = mssparkutils.credentials.getSecret(keyvaultName,\"secret name\",\"linked service name\")\r\n",
110 |         "storageConnString = mssparkutils.credentials.getSecret(keyvaultName,\"secret name\",\"linked service name\")\r\n",
111 |         "tenantId = mssparkutils.credentials.getSecret(keyvaultName,\"secret name\",\"linked service name\")\r\n",
112 |         "\r\n",
113 |         "\r\n",
114 |         "paramData = [(applicationId, applicationSecret, storageConnString, tenantId)]\r\n",
115 |         "schema = StructType([ \\\r\n",
116 |         "  StructField(\"applicationId\", StringType(), True), \\\r\n",
117 |         "  StructField(\"applicationSecret\", StringType(), True), \\\r\n",
118 |         "  StructField(\"storageConnString\", StringType(), True), \\\r\n",
119 |         "  StructField(\"tenantId\", StringType(), True) \\\r\n",
120 |         "])\r\n",
121 |         "\r\n",
122 |         "df = spark.createDataFrame(spark.sparkContext.parallelize(paramData), schema)\r\n",
123 |         "df.createOrReplaceTempView(\"vw_tompo_params\")\r\n",
124 |         "\r\n",
125 |         "metadatadf = spark.read.option(\"header\", True).csv(\"/data/tompo/tompo_report_metadata.csv\")\r\n",
126 |         "metadatadf.createOrReplaceTempView(\"vw_report_metadata\")"
127 |       ],
128 |       "outputs": [],
129 |       "execution_count": null,
130 |       "metadata": {
131 |         "jupyter": {
132 |           "source_hidden": false,
133 |           "outputs_hidden": false
134 |         },
135 |         "nteract": {
136 |           "transient": {
137 |             "deleting": false
138 |           }
139 |         },
140 |         "microsoft": {
141 |           "language": "python"
142 |         }
143 |       }
144 |     },
145 |     {
146 |       "cell_type": "code",
147 |       "source": [
148 |         "%%csharp\r\n",
149 |         "public class TompoSecret\r\n",
150 |         "{\r\n",
151 |         "    public string applicationId { get; set; }\r\n",
152 |         "    public string applicationSecret { get; set; }\r\n",
153 |         "    public string storageConnString { get; set; }\r\n",
154 |         "    public string tenantId { get; set; }\r\n",
155 |         "}"
156 |       ],
157 |       "outputs": [],
158 |       "execution_count": null,
159 |       "metadata": {
160 |         "jupyter": {
161 |           "source_hidden": false,
162 |           "outputs_hidden": false
163 |         },
164 |         "nteract": {
165 |           "transient": {
166 |             "deleting": false
167 |           }
168 |         },
169 |         "microsoft": {
170 |           "language": "csharp"
171 |         }
172 |       }
173 |     },
174 |     {
175 |       "cell_type": "code",
176 |       "source": [
177 |         "%%csharp\r\n",
178 |         "using System.Collections.Generic;\r\n",
179 |         "using System.IO;\r\n",
180 |         "using System.Net.Http;\r\n",
181 |         "using System.Net.Http.Headers;\r\n",
182 |         "using Newtonsoft.Json.Linq;\r\n",
183 |         "using System.IO.Compression;\r\n",
184 |         "using Microsoft.Azure.Storage.Blob;\r\n",
185 |         "using System.Threading;\r\n",
186 |         "using System.Threading.Tasks;\r\n",
187 |         "using Microsoft.Spark.Sql;\r\n",
188 |         "using Microsoft.Spark.Sql.Types;\r\n",
189 |         "using Microsoft.Azure.Storage.Auth;\r\n",
190 |         "using Microsoft.Azure.Storage;\r\n",
191 |         "using Microsoft.Extensions.Logging;\r\n",
192 |         "using System.Net;\r\n",
193 |         "using Newtonsoft.Json;\r\n",
194 |         "using System.Globalization;\r\n",
195 |         "using Newtonsoft.Json.Serialization;\r\n",
196 |         "using CsvHelper;\r\n",
197 |         "using Microsoft.AspNetCore.Mvc;\r\n",
198 |         "using Microsoft.PowerBI.Api;\r\n",
199 |         "\r\n",
200 |         "public const string urlPowerBiServiceApiRoot = \"https://api.powerbi.com/\";\r\n",
201 |         "private static string workspaceId = \"\";\r\n",
202 |         "private static string reportId = \"\";\r\n",
203 |         "private static string reportName = \"\";\r\n",
204 |         "private static string applicationId = \"\";\r\n",
205 |         "private static string applicationSecret = \"\";\r\n",
206 |         "private static string tenantSpecificAuthority = \"\";\r\n",
207 |         "private static string storageConnString = \"\";\r\n",
208 |         "\r\n",
209 |         "\r\n",
210 |         "TompoSecret secretObj = new TompoSecret();\r\n",
211 |         "var secretsdata = spark.Sql(\"SELECT * FROM vw_tompo_params\");\r\n",
212 |         "\r\n",
213 |         "secretsdata.Collect().ToList().ForEach(row => {\r\n",
214 |         "    secretObj.applicationId = row[0].ToString();\r\n",
215 |         "    secretObj.applicationSecret = row[1].ToString();\r\n",
216 |         "    secretObj.storageConnString = row[2].ToString();\r\n",
217 |         "    secretObj.tenantId = row[3].ToString();\r\n",
218 |         "    }\r\n",
219 |         ");\r\n",
220 |         "\r\n",
221 |         "applicationId = secretObj.applicationId;\r\n",
222 |         "applicationSecret = secretObj.applicationSecret;\r\n",
223 |         "tenantSpecificAuthority = \"https://login.microsoftonline.com/\" + secretObj.tenantId;\r\n",
224 |         "storageConnString = secretObj.storageConnString;\r\n",
225 |         "\r\n",
226 |         "PowerBIClient pbiClient = GetPowerBiAppOnlyClient(applicationId, applicationSecret, tenantSpecificAuthority, urlPowerBiServiceApiRoot);\r\n",
227 |         "var storageAccount = CloudStorageAccount.Parse(storageConnString);\r\n",
228 |         "var blobClient = storageAccount.CreateCloudBlobClient();\r\n",
229 |         "var container = blobClient.GetContainerReference(\"hrsisynapsefs\");\r\n",
230 |         "\r\n",
231 |         "var metadata = spark.Sql(\"SELECT * FROM vw_report_metadata where isActive=1\");\r\n",
232 |         "\r\n",
233 |         "metadata.Collect().ToList().ForEach(row => {\r\n",
234 |         "    var workspaceName = row[0].ToString();\r\n",
235 |         "    var workspaceId = row[1].ToString();\r\n",
236 |         "    var reportName = row[2].ToString();\r\n",
237 |         "    var reportId = row[3].ToString();\r\n",
238 |         "    var modelName = row[4].ToString();\r\n",
239 |         "    \r\n",
240 |         "    var reportStream = pbiClient.Reports.ExportReport(new Guid(workspaceId), new Guid(reportId));\r\n",
241 |         "    var blob = container.GetBlockBlobReference(\"data/tompo/tompo_layout/\" + reportName + \".pbix\");\r\n",
242 |         "    blob.Properties.ContentType = \"mutipart/form-data\";\r\n",
243 |         "    blob.UploadFromStream(reportStream);\r\n",
244 |         "    reportStream.Close();\r\n",
245 |         "    Console.WriteLine(reportName + \" Report has been donwloaded successfully\");\r\n",
246 |         "\r\n",
247 |         "    }\r\n",
248 |         ");"
249 |       ],
250 |       "outputs": [],
251 |       "execution_count": null,
252 |       "metadata": {
253 |         "jupyter": {
254 |           "source_hidden": false,
255 |           "outputs_hidden": false
256 |         },
257 |         "nteract": {
258 |           "transient": {
259 |             "deleting": false
260 |           }
261 |         },
262 |         "microsoft": {
263 |           "language": "csharp"
264 |         }
265 |       }
266 |     },
267 |     {
268 |       "cell_type": "code",
269 |       "source": [
270 |         "%%pyspark\r\n",
271 |         "from zipfile import ZipFile\r\n",
272 |         "import os\r\n",
273 |         "import shutil\r\n",
274 |         "import pandas as pd\r\n",
275 |         "import json\r\n",
276 |         "from os.path import exists\r\n",
277 |         "\r\n",
278 |         "hrsiBasePath = os.getenv(\"hrsiBasePath\")\r\n",
279 |         "\r\n",
280 |         "                         #uncomment below block if running for first time, need to ad exists code\r\n",
281 |         "#mssparkutils.fs.unmount(\"/hrsisynapsefs_temp\") \r\n",
282 |         "#mssparkutils.fs.mount(hrsiBasePath, \r\n",
283 |         "#\t\t\t\t\t\"/hrsisynapsefs_temp\",\r\n",
284 |         "#\t\t\t\t\t{\"linkedService\":\"HRBIADLS\"}\r\n",
285 |         "#\t\t\t\t\t)\r\n",
286 |         "\r\n",
287 |         "synpasefspath = \"/synfs/\" + mssparkutils.env.getJobId() + \"/hrsisynapsefs_temp\"\r\n",
288 |         "\r\n",
289 |         "outfilepath = synpasefspath + \"/data/tompo/tompo_parseroutput/tompo_output.csv\"\r\n",
290 |         "\r\n",
291 |         "file_exists = exists(outfilepath)\r\n",
292 |         "\r\n",
293 |         "if file_exists:\r\n",
294 |         "    print(\"output file exists\")\r\n",
295 |         "else:\r\n",
296 |         "    outfiledf = pd.DataFrame(columns = ['WorkspaceName', 'ReportName', 'PageName', 'VisualType', 'Column', 'ModelName', 'LastRefreshedOn'])\r\n",
297 |         "    outfiledf.to_csv(outfilepath, index=False)\r\n",
298 |         "    print(\"created empty one time output file\")\r\n",
299 |         "\r\n",
300 |         "\r\n",
301 |         "metadatadfdata = spark.sql(\"SELECT * FROM vw_report_metadata where isActive=1\")\r\n",
302 |         "\r\n",
303 |         "df = pd.DataFrame(columns = ['WorkspaceName', 'ReportName', 'PageName', 'VisualType', 'Column', 'ModelName', 'LastRefreshedOn'])\r\n",
304 |         "\r\n",
305 |         "for row in metadatadfdata.collect():\r\n",
306 |         "    print(\"Working on file: \" + row[\"ReportName\"])\r\n",
307 |         "    \r\n",
308 |         "    f = ZipFile(synpasefspath + \"/data/tompo/tompo_layout/\" + row[\"ReportName\"] + \".pbix\", 'r')\r\n",
309 |         "    f.extractall(synpasefspath + \"/data/tompo/tompo_layout/_temp\" + row[\"ReportName\"])\r\n",
310 |         "    shutil.copyfile(synpasefspath + \"/data/tompo/tompo_layout/_temp\" + row[\"ReportName\"] + \"/Report/Layout\", synpasefspath + \"/data/tompo/tompo_layout/\" + row[\"ReportName\"] + \"_Layout\")\r\n",
311 |         "    shutil.rmtree(synpasefspath + \"/data/tompo/tompo_layout/_temp\" + row[\"ReportName\"])\r\n",
312 |         "    print(\"Retreived layout file for report \" + row[\"ReportName\"] + \".pbix\")\r\n",
313 |         "\r\n",
314 |         "    # code from here is to parse layout file and append in csv for POWER BI reporting\r\n",
315 |         "    with  open(synpasefspath + \"/data/tompo/tompo_layout/\" + row[2] + \"_Layout\", encoding=\"utf-16le\", errors=\"backslashreplace\") as file:\r\n",
316 |         "        data = file.read().strip()\r\n",
317 |         "\r\n",
318 |         "    layoutdata = json.loads(data)\r\n",
319 |         "\r\n",
320 |         "    outputlist = []\r\n",
321 |         "\r\n",
322 |         "    for section in layoutdata['sections']:\r\n",
323 |         "\r\n",
324 |         "        tabname = section['displayName']\r\n",
325 |         "\r\n",
326 |         "        for container in section['visualContainers']:\r\n",
327 |         "            # if container['id'] != 0 :\r\n",
328 |         "            configdict = json.loads(container['config'])\r\n",
329 |         "\r\n",
330 |         "            if 'singleVisual' in configdict:\r\n",
331 |         "\r\n",
332 |         "                visualtype = configdict['singleVisual']['visualType']\r\n",
333 |         "\r\n",
334 |         "                if 'projections' in configdict['singleVisual']:\r\n",
335 |         "\r\n",
336 |         "                    for key in configdict['singleVisual']['projections']:\r\n",
337 |         "                        for query in configdict['singleVisual']['projections'][key]:\r\n",
338 |         "\r\n",
339 |         "                            data = query['queryRef'].split('.')[-1]\r\n",
340 |         "                            outputlist.append(row[\"WorkspaceName\"])\r\n",
341 |         "                            outputlist.append(row[\"ReportName\"])\r\n",
342 |         "                            outputlist.append(tabname)\r\n",
343 |         "                            outputlist.append(visualtype)\r\n",
344 |         "                            outputlist.append(data)\r\n",
345 |         "                            outputlist.append(row[\"ModelName\"])\r\n",
346 |         "                            outputlist.append(str(pd.to_datetime('now').date()))\r\n",
347 |         "                            \r\n",
348 |         "                            df.loc[len(df)] = outputlist\r\n",
349 |         "                            outputlist.clear()\r\n",
350 |         "\r\n",
351 |         "    outputdf = pd.read_csv(outfilepath, header='infer')\r\n",
352 |         "\r\n",
353 |         "    filterval = row[\"WorkspaceName\"].strip() + row[\"ReportName\"].strip()\r\n",
354 |         "    outfiltereddf = outputdf[ (outputdf[\"WorkspaceName\"]+outputdf[\"ReportName\"] != filterval ) ]\r\n",
355 |         "\r\n",
356 |         "    finaldf = pd.concat([outfiltereddf, df])\r\n",
357 |         "    \r\n",
358 |         "    finaldf.to_csv(outfilepath, index=False)\r\n",
359 |         "    df = df.iloc[:0]\r\n",
360 |         "    print(\"Layout data is added to final csv for file: \" + row[\"ReportName\"])"
361 |       ],
362 |       "outputs": [],
363 |       "execution_count": null,
364 |       "metadata": {
365 |         "jupyter": {
366 |           "source_hidden": false,
367 |           "outputs_hidden": false
368 |         },
369 |         "nteract": {
370 |           "transient": {
371 |             "deleting": false
372 |           }
373 |         },
374 |         "microsoft": {
375 |           "language": "python"
376 |         },
377 |         "collapsed": false
378 |       }
379 |     }
380 |   ],
381 |   "metadata": {
382 |     "kernelspec": {
383 |       "name": "synapse_pyspark",
384 |       "language": "Python",
385 |       "display_name": "Synapse PySpark"
386 |     },
387 |     "language_info": {
388 |       "name": "csharp"
389 |     },
390 |     "kernel_info": {
391 |       "name": "synapse_pyspark"
392 |     },
393 |     "description": null,
394 |     "save_output": true,
395 |     "synapse_widget": {
396 |       "version": "0.1",
397 |       "state": {}
398 |     }
399 |   },
400 |   "nbformat": 4,
401 |   "nbformat_minor": 2
402 | }


--------------------------------------------------------------------------------