├── CODE_OF_CONDUCT.md ├── LICENSE ├── README.md ├── SECURITY.md ├── SUPPORT.md ├── sparklin ├── BlobTriggerFuncApp │ ├── .funcignore │ ├── .gitignore │ ├── BlobTriggerFunction │ │ ├── .funcignore │ │ ├── Data.py │ │ ├── Synapse_JsonParser.py │ │ ├── __init__.py │ │ ├── column_parser.py │ │ ├── function.json │ │ ├── host.json │ │ ├── join_parser.py │ │ ├── readme.md │ │ └── sample.dat │ ├── host.json │ └── requirements.txt ├── HttpTriggerFuncApp │ ├── .funcignore │ ├── .gitignore │ ├── HttpTriggerFunction │ │ ├── __init__.py │ │ ├── event.py │ │ ├── function.json │ │ ├── sample.dat │ │ └── tablestorage.py │ ├── Reqtest.http │ ├── host.json │ └── requirements.txt ├── Onboarding.md ├── OpenLineage │ ├── OpenLineage.sln │ └── OpenLineage │ │ ├── CaptureLineage.cs │ │ ├── Constant.cs │ │ ├── EventMetadata.cs │ │ ├── OpenLineage.csproj │ │ ├── OpenLineage.csproj.user │ │ ├── TableStorage.cs │ │ ├── Utility.cs │ │ └── host.json ├── README.md └── openlineage-spark-0.4.0.jar └── tompo ├── Onboarding.md ├── README.md ├── TOMPo.pbix ├── TOMPo_ModelMetada.ipynb ├── TOMPo_OnboardingSteps.docx └── TOMPo_ReportParser.ipynb /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | --- 2 | page_type: sample 3 | languages: 4 | - python 5 | products: 6 | - azure 7 | - power bi 8 | - synapse 9 | - databricks 10 | - purview 11 | - analysis services 12 | - datalake 13 | 14 | name: Data Lineage 15 | description: End to end data lineage from source to visualizations. 16 | --- 17 | 18 | # Onboarding Documents 19 | 20 | - [Onboarding Sparklin](https://github.com/microsoft/DataLineage/blob/main/sparklin/Onboarding.md) 21 | - [Onboarding TOMPo](https://github.com/microsoft/DataLineage/blob/main/tompo/Onboarding.md) 22 | 23 | ![image](https://user-images.githubusercontent.com/118733500/227436747-f527883b-92da-4482-8aec-b34c03003cf7.png) 24 | 25 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd). 40 | 41 | 42 | -------------------------------------------------------------------------------- /SUPPORT.md: -------------------------------------------------------------------------------- 1 | # TODO: The maintainer of this repo has not yet edited this file 2 | 3 | **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project? 4 | 5 | - **No CSS support:** Fill out this template with information about how to file issues and get help. 6 | - **Yes CSS support:** Fill out an intake form at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). CSS will work with/help you to determine next steps. 7 | - **Not sure?** Fill out an intake as though the answer were "Yes". CSS will help you decide. 8 | 9 | *Then remove this first heading from this SUPPORT.MD file before publishing your repo.* 10 | 11 | # Support 12 | 13 | ## How to file issues and get help 14 | 15 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 16 | issues before filing new issues to avoid duplicates. For new issues, file your bug or 17 | feature request as a new Issue. 18 | 19 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 20 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER 21 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**. 22 | 23 | ## Microsoft Support Policy 24 | 25 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above. 26 | -------------------------------------------------------------------------------- /sparklin/BlobTriggerFuncApp/.funcignore: -------------------------------------------------------------------------------- 1 | .git* 2 | .vscode 3 | local.settings.json 4 | test 5 | .venv -------------------------------------------------------------------------------- /sparklin/BlobTriggerFuncApp/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | 72 | # PyBuilder 73 | target/ 74 | 75 | # Jupyter Notebook 76 | .ipynb_checkpoints 77 | 78 | # IPython 79 | profile_default/ 80 | ipython_config.py 81 | 82 | # pyenv 83 | .python-version 84 | 85 | # pipenv 86 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 87 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 88 | # having no cross-platform support, pipenv may install dependencies that don’t work, or not 89 | # install all needed dependencies. 90 | #Pipfile.lock 91 | 92 | # celery beat schedule file 93 | celerybeat-schedule 94 | 95 | # SageMath parsed files 96 | *.sage.py 97 | 98 | # Environments 99 | .env 100 | .venv 101 | env/ 102 | venv/ 103 | ENV/ 104 | env.bak/ 105 | venv.bak/ 106 | 107 | # Spyder project settings 108 | .spyderproject 109 | .spyproject 110 | 111 | # Rope project settings 112 | .ropeproject 113 | 114 | # mkdocs documentation 115 | /site 116 | 117 | # mypy 118 | .mypy_cache/ 119 | .dmypy.json 120 | dmypy.json 121 | 122 | # Pyre type checker 123 | .pyre/ 124 | 125 | # Azure Functions artifacts 126 | bin 127 | obj 128 | appsettings.json 129 | local.settings.json 130 | 131 | # Azurite artifacts 132 | __blobstorage__ 133 | __queuestorage__ 134 | __azurite_db*__.json 135 | .python_packages -------------------------------------------------------------------------------- /sparklin/BlobTriggerFuncApp/BlobTriggerFunction/.funcignore: -------------------------------------------------------------------------------- 1 | venv -------------------------------------------------------------------------------- /sparklin/BlobTriggerFuncApp/BlobTriggerFunction/Data.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from azure.data.tables import TableServiceClient, UpdateMode 3 | # import os 4 | 5 | class AZTableStorage: 6 | 7 | table_name = "" 8 | conn_str = "" 9 | table_client = "" 10 | 11 | def createClient(self, table_name=None, conn_str=None): 12 | self.table_name = table_name 13 | self.conn_str = conn_str 14 | print(">>>>>>>>table_name>>>>>>>>"+self.table_name) 15 | print(">>>>>>>>conn_str>>>>>>>>"+self.conn_str) 16 | self.table_service = TableServiceClient.from_connection_string(self.conn_str) 17 | 18 | # Create the table if it does not already exist 19 | self.table_service.create_table_if_not_exists(self.table_name) 20 | 21 | self.table_client = self.table_service.get_table_client(self.table_name) 22 | # logging.info(self.table_name) 23 | return self.table_client 24 | 25 | def insert_entity(self, tbl_client, entity): 26 | #entity = self.deserialize() 27 | return tbl_client.create_entity(entity) 28 | 29 | def azure_upsert_entity(self, tbl_client, entity): 30 | # print(entity) 31 | #entity = self.deserialize() 32 | return tbl_client.upsert_entity(mode=UpdateMode.REPLACE, entity=entity) 33 | 34 | def azure_query_entities(self,tbl_client, queryfilter): 35 | return tbl_client.query_entities(queryfilter) 36 | 37 | # @staticmethod 38 | # def deserialize(): 39 | # params = {key: request.form.get(key) for key in request.form.keys()} 40 | # params["PartitionKey"] = "Chicago" 41 | # params["RowKey"] = "2021-07-01 12:00 AM" 42 | # return params 43 | 44 | def create_event_entity(self, PartitionKey, RowKey, Status, RetryCount, FilePath, isArchived, Message): 45 | my_entity = {"PartitionKey" : str(PartitionKey), 46 | "RowKey" : str(RowKey), 47 | "Status" : str(Status), 48 | "RetryCount" : RetryCount, 49 | "FilePath" : str(FilePath), 50 | "isArchived" : isArchived, 51 | "Message" : str(Message) 52 | } 53 | return my_entity 54 | 55 | def create_lineage_entity(self, PartitionKey, RowKey, input_tables, output_table, input_columns, output_columns, isdelta, isintermediate, isglobal, derived_columns, joinconditions): 56 | my_entity = {"PartitionKey" : str(PartitionKey), 57 | "RowKey" : str(RowKey), 58 | "input_tables" : str(input_tables), 59 | "output_table" : str(output_table), 60 | "input_columns" : str(input_columns), 61 | "output_columns" : str(output_columns), 62 | "isdelta" : str(isdelta), 63 | "isintermediate" : str(isintermediate), 64 | "isglobal" : str(isglobal), 65 | "derived_columns" : str(derived_columns), 66 | "joinconditions" : str(joinconditions) 67 | } 68 | return my_entity 69 | # def main(): 70 | # name = "e4b54cdb-e6ca-432d-8edf-e584fec37611_sql_test_data_lineage_pool_1650000026883_20220527140329.json" 71 | # fileName = name[0:name.index(".")] 72 | # print(">>>>>>>>1>>>>>>>>"+fileName) 73 | # azStorage = AZTableStorage() 74 | # azStorage.createClient() 75 | # metadata = azStorage.create_entity("Learning",fileName,"UnProcessed","Success",3,name,True) 76 | # azStorage.upsert_entity(metadata) 77 | 78 | # if __name__ == "__main__": 79 | # main() -------------------------------------------------------------------------------- /sparklin/BlobTriggerFuncApp/BlobTriggerFunction/Synapse_JsonParser.py: -------------------------------------------------------------------------------- 1 | import json, os 2 | from pyapacheatlas.core.util import GuidTracker, AtlasException 3 | from pyapacheatlas.core import PurviewClient, AtlasEntity, TypeCategory, AtlasProcess 4 | from pyapacheatlas.core.typedef import (EntityTypeDef, AtlasAttributeDef) 5 | import logging, re 6 | 7 | from .column_parser import get_column_transformations 8 | from .join_parser import get_join_conditions 9 | 10 | 11 | class PurviewTransform: 12 | def __init__(self, client, in_data): 13 | self.client = client 14 | self.in_data = in_data 15 | self._inputs = [] 16 | self._relationship_inputs = [] 17 | self._raw_inputs = [] 18 | self._outputs = [] 19 | self._output_fields = [] 20 | self._raw_outputs = [] 21 | self._relationship_outputs = [] 22 | self._raw_outputs = [] 23 | self._aliases = [] 24 | self._tables = [] 25 | self._alias_tablenames = {} 26 | self._column_mapping = [] 27 | self._field_mapping = None 28 | self._col_alias_map = {} 29 | self._input_cols = [] 30 | self._output_cols = [] 31 | self.hardcodecol = [] 32 | self.input_tables = [] 33 | self.output_table = "" 34 | self.output_table_schm = "" 35 | self.joinList = [] 36 | self._table_and_columns = {} 37 | self.inp_qualified_Name = "" 38 | self.out_qualified_Name = "" 39 | self.setproject = 0 40 | self.table_and_schema_count = {} 41 | self.sqlcommand = "" 42 | self._input_table_qualifiedName = {} 43 | 44 | self.inp_name = "" 45 | self.inp_guid = "" 46 | self.inp_entitytype = "" 47 | 48 | self.output_name = "" 49 | self.output_guid = "" 50 | self.output_entitytype = "" 51 | 52 | self.a = [] 53 | self.tbl = [] 54 | self.ds_create_i = 0 55 | 56 | self.intermediate_tbl_views = [] 57 | self.deltatable = [] 58 | self.globaltempviews = [] 59 | 60 | self.nb_name = "" 61 | self.rowkey = "" 62 | self.cluster_name = "" 63 | self.dataframe = "N" 64 | 65 | self.match = 0 66 | self.unmatch = 0 67 | 68 | logging.info("logger started") 69 | self.gt = GuidTracker() 70 | 71 | logging.info("finished init") 72 | 73 | def get_tbl_nm(self, inp_tblname, type='out'): 74 | if inp_tblname.count(",") > 0: 75 | outtblnm = inp_tblname.split(",")[1].strip(" ").replace('[', '').replace(']', '') 76 | schnm = inp_tblname.split(",")[0].strip(" ").replace('[', '').replace(']', '') 77 | 78 | if type == "inp": 79 | self.input_tables.append(schnm + "." + outtblnm) 80 | else: 81 | self.output_table_schm = schnm + "." + outtblnm 82 | 83 | if outtblnm not in self.deltatable and schnm.lower() != "global_temp": 84 | # self.deltatable.update({outtblnm: schnm}) 85 | self.deltatable.append(schnm + "." + outtblnm) 86 | if outtblnm not in self.deltatable and schnm.lower() == "global_temp": 87 | self.globaltempviews.append(schnm + "." + outtblnm) 88 | # tblnm = str(child['multipartIdentifier']).replace('[', '').replace(']', '') 89 | else: 90 | outtblnm = inp_tblname.replace('[', '').replace(']', '') 91 | self.intermediate_tbl_views.append(outtblnm) 92 | if type == "inp": 93 | self.input_tables.append(outtblnm) 94 | else: 95 | self.output_table_schm = outtblnm 96 | # schnm = "" 97 | 98 | if outtblnm not in self.table_and_schema_count: 99 | self.table_and_schema_count.update({outtblnm: inp_tblname.count(",")}) 100 | 101 | return outtblnm 102 | 103 | def subquery_alias(self, plan): 104 | if plan['class'] == 'org.apache.spark.sql.catalyst.plans.logical.SubqueryAlias': 105 | if plan['identifier']['name'] != '__auto_generated_subquery_name': 106 | self._aliases.append(plan['identifier']['name']) 107 | 108 | def unresolved_relation(self, plan): 109 | if plan['class'] == 'org.apache.spark.sql.catalyst.analysis.UnresolvedRelation': 110 | tblnm = self.get_tbl_nm(str(plan['multipartIdentifier']), "inp") 111 | self._tables.append(tblnm) 112 | 113 | def get_fields_pattern(self, sqlcmd, plan): 114 | 115 | if sqlcmd == "INSERT": 116 | if plan['class'] == 'org.apache.spark.sql.catalyst.plans.logical.Project': 117 | if self.setproject == 0: 118 | for project in plan['projectList']: 119 | # print("Project Entered") 120 | _fields = "" 121 | for field in project: 122 | if field['class'] == "org.apache.spark.sql.catalyst.expressions.Alias": 123 | _fields += field['name'] + "|" 124 | if field['class'] == "org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute": 125 | _fields += field['nameParts'] + "|" 126 | if field['class'] == "org.apache.spark.sql.catalyst.expressions.Literal": 127 | if field['value'] is not None: 128 | _fields += "lit(" + field['value'] + ")" + "|" 129 | else: 130 | _fields += "lit(null)" + "|" 131 | if field['class'] == "org.apache.spark.sql.catalyst.expressions.WindowExpression": 132 | _fields += "Row_number()" + "|" + "Partition" + "|" + "Group" + "|" 133 | self._output_fields.append(_fields) 134 | self.hardcodecol = get_column_transformations(plan['projectList']) 135 | self.setproject = 1 136 | 137 | self.subquery_alias(plan) 138 | self.unresolved_relation(plan) 139 | 140 | if sqlcmd == "CreateViewCommand": 141 | for child in plan['child']: 142 | if child['class'] == 'org.apache.spark.sql.catalyst.plans.logical.Project': 143 | if self.setproject == 0: 144 | for project in child['projectList']: 145 | # print(project) 146 | _fields = "" 147 | for field in project: 148 | if field['class'] == "org.apache.spark.sql.catalyst.expressions.Alias": 149 | _fields += field['name'] + "|" 150 | if field['class'] == "org.apache.spark.sql.catalyst.expressions.AttributeReference": 151 | _fields += field['name'] + "|" 152 | self._output_fields.append(_fields) 153 | self.hardcodecol = get_column_transformations(child['projectList']) 154 | self.setproject = 1 155 | self.subquery_alias(child) 156 | self.unresolved_relation(child) 157 | if not self._tables: 158 | self._tables.append(self.nb_name + "_dataframe") 159 | self.dataframe = "Y" 160 | 161 | if sqlcmd == "CREATEVIEW": 162 | for child in plan['child']: 163 | if child['class'] == 'org.apache.spark.sql.catalyst.plans.logical.Project': 164 | if self.setproject == 0: 165 | for project in child['projectList']: 166 | _fields = "" 167 | for field in project: 168 | if field['class'] == "org.apache.spark.sql.catalyst.expressions.Alias": 169 | _fields += field['name'] + "|" 170 | if field['class'] == "org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute": 171 | _fields += field['nameParts'] + "|" 172 | if field['class'] == "org.apache.spark.sql.catalyst.analysis.UnresolvedFunction": 173 | if field['name']['funcName'] == "DENSE_RANK": 174 | _fields += "DENSE_RANK||||" + "|" 175 | if field['class'] == "org.apache.spark.sql.catalyst.expressions.Literal": 176 | if field['value'] is not None: 177 | _fields += "lit(" + field['value'] + ")" + "|" 178 | else: 179 | _fields += "lit(null)" + "|" 180 | if field['class'] == 'org.apache.spark.sql.catalyst.analysis.UnresolvedStar': 181 | print("*") 182 | if field['class'] == "org.apache.spark.sql.catalyst.expressions.WindowExpression": 183 | _fields += "Row_number()" + "|" + "Partition" + "|" + "Group" + "|" 184 | self._output_fields.append(_fields) 185 | self.hardcodecol = get_column_transformations(child['projectList']) 186 | self.setproject = 1 187 | 188 | self.subquery_alias(child) 189 | self.unresolved_relation(child) 190 | 191 | if sqlcmd == "DS_CREATETABLE": 192 | _output_proj = [] 193 | if plan['class'] == 'org.apache.spark.sql.catalyst.plans.logical.SubqueryAlias': 194 | self.ds_create_i += 1 195 | self.a.append(self.ds_create_i) 196 | # print(plan['identifier']['name']) 197 | self.tbl.append(plan['identifier']['name']) 198 | 199 | if plan['class'] == 'org.apache.spark.sql.execution.datasources.LogicalRelation': 200 | self.ds_create_i += 1 201 | 202 | if plan['class'] == 'org.apache.spark.sql.catalyst.plans.logical.Project': 203 | self.ds_create_i += 1 204 | if self.setproject == 0: 205 | for project in plan['projectList']: 206 | _fields = "" 207 | for field in project: 208 | if field['class'] == "org.apache.spark.sql.catalyst.expressions.Alias": 209 | _fields += field['name'] + "|" 210 | if field['class'] == "org.apache.spark.sql.catalyst.expressions.AttributeReference": 211 | if field['qualifier']: 212 | qualifier = field['qualifier'].replace('[', '').replace(']', '') 213 | _fields += "[" + qualifier + "," + field['name'] + "]" + "|" 214 | else: 215 | _fields += field['name'] + "|" 216 | self._output_fields.append(_fields) 217 | self.hardcodecol = get_column_transformations(child['projectList']) 218 | self.setproject = 1 219 | 220 | if sqlcmd == "Project": 221 | if self.setproject == 0: 222 | for project in plan['projectList']: 223 | _fields = "" 224 | for field in project: 225 | if field['class'] == "org.apache.spark.sql.catalyst.expressions.Alias": 226 | _fields += field['name'] + "|" 227 | if field['class'] == "org.apache.spark.sql.catalyst.expressions.AttributeReference": 228 | _fields += field['name'] + "|" 229 | self._output_fields.append(_fields) 230 | self.setproject = 1 231 | self._tables.append(self.nb_name + "_dataframe") 232 | 233 | if sqlcmd == "MERGE": 234 | 235 | for key, values in plan.items(): 236 | if key == "matchedActions": 237 | if values: 238 | if 'InsertAction' in values[0][0]['class']: 239 | for field in values[0]: 240 | if field['class'] == "org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute": 241 | self._output_fields.append(field['nameParts']) 242 | else: 243 | self.match = 1 244 | else: 245 | self.match = 1 246 | if key == "notMatchedActions": 247 | if values: 248 | if 'InsertAction' in values[0][0]['class']: 249 | for field in values[0]: 250 | if field['class'] == "org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute": 251 | self._output_fields.append(field['nameParts']) 252 | else: 253 | self.unmatch = 1 254 | else: 255 | self.unmatch = 1 256 | 257 | if self.match == 1 and self.unmatch == 1: 258 | os._exit(0) 259 | 260 | self.subquery_alias(plan) 261 | self.unresolved_relation(plan) 262 | 263 | def get_parse_plan(self, output_plan): 264 | 265 | for plan in output_plan: 266 | _alias_tables = "" 267 | if plan['class'] == 'org.apache.spark.sql.catalyst.plans.logical.InsertIntoStatement': 268 | for tbl in plan['table']: 269 | self.output_table = self.get_tbl_nm(tbl['multipartIdentifier']) 270 | self.sqlcommand = "INSERT" 271 | 272 | if plan['class'] == 'org.apache.spark.sql.catalyst.plans.logical.CreateViewStatement': 273 | # print("hi") 274 | self.output_table = self.get_tbl_nm(plan['viewName']) 275 | self.sqlcommand = "CREATEVIEW" 276 | 277 | if plan['class'] == "org.apache.spark.sql.catalyst.plans.logical.CreateTableAsSelectStatement": 278 | self.output_table = self.get_tbl_nm(plan['tableName']) 279 | self.sqlcommand = "INSERT" 280 | 281 | if plan['class'] == 'org.apache.spark.sql.execution.datasources.CreateTable': 282 | self.output_table = self.get_tbl_nm(plan['tableDesc']['identifier']['table']) 283 | self.sqlcommand = "DS_CREATETABLE" 284 | 285 | if plan['class'] == 'org.apache.spark.sql.catalyst.plans.logical.MergeIntoTable': 286 | self.sqlcommand = "MERGE" 287 | 288 | if plan['class'] == 'org.apache.spark.sql.execution.command.CreateViewCommand': 289 | self.output_table = self.get_tbl_nm(plan['name']['table']) 290 | # print(plan['name']['table']) 291 | self.sqlcommand = "CreateViewCommand" 292 | 293 | self.get_fields_pattern(self.sqlcommand, plan) 294 | 295 | return True 296 | 297 | def _column_alias_map(self, val_list, alias=""): 298 | if alias == 'NoAlias': 299 | if 'NoAlias' in self._col_alias_map.keys(): 300 | self._col_alias_map['NoAlias'].append(val_list[0].strip()) 301 | else: 302 | # create a new array in this slot 303 | self._col_alias_map['NoAlias'] = [val_list[0].strip()] 304 | else: 305 | if val_list[0] in self._col_alias_map: 306 | self._col_alias_map[val_list[0]].append(val_list[1].strip()) 307 | else: 308 | # create a new array in this slot 309 | self._col_alias_map[val_list[0]] = [val_list[1].strip()] 310 | 311 | @staticmethod 312 | def _column_clean(val): 313 | return val.replace('[', '').replace(']', '').split(",") 314 | 315 | def get_inp_out_fields(self, _output_fields, sqlcommand): 316 | print("Raw Source Fields ") 317 | print(_output_fields) 318 | if sqlcommand == "MERGE": 319 | # abc = {} 320 | # for i in range(len(_output_fields)): 321 | # if _output_fields.count(_output_fields[i]) > 1: 322 | # if _output_fields[i] not in abc: 323 | # abc.update({_output_fields[i]: i}) 324 | # print(abc) 325 | # for key, value in abc.items(): 326 | # _output_fields.pop(value) 327 | 328 | odd_i = [] 329 | even_i = [] 330 | for i in range(0, len(_output_fields)): 331 | if i % 2: 332 | even_i.append(_output_fields[i]) 333 | else: 334 | odd_i.append(_output_fields[i]) 335 | 336 | for i in odd_i: 337 | val_list = self._column_clean(i) 338 | if len(val_list) > 1: 339 | self._output_cols.append(val_list[1].strip()) 340 | else: 341 | self._output_cols.append(val_list[0].strip()) 342 | # self._output_cols.append(i.replace('[', '').replace(']', '')) 343 | for i in even_i: 344 | val_list = self._column_clean(i) 345 | if len(val_list) > 1: 346 | self._input_cols.append(val_list[1].strip()) 347 | self._column_alias_map(val_list) 348 | else: 349 | self._input_cols.append(val_list[0].strip()) 350 | self._column_alias_map(val_list, "NoAlias") 351 | 352 | self.output_table = self._tables[0] 353 | self._tables.pop(0) 354 | self.output_table_schm = self.input_tables[0] 355 | self.input_tables.pop(0) 356 | else: 357 | for val in _output_fields: 358 | if val.count("|") == 1: 359 | val_list = self._column_clean(val.replace('|', '')) 360 | if len(val_list) > 1: 361 | self._input_cols.append(val_list[1].strip()) 362 | self._output_cols.append(val_list[1].strip()) 363 | self._column_alias_map(val_list) 364 | else: 365 | # self._col_alias_map.update({'NoAlias':val_list[0].strip()}) 366 | self._input_cols.append(val_list[0]) 367 | self._output_cols.append(val_list[0]) 368 | self._column_alias_map(val_list, 'NoAlias') 369 | 370 | if val.count("|") == 2: 371 | val_split = val.split("|")[1] 372 | if val_split.__contains__("lit"): 373 | self._input_cols.append(val_split) 374 | # self.hardcodecol.append(val_split + " as " + val.split("|")[0]) 375 | else: 376 | # print(val_split) 377 | val_list = self._column_clean(val_split) 378 | # print(val_list) 379 | if len(val_list) > 1: 380 | self._input_cols.append(val_list[1].strip()) 381 | self._column_alias_map(val_list) 382 | else: 383 | self._input_cols.append(val_list[0].strip()) 384 | self._column_alias_map(val_list, 'NoAlias') 385 | 386 | self._output_cols.append(val.split("|")[0]) 387 | 388 | if val.count("|") == 3: 389 | val_split = val.split("|")[1] 390 | val_list = self._column_clean(val_split) 391 | if len(val_list) > 1: 392 | self._input_cols.append(val_list[1].strip()) 393 | else: 394 | self._input_cols.append(val_list[0].strip()) 395 | self._output_cols.append(val.split("|")[0].strip()) 396 | # self._column_alias_map(val_list) 397 | 398 | # if val.count("|") > 3: 399 | # self.hardcodecol.append("Derived Logic like CASE/CONCAT/DENSE_RANK " + val.split("|")[0]) 400 | 401 | print("Input and Output and HardCode and Column_Alias Mapping") 402 | print(self._input_cols) 403 | print(self._output_cols) 404 | print(self.hardcodecol) 405 | print(self._col_alias_map) 406 | 407 | return True 408 | 409 | # def get_join_conditions(self, output_plan): 410 | # 411 | # for plan in output_plan: 412 | # if plan['class'] == 'org.apache.spark.sql.catalyst.plans.logical.Join': 413 | # join = (plan['joinType']['object'])[:-1] 414 | # joinname = join.split('.')[-1] 415 | # tablealias1 = ((plan['condition'][1])['nameParts']).split(',')[0][1:] 416 | # column1 = ((plan['condition'][1])['nameParts']).split(',')[1][1:-1] 417 | # tablealias2 = ((plan['condition'][2])['nameParts']).split(',')[0][1:] 418 | # column2 = ((plan['condition'][2])['nameParts']).split(',')[1][1:-1] 419 | # row = self._alias_tablenames[tablealias1] + " " + joinname + "Join" + " " + self._alias_tablenames[ 420 | # tablealias2] + " on " + column1 + "=" + column2 421 | # self.joinList.append(row) 422 | # return True 423 | 424 | def get_alias_table_cols(self): 425 | 426 | for key, value in self._col_alias_map.items(): 427 | if key in self._alias_tablenames: 428 | if type(value) is list: 429 | for col in value: 430 | colname = col 431 | tablename = self._alias_tablenames.get(key) 432 | self._table_and_columns.update({colname: tablename}) 433 | else: 434 | colname = value[0] 435 | tablename = self._alias_tablenames.get(key) 436 | self._table_and_columns.update({colname: tablename}) 437 | else: 438 | if type(value) is list: 439 | for col in value: 440 | colname = col 441 | tablename = list(self._alias_tablenames.values())[0] 442 | self._table_and_columns.update({colname: tablename}) 443 | else: 444 | colname = value[0] 445 | tablename = list(self._alias_tablenames.values())[0] 446 | self._table_and_columns.update({colname: tablename}) 447 | 448 | print("Alias Tables and Columns Mapping") 449 | print(self._alias_tablenames) 450 | print(self._table_and_columns) 451 | 452 | return True 453 | 454 | def search_purview_entity(self, entityname): 455 | 456 | print(entityname) 457 | search = self.client.discovery.search_entities(entityname) 458 | qname, name, guid, entitytype = "", "", "", "" 459 | for entity in search: 460 | # re.search(r"^cooked*\W?", entity['qualifiedName']) 461 | # print(entity) 462 | try: 463 | if (entity["entityType"] == "azure_datalake_gen2_path" or str( 464 | entity["entityType"]).lower() == "dataset") and \ 465 | str(entity['name']).lower().strip() == entityname.lower() and re.search( 466 | r"^" + self.cluster_name + "://", entity['qualifiedName']): 467 | qname = entity['qualifiedName'] 468 | name = entity['name'] 469 | guid = entity['id'] 470 | entitytype = entity['entityType'] 471 | 472 | # print(entity['qualifiedName']) 473 | # print("ABC "+qname) 474 | except: 475 | print("Search Scan results are Coming Differently") 476 | 477 | return qname, name, guid, entitytype 478 | 479 | def get_ds_create_tables(self): 480 | for j in range(len(self.a) - 1): 481 | k = self.a[j] 482 | if self.a[j + 1] == k + 1: 483 | self._alias_tablenames.update({self.tbl[j]: self.tbl[j + 1]}) 484 | 485 | def purview_plan_push(self, qualifiedName, runid): 486 | 487 | print(self._tables) 488 | print(self.table_and_schema_count) 489 | 490 | purv_inpqname, purv_inpname, purv_inpguid, purv_inpentitytype = "", "", "", "" 491 | purv_outqname, purv_outname, purv_outguid, purv_outentitytype = "", "", "", "" 492 | 493 | for inp in self._tables: 494 | # Need to convert it into a Function 495 | if inp in self.table_and_schema_count and self.table_and_schema_count[inp] > 0: 496 | print(inp + " hiii " + str(self.table_and_schema_count[inp])) 497 | # purv_inpqname, purv_inpname, purv_inpguid, purv_inpentitytype = self.search_purview_entity(inp.lower()) 498 | 499 | # print(purv_inpqname) 500 | if purv_inpqname: 501 | # print("Here" + " " + purv_qname) 502 | self.inp_qualified_Name = purv_inpqname 503 | self.inp_name = purv_inpname 504 | self.inp_guid = purv_inpguid 505 | self.inp_entitytype = purv_inpentitytype 506 | else: 507 | self.inp_qualified_Name = (qualifiedName + "://" + inp).lower() 508 | self.inp_name = inp 509 | self.inp_guid = self.gt.get_guid() 510 | self.inp_entitytype = "DataSet" 511 | 512 | self._input_table_qualifiedName.update({inp: self.inp_qualified_Name}) 513 | self._inputs.append(AtlasEntity(name=self.inp_name, 514 | typeName=self.inp_entitytype, 515 | qualified_name=self.inp_qualified_Name, 516 | guid=self.inp_guid) 517 | ) 518 | 519 | print("Input Tables and Its Qualified Names") 520 | print(self._input_table_qualifiedName) 521 | 522 | if self.sqlcommand == "INSERT" or self.sqlcommand == "MERGE": 523 | print("INSERT Occured Hence Output Table will be Searched") 524 | # purv_outqname, purv_outname, purv_outguid, purv_outentitytype = self.search_purview_entity(self.output_table) 525 | 526 | if purv_outqname: 527 | self.out_qualified_Name = purv_outqname 528 | self.output_name = purv_outname 529 | self.output_guid = purv_outguid 530 | self.output_entitytype = purv_outentitytype 531 | else: 532 | self.out_qualified_Name = (qualifiedName + "://" + self.output_table).lower() 533 | self.output_name = self.output_table 534 | self.output_guid = self.gt.get_guid() 535 | self.output_entitytype = "DataSet" 536 | 537 | print(self.output_table + " " + self.out_qualified_Name) 538 | 539 | OutputTable = AtlasEntity( 540 | name=self.output_name, 541 | typeName=self.output_entitytype, 542 | qualified_name=self.out_qualified_Name, 543 | guid=self.output_guid 544 | ) 545 | if self._input_cols and self._output_cols: 546 | for item in zip(self._input_cols, self._output_cols): 547 | if item[0] in self._table_and_columns: 548 | inp = self._input_table_qualifiedName.get(self._table_and_columns.get(item[0])) 549 | self._column_mapping.append( 550 | {"ColumnMapping": [ 551 | {"Source": item[0], "Sink": item[1]}], 552 | "DatasetMapping": {"Source": inp, 553 | "Sink": OutputTable.qualifiedName}}) 554 | 555 | print("ColumnMapping") 556 | print(self._column_mapping) 557 | # print(self.hardcodecol) 558 | # print(self.joinList) 559 | process = AtlasProcess( 560 | name=qualifiedName + "_" + self.output_table + "_process", 561 | typeName="HRServicesInsights_OneHRSI", 562 | qualified_name="hrdi://synapse_notebook/" + self.nb_name + "/" + qualifiedName + "_" + self.output_table, 563 | inputs=self._inputs, 564 | outputs=[OutputTable], 565 | guid=self.gt.get_guid(), 566 | attributes={"columnMapping": json.dumps(self._column_mapping), 567 | "hardCoded_Columns": self.hardcodecol, 568 | "Delta_Tables": self.deltatable, 569 | "Global_Temp_Views_or_Tables": self.globaltempviews, 570 | "Intermediate_Views_or_Tables": self.intermediate_tbl_views, 571 | "JoinConditions": self.joinList} 572 | ) 573 | if self._inputs and OutputTable: 574 | try: 575 | results = self.client.upload_entities([process, OutputTable] + self._inputs) 576 | except: 577 | print("No ColumnMapping or Input or Output Tables Available") 578 | 579 | def purview_dataset_push(self, inp_qname, inp_name, out_qname, out_name, process_qname, name): 580 | print("Came") 581 | a = AtlasEntity( 582 | name=inp_name, 583 | typeName="DataSet", 584 | qualified_name=inp_qname, 585 | guid=self.gt.get_guid() 586 | ) 587 | b = AtlasEntity( 588 | name=out_name, 589 | typeName="DataSet", 590 | qualified_name=out_qname, 591 | guid=self.gt.get_guid() 592 | ) 593 | 594 | process = AtlasProcess( 595 | name=name, 596 | typeName="Process", 597 | qualified_name="Process" + process_qname, 598 | inputs=[a], 599 | outputs=[b], 600 | guid=self.gt.get_guid() 601 | ) 602 | 603 | results = self.client.upload_entities(batch=[a, b, process]) 604 | 605 | def get_project_details(self, output_plan): 606 | for plan in output_plan: 607 | _alias_tables = "" 608 | if plan['class'] == "org.apache.spark.sql.catalyst.plans.logical.Project": 609 | if plan['projectList'][0][0]['qualifier']: 610 | # print(plan['projectList'][0][0]['qualifier']) 611 | self.output_table = self.get_tbl_nm(plan['projectList'][0][0]['qualifier']) 612 | self.sqlcommand = "Project" 613 | else: 614 | os._exit(0) 615 | 616 | self.get_fields_pattern(self.sqlcommand, plan) 617 | return True 618 | 619 | def transform_to_purview(self): 620 | 621 | inputs_array = self.in_data['inputs'] 622 | outputs_array = self.in_data['outputs'] 623 | runid = self.in_data['run']['runId'] 624 | 625 | _name = self.in_data['job']['name'].split(".")[0].split("_")[:-2] 626 | self.rowkey = "_".join(_name) + "_" + runid 627 | self.nb_name = "_".join(_name) 628 | 629 | if re.search("hrsi", self.in_data['job']['name'].lower()): 630 | self.cluster_name = "hrservicesinsights" 631 | elif re.search("ultp", self.in_data['job']['name'].lower()): 632 | self.cluster_name = "ultp_services" 633 | elif re.search("gtabi", self.in_data['job']['name'].lower()): 634 | self.cluster_name = "gtabi_services" 635 | elif re.search("learning", self.in_data['job']['name'].lower()): 636 | self.cluster_name = "learninginsights" 637 | elif re.search("hcm", self.in_data['job']['name'].lower()): 638 | self.cluster_name = "headcountmanagement" 639 | else: 640 | self.cluster_name = "external" 641 | 642 | # qualifiedName = self.cluster_name + "://" + self.nb_name 643 | qualifiedName = self.cluster_name 644 | print(qualifiedName) 645 | # print(len(self.in_data['run']['facets'])) 646 | 647 | if len(self.in_data['run']['facets']) > 1: 648 | 649 | _plan = self.in_data['run']['facets']['spark.logicalPlan']['plan'] 650 | print(runid + " " + self.nb_name) 651 | 652 | classname = self.in_data['run']['facets']['spark.logicalPlan']['plan'][0]['class'] 653 | 654 | if classname == "org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand": 655 | for inp in self.in_data['inputs']: 656 | table = inp['name'].split("/")[-1] 657 | self.input_tables.append(table) 658 | for col in inp['facets']['schema']['fields']: 659 | column = col['name'] 660 | self._input_cols.append(column) 661 | self._table_and_columns.update({column: table}) 662 | self.output_table_schm = self.in_data['outputs'][0]['name'].split("/")[-1] 663 | self.input_tables = list(dict.fromkeys(self.input_tables)) 664 | self._input_cols = list(dict.fromkeys(self._input_cols)) 665 | self.output_table = self.output_table_schm 666 | self._output_cols = self._input_cols 667 | self.deltatable = self.output_table 668 | self._tables = self.input_tables 669 | 670 | print(self.input_tables) 671 | print(self.output_table_schm) 672 | print(self._input_cols) 673 | print(self._output_cols) 674 | print(self._table_and_columns) 675 | 676 | else: 677 | if classname == "org.apache.spark.sql.catalyst.plans.logical.Project": 678 | self.get_project_details(_plan) 679 | else: 680 | self.get_parse_plan(_plan) 681 | 682 | if self.sqlcommand == "DS_CREATETABLE": 683 | self.get_inp_out_fields(self._output_fields, self.sqlcommand) 684 | self.get_ds_create_tables() 685 | else: 686 | self.get_inp_out_fields(self._output_fields, self.sqlcommand) 687 | 688 | if self._aliases and self._tables: 689 | if len(self._aliases) == len(self._tables): 690 | for item in zip(self._aliases, self._tables): 691 | self._alias_tablenames.update({item[0]: item[1]}) 692 | else: 693 | _aliases_new = self._aliases[1:] 694 | for item in zip(_aliases_new, self._tables): 695 | self._alias_tablenames.update({item[0]: item[1]}) 696 | elif self._tables: 697 | self._alias_tablenames.update({'NoAlias': self._tables[0]}) 698 | 699 | if self._col_alias_map and self._alias_tablenames: 700 | self.get_alias_table_cols() 701 | 702 | if self.sqlcommand in ("CreateViewCommand", "CREATEVIEW"): 703 | self.joinList = get_join_conditions(_plan[0]['child'], self._alias_tablenames) 704 | elif self.sqlcommand == "INSERT": 705 | self.joinList = get_join_conditions(_plan, self._alias_tablenames) 706 | 707 | if self.dataframe == "Y": 708 | inp_qname = self.out_qualified_Name 709 | inp_name = self.output_name 710 | out_name = self.nb_name + "_dataframe" 711 | out_qname = (self.cluster_name + "://" + self.nb_name + "://" + out_name).lower() 712 | # out_qname = (self.cluster_name + "://" + out_name).lower() 713 | 714 | # print(inp_name) 715 | self.purview_dataset_push(inp_qname, inp_name, out_qname, out_name, inp_name, "TabletoDataframe") 716 | 717 | print("Output Table") 718 | print(self.output_table) 719 | print(self.output_table_schm) 720 | print(self.input_tables) 721 | print(self.deltatable) 722 | print(self.globaltempviews) 723 | print(self.intermediate_tbl_views) 724 | 725 | print(self._tables) 726 | self.purview_plan_push(qualifiedName, runid) 727 | 728 | else: 729 | print("Facets doesnt have any Plan") 730 | inp_qname, inp_name = "", "" 731 | if inputs_array: 732 | for inp in inputs_array: 733 | inp_qname = dict(inp).get('name') 734 | inp_name = str(dict(inp).get('name')).split("/")[-1].split('.')[0] 735 | out_name = self.nb_name + "_dataframe" 736 | out_qname = self.cluster_name + "://" + self.nb_name + "://" + out_name 737 | print(inp_name) 738 | self.purview_dataset_push(inp_qname.lower(), inp_name, out_qname.lower(), out_name, out_name, 739 | "FiletoDataframe") 740 | else: 741 | os._exit(0) 742 | 743 | return self.cluster_name, self.rowkey, self.input_tables, self.output_table_schm, self._input_cols, self._output_cols, self.deltatable, self.intermediate_tbl_views, self.globaltempviews, self.hardcodecol, self.joinList 744 | -------------------------------------------------------------------------------- /sparklin/BlobTriggerFuncApp/BlobTriggerFunction/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import azure.functions as func 4 | import json 5 | from .Data import AZTableStorage 6 | from azure.data.tables import TableClient 7 | import os, traceback, sys 8 | 9 | from .Synapse_JsonParser import PurviewTransform 10 | from pyapacheatlas.auth import ServicePrincipalAuthentication 11 | from pyapacheatlas.core import PurviewClient 12 | 13 | oauth = ServicePrincipalAuthentication( 14 | tenant_id=<>, 15 | client_id=<>, 16 | client_secret=<> 17 | ) 18 | 19 | client = PurviewClient( 20 | account_name=<>, 21 | authentication=oauth 22 | ) 23 | 24 | def main(myblob: func.InputStream): 25 | 26 | # logging.info(f"Python blob trigger function processed blob \n" 27 | # f"Name: {myblob.name}\n" 28 | # f"Blob Size: {myblob.length} bytes") 29 | 30 | fileName = myblob.name.split("/")[1].split(".")[0] 31 | # logging.info(f"****fileName******* {fileName} ") 32 | 33 | logging.info(f"****TableName******* {os.getenv('StorageTableName')}") 34 | 35 | # cluster_name, nb_name, input_tables,output_table, _input_cols, _output_cols,deltatable, intermediate_tbl_views,globaltempviews, hardcodecol, joinList = "","","","","","","","","","","" 36 | 37 | pt = PurviewTransform(client, json.load(myblob)) 38 | 39 | azStorage = AZTableStorage() 40 | 41 | event_client = azStorage.createClient(os.getenv('TableName'),os.getenv('datalineagesynapsestrpoc_STORAGE')) 42 | 43 | lineage_client = azStorage.createClient(os.getenv('StorageTableName'),os.getenv('datalineagesynapsestrpoc_STORAGE')) 44 | 45 | streamName = "HRSI" 46 | 47 | name_filter = "PartitionKey eq '%s' and RowKey eq '%s' and (Status eq 'Unprocessed' or Status eq 'Parsing Failed')" % (streamName, fileName) 48 | # # table_client = TableClient.from_connection_string(conn_str=os.getenv('datalineagesynapsestrpoc_STORAGE'), table_name=os.getenv('TableName')) 49 | 50 | entities = azStorage.azure_query_entities(event_client, name_filter) 51 | 52 | # lineage_metadata = azStorage.create_lineage_entity("HRSI",fileName,"","","","","","","","") 53 | # event_metadata = azStorage.create_event_entity("HRSI",fileName,"Unprocessed",3,myblob.name,False,"SUCCESS") 54 | 55 | # azStorage.insert_entity(event_client, event_metadata) 56 | # azStorage.insert_entity(lineage_client, lineage_metadata) 57 | 58 | 59 | # cluster_name, nb_name, input_tables,output_table, _input_cols, _output_cols,deltatable, intermediate_tbl_views,globaltempviews, hardcodecol, joinList = pt.transform_to_purview() 60 | # lineage_metadata = azStorage.create_lineage_entity(cluster_name, nb_name, input_tables, 61 | # output_table, _input_cols, _output_cols, 62 | # deltatable, intermediate_tbl_views, 63 | # globaltempviews, hardcodecol, joinList) 64 | # azStorage.azure_upsert_entity(lineage_client, lineage_metadata) 65 | 66 | for entity in entities: 67 | print('RowKey:' + entity['RowKey']) 68 | try: 69 | cluster_name, nb_name, input_tables,output_table, _input_cols, _output_cols,deltatable, intermediate_tbl_views,globaltempviews, hardcodecol, joinList = pt.transform_to_purview() 70 | if input_tables and output_table: 71 | lineage_metadata = azStorage.create_lineage_entity(cluster_name, nb_name, input_tables, 72 | output_table, _input_cols, _output_cols, 73 | deltatable, intermediate_tbl_views, 74 | globaltempviews, hardcodecol, joinList) 75 | azStorage.azure_upsert_entity(lineage_client, lineage_metadata) 76 | print("Execution Completed") 77 | metadata = azStorage.create_event_entity("HRSI",fileName,"Processed",3,myblob.name,False,"SUCCESS") 78 | except BaseException as e: 79 | print("Exception Caused in Parsing " + str(e)) 80 | exc_type, exc_value, exc_traceback = sys.exc_info() 81 | err_msg = traceback.format_exception(exc_type, exc_value,exc_traceback)[-2:] 82 | # for i in traceback.format_exception(exc_type, exc_value,exc_traceback): 83 | # print(i) 84 | metadata = azStorage.create_event_entity("HRSI",fileName,"Parsing Failed",3,myblob.name,False,err_msg) 85 | 86 | 87 | try: 88 | azStorage.azure_upsert_entity(event_client, metadata) 89 | except: 90 | logging.info("There is no ROWKEY (alias FileName) in the EventMetadata Table for Update") 91 | -------------------------------------------------------------------------------- /sparklin/BlobTriggerFuncApp/BlobTriggerFunction/column_parser.py: -------------------------------------------------------------------------------- 1 | # import json 2 | # from collections.abc import Mapping 3 | 4 | 5 | def unresolvedAttribute(string): 6 | a = string.strip('[]').split(',') 7 | return (".".join(i.strip() for i in a)) 8 | 9 | 10 | def literal(rec): 11 | elem = "" 12 | if rec["dataType"] == "string": 13 | elem = "'" + rec["value"] + "'" 14 | elif rec["dataType"] == "null": 15 | elem = "NULL" 16 | else: 17 | elem = rec["value"] 18 | return elem 19 | 20 | 21 | def attribute(rec): 22 | str = "" 23 | if rec["class"] == "org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute": 24 | str = unresolvedAttribute(rec["nameParts"]) 25 | elif rec["class"] == "org.apache.spark.sql.catalyst.analysis.Literal" or rec[ 26 | "class"] == "org.apache.spark.sql.catalyst.expressions.Literal": 27 | str = literal(rec) 28 | elif rec["class"] == "org.apache.spark.sql.catalyst.analysis.UnresolvedStar": 29 | str = "*" 30 | return str 31 | 32 | 33 | def WindowSpecDefinition(row, index): 34 | str = "" 35 | # print(index) 36 | curr_index = index + 1 37 | if len(row[index]["partitionSpec"]) > 0: 38 | children = len(row[index]["partitionSpec"]) 39 | no_of_children = 0 40 | str += "PARTITION BY " 41 | while no_of_children < children and curr_index < len(row): 42 | temp = "" 43 | if row[curr_index]["num-children"] > 0: 44 | temp, curr_index = function(row, curr_index) 45 | else: 46 | temp = attribute(row[curr_index]) 47 | str += temp 48 | if no_of_children < children - 1: 49 | str += ", " 50 | curr_index += 1 51 | no_of_children += 1 52 | if len(row[index]["orderSpec"]) > 0: 53 | str += " ORDER BY " 54 | if row[curr_index]["num-children"] == 1: 55 | temp = "" 56 | if row[curr_index]["direction"]["object"] == "org.apache.spark.sql.catalyst.expressions.Descending$": 57 | temp = " DESC" 58 | else: 59 | temp = " ASC" 60 | curr_index += 1 61 | children = len(row[index]["orderSpec"]) 62 | no_of_children = 0 63 | while no_of_children < children and curr_index < len(row): 64 | temp2 = "" 65 | if row[curr_index]["class"] == "org.apache.spark.sql.catalyst.expressions.SortOrder": 66 | if row[curr_index]["direction"]["object"] == "org.apache.spark.sql.catalyst.expressions.Descending$": 67 | temp = " DESC" 68 | else: 69 | temp = " ASC" 70 | curr_index += 1 71 | continue 72 | elif row[curr_index]["num-children"] > 0: 73 | temp2, curr_index = function(row, curr_index) 74 | else: 75 | temp2 = attribute(row[curr_index]) 76 | str += temp2 77 | str += temp 78 | if no_of_children < children - 1: 79 | str += ", " 80 | curr_index += 1 81 | no_of_children += 1 82 | # print(curr_index) 83 | 84 | if row[curr_index]["class"] == "org.apache.spark.sql.catalyst.expressions.UnspecifiedFrame$": 85 | curr_index += 1 86 | pass 87 | return (str, curr_index - 1) 88 | 89 | 90 | def windowExpression(row, index): 91 | fstr = "" 92 | curr_index = index + 1 93 | if row[curr_index]["class"] == "org.apache.spark.sql.catalyst.analysis.UnresolvedFunction": 94 | str, curr_index = function(row, curr_index) 95 | fstr += str 96 | curr_index += 1 97 | fstr += " OVER (" 98 | if row[curr_index]["class"] == "org.apache.spark.sql.catalyst.expressions.WindowSpecDefinition": 99 | str, curr_index = WindowSpecDefinition(row, curr_index) 100 | fstr += str 101 | curr_index += 1 102 | fstr += ")" 103 | return (fstr, curr_index - 1) 104 | 105 | 106 | def recurse(row, index): 107 | children = row[index]["num-children"] 108 | no_of_children = 0 109 | curr_index = index + 1 110 | while no_of_children < children and curr_index < len(row): 111 | curr_index = recurse(row, curr_index) 112 | curr_index += 1 113 | no_of_children += 1 114 | return curr_index - 1 115 | 116 | 117 | def function(row, curr_index): 118 | funcName,funcStr, index = "", "", "" 119 | if row[curr_index]["class"] == "org.apache.spark.sql.catalyst.expressions.CaseWhen": 120 | return ("CASE WHEN function", recurse(row, curr_index)) 121 | elif row[curr_index]["class"] == "org.apache.spark.sql.catalyst.expressions.Cast": 122 | funcName = "CAST" 123 | # elif row[curr_index]["class"] == "org.apache.spark.sql.catalyst.expressions.Add": 124 | # pass 125 | elif row[curr_index]["class"] == "org.apache.spark.sql.catalyst.expressions.UnspecifiedFrame": 126 | return (funcStr, index) 127 | elif row[curr_index]["class"] == "org.apache.spark.sql.catalyst.analysis.UnresolvedAlias": 128 | pass 129 | else: 130 | if "name" in row[curr_index]: 131 | funcName = row[curr_index]["name"]["funcName"] 132 | else: 133 | funcName = row[curr_index]["class"].split(".")[-1] 134 | children = row[curr_index]["num-children"] 135 | if row[curr_index]["class"] == "org.apache.spark.sql.catalyst.analysis.UnresolvedAlias": 136 | funcStr = "" 137 | else: 138 | funcStr = f"{funcName}(" 139 | index = curr_index + 1 140 | no_of_children = 0 141 | while no_of_children < children and index < len(row): 142 | elem = row[index] 143 | str = "" 144 | if elem["class"] == "org.apache.spark.sql.catalyst.analysis.UnresolvedFunction": 145 | str, index = function(row, index) 146 | elif elem["class"] == "org.apache.spark.sql.catalyst.expressions.Cast": 147 | str, index = function(row, index) 148 | elif elem["class"] == "org.apache.spark.sql.catalyst.expressions.WindowExpression": 149 | str, index = windowExpression(row, index) 150 | elif elem["num-children"] == 0: 151 | str = attribute(elem) 152 | else: 153 | str, index = function(row, index) 154 | funcStr += str 155 | if no_of_children < children - 1: 156 | funcStr += ", " 157 | index += 1 158 | no_of_children += 1 159 | if funcName == "CAST": 160 | dataType = row[curr_index]["dataType"].upper() 161 | funcStr += f" AS {dataType}" 162 | 163 | if row[curr_index]["class"] == "org.apache.spark.sql.catalyst.analysis.UnresolvedAlias": 164 | pass 165 | else: 166 | funcStr += ")" 167 | return (funcStr, index - 1) 168 | 169 | 170 | def get_column_transformations(parseJson): 171 | output = [] 172 | for line in parseJson: 173 | # print(len(line)>1) 174 | firstElem = line[0] 175 | alias = "" 176 | index = 0 177 | if firstElem["class"] == "org.apache.spark.sql.catalyst.analysis.UnresolvedAlias": 178 | pass 179 | if firstElem["class"] == "org.apache.spark.sql.catalyst.expressions.Alias": 180 | alias = firstElem["name"] 181 | index += 1 182 | elif firstElem["num-children"] == 0: 183 | alias = attribute(firstElem) 184 | index += 1 185 | # print("sdc") 186 | if len(line) > 1: 187 | # print("Dsc") 188 | funcStr = "" 189 | while index < len(line): 190 | if line[index]["class"] == "org.apache.spark.sql.catalyst.expressions.WindowExpression": 191 | funcStr, index = windowExpression(line, index) 192 | # elif line[index]["num-children"] == 0: 193 | # funcStr = attribute(line[index]) 194 | elif line[index]["num-children"] > 0: 195 | funcStr, index = function(line, index) 196 | index += 1 197 | if funcStr != "": 198 | 199 | # print(index) 200 | if firstElem["class"] == "org.apache.spark.sql.catalyst.analysis.UnresolvedAlias": 201 | output += [funcStr] 202 | # print(funcStr) 203 | else: 204 | output += [funcStr + " AS " + alias] 205 | # print(funcStr + " AS " + alias) 206 | # else: 207 | # print(alias) 208 | # elif 209 | if len(output) == 0: 210 | return None 211 | else: 212 | return output 213 | 214 | 215 | # def findProjectList(elem, count): 216 | # output = [] 217 | # if isinstance(elem, Mapping) or isinstance(elem, list): 218 | # if "projectList" in elem: 219 | # temp = parser(elem["projectList"]) 220 | # # print("") 221 | # # if temp is not None: 222 | # output += [temp] 223 | # else: 224 | # for item in elem: 225 | # if isinstance(elem, Mapping): 226 | # temp = findProjectList(elem[item], count + 1) 227 | # else: 228 | # temp = findProjectList(item, count + 1) 229 | # if temp is not None: 230 | # output += temp 231 | # if len(output) > 0: 232 | # print("Output List",output) 233 | # print("-------------------------------") 234 | # print("-------------------------------") 235 | # if len(output) > 0: 236 | # return output 237 | # else: 238 | # return None 239 | 240 | 241 | # cnt=0 242 | # data = json.load(open('dimperson/dimperson_create6.json')) 243 | # # data= json.load(open('Yogesh (1)/dimcrm1.json'), object_pairs_hook=OrderedDict) 244 | # 245 | # tablePlan = data["run"]["facets"]["spark.logicalPlan"]["plan"] 246 | # for i in tablePlan: 247 | # # cnt+=1 248 | # # print(cnt) 249 | # result = findProjectList(i, 0) 250 | # if result is not None: 251 | # print() 252 | # print("Result=", result) 253 | # print("-------------------------------") 254 | # break 255 | # if len(result) > 0: 256 | # print(result) 257 | # print(result) 258 | -------------------------------------------------------------------------------- /sparklin/BlobTriggerFuncApp/BlobTriggerFunction/function.json: -------------------------------------------------------------------------------- 1 | { 2 | "scriptFile": "__init__.py", 3 | "bindings": [ 4 | { 5 | "name": "myblob", 6 | "type": "blobTrigger", 7 | "direction": "in", 8 | "path": "openlineage/{name}", 9 | "connection": "datalineagesynapsestrpoc_STORAGE" 10 | } 11 | ] 12 | } 13 | -------------------------------------------------------------------------------- /sparklin/BlobTriggerFuncApp/BlobTriggerFunction/host.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "2.0", 3 | "extensionBundle": { 4 | "id": "Microsoft.Azure.Functions.ExtensionBundle", 5 | "version": "[2.*, 3.0.0)" 6 | } 7 | } -------------------------------------------------------------------------------- /sparklin/BlobTriggerFuncApp/BlobTriggerFunction/join_parser.py: -------------------------------------------------------------------------------- 1 | 2 | def get_join_conditions(output_plan,_alias_tablenames): 3 | # print("get joins info") 4 | overall_conditions = [] 5 | mid_expressions = [] 6 | attr_expressions = [] 7 | attr_functions = [] 8 | attr_output = [] 9 | intermediate_output = [] 10 | aliases = [] 11 | joinList = [] 12 | count = 0 13 | 14 | for child in output_plan: 15 | 16 | finalout = "" 17 | if child['class'] == 'org.apache.spark.sql.catalyst.plans.logical.Join': 18 | join = (child['joinType']['object'])[:-1] 19 | joinname = join.split('.')[-1] 20 | 21 | if (joinname == "LeftOuter"): 22 | joinname = "LOJ" 23 | elif (joinname == "RightOuter"): 24 | joinname = "ROJ" 25 | elif (joinname == "Inner"): 26 | joinname = "IJ" 27 | elif (joinname == "FullOuter"): 28 | joinname = "FOJ" 29 | 30 | for i in range(len(child['condition'])): 31 | 32 | childclass = child['condition'][i]['class'] 33 | 34 | if childclass != 'org.apache.spark.sql.catalyst.expressions.And' and childclass != 'org.apache.spark.sql.catalyst.expressions.Or': 35 | if childclass != 'org.apache.spark.sql.catalyst.expressions.EqualTo': 36 | if ( 37 | childclass != 'org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute' and childclass != 'org.apache.spark.sql.catalyst.expressions.Literal'): 38 | 39 | if childclass == "org.apache.spark.sql.catalyst.analysis.UnresolvedFunction": 40 | attr_functions.append(child['condition'][i]['name']['funcName']) 41 | # this is for expressions on attribute like isnotnull OR CAST 42 | else: 43 | func = childclass.split('.')[-1] 44 | if func == 'Cast': 45 | datatype = child['condition'][i]['dataType'] 46 | func = "Cast AS " + datatype 47 | attr_expressions.append(func) 48 | 49 | else: 50 | if childclass == 'org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute': 51 | if (child['condition'][i])['nameParts'].find(",") > 0: 52 | column = ((child['condition'][i])['nameParts']).split(',')[1][1:-1] 53 | tablealias = ((child['condition'][i])['nameParts']).split(',')[0][1:] 54 | table = tablealias + "." + column 55 | else: 56 | column = ((child['condition'][i])['nameParts'])[0] 57 | tablealias = "" 58 | table = column 59 | 60 | 61 | # apply functions and expressions on attribute 62 | if len(attr_functions) > 0: 63 | for func in attr_functions: 64 | table = func + "(" + table + ")" 65 | attr_functions.clear() 66 | 67 | if len(attr_expressions) > 0: 68 | for exp in attr_expressions: 69 | table = table + " " + exp 70 | attr_expressions.clear() 71 | 72 | attr_output.append(table) 73 | if tablealias: 74 | aliases.append(tablealias) 75 | else: 76 | literaldatatype = child['condition'][i]['dataType'] 77 | value = child['condition'][i]['value'] 78 | attr_output.append(value) 79 | 80 | # if we have mid expression that means we will get two unresoved attributes class 81 | if (len(mid_expressions) >= 1): 82 | count = count + 1 83 | if count == 2: 84 | intermediate_output.append( 85 | attr_output[0] + " " + mid_expressions[0] + " " + attr_output[1]) 86 | 87 | attr_output.clear() 88 | mid_expressions.clear() 89 | count = 0 90 | else: 91 | intermediate_output.append(attr_output[0]) 92 | attr_output.clear() 93 | else: 94 | if childclass.split('.')[-1] == "EqualTo": 95 | mid_expressions.append("=") 96 | else: 97 | mid_expressions.append(childclass.split('.')[-1]) 98 | else: 99 | overall_conditions.append(childclass.split('.')[-1]) 100 | 101 | if i + 1 == len(child['condition']): 102 | 103 | finalaliases = list(set(aliases)) 104 | 105 | _alias_tablenames_new = dict((k.lower(), v) for k, v in _alias_tablenames.items()) 106 | 107 | if finalaliases[0].lower() in _alias_tablenames_new.keys(): 108 | alias1 = _alias_tablenames_new[finalaliases[0].lower()] 109 | else: 110 | alias1 = "InlineQuery" 111 | if finalaliases[1].lower() in _alias_tablenames_new.keys(): 112 | alias11 = _alias_tablenames_new[finalaliases[1].lower()] 113 | else: 114 | alias11 = "InlineQuery" 115 | if len(overall_conditions) > 0 and len(intermediate_output) > 1: 116 | finalout = alias1 + " " + finalaliases[0] + " " + joinname + " " + alias11 + " " + \ 117 | finalaliases[1] + " ON " + " ".join( 118 | [x for y in zip(intermediate_output, overall_conditions + [0]) for x in y][:-1]) 119 | else: 120 | finalout = alias1 + " " + finalaliases[0] + " " + joinname + " " + alias11 + " " + \ 121 | finalaliases[1] + " ON " + intermediate_output[0] 122 | 123 | # print(finalout) 124 | joinList.append(finalout) 125 | 126 | overall_conditions.clear() 127 | intermediate_output.clear() 128 | aliases.clear() 129 | return joinList 130 | # print("exit joins info") -------------------------------------------------------------------------------- /sparklin/BlobTriggerFuncApp/BlobTriggerFunction/readme.md: -------------------------------------------------------------------------------- 1 | # BlobTrigger - Python 2 | 3 | The `BlobTrigger` makes it incredibly easy to react to new Blobs inside of Azure Blob Storage. This sample demonstrates a simple use case of processing data from a given Blob using Python. 4 | 5 | ## How it works 6 | 7 | For a `BlobTrigger` to work, you provide a path which dictates where the blobs are located inside your container, and can also help restrict the types of blobs you wish to return. For instance, you can set the path to `samples/{name}.png` to restrict the trigger to only the samples path and only blobs with ".png" at the end of their name. 8 | 9 | ## Learn more 10 | 11 | Documentation -------------------------------------------------------------------------------- /sparklin/BlobTriggerFuncApp/BlobTriggerFunction/sample.dat: -------------------------------------------------------------------------------- 1 | samples-workitems/workitem.txt -------------------------------------------------------------------------------- /sparklin/BlobTriggerFuncApp/host.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "2.0", 3 | "logging": { 4 | "applicationInsights": { 5 | "samplingSettings": { 6 | "isEnabled": true, 7 | "excludedTypes": "Request" 8 | } 9 | } 10 | }, 11 | "extensionBundle": { 12 | "id": "Microsoft.Azure.Functions.ExtensionBundle", 13 | "version": "[2.*, 3.0.0)" 14 | }, 15 | "functionTimeout": "00:10:00" 16 | } 17 | -------------------------------------------------------------------------------- /sparklin/BlobTriggerFuncApp/requirements.txt: -------------------------------------------------------------------------------- 1 | # DO NOT include azure-functions-worker in this file 2 | # The Python Worker is managed by Azure Functions platform 3 | # Manually managing azure-functions-worker may cause unexpected issues 4 | 5 | azure-data-tables 6 | azure-functions==1.7.2 7 | azure-identity==1.6.1 8 | pyapacheatlas==0.12 9 | -------------------------------------------------------------------------------- /sparklin/HttpTriggerFuncApp/.funcignore: -------------------------------------------------------------------------------- 1 | .git* 2 | .vscode 3 | local.settings.json 4 | test 5 | .venv -------------------------------------------------------------------------------- /sparklin/HttpTriggerFuncApp/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | 72 | # PyBuilder 73 | target/ 74 | 75 | # Jupyter Notebook 76 | .ipynb_checkpoints 77 | 78 | # IPython 79 | profile_default/ 80 | ipython_config.py 81 | 82 | # pyenv 83 | .python-version 84 | 85 | # pipenv 86 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 87 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 88 | # having no cross-platform support, pipenv may install dependencies that don’t work, or not 89 | # install all needed dependencies. 90 | #Pipfile.lock 91 | 92 | # celery beat schedule file 93 | celerybeat-schedule 94 | 95 | # SageMath parsed files 96 | *.sage.py 97 | 98 | # Environments 99 | .env 100 | .venv 101 | env/ 102 | venv/ 103 | ENV/ 104 | env.bak/ 105 | venv.bak/ 106 | 107 | # Spyder project settings 108 | .spyderproject 109 | .spyproject 110 | 111 | # Rope project settings 112 | .ropeproject 113 | 114 | # mkdocs documentation 115 | /site 116 | 117 | # mypy 118 | .mypy_cache/ 119 | .dmypy.json 120 | dmypy.json 121 | 122 | # Pyre type checker 123 | .pyre/ 124 | 125 | # Azure Functions artifacts 126 | bin 127 | obj 128 | appsettings.json 129 | local.settings.json 130 | 131 | # Azurite artifacts 132 | __blobstorage__ 133 | __queuestorage__ 134 | __azurite_db*__.json 135 | .python_packages -------------------------------------------------------------------------------- /sparklin/HttpTriggerFuncApp/HttpTriggerFunction/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import datetime as dt 4 | import json 5 | import azure.functions as func 6 | from azure.storage.blob import BlobClient, ContainerClient, ContentSettings 7 | from .tablestorage import tablestorage 8 | from .event import event 9 | 10 | 11 | def uploadblob(json_in, blobname, conn_str, lin_container): 12 | 13 | container_client = ContainerClient.from_connection_string(conn_str, container_name=lin_container) 14 | blob = BlobClient.from_connection_string(conn_str, container_name= lin_container, blob_name=blobname) 15 | BlobClient 16 | blob.upload_blob(json_in, overwrite=True) 17 | 18 | 19 | def main(req: func.HttpRequest) -> func.HttpResponse: 20 | 21 | logging.info("http trigger function kicked off") 22 | 23 | lineageContainerStr = os.environ["LINEAGE_STORAGE_CONN_STR"] 24 | lineageContainer = os.environ["LINEAGE_CONTAINER"] 25 | 26 | data = req.get_json() 27 | 28 | currenttimestamp = dt.datetime.utcnow().strftime("%Y%m%d%H%M%S") 29 | eventType = data["eventType"] 30 | className = data["run"]["facets"]["spark.logicalPlan"]["plan"][0]["class"] 31 | runId = data["run"]["runId"] 32 | notebookName = data["job"]["name"] 33 | notebookName = notebookName[0 : notebookName.index('.')] 34 | 35 | fileName = runId + '_' + notebookName + '_' + currenttimestamp + '.json' 36 | filePath = lineageContainer + '/' + fileName 37 | 38 | predefined_class_list = ["org.apache.spark.sql.execution.datasources.CreateTable", 39 | "org.apache.spark.sql.catalyst.plans.logical.CreateViewStatement", 40 | "org.apache.spark.sql.catalyst.plans.logical.CreateTableAsSelectStatement", 41 | "org.apache.spark.sql.catalyst.plans.logical.InsertIntoStatement", 42 | "org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand"] 43 | 44 | if eventType is not None and eventType == "COMPLETE" and className in predefined_class_list : 45 | # upload file as json into blob storage 46 | uploadblob(json.dumps(data), fileName, lineageContainerStr, lineageContainer) 47 | 48 | #code to add new unprocessed row for same uploaded json into azure table 49 | eventrow = event('HRSI', fileName) 50 | eventrow.Status = 'Unprocessed' 51 | eventrow.RetryCount = 3 52 | eventrow.FilepPath = filePath 53 | eventrow.isArchived = False 54 | eventrow.Message = '' 55 | 56 | tableStorage = tablestorage() 57 | tableStorage.insertEventMetadata(eventrow.__dict__) 58 | 59 | return func.HttpResponse(f"Func App successfully processed http request") 60 | 61 | else: 62 | 63 | return func.HttpResponse(f"Event Type is not COMPLETE Or ClassName Not Matched") -------------------------------------------------------------------------------- /sparklin/HttpTriggerFuncApp/HttpTriggerFunction/event.py: -------------------------------------------------------------------------------- 1 | import azure.data.tables 2 | 3 | class event(): 4 | 5 | Status = 'Unprocessed' 6 | Message = '' 7 | RetryCount = 3 8 | FilepPath = '/openlineage/' 9 | isArchived = 0 10 | 11 | def __init__(self, teamname: str, filename: str) -> None: 12 | self.PartitionKey = teamname 13 | self.RowKey = filename 14 | -------------------------------------------------------------------------------- /sparklin/HttpTriggerFuncApp/HttpTriggerFunction/function.json: -------------------------------------------------------------------------------- 1 | { 2 | "scriptFile": "__init__.py", 3 | "bindings": [ 4 | { 5 | "authLevel": "function", 6 | "type": "httpTrigger", 7 | "direction": "in", 8 | "name": "req", 9 | "Route": "1/lineage", 10 | "methods": [ 11 | "get", 12 | "post" 13 | ] 14 | }, 15 | { 16 | "type": "http", 17 | "direction": "out", 18 | "name": "$return" 19 | } 20 | ] 21 | } -------------------------------------------------------------------------------- /sparklin/HttpTriggerFuncApp/HttpTriggerFunction/sample.dat: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Azure" 3 | } -------------------------------------------------------------------------------- /sparklin/HttpTriggerFuncApp/HttpTriggerFunction/tablestorage.py: -------------------------------------------------------------------------------- 1 | import os 2 | from azure.data.tables import TableServiceClient 3 | 4 | class tablestorage: 5 | 6 | def __init__(self) -> None: 7 | # connstr = os.environ["LINEAGE_STORAGE_CONN_STR"] 8 | self.connstr = "DefaultEndpointsProtocol=https;AccountName=datalineagesynapsestrpoc;AccountKey=<>;EndpointSuffix=core.windows.net" 9 | # tablename = os.environ["LINEAGE_EVENT_TABLE"] 10 | self.tablename = "EventMetadataYJ" 11 | 12 | self.table_service_client = TableServiceClient.from_connection_string(self.connstr) 13 | self.table_service_client.create_table_if_not_exists(self.tablename) 14 | 15 | 16 | def insertEventMetadata(self, eventrow) -> None: 17 | table_client = self.table_service_client.get_table_client(self.tablename) 18 | table_client.create_entity(eventrow) 19 | -------------------------------------------------------------------------------- /sparklin/HttpTriggerFuncApp/Reqtest.http: -------------------------------------------------------------------------------- 1 | POST http://localhost:7071/api/sparkopenlineage 2 | Connect-Type: application/json 3 | 4 | { 5 | "name" : "Yogesh", 6 | "age" : 28 7 | } -------------------------------------------------------------------------------- /sparklin/HttpTriggerFuncApp/host.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "2.0", 3 | "logging": { 4 | "applicationInsights": { 5 | "samplingSettings": { 6 | "isEnabled": true, 7 | "excludedTypes": "Request" 8 | } 9 | } 10 | }, 11 | "extensionBundle": { 12 | "id": "Microsoft.Azure.Functions.ExtensionBundle", 13 | "version": "[3.*, 4.0.0)" 14 | } 15 | } -------------------------------------------------------------------------------- /sparklin/HttpTriggerFuncApp/requirements.txt: -------------------------------------------------------------------------------- 1 | # DO NOT include azure-functions-worker in this file 2 | # The Python Worker is managed by Azure Functions platform 3 | # Manually managing azure-functions-worker may cause unexpected issues 4 | 5 | azure-functions -------------------------------------------------------------------------------- /sparklin/Onboarding.md: -------------------------------------------------------------------------------- 1 | Onboarding is easy with just a few configurations in Synapse Spark Pool environment and taking code scripts from SparkLin Branch. 2 | Overall Steps looks like: 3 | 4 |  1. Upload the Jar “openlineage-spark:.jar” into the Synapse Spark Pool Packages. 5 | 6 |  2. Add spark configurations related to open lineage in Synapse Spark Pool. 7 | 8 |  3. Create a new storage account with a blob container name openlineage to have all json files uploaded and 2 azure table storage (EventMetadata and LineageDetails) 9 | 10 |  4. Create the Azure Function Apps and functions related to SparkLin. 11 | 12 |  5. Create Purview Collection where all lineage assets will reside. 13 | 14 | **Cluster Setup** 15 | 16 | OpenLineage integrates with Spark by implementing SparkListener (SparkListenerSQLExecution, SparkListenerEvent) interface and collecting information about jobs that are executed inside a Spark application. 17 | 18 | To activate the listener, add the following properties to your Spark configuration: 19 | 20 |  • spark.extraListeners io.openlineage.spark.agent.OpenLineageSparkListener 21 | 22 | Once the listener is activated, it needs to know where to report lineage events, as well as the namespace of your jobs. Add the following additional configuration lines to your Spark Configuration in the Spark pool. 23 | 24 |  • spark.openlineage.host {your.openlineage.host i.e. func app endpoint url} 25 | 26 |  • spark.openlineage.namespace {your workspacename} 27 | 28 |  • spark.openlineage.url.param.code {your func app host key (default key)} 29 | 30 |  • spark.openlineage.version { 1 or v1 depends on the jar} 31 | 32 | 33 | **Storage Account Setup** 34 | 35 | Create new storage account in Azure portal by using portal wizard. 36 | 37 |  • create new container and name it as **openlineage** 38 | 39 | 40 | **Azure Table Storages setup** 41 | 42 | We work with 2 azure table storages, one to store all events and their processing status and other is to store all lineage details. 43 | 44 | **Creation steps:** 45 | 46 | Open your sotrage account and go to **Tables** and create two new blank tables and name them as EventMetadata and LineageDetails. 47 | 48 | image 49 | 50 | 51 | EventMetadata table is used to store all events information which is triggered by open lineage and to track parsing status of each event. 52 | Structure of EventMetadata table looks like below after rows generated: 53 | 1. PartitionKey 54 | 2. RowKey 55 | 3. Timestamp 56 | 4. Status 57 | 5. RetryCount 58 | 6. FilePath 59 | 7. isArchived 60 | 8. Message 61 | 62 | LineageDetails table is used to store all lineage information which is obtained after json parsing. 63 | Structure of LineageDetails table looks like: 64 | 1. PartitionKey 65 | 2. RowKey 66 | 3. Timestamp 67 | 4. derived_columns 68 | 5. input_columns 69 | 6. input_tables 70 | 7. isdelta 71 | 8. isglobal 72 | 9. isintermediate 73 | 10. joinconditions 74 | 11. output_columns 75 | 12. output_table 76 | 77 | After we have both azure storage tables, we make use of HTTP and Blob Storage based Azure functions to process all open lineage produced jsons. 78 | 79 | **Function Apps Set up Steps: ** 80 | 81 | HTTP Trigger Function App: 82 | 83 | Create new function app on azure portal with application insights enabled which will provide a http endpoint for spark cluster to make PUSH requests with json type data. This is C# based function app. 84 | 85 | Deployment: 86 | 87 | Deploy function from URL: https://github.com/microsoft/DataLineage/tree/main/sparklin/OpenLineage 88 | 89 | Add new Configurations in Function App: 90 | 91 | ConnectionString : < your new storage account connection string > 92 | 93 | ContainerName : openlineage 94 | 95 | TableName : EventMetadata 96 | 97 | **What does function do** 98 | 1. App will store this json data as file into blob storage 99 | 2. App will insert an entry in eventmetadata table with status as Unprocessed for this particular json file 100 | 101 | Blob Trigger Function App: 102 | 103 | Create new function app on azure portal with application insights enabled which will get tiggered as and when new blobs will be uploaded by http trigger function app. This is python based function app. 104 | 105 | Deployment: 106 | 107 | Deploy function from URL: https://github.com/microsoft/DataLineage/tree/main/sparklin/BlobTriggerFuncApp 108 | 109 | Add new Configurations in Function App: 110 | 111 | datalineagesynapsestrpoc_STORAGE : < your new storage account connection string > 112 | 113 | StorageTableName : LineageDetails 114 | 115 | TableName : EventMetadata 116 | 117 | **What does function do** 118 | 1. App will query EventMetadata table and take all records which are in status of 'Unprocessed' 119 | 2. For every event, App will read json file from blob storage and start parsing it using Python code and push all lineage details to ''LineageDetails' Azure table 120 | 3. Finally App will update status of working event as 'Processed' if lineage is pushed and 'Failed' if something fails and update 'Message' column with exception details. 121 | 122 | **Note: Limitations** 123 | Based on our usecases we have implemented the parser, if we find any new edge usecases, we need to enhance the Parser. 124 | Currently it supports Spark 3.1, for greater Spark versions Parser enhancement is required. 125 | -------------------------------------------------------------------------------- /sparklin/OpenLineage/OpenLineage.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio Version 16 4 | VisualStudioVersion = 16.0.32413.69 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "OpenLineage", "OpenLineage\OpenLineage.csproj", "{41D137BF-7F46-42BB-94D6-437A05CAADDA}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Debug|Any CPU = Debug|Any CPU 11 | Release|Any CPU = Release|Any CPU 12 | EndGlobalSection 13 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 14 | {41D137BF-7F46-42BB-94D6-437A05CAADDA}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 15 | {41D137BF-7F46-42BB-94D6-437A05CAADDA}.Debug|Any CPU.Build.0 = Debug|Any CPU 16 | {41D137BF-7F46-42BB-94D6-437A05CAADDA}.Release|Any CPU.ActiveCfg = Release|Any CPU 17 | {41D137BF-7F46-42BB-94D6-437A05CAADDA}.Release|Any CPU.Build.0 = Release|Any CPU 18 | EndGlobalSection 19 | GlobalSection(SolutionProperties) = preSolution 20 | HideSolutionNode = FALSE 21 | EndGlobalSection 22 | GlobalSection(ExtensibilityGlobals) = postSolution 23 | SolutionGuid = {917ADBC5-E454-47E0-9AE3-850361FEE5DB} 24 | EndGlobalSection 25 | EndGlobal 26 | -------------------------------------------------------------------------------- /sparklin/OpenLineage/OpenLineage/CaptureLineage.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Text; 3 | using System.IO; 4 | using System.Threading.Tasks; 5 | using Microsoft.AspNetCore.Mvc; 6 | using Microsoft.Azure.WebJobs; 7 | using Microsoft.Azure.WebJobs.Extensions.Http; 8 | using Microsoft.AspNetCore.Http; 9 | using Microsoft.Extensions.Logging; 10 | using Newtonsoft.Json; 11 | using Microsoft.Azure.Storage; 12 | using Microsoft.Azure.Storage.Blob; 13 | using System.Linq; 14 | 15 | namespace OpenLineage 16 | { 17 | public static class CaptureLineage 18 | { 19 | [FunctionName("CaptureLineage")] 20 | public static async Task Run( 21 | [HttpTrigger(AuthorizationLevel.Function, "get", "post", Route = "1/lineage")] HttpRequest req, 22 | ILogger log) 23 | { 24 | log.LogInformation("C# HTTP trigger function processed a request."); 25 | 26 | string eventType = req.Query["eventType"]; 27 | 28 | string[] predefined_class_list = {"org.apache.spark.sql.execution.datasources.CreateTable", 29 | "org.apache.spark.sql.catalyst.plans.logical.CreateViewStatement", 30 | "org.apache.spark.sql.catalyst.plans.logical.CreateTableAsSelectStatement", 31 | "org.apache.spark.sql.catalyst.plans.logical.InsertIntoStatement", 32 | "org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand", 33 | "org.apache.spark.sql.catalyst.plans.logical.MergeIntoTable"}; 34 | 35 | string requestBody = await new StreamReader(req.Body).ReadToEndAsync(); 36 | dynamic data = JsonConvert.DeserializeObject(requestBody); 37 | eventType = eventType ?? data?.eventType; 38 | string className = data["run"]["facets"]["spark.logicalPlan"]["plan"][0]["class"]; 39 | string runId = data["run"]["runId"]; 40 | string notebookName = data["job"]["name"]; 41 | notebookName = notebookName.Substring(0,notebookName.IndexOf(".")-1); 42 | 43 | if (eventType != null && eventType.Equals("COMPLETE") && predefined_class_list.Contains(className)) 44 | { 45 | string connectionString = Environment.GetEnvironmentVariable("ConnectionString"); 46 | string containerName = Environment.GetEnvironmentVariable("ContainerName"); 47 | TableStorage tableStorage = new TableStorage(); 48 | 49 | string currentTimestamp = DateTime.UtcNow.ToString("yyyyMMddHHmmss"); 50 | string fileName = $"{runId}_{notebookName}_{currentTimestamp}.json"; 51 | EventMetadata eventMetadata = new EventMetadata(Utility.getQualifierName(notebookName), $"{runId}_{notebookName}_{currentTimestamp}"); 52 | eventMetadata.Status = Constant.UN_PROCESSED; 53 | eventMetadata.RetryCount = Constant.RETRY_COUNT; 54 | eventMetadata.isArchived = Constant.IS_ARCHIVE; 55 | eventMetadata.FilePath = $"{containerName}/{fileName}"; 56 | 57 | CloudStorageAccount storageAccount = CloudStorageAccount.Parse(connectionString); 58 | CloudBlobClient client = storageAccount.CreateCloudBlobClient(); 59 | CloudBlobContainer container = client.GetContainerReference(containerName); 60 | 61 | CloudBlockBlob blob = container.GetBlockBlobReference(fileName); 62 | blob.Properties.ContentType = "application/json"; 63 | using (Stream stream = new MemoryStream(Encoding.UTF8.GetBytes(requestBody))) 64 | { 65 | await blob.UploadFromStreamAsync(stream); 66 | } 67 | tableStorage.insetEventMetadata(eventMetadata); 68 | return new OkObjectResult("file uploaded successfylly"); 69 | } 70 | else 71 | { 72 | return new OkObjectResult("Event Type is not COMPLETE Or ClassName Not Matched"); 73 | } 74 | } 75 | } 76 | 77 | 78 | } 79 | -------------------------------------------------------------------------------- /sparklin/OpenLineage/OpenLineage/Constant.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace OpenLineage 6 | { 7 | static class Constant 8 | { 9 | public const String UN_PROCESSED = "Unprocessed"; 10 | public const int RETRY_COUNT = 3; 11 | public const Boolean IS_ARCHIVE = false; 12 | 13 | public const String LEARNING = "learning"; 14 | public const String GAT = "gta"; 15 | public const String HCM = "hcm"; 16 | public const String HRSI = "hrsi"; 17 | public const String ULTP = "ultp"; 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /sparklin/OpenLineage/OpenLineage/EventMetadata.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | using Microsoft.WindowsAzure.Storage.Table; 5 | 6 | namespace OpenLineage 7 | { 8 | class EventMetadata : TableEntity 9 | { 10 | public EventMetadata(String teamName, String fileName) 11 | { 12 | PartitionKey = teamName; 13 | RowKey = fileName; 14 | } 15 | 16 | public EventMetadata() { } 17 | 18 | public string Status { get; set; } 19 | public string Message { get; set; } 20 | public int RetryCount { get; set; } 21 | public String FilePath { get; set; } 22 | 23 | public Boolean isArchived { get; set; } 24 | 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /sparklin/OpenLineage/OpenLineage/OpenLineage.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | netcoreapp3.1 4 | v3 5 | c5590014-6f5a-416a-9f64-80d4ed0332a8 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | PreserveNewest 16 | 17 | 18 | PreserveNewest 19 | Never 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /sparklin/OpenLineage/OpenLineage/OpenLineage.csproj.user: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | <_LastSelectedProfileId>C:\Users\yjain\Source\Repos\HR-HRDI-TABI-DL-HRDIDataLineage\OpenLineage\OpenLineage\Properties\PublishProfiles\sparkOpenLineageRead - Zip Deploy.pubxml 5 | 6 | -------------------------------------------------------------------------------- /sparklin/OpenLineage/OpenLineage/TableStorage.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | using Microsoft.WindowsAzure.Storage.Table; 5 | using Microsoft.Azure; 6 | using Microsoft.WindowsAzure.Storage; 7 | using Microsoft.WindowsAzure.Storage.Auth; 8 | 9 | namespace OpenLineage 10 | { 11 | class TableStorage 12 | { 13 | string connectionString = Environment.GetEnvironmentVariable("ConnectionString"); 14 | string tableName = Environment.GetEnvironmentVariable("TableName"); 15 | CloudTable table = null; 16 | 17 | public TableStorage() 18 | { 19 | createTableIfNotExist(); 20 | } 21 | 22 | public async void createTableIfNotExist() 23 | { 24 | CloudStorageAccount storageAcc = CloudStorageAccount.Parse(connectionString); 25 | CloudTableClient tblclient = storageAcc.CreateCloudTableClient(); 26 | table = tblclient.GetTableReference(tableName); 27 | await table.CreateIfNotExistsAsync(); 28 | 29 | } 30 | 31 | public async void insetEventMetadata(EventMetadata eventMetadata) 32 | { 33 | if(table==null) createTableIfNotExist(); 34 | TableOperation insertOperation = TableOperation.Insert(eventMetadata); 35 | await table.ExecuteAsync(insertOperation); 36 | } 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /sparklin/OpenLineage/OpenLineage/Utility.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace OpenLineage 6 | { 7 | class Utility 8 | { 9 | public static String getQualifierName(String notebookName) 10 | { 11 | if (notebookName != null && notebookName.IndexOf(Constant.GAT, StringComparison.OrdinalIgnoreCase) >= 0) 12 | { 13 | return Constant.GAT.ToUpper(); 14 | } 15 | else if (notebookName != null && notebookName.IndexOf(Constant.HRSI, StringComparison.OrdinalIgnoreCase) >= 0) 16 | { 17 | return Constant.HRSI.ToUpper(); 18 | } 19 | else if (notebookName != null && notebookName.IndexOf(Constant.LEARNING, StringComparison.OrdinalIgnoreCase) >= 0) 20 | { 21 | return Constant.LEARNING.ToUpper(); 22 | } 23 | else if (notebookName != null && notebookName.IndexOf(Constant.HCM, StringComparison.OrdinalIgnoreCase) >= 0) 24 | { 25 | return Constant.HCM.ToUpper(); 26 | } 27 | else if (notebookName != null && notebookName.IndexOf(Constant.ULTP, StringComparison.OrdinalIgnoreCase) >= 0) 28 | { 29 | return Constant.ULTP.ToUpper(); 30 | } 31 | else 32 | { 33 | return notebookName.Substring(0, notebookName.LastIndexOf("_")).ToUpper(); 34 | } 35 | 36 | } 37 | 38 | public static void main() 39 | { 40 | Console.WriteLine(Utility.getQualifierName("sql_test_data_lineage_pool_1650268838")); 41 | } 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /sparklin/OpenLineage/OpenLineage/host.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "2.0", 3 | "logging": { 4 | "applicationInsights": { 5 | "samplingSettings": { 6 | "isEnabled": true, 7 | "excludedTypes": "Request" 8 | } 9 | } 10 | } 11 | } -------------------------------------------------------------------------------- /sparklin/README.md: -------------------------------------------------------------------------------- 1 | SparkLin is a Custom Parser which parses the Spark Internal Logical Execution Plan and fetches the required Attributes, entities and the Transformations/functions applied on Columns and Join Conditions on Entities. 2 | 3 | SparkLin uses OpenLineage 0.4 Version Jar which is tightly coupled with Spark 3.1 Version and provides more detailed plan. 4 | 5 | This Project uses two Function Apps which are used for Capturing the Event Json Payloads from OpenLineage into Blob Storage and read or parse the Jsons automatically . 6 | 7 | We are constantly improving the SparkLin to tackle different usecases where the Synapse Notebooks contain Complete Spark Native Code. 8 | -------------------------------------------------------------------------------- /sparklin/openlineage-spark-0.4.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DataLineage/a4390286d334ad02dccd2e10d199487ca13640c2/sparklin/openlineage-spark-0.4.0.jar -------------------------------------------------------------------------------- /tompo/Onboarding.md: -------------------------------------------------------------------------------- 1 | Please refer Onboarding document present in tompo branch 2 | -------------------------------------------------------------------------------- /tompo/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /tompo/TOMPo.pbix: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DataLineage/a4390286d334ad02dccd2e10d199487ca13640c2/tompo/TOMPo.pbix -------------------------------------------------------------------------------- /tompo/TOMPo_ModelMetada.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "source": [ 6 | "%%sql\r\n", 7 | "--This notebook should be run after TOMPo_ReportParser notebook is executed as the current notebook(TOMPo_ModelMetadata) utilizes the output of TOMPo_ReportParser\r\n", 8 | "--First Create the TOMPo Database (If not exisits)\r\n", 9 | "CREATE DATABASE IF NOT EXISTS tompo" 10 | ], 11 | "outputs": [], 12 | "execution_count": null, 13 | "metadata": { 14 | "jupyter": { 15 | "source_hidden": false, 16 | "outputs_hidden": false 17 | }, 18 | "nteract": { 19 | "transient": { 20 | "deleting": false 21 | } 22 | }, 23 | "microsoft": { 24 | "language": "sparksql" 25 | }, 26 | "collapsed": false 27 | } 28 | }, 29 | { 30 | "cell_type": "code", 31 | "source": [ 32 | "%%sql\r\n", 33 | "--Create the tables for TOMPo\r\n", 34 | "use tompo;\r\n", 35 | "DROP TABLE IF EXISTS tompo_datasources;\r\n", 36 | "DROP TABLE IF EXISTS tompo_tables;\r\n", 37 | "DROP TABLE IF EXISTS tompo_tablepartitions;\r\n", 38 | "DROP TABLE IF EXISTS tompo_totables;\r\n", 39 | "DROP TABLE IF EXISTS tompo_columns;\r\n", 40 | "DROP TABLE IF EXISTS tompo_measures;\r\n", 41 | "DROP TABLE IF EXISTS tompo_calcdependency;\r\n", 42 | "DROP TABLE IF EXISTS tompo_relationships;\r\n", 43 | "DROP TABLE IF EXISTS tompo_rolememberships; \r\n", 44 | "DROP TABLE IF EXISTS tompo_roles;\r\n", 45 | "DROP TABLE IF EXISTS tompo_dbschema_catalogs;\r\n", 46 | "DROP TABLE IF EXISTS tompo.tompo_reportimpact;" 47 | ], 48 | "outputs": [], 49 | "execution_count": null, 50 | "metadata": { 51 | "jupyter": { 52 | "source_hidden": false, 53 | "outputs_hidden": false 54 | }, 55 | "nteract": { 56 | "transient": { 57 | "deleting": false 58 | } 59 | }, 60 | "microsoft": { 61 | "language": "sparksql" 62 | } 63 | } 64 | }, 65 | { 66 | "cell_type": "code", 67 | "source": [ 68 | "%%spark\r\n", 69 | "//Drop\\Remove the External tables\r\n", 70 | "var fs = org.apache.hadoop.fs.FileSystem.get(sc.hadoopConfiguration)\r\n", 71 | "if(fs.exists(new org.apache.hadoop.fs.Path(\"/data/tompo/tompo_datasources\"))) {\r\n", 72 | " mssparkutils.fs.rm(\"/data/tompo/tompo_datasources\", recurse=true)\r\n", 73 | "}\r\n", 74 | "\r\n", 75 | "var fs = org.apache.hadoop.fs.FileSystem.get(sc.hadoopConfiguration)\r\n", 76 | "if(fs.exists(new org.apache.hadoop.fs.Path(\"/data/tompo/tompo_tables\"))) {\r\n", 77 | " mssparkutils.fs.rm(\"/data/tompo/tompo_tables\", recurse=true)\r\n", 78 | "}\r\n", 79 | "\r\n", 80 | "var fs = org.apache.hadoop.fs.FileSystem.get(sc.hadoopConfiguration)\r\n", 81 | "if(fs.exists(new org.apache.hadoop.fs.Path(\"/data/tompo/tompo_tablepartitions\"))) {\r\n", 82 | " mssparkutils.fs.rm(\"/data/tompo/tompo_tablepartitions\", recurse=true)\r\n", 83 | "}\r\n", 84 | "\r\n", 85 | "var fs = org.apache.hadoop.fs.FileSystem.get(sc.hadoopConfiguration)\r\n", 86 | "if(fs.exists(new org.apache.hadoop.fs.Path(\"/data/tompo/tompo_totables\"))) {\r\n", 87 | " mssparkutils.fs.rm(\"/data/tompo/tompo_totables\", recurse=true)\r\n", 88 | "}\r\n", 89 | "\r\n", 90 | "var fs = org.apache.hadoop.fs.FileSystem.get(sc.hadoopConfiguration)\r\n", 91 | "if(fs.exists(new org.apache.hadoop.fs.Path(\"/data/tompo/tompo_columns\"))) {\r\n", 92 | " mssparkutils.fs.rm(\"/data/tompo/tompo_columns\", recurse=true)\r\n", 93 | "}\r\n", 94 | "\r\n", 95 | "var fs = org.apache.hadoop.fs.FileSystem.get(sc.hadoopConfiguration)\r\n", 96 | "if(fs.exists(new org.apache.hadoop.fs.Path(\"/data/tompo/tompo_measures\"))) {\r\n", 97 | " mssparkutils.fs.rm(\"/data/tompo/tompo_measures\", recurse=true)\r\n", 98 | "}\r\n", 99 | "\r\n", 100 | "var fs = org.apache.hadoop.fs.FileSystem.get(sc.hadoopConfiguration)\r\n", 101 | "if(fs.exists(new org.apache.hadoop.fs.Path(\"/data/tompo/tompo_calcdependency\"))) {\r\n", 102 | " mssparkutils.fs.rm(\"/data/tompo/tompo_calcdependency\", recurse=true)\r\n", 103 | "}\r\n", 104 | "\r\n", 105 | "var fs = org.apache.hadoop.fs.FileSystem.get(sc.hadoopConfiguration)\r\n", 106 | "if(fs.exists(new org.apache.hadoop.fs.Path(\"/data/tompo/tompo_relationships\"))) {\r\n", 107 | " mssparkutils.fs.rm(\"/data/tompo/tompo_relationships\", recurse=true)\r\n", 108 | "}\r\n", 109 | "\r\n", 110 | "var fs = org.apache.hadoop.fs.FileSystem.get(sc.hadoopConfiguration)\r\n", 111 | "if(fs.exists(new org.apache.hadoop.fs.Path(\"/data/tompo/tompo_rolememberships\"))) {\r\n", 112 | " mssparkutils.fs.rm(\"/data/tompo/tompo_rolememberships\", recurse=true)\r\n", 113 | "}\r\n", 114 | "\r\n", 115 | "var fs = org.apache.hadoop.fs.FileSystem.get(sc.hadoopConfiguration)\r\n", 116 | "if(fs.exists(new org.apache.hadoop.fs.Path(\"/data/tompo/tompo_roles\"))) {\r\n", 117 | " mssparkutils.fs.rm(\"/data/tompo/tompo_roles\", recurse=true)\r\n", 118 | "}\r\n", 119 | "\r\n", 120 | "\r\n", 121 | "var fs = org.apache.hadoop.fs.FileSystem.get(sc.hadoopConfiguration)\r\n", 122 | "if(fs.exists(new org.apache.hadoop.fs.Path(\"/data/tompo/tompo_dbschema_catalogs\"))) {\r\n", 123 | " mssparkutils.fs.rm(\"/data/tompo/tompo_dbschema_catalogs\", recurse=true)\r\n", 124 | "}\r\n", 125 | "\r\n", 126 | "var fs = org.apache.hadoop.fs.FileSystem.get(sc.hadoopConfiguration)\r\n", 127 | "if(fs.exists(new org.apache.hadoop.fs.Path(\"/data/tompo/tompo_reportimpact\"))) {\r\n", 128 | " mssparkutils.fs.rm(\"/data/tompo/tompo_reportimpact\", recurse=true)\r\n", 129 | "}\r\n", 130 | "\r\n" 131 | ], 132 | "outputs": [], 133 | "execution_count": null, 134 | "metadata": { 135 | "jupyter": { 136 | "source_hidden": false, 137 | "outputs_hidden": false 138 | }, 139 | "nteract": { 140 | "transient": { 141 | "deleting": false 142 | } 143 | }, 144 | "microsoft": { 145 | "language": "scala" 146 | } 147 | } 148 | }, 149 | { 150 | "cell_type": "code", 151 | "source": [ 152 | "%%sql\r\n", 153 | "--Create TOMPo tables\r\n", 154 | "use tompo;\r\n", 155 | "\r\n", 156 | "CREATE TABLE IF NOT EXISTS tompo_datasources\r\n", 157 | "( \r\n", 158 | " ModelName STRING\r\n", 159 | " , ID BIGINT\r\n", 160 | " , ModelID BIGINT\r\n", 161 | " , Name STRING \r\n", 162 | " , Description STRING\r\n", 163 | " , Type BIGINT\r\n", 164 | " , ConnectionString STRING\r\n", 165 | " , ImpersonationMode BIGINT\r\n", 166 | " , Account STRING\r\n", 167 | " , ModifiedTime TIMESTAMP\r\n", 168 | ")\r\n", 169 | "USING DELTA\r\n", 170 | "LOCATION \"/data/tompo/tompo_datasources\";\r\n", 171 | "\r\n", 172 | "CREATE TABLE IF NOT EXISTS tompo_tables\r\n", 173 | "( \r\n", 174 | " ModelName STRING\r\n", 175 | " , ID BIGINT\r\n", 176 | " , ModelID BIGINT\r\n", 177 | " , Name STRING \r\n", 178 | " , DataCategory STRING\r\n", 179 | " , Description STRING\r\n", 180 | " , IsHidden BOOLEAN\r\n", 181 | ")\r\n", 182 | "USING DELTA\r\n", 183 | "LOCATION \"/data/tompo/tompo_tables\";\r\n", 184 | "\r\n", 185 | "\r\n", 186 | "CREATE TABLE IF NOT EXISTS tompo_tablepartitions\r\n", 187 | "( \r\n", 188 | " ModelName STRING\r\n", 189 | " , ID BIGINT\r\n", 190 | " , TableID BIGINT\r\n", 191 | " , Name STRING \r\n", 192 | " , Description STRING\r\n", 193 | " , DataSourceID INT\r\n", 194 | " , QueryDefinition STRING\r\n", 195 | " , Type BIGINT\r\n", 196 | " , Mode BIGINT\r\n", 197 | " , ModifiedTime TIMESTAMP\r\n", 198 | ")\r\n", 199 | "USING DELTA\r\n", 200 | "LOCATION \"/data/tompo/tompo_tablepartitions\";\r\n", 201 | "\r\n", 202 | "CREATE TABLE IF NOT EXISTS tompo_totables\r\n", 203 | "( \r\n", 204 | " ModelName STRING \r\n", 205 | " , ID BIGINT\r\n", 206 | " , ModelID BIGINT\r\n", 207 | " , Name STRING \r\n", 208 | " , DataCategory STRING\r\n", 209 | " , Description STRING\r\n", 210 | " , IsHidden BOOLEAN\r\n", 211 | ")\r\n", 212 | "USING DELTA\r\n", 213 | "LOCATION \"/data/tompo/tompo_totables\";\r\n", 214 | "\r\n", 215 | "\r\n", 216 | "CREATE TABLE IF NOT EXISTS tompo_columns\r\n", 217 | "( \r\n", 218 | " ModelName STRING\r\n", 219 | " , ID BIGINT\r\n", 220 | " , TableID BIGINT\r\n", 221 | " , Name STRING \r\n", 222 | " , ExplicitDataType STRING\r\n", 223 | " , DataCategory STRING\r\n", 224 | " , Description BOOLEAN\r\n", 225 | " , isHidden BOOLEAN\r\n", 226 | " , isUnique BOOLEAN\r\n", 227 | " , isKey BOOLEAN\r\n", 228 | " , SummarizeBy BIGINT\r\n", 229 | " , ColumnStorageID BIGINT\r\n", 230 | " , Type BIGINT\r\n", 231 | " , SourceColumn STRING\r\n", 232 | " , Expression STRING\r\n", 233 | " , FormatString STRING\r\n", 234 | " , SortByColumnID BIGINT\r\n", 235 | " , AttributeHierarchyID BIGINT\r\n", 236 | " , ModifiedTime TIMESTAMP\r\n", 237 | " , StructuredModifiedTime TIMESTAMP\r\n", 238 | " , DisplayFolder STRING\r\n", 239 | ")\r\n", 240 | "USING DELTA\r\n", 241 | "LOCATION \"/data/tompo/tompo_columns\";\r\n", 242 | "\r\n", 243 | "\r\n", 244 | "CREATE TABLE IF NOT EXISTS tompo_measures\r\n", 245 | "( \r\n", 246 | " ModelName STRING\r\n", 247 | " , ID BIGINT\r\n", 248 | " , TableID BIGINT\r\n", 249 | " , Name STRING\r\n", 250 | " , Description STRING \r\n", 251 | " , DataType BIGINT\r\n", 252 | " , Expression STRING\r\n", 253 | " , FormatString STRING\r\n", 254 | " , IsHidden BOOLEAN\r\n", 255 | " , ModifiedTime TIMESTAMP\r\n", 256 | " , StructuredModifiedTime TIMESTAMP\r\n", 257 | " , KPID BIGINT\r\n", 258 | " , IsSimpleMeasure BOOLEAN\r\n", 259 | " , DisplayFolder STRING\r\n", 260 | ")\r\n", 261 | "USING DELTA\r\n", 262 | "LOCATION \"/data/tompo/tompo_measures\";\r\n", 263 | "\r\n", 264 | "CREATE TABLE IF NOT EXISTS tompo_calcdependency\r\n", 265 | "( \r\n", 266 | " ModelName STRING\r\n", 267 | " , Database_Name STRING\r\n", 268 | " , Object_Type STRING\r\n", 269 | " , Table STRING\r\n", 270 | " , Object STRING \r\n", 271 | " , Expression STRING\r\n", 272 | " , ReferenceObjectType STRING\r\n", 273 | " , ReferencedTable STRING\r\n", 274 | " , ReferencedObject STRING\r\n", 275 | " , ReferencedExpression STRING\r\n", 276 | " , Query STRING\r\n", 277 | ")\r\n", 278 | "USING DELTA\r\n", 279 | "LOCATION \"/data/tompo/tompo_calcdependency\";\r\n", 280 | "\r\n", 281 | "\r\n", 282 | "CREATE TABLE IF NOT EXISTS tompo_relationships\r\n", 283 | "( \r\n", 284 | " ModelName STRING\r\n", 285 | " , ID BIGINT\r\n", 286 | " , ModelID BIGINT\r\n", 287 | " , IsActive BIGINT\r\n", 288 | " , Type BIGINT \r\n", 289 | " , CrossFilteringBehavior STRING\r\n", 290 | " , FromTableID STRING\r\n", 291 | " , FromColumnID STRING\r\n", 292 | " , FromCardinality STRING\r\n", 293 | " , ToTableID STRING\r\n", 294 | " , ToColumnID STRING\r\n", 295 | " , ToCardinality BIGINT\r\n", 296 | " , ModifiedTime TIMESTAMP\r\n", 297 | ")\r\n", 298 | "USING DELTA\r\n", 299 | "LOCATION \"/data/tompo/tompo_relationships\";\r\n", 300 | "\r\n", 301 | "\r\n", 302 | "CREATE TABLE IF NOT EXISTS tompo_rolememberships\r\n", 303 | "( \r\n", 304 | " ModelName STRING\r\n", 305 | " , ID BIGINT\r\n", 306 | " , RoleID BIGINT\r\n", 307 | " , MemberName STRING \r\n", 308 | " , IdentityProvider STRING\r\n", 309 | " , ModifiedTime TIMESTAMP\r\n", 310 | ")\r\n", 311 | "USING DELTA\r\n", 312 | "LOCATION \"/data/tompo/tompo_rolememberships\";\r\n", 313 | "\r\n", 314 | "\r\n", 315 | "CREATE TABLE IF NOT EXISTS tompo_roles\r\n", 316 | "( \r\n", 317 | " ModelName STRING\r\n", 318 | " , ID BIGINT\r\n", 319 | " , Name STRING\r\n", 320 | " , Description STRING \r\n", 321 | " , ModelPermission INT\r\n", 322 | " , ModifiedTime TIMESTAMP\r\n", 323 | ")\r\n", 324 | "USING DELTA\r\n", 325 | "LOCATION \"/data/tompo/tompo_roles\";\r\n", 326 | "\r\n", 327 | "CREATE TABLE IF NOT EXISTS tompo_dbschema_catalogs\r\n", 328 | "(\r\n", 329 | " ModelName STRING\r\n", 330 | " , Description STRING\r\n", 331 | " , ROLES STRING\r\n", 332 | " , DATE_MODIFIED TIMESTAMP\r\n", 333 | " , COMPATIBILITY_LEVEL BIGINT\r\n", 334 | " , TYPE STRING\r\n", 335 | ")\r\n", 336 | "USING DELTA\r\n", 337 | "LOCATION \"/data/tompo/tompo_dbschema_catalogs\";\r\n", 338 | "\r\n", 339 | "\r\n", 340 | "CREATE TABLE IF NOT EXISTS tompo_reportimpact\r\n", 341 | "( \r\n", 342 | " ModelName STRING\r\n", 343 | " , TableName STRING\r\n", 344 | " , ObjectName STRING\r\n", 345 | " , ReportName STRING\r\n", 346 | " , PageName STRING\r\n", 347 | " , VisualType STRING\r\n", 348 | " , ObjectType STRING\r\n", 349 | ")\r\n", 350 | "USING DELTA\r\n", 351 | "LOCATION \"/data/tompo/tompo_reportimpact\";\r\n" 352 | ], 353 | "outputs": [], 354 | "execution_count": null, 355 | "metadata": { 356 | "jupyter": { 357 | "source_hidden": false, 358 | "outputs_hidden": false 359 | }, 360 | "nteract": { 361 | "transient": { 362 | "deleting": false 363 | } 364 | }, 365 | "microsoft": { 366 | "language": "sparksql" 367 | }, 368 | "collapsed": false 369 | } 370 | }, 371 | { 372 | "cell_type": "code", 373 | "source": [ 374 | "#r \"nuget: Microsoft.AnalysisServices.AdomdClient.NetCore.retail.amd64, 19.51.0\"\r\n", 375 | "\r\n", 376 | "using System.Data;\r\n", 377 | "using Microsoft.Spark.Sql;\r\n", 378 | "using Microsoft.Spark.Sql.Types;\r\n", 379 | "using T=Microsoft.Spark.Sql.Types;\r\n", 380 | "using Microsoft.AnalysisServices.AdomdClient;\r\n", 381 | "\r\n", 382 | "DataFrame RunXmlaQuery(string constr, string query, int topRows=0)\r\n", 383 | "{\r\n", 384 | " using (var con = new AdomdConnection(constr))\r\n", 385 | " {\r\n", 386 | " con.Open();\r\n", 387 | " var cmd = con.CreateCommand();\r\n", 388 | " cmd.CommandText = query;\r\n", 389 | " using (var rdr = cmd.ExecuteReader())\r\n", 390 | " {\r\n", 391 | " return GetDataFrame(rdr,topRows);\r\n", 392 | " }\r\n", 393 | "\r\n", 394 | " }\r\n", 395 | "}\r\n", 396 | "\r\n", 397 | "DataFrame GetDataFrame( IDataReader rdr, int topRows = 0)\r\n", 398 | "{\r\n", 399 | " var fields= GetFields(rdr).ToList();\r\n", 400 | " var type = new StructType(fields);\r\n", 401 | "\r\n", 402 | " //Console.WriteLine(type.SerializeToJson());\r\n", 403 | "\r\n", 404 | " return spark.CreateDataFrame(GetRows(rdr, topRows), type);\r\n", 405 | "}\r\n", 406 | "\r\n", 407 | "IEnumerable GetRows(IDataReader rdr, int topRows = 0)\r\n", 408 | "{ \r\n", 409 | " int rows = 0;\r\n", 410 | " while (rdr.Read())\r\n", 411 | " {\r\n", 412 | " rows++;\r\n", 413 | " var values = new object[rdr.FieldCount];\r\n", 414 | " rdr.GetValues(values);\r\n", 415 | "\r\n", 416 | " for (int i=0;i 0 && rows >= topRows)\r\n", 440 | " break;\r\n", 441 | "\r\n", 442 | " }\r\n", 443 | "}\r\n", 444 | "\r\n", 445 | "DataType GetSparkType(Type t)\r\n", 446 | "{\r\n", 447 | " if (t == typeof(string)) \r\n", 448 | " return new T.StringType();\r\n", 449 | " if (t == typeof(int))\r\n", 450 | " return new T.IntegerType();\r\n", 451 | " if (t == typeof(long) || t == typeof(UInt64))\r\n", 452 | " return new T.LongType();\r\n", 453 | " if (t == typeof(float))\r\n", 454 | " return new T.FloatType();\r\n", 455 | " if (t == typeof(double))\r\n", 456 | " return new T.DoubleType();\r\n", 457 | " if (t == typeof(decimal))\r\n", 458 | " return new T.DecimalType();\r\n", 459 | " if (t == typeof(DateTime) || t == typeof(DateTimeOffset))\r\n", 460 | " //return new T.TimestampType(); not yet supported\r\n", 461 | " return new T.StringType();\r\n", 462 | " if (t == typeof(Date))\r\n", 463 | " //return new T.DateType();\r\n", 464 | " return new T.StringType();\r\n", 465 | " if (t == typeof(Guid))\r\n", 466 | " return new T.StringType();\r\n", 467 | " if (t == typeof(decimal))\r\n", 468 | " return new T.DecimalType();\r\n", 469 | " if (t == typeof(long))\r\n", 470 | " return new T.LongType();\r\n", 471 | " if (t == typeof(short))\r\n", 472 | " return new T.ShortType();\r\n", 473 | " if (t == typeof(bool))\r\n", 474 | " return new T.BooleanType();\r\n", 475 | " if (t == typeof(byte))\r\n", 476 | " return new T.ByteType();\r\n", 477 | "\r\n", 478 | "\r\n", 479 | " throw new InvalidOperationException($\"Unsupported Type for DataFrame conversion: {t.Name}\");\r\n", 480 | "}\r\n", 481 | "\r\n", 482 | "IEnumerable GetFields(IDataReader rdr)\r\n", 483 | "{\r\n", 484 | " for (int i = 0; i < rdr.FieldCount; i++)\r\n", 485 | " {\r\n", 486 | " var type = GetSparkType(rdr.GetFieldType(i)); \r\n", 487 | " var name = rdr.GetName(i);\r\n", 488 | "\r\n", 489 | " //Console.WriteLine($\"{name} {type}\");\r\n", 490 | "\r\n", 491 | " yield return new StructField(name, type, isNullable:true);\r\n", 492 | " }\r\n", 493 | "\r\n", 494 | "}\r\n" 495 | ], 496 | "outputs": [], 497 | "execution_count": null, 498 | "metadata": { 499 | "jupyter": { 500 | "source_hidden": false, 501 | "outputs_hidden": false 502 | }, 503 | "nteract": { 504 | "transient": { 505 | "deleting": false 506 | } 507 | } 508 | } 509 | }, 510 | { 511 | "cell_type": "code", 512 | "source": [ 513 | "DataFrame df_parser_output = spark\r\n", 514 | " .Read()\r\n", 515 | " .Option(\"header\", true)\r\n", 516 | " .Option(\"inferShchema\", true)\r\n", 517 | " .Option(\"ignoreLeadingWhiteSpace\", true)\r\n", 518 | " .Option(\"ignoreTrailingWhiteSpace\", true)\r\n", 519 | " .Csv(\"/data/tompo/tompo_parseroutput/tompo_output.csv\");\r\n", 520 | "df_parser_output.CreateOrReplaceTempView(\"vwParserOutput\")" 521 | ], 522 | "outputs": [], 523 | "execution_count": null, 524 | "metadata": { 525 | "jupyter": { 526 | "source_hidden": false, 527 | "outputs_hidden": false 528 | }, 529 | "nteract": { 530 | "transient": { 531 | "deleting": false 532 | } 533 | } 534 | } 535 | }, 536 | { 537 | "cell_type": "code", 538 | "source": [ 539 | "DataFrame source = spark\r\n", 540 | " .Read()\r\n", 541 | " .Option(\"header\", true)\r\n", 542 | " .Option(\"inferShchema\", true)\r\n", 543 | " .Option(\"ignoreLeadingWhiteSpace\", true)\r\n", 544 | " .Option(\"ignoreTrailingWhiteSpace\", true)\r\n", 545 | " .Csv(\"/data/tompo/tompo_model_metadata.csv\");" 546 | ], 547 | "outputs": [], 548 | "execution_count": null, 549 | "metadata": { 550 | "jupyter": { 551 | "source_hidden": false, 552 | "outputs_hidden": false 553 | }, 554 | "nteract": { 555 | "transient": { 556 | "deleting": false 557 | } 558 | }, 559 | "microsoft": {} 560 | } 561 | }, 562 | { 563 | "cell_type": "code", 564 | "source": [ 565 | "%%pyspark\r\n", 566 | "from pyspark.sql.types import StructType,StructField,StringType,DateType\r\n", 567 | "from pyspark.sql.functions import *\r\n", 568 | "import os\r\n", 569 | "import re\r\n", 570 | "from notebookutils import mssparkutils\r\n", 571 | "\r\n", 572 | "keyvaultName = os.getenv(\"keyvaultName\") # Create a spark.yarn.appMasterEnv.keyvaultName property in Apache Spark Configuration and store the keyvalut name\r\n", 573 | "applicationId = mssparkutils.credentials.getSecret(keyvaultName,\"secret name\",\"linked service name\") \r\n", 574 | "applicationSecret = mssparkutils.credentials.getSecret(keyvaultName,\"secret name\",\"linked service name\")\r\n", 575 | "storageConnString = mssparkutils.credentials.getSecret(keyvaultName,\"secret name\",\"linked service name\")\r\n", 576 | "tenantId = mssparkutils.credentials.getSecret(keyvaultName,\"secret name\",\"linked service name\")\r\n", 577 | "\r\n", 578 | "paramData = [(applicationId, applicationSecret, storageConnString, tenantId)]\r\n", 579 | "schema = StructType([ \\\r\n", 580 | " StructField(\"applicationId\", StringType(), True), \\\r\n", 581 | " StructField(\"applicationSecret\", StringType(), True), \\\r\n", 582 | " StructField(\"storageConnString\", StringType(), True), \\\r\n", 583 | " StructField(\"tenantId\", StringType(), True) \\\r\n", 584 | "])\r\n", 585 | "\r\n", 586 | "df = spark.createDataFrame(spark.sparkContext.parallelize(paramData), schema)\r\n", 587 | "df.createOrReplaceTempView(\"vw_tompo_params\")\r\n" 588 | ], 589 | "outputs": [], 590 | "execution_count": null, 591 | "metadata": { 592 | "jupyter": { 593 | "source_hidden": false, 594 | "outputs_hidden": false 595 | }, 596 | "nteract": { 597 | "transient": { 598 | "deleting": false 599 | } 600 | }, 601 | "microsoft": { 602 | "language": "python" 603 | } 604 | } 605 | }, 606 | { 607 | "cell_type": "code", 608 | "source": [ 609 | "public class TompoSecret\r\n", 610 | "{\r\n", 611 | " public string applicationId { get; set; }\r\n", 612 | " public string applicationSecret { get; set; }\r\n", 613 | " public string storageConnString { get; set; }\r\n", 614 | " public string tenantId { get; set; }\r\n", 615 | "}\r\n", 616 | "\r\n", 617 | "\r\n", 618 | "TompoSecret secretObj = new TompoSecret();\r\n", 619 | "var secretsdata = spark.Sql(\"SELECT * FROM vw_tompo_params\");\r\n", 620 | "\r\n", 621 | "secretsdata.Collect().ToList().ForEach(row => {\r\n", 622 | " secretObj.applicationId = row[0].ToString();\r\n", 623 | " secretObj.applicationSecret = row[1].ToString();\r\n", 624 | " secretObj.storageConnString = row[2].ToString();\r\n", 625 | " secretObj.tenantId = row[3].ToString();\r\n", 626 | " }\r\n", 627 | ");\r\n", 628 | "\r\n", 629 | "var applicationId = secretObj.applicationId;\r\n", 630 | "var applicationSecret = secretObj.applicationSecret;\r\n", 631 | "var tenantId= secretObj.tenantId;\r\n", 632 | "\r\n", 633 | "var clientId = applicationId;\r\n", 634 | "//var tenantId = tenantId;\r\n", 635 | "var userId = $\"app:{clientId}@{tenantId}\";\r\n", 636 | "var secret = applicationSecret ; \r\n", 637 | "var xmlaEndpoint=\"\";\r\n", 638 | "var dataset=\"\";\r\n", 639 | "var constr = \"\";\r\n", 640 | "var query=\"\";\r\n", 641 | "var modelname=\"\";\r\n", 642 | "\r\n", 643 | "foreach (var obj in source.Collect())\r\n", 644 | "{\r\n", 645 | " Console.WriteLine(obj[0]);\r\n", 646 | " Console.WriteLine(obj[1]);\r\n", 647 | " Console.WriteLine(obj[2]);\r\n", 648 | " xmlaEndpoint = \"powerbi://api.powerbi.com/v1.0/myorg/\" + obj[1] ;\r\n", 649 | " dataset = obj[2]+ \"\" ;\r\n", 650 | " \r\n", 651 | " //Populate Data Sources\r\n", 652 | " query = \"SELECT [ID], ModelID, [Name], [Description], [Type], ConnectionString, ImpersonationMode, Account, ModifiedTime from $SYSTEM.TMSCHEMA_DATA_SOURCES\";\r\n", 653 | " constr = $\"Data Source={xmlaEndpoint};database={dataset};user id={userId};password={secret}\";\r\n", 654 | " var df = RunXmlaQuery(constr,query);\r\n", 655 | " df.CreateOrReplaceTempView(\"vwTompoMetadata\"); \r\n", 656 | " //Insert result into table here - UNION ALL\r\n", 657 | " query=\"INSERT INTO tompo.tompo_datasources SELECT '\" + obj[2] + \"',ID, ModelID, Name, Description, Type, ConnectionString, ImpersonationMode, Account, CAST(ModifiedTime AS DATE) FROM vwTompoMetadata \";\r\n", 658 | " var res = spark.Sql(query);\r\n", 659 | " \r\n", 660 | "\r\n", 661 | " //Populate Tables\r\n", 662 | " query = \"Select [ID], [ModelID], [Name], [DataCategory], [Description], [IsHidden] from $SYSTEM.TMSCHEMA_TABLES\";\r\n", 663 | " constr = $\"Data Source={xmlaEndpoint};database={dataset};user id={userId};password={secret}\";\r\n", 664 | " df = RunXmlaQuery(constr,query);\r\n", 665 | " df.CreateOrReplaceTempView(\"vwTompoMetadata\");\r\n", 666 | " //Insert result into table here - UNION ALL\r\n", 667 | " query=\"INSERT INTO tompo.tompo_tables SELECT '\" + obj[2] + \"', ID, ModelID, Name, DataCategory, Description, IsHidden FROM vwTompoMetadata\";\r\n", 668 | " res = spark.Sql(query);\r\n", 669 | " \r\n", 670 | "\r\n", 671 | " //Populate Table Partitions\r\n", 672 | " query = \"Select [ID], [TableID], [Name], [Description], [DataSourceID], [QueryDefinition], [Type], [Mode], ModifiedTime from $SYSTEM.TMSCHEMA_PARTITIONS\";\r\n", 673 | " constr = $\"Data Source={xmlaEndpoint};database={dataset};user id={userId};password={secret}\";\r\n", 674 | " df = RunXmlaQuery(constr,query);\r\n", 675 | " df.CreateOrReplaceTempView(\"vwTompoMetadata\");\r\n", 676 | " //Insert result into table here - UNION ALL\r\n", 677 | " query=\"INSERT INTO tompo.tompo_tablepartitions Select '\" + obj[2] + \"', ID, TableID, Name, Description, DataSourceID, QueryDefinition, Type, Mode, CAST(ModifiedTime AS DATE) FROM vwTompoMetadata\";\r\n", 678 | " res = spark.Sql(query);\r\n", 679 | "\r\n", 680 | "\r\n", 681 | " //Populate ToTables (This is to handle relationships)\r\n", 682 | " query = \"Select [ID], [ModelID], [Name], [DataCategory], [Description], [IsHidden] from $SYSTEM.TMSCHEMA_TABLES\";\r\n", 683 | " constr = $\"Data Source={xmlaEndpoint};database={dataset};user id={userId};password={secret}\";\r\n", 684 | " df = RunXmlaQuery(constr,query);\r\n", 685 | " df.CreateOrReplaceTempView(\"vwTompoMetadata\");\r\n", 686 | " //Insert result into table here - UNION ALL\r\n", 687 | " query=\"INSERT INTO tompo.tompo_totables SELECT '\" + obj[2] + \"' , ID, ModelID, Name, DataCategory, Description, IsHidden FROM vwTompoMetadata\";\r\n", 688 | " res = spark.Sql(query);\r\n", 689 | "\r\n", 690 | "\r\n", 691 | " //Populate Columns\r\n", 692 | " query = \"Select [ID], [TableID], [ExplicitName] , [ExplicitDataType], [DataCategory], [Description], [IsHidden], [IsUnique], [IsKey], [SummarizeBy], [ColumnStorageID], [Type], [SourceColumn], [Expression], [FormatString], [SortByColumnID], [AttributeHierarchyID], [ModifiedTime], [StructureModifiedTime], [DisplayFolder] from $SYSTEM.TMSCHEMA_COLUMNS\";\r\n", 693 | " constr = $\"Data Source={xmlaEndpoint};database={dataset};user id={userId};password={secret}\";\r\n", 694 | " df = RunXmlaQuery(constr,query);\r\n", 695 | " df.CreateOrReplaceTempView(\"vwTompoMetadata\");\r\n", 696 | " //Insert result into table here - UNION ALL\r\n", 697 | " query=\"INSERT INTO tompo.tompo_columns Select '\" + obj[2] + \"', ID, TableID, ExplicitName , ExplicitDataType, DataCategory, Description, IsHidden, IsUnique, IsKey, SummarizeBy, ColumnStorageID, Type, SourceColumn, Expression, FormatString, SortByColumnID, AttributeHierarchyID, CAST(ModifiedTime AS Date), CAST(StructureModifiedTime AS Date), CAST(DisplayFolder AS Date) FROM vwTompoMetadata\";\r\n", 698 | " res = spark.Sql(query);\r\n", 699 | "\r\n", 700 | "\r\n", 701 | " //Populate Measures\r\n", 702 | " query = \"Select [ID], [TableID], [Name], [Description] , [DataType], [Expression], [FormatString], [IsHidden], [ModifiedTime], [StructureModifiedTime], [KPIID], [IsSimpleMeasure], [DisplayFolder] from $SYSTEM.TMSCHEMA_MEASURES\";\r\n", 703 | " constr = $\"Data Source={xmlaEndpoint};database={dataset};user id={userId};password={secret}\";\r\n", 704 | " df = RunXmlaQuery(constr,query);\r\n", 705 | " df.CreateOrReplaceTempView(\"vwTompoMetadata\");\r\n", 706 | " //Insert result into table here - UNION ALL\r\n", 707 | " query=\"INSERT INTO tompo.tompo_measures Select '\" + obj[2] + \"', ID, TableID, Name, Description , DataType, Expression, FormatString, IsHidden, CAST(ModifiedTime AS Date), CAST(StructureModifiedTime AS Date), KPIID, IsSimpleMeasure, DisplayFolder FROM vwTompoMetadata\";\r\n", 708 | " res = spark.Sql(query);\r\n", 709 | "\r\n", 710 | "\r\n", 711 | " //Populate CALC Dependency\r\n", 712 | " query = \"SELECT DATABASE_NAME, OBJECT_TYPE, [TABLE], OBJECT, EXPRESSION, REFERENCED_OBJECT_TYPE, REFERENCED_TABLE, REFERENCED_OBJECT, REFERENCED_EXPRESSION, QUERY FROM $System.DISCOVER_CALC_DEPENDENCY where OBJECT_TYPE='MEASURE'\";\r\n", 713 | " constr = $\"Data Source={xmlaEndpoint};database={dataset};user id={userId};password={secret}\";\r\n", 714 | " df = RunXmlaQuery(constr,query);\r\n", 715 | " df.CreateOrReplaceTempView(\"vwTompoMetadata\");\r\n", 716 | " //Insert result into table here - UNION ALL\r\n", 717 | " query=\"INSERT INTO tompo.tompo_calcdependency SELECT '\" + obj[2] + \"', DATABASE_NAME, OBJECT_TYPE, TABLE, OBJECT, EXPRESSION, REFERENCED_OBJECT_TYPE, REFERENCED_TABLE, REFERENCED_OBJECT, REFERENCED_EXPRESSION, QUERY FROM vwTompoMetadata\";\r\n", 718 | " res = spark.Sql(query);\r\n", 719 | "\r\n", 720 | "\r\n", 721 | " //Populate Relationships\r\n", 722 | " query = \"Select [ID], [ModelID], [IsActive], [Type], [CrossfilteringBehavior], [FromTableID], [FromColumnID], [FromCardinality], [ToTableID], [ToColumnID], [ToCardinality], [ModifiedTime] from $SYSTEM.TMSCHEMA_RELATIONSHIPS\";\r\n", 723 | " constr = $\"Data Source={xmlaEndpoint};database={dataset};user id={userId};password={secret}\";\r\n", 724 | " df = RunXmlaQuery(constr,query);\r\n", 725 | " df.CreateOrReplaceTempView(\"vwTompoMetadata\");\r\n", 726 | " //Insert result into table here - UNION ALL\r\n", 727 | " query=\"INSERT INTO tompo.tompo_relationships Select '\" + obj[2] + \"', ID, ModelID, IsActive, Type, CrossfilteringBehavior, FromTableID, FromColumnID, FromCardinality, ToTableID, ToColumnID, ToCardinality, CAST(ModifiedTime as Date) FROM vwTompoMetadata\";\r\n", 728 | " res = spark.Sql(query);\r\n", 729 | "\r\n", 730 | "\r\n", 731 | " //Populate Rolememberships\r\n", 732 | " query = \"select [ID], RoleID, MemberName,IdentityProvider,ModifiedTime from $System.TMSCHEMA_ROLE_MEMBERSHIPS\";\r\n", 733 | " constr = $\"Data Source={xmlaEndpoint};database={dataset};user id={userId};password={secret}\";\r\n", 734 | " df = RunXmlaQuery(constr,query);\r\n", 735 | " df.CreateOrReplaceTempView(\"vwTompoMetadata\");\r\n", 736 | " //Insert result into table here - UNION ALL\r\n", 737 | " query=\"INSERT INTO tompo.tompo_rolememberships Select '\" + obj[2] + \"',ID, RoleID, MemberName,IdentityProvider,CAST(ModifiedTime as Date) FROM vwTompoMetadata\";\r\n", 738 | " res = spark.Sql(query);\r\n", 739 | "\r\n", 740 | "\r\n", 741 | " //Populate Roles\r\n", 742 | " query = \"select [ID], [Name], [Description], ModelPermission,ModifiedTime from $System.TMSCHEMA_ROLES\";\r\n", 743 | " constr = $\"Data Source={xmlaEndpoint};database={dataset};user id={userId};password={secret}\";\r\n", 744 | " df = RunXmlaQuery(constr,query);\r\n", 745 | " df.CreateOrReplaceTempView(\"vwTompoMetadata\");\r\n", 746 | " //Insert result into table here - UNION ALL\r\n", 747 | " query=\"INSERT INTO tompo.tompo_roles select '\" + obj[2] + \"', ID, Name, Description, ModelPermission,CAST(ModifiedTime as Date) FROM vwTompoMetadata\";\r\n", 748 | " res = spark.Sql(query);\r\n", 749 | "\r\n", 750 | "\r\n", 751 | " //Populate Roles\r\n", 752 | " query = \"select [CATALOG_NAME], [Description], [ROLES],[DATE_MODIFIED],[COMPATIBILITY_LEVEL],[TYPE] from $SYSTEM.DBSCHEMA_CATALOGS WHERE [CATALOG_NAME]='\" + obj[2] + \"'\";\r\n", 753 | " constr = $\"Data Source={xmlaEndpoint};database={dataset};user id={userId};password={secret}\";\r\n", 754 | " df = RunXmlaQuery(constr,query);\r\n", 755 | " df.CreateOrReplaceTempView(\"vwTompoMetadata\");\r\n", 756 | " //Insert result into table here - UNION ALL\r\n", 757 | " query=\"INSERT INTO tompo.tompo_dbschema_catalogs select CATALOG_NAME, Description, ROLES,CAST(DATE_MODIFIED as Date),COMPATIBILITY_LEVEL,TYPE FROM vwTompoMetadata\";\r\n", 758 | " res = spark.Sql(query);\r\n", 759 | " \r\n", 760 | "\r\n", 761 | " //Display(df);\r\n", 762 | " Console.WriteLine(\"New Line\");\r\n", 763 | "}" 764 | ], 765 | "outputs": [], 766 | "execution_count": null, 767 | "metadata": { 768 | "jupyter": { 769 | "source_hidden": false, 770 | "outputs_hidden": false 771 | }, 772 | "nteract": { 773 | "transient": { 774 | "deleting": false 775 | } 776 | }, 777 | "microsoft": {}, 778 | "collapsed": false 779 | } 780 | }, 781 | { 782 | "cell_type": "code", 783 | "source": [ 784 | "%%sql\r\n", 785 | "USE tompo;\r\n", 786 | "INSERT INTO tompo.tompo_reportimpact\r\n", 787 | "select distinct a.ModelName, b.Name as TableName, c.Object as MeasureName, po.ReportName,po.PageName, po.VisualType, 'Measure'\r\n", 788 | "from tompo.tompo_dbschema_catalogs a\r\n", 789 | "JOIN tompo.tompo_tables b\r\n", 790 | "on a.ModelName= b.ModelName\r\n", 791 | "JOIN tompo.tompo_calcdependency c\r\n", 792 | "on concat(b.ModelName,b.Name) = concat(c.ModelName,c.ReferencedTable)\r\n", 793 | "JOIN vwParserOutput po\r\n", 794 | "on concat(c.ModelName, c.Object) = concat(po.ModelName,po.Column)\r\n", 795 | "\r\n", 796 | "UNION ALL\r\n", 797 | "\r\n", 798 | "select DISTINCT a.ModelName, b.Name as TableName, c.Name as ColumnName,po.ReportName,po.PageName, po.VisualType, 'Column'\r\n", 799 | "FROM tompo.tompo_dbschema_catalogs a\r\n", 800 | "JOIN tompo.tompo_tables b\r\n", 801 | "ON a.ModelName= b.ModelName\r\n", 802 | "JOIN tompo.tompo_columns c\r\n", 803 | "ON concat(b.ModelName, b.ID) = concat(c.ModelName,c.TableID)\r\n", 804 | "JOIN vwParserOutput po\r\n", 805 | "on concat(c.ModelName, c.Name) = concat(po.ModelName,po.Column)\r\n" 806 | ], 807 | "outputs": [], 808 | "execution_count": null, 809 | "metadata": { 810 | "jupyter": { 811 | "source_hidden": false, 812 | "outputs_hidden": false 813 | }, 814 | "nteract": { 815 | "transient": { 816 | "deleting": false 817 | } 818 | }, 819 | "microsoft": { 820 | "language": "sparksql" 821 | }, 822 | "collapsed": false 823 | } 824 | } 825 | ], 826 | "metadata": { 827 | "kernelspec": { 828 | "name": "synapse_pyspark", 829 | "language": "Python", 830 | "display_name": "Synapse PySpark" 831 | }, 832 | "language_info": { 833 | "name": "csharp" 834 | }, 835 | "kernel_info": { 836 | "name": "synapse_pyspark" 837 | }, 838 | "description": null, 839 | "save_output": true, 840 | "synapse_widget": { 841 | "version": "0.1", 842 | "state": {} 843 | } 844 | }, 845 | "nbformat": 4, 846 | "nbformat_minor": 2 847 | } -------------------------------------------------------------------------------- /tompo/TOMPo_OnboardingSteps.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DataLineage/a4390286d334ad02dccd2e10d199487ca13640c2/tompo/TOMPo_OnboardingSteps.docx -------------------------------------------------------------------------------- /tompo/TOMPo_ReportParser.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "source": [ 6 | "#r \"nuget: Microsoft.PowerBI.Api, 4.10.0\"\r\n", 7 | "#r \"nuget: Microsoft.IdentityModel.Clients.ActiveDirectory, 5.2.9\"\r\n", 8 | "#r \"nuget: Azure.Identity, 1.7.0\"\r\n", 9 | "#r \"nuget: Microsoft.Rest.ClientRuntime, 2.3.24\"\r\n", 10 | "#r \"nuget: Microsoft.Azure.Storage.Blob\"\r\n", 11 | "#r \"nuget: Microsoft.Azure.Storage.Common\"\r\n", 12 | "#r \"nuget: Azure.Security.KeyVault.Secrets, 4.4.0\"\r\n", 13 | "#r \"nuget: CsvHelper\"\r\n", 14 | "#r \"nuget: microsoft.aspnetcore.mvc.core\"" 15 | ], 16 | "outputs": [], 17 | "execution_count": null, 18 | "metadata": {} 19 | }, 20 | { 21 | "cell_type": "code", 22 | "source": [ 23 | "%%csharp\r\n", 24 | "using System;\r\n", 25 | "using System.Linq;\r\n", 26 | "using System.Configuration;\r\n", 27 | "using System.IO;\r\n", 28 | "using System.Security;\r\n", 29 | "using System.Reflection;\r\n", 30 | "using Microsoft.Rest;\r\n", 31 | "using Microsoft.Identity.Client;\r\n", 32 | "using Microsoft.PowerBI.Api;\r\n", 33 | "using System.Collections.Generic;\r\n", 34 | "using Microsoft.Spark.Sql;\r\n", 35 | "using System.Collections.Generic;\r\n", 36 | "using System.Net.Http;\r\n", 37 | "using System.Net.Http.Headers;\r\n", 38 | "using Newtonsoft.Json.Linq;\r\n", 39 | "using System.IO.Compression;\r\n", 40 | "using Microsoft.Azure.Storage.Blob;\r\n", 41 | "using System.Threading;\r\n", 42 | "using System.Threading.Tasks;\r\n", 43 | "using Microsoft.Spark.Sql;\r\n", 44 | "using Microsoft.Spark.Sql.Types;\r\n", 45 | "using Microsoft.Azure.Storage.Auth;\r\n", 46 | "using Microsoft.Azure.Storage;\r\n", 47 | "using Microsoft.Extensions.Logging;\r\n", 48 | "using System.Net;\r\n", 49 | "using Newtonsoft.Json;\r\n", 50 | "using System.Globalization;\r\n", 51 | "using Newtonsoft.Json.Serialization;\r\n", 52 | "using CsvHelper;\r\n", 53 | "using Microsoft.AspNetCore.Mvc;\r\n", 54 | "using Microsoft.PowerBI.Api;\r\n", 55 | "\r\n", 56 | "private static string spnAccessToken = string.Empty;\r\n", 57 | "\r\n", 58 | "static string GetSPNAccessToken(string applicationId, string applicationSecret, string tenantSpecificAuthority) {\r\n", 59 | " if (spnAccessToken.Equals(string.Empty)) {\r\n", 60 | " var appConfidential = ConfidentialClientApplicationBuilder.Create(applicationId)\r\n", 61 | " .WithClientSecret(applicationSecret)\r\n", 62 | " .WithAuthority(tenantSpecificAuthority)\r\n", 63 | " .Build();\r\n", 64 | "\r\n", 65 | " string[] scopesDefault = new string[] { \"https://analysis.windows.net/powerbi/api/.default\" };\r\n", 66 | " var authResult = appConfidential.AcquireTokenForClient(scopesDefault).ExecuteAsync().Result;\r\n", 67 | " spnAccessToken = authResult.AccessToken;\r\n", 68 | " }\r\n", 69 | " return spnAccessToken;\r\n", 70 | " }\r\n", 71 | " \r\n", 72 | "public static PowerBIClient GetPowerBiAppOnlyClient(string applicationId\r\n", 73 | " ,string applicationSecret\r\n", 74 | " ,string tenantSpecificAuthority\r\n", 75 | " ,string urlPowerBiServiceApiRoot) {\r\n", 76 | " var tokenCredentials = new TokenCredentials(GetSPNAccessToken(applicationId, applicationSecret, tenantSpecificAuthority), \"Bearer\");\r\n", 77 | " return new PowerBIClient(new Uri(urlPowerBiServiceApiRoot), tokenCredentials);\r\n", 78 | "}" 79 | ], 80 | "outputs": [], 81 | "execution_count": null, 82 | "metadata": { 83 | "jupyter": { 84 | "source_hidden": false, 85 | "outputs_hidden": false 86 | }, 87 | "nteract": { 88 | "transient": { 89 | "deleting": false 90 | } 91 | }, 92 | "microsoft": { 93 | "language": "csharp" 94 | } 95 | } 96 | }, 97 | { 98 | "cell_type": "code", 99 | "source": [ 100 | "%%pyspark\r\n", 101 | "from pyspark.sql.types import StructType,StructField,StringType,DateType\r\n", 102 | "from pyspark.sql.functions import *\r\n", 103 | "import os\r\n", 104 | "import re\r\n", 105 | "from notebookutils import mssparkutils\r\n", 106 | "\r\n", 107 | "keyvaultName = os.getenv(\"keyvaultName\") # Create a spark.yarn.appMasterEnv.keyvaultName property in Apache Spark Configuration and store the keyvalut name\r\n", 108 | "applicationId = mssparkutils.credentials.getSecret(keyvaultName,\"secret name\",\"linked service name\") \r\n", 109 | "applicationSecret = mssparkutils.credentials.getSecret(keyvaultName,\"secret name\",\"linked service name\")\r\n", 110 | "storageConnString = mssparkutils.credentials.getSecret(keyvaultName,\"secret name\",\"linked service name\")\r\n", 111 | "tenantId = mssparkutils.credentials.getSecret(keyvaultName,\"secret name\",\"linked service name\")\r\n", 112 | "\r\n", 113 | "\r\n", 114 | "paramData = [(applicationId, applicationSecret, storageConnString, tenantId)]\r\n", 115 | "schema = StructType([ \\\r\n", 116 | " StructField(\"applicationId\", StringType(), True), \\\r\n", 117 | " StructField(\"applicationSecret\", StringType(), True), \\\r\n", 118 | " StructField(\"storageConnString\", StringType(), True), \\\r\n", 119 | " StructField(\"tenantId\", StringType(), True) \\\r\n", 120 | "])\r\n", 121 | "\r\n", 122 | "df = spark.createDataFrame(spark.sparkContext.parallelize(paramData), schema)\r\n", 123 | "df.createOrReplaceTempView(\"vw_tompo_params\")\r\n", 124 | "\r\n", 125 | "metadatadf = spark.read.option(\"header\", True).csv(\"/data/tompo/tompo_report_metadata.csv\")\r\n", 126 | "metadatadf.createOrReplaceTempView(\"vw_report_metadata\")" 127 | ], 128 | "outputs": [], 129 | "execution_count": null, 130 | "metadata": { 131 | "jupyter": { 132 | "source_hidden": false, 133 | "outputs_hidden": false 134 | }, 135 | "nteract": { 136 | "transient": { 137 | "deleting": false 138 | } 139 | }, 140 | "microsoft": { 141 | "language": "python" 142 | } 143 | } 144 | }, 145 | { 146 | "cell_type": "code", 147 | "source": [ 148 | "%%csharp\r\n", 149 | "public class TompoSecret\r\n", 150 | "{\r\n", 151 | " public string applicationId { get; set; }\r\n", 152 | " public string applicationSecret { get; set; }\r\n", 153 | " public string storageConnString { get; set; }\r\n", 154 | " public string tenantId { get; set; }\r\n", 155 | "}" 156 | ], 157 | "outputs": [], 158 | "execution_count": null, 159 | "metadata": { 160 | "jupyter": { 161 | "source_hidden": false, 162 | "outputs_hidden": false 163 | }, 164 | "nteract": { 165 | "transient": { 166 | "deleting": false 167 | } 168 | }, 169 | "microsoft": { 170 | "language": "csharp" 171 | } 172 | } 173 | }, 174 | { 175 | "cell_type": "code", 176 | "source": [ 177 | "%%csharp\r\n", 178 | "using System.Collections.Generic;\r\n", 179 | "using System.IO;\r\n", 180 | "using System.Net.Http;\r\n", 181 | "using System.Net.Http.Headers;\r\n", 182 | "using Newtonsoft.Json.Linq;\r\n", 183 | "using System.IO.Compression;\r\n", 184 | "using Microsoft.Azure.Storage.Blob;\r\n", 185 | "using System.Threading;\r\n", 186 | "using System.Threading.Tasks;\r\n", 187 | "using Microsoft.Spark.Sql;\r\n", 188 | "using Microsoft.Spark.Sql.Types;\r\n", 189 | "using Microsoft.Azure.Storage.Auth;\r\n", 190 | "using Microsoft.Azure.Storage;\r\n", 191 | "using Microsoft.Extensions.Logging;\r\n", 192 | "using System.Net;\r\n", 193 | "using Newtonsoft.Json;\r\n", 194 | "using System.Globalization;\r\n", 195 | "using Newtonsoft.Json.Serialization;\r\n", 196 | "using CsvHelper;\r\n", 197 | "using Microsoft.AspNetCore.Mvc;\r\n", 198 | "using Microsoft.PowerBI.Api;\r\n", 199 | "\r\n", 200 | "public const string urlPowerBiServiceApiRoot = \"https://api.powerbi.com/\";\r\n", 201 | "private static string workspaceId = \"\";\r\n", 202 | "private static string reportId = \"\";\r\n", 203 | "private static string reportName = \"\";\r\n", 204 | "private static string applicationId = \"\";\r\n", 205 | "private static string applicationSecret = \"\";\r\n", 206 | "private static string tenantSpecificAuthority = \"\";\r\n", 207 | "private static string storageConnString = \"\";\r\n", 208 | "\r\n", 209 | "\r\n", 210 | "TompoSecret secretObj = new TompoSecret();\r\n", 211 | "var secretsdata = spark.Sql(\"SELECT * FROM vw_tompo_params\");\r\n", 212 | "\r\n", 213 | "secretsdata.Collect().ToList().ForEach(row => {\r\n", 214 | " secretObj.applicationId = row[0].ToString();\r\n", 215 | " secretObj.applicationSecret = row[1].ToString();\r\n", 216 | " secretObj.storageConnString = row[2].ToString();\r\n", 217 | " secretObj.tenantId = row[3].ToString();\r\n", 218 | " }\r\n", 219 | ");\r\n", 220 | "\r\n", 221 | "applicationId = secretObj.applicationId;\r\n", 222 | "applicationSecret = secretObj.applicationSecret;\r\n", 223 | "tenantSpecificAuthority = \"https://login.microsoftonline.com/\" + secretObj.tenantId;\r\n", 224 | "storageConnString = secretObj.storageConnString;\r\n", 225 | "\r\n", 226 | "PowerBIClient pbiClient = GetPowerBiAppOnlyClient(applicationId, applicationSecret, tenantSpecificAuthority, urlPowerBiServiceApiRoot);\r\n", 227 | "var storageAccount = CloudStorageAccount.Parse(storageConnString);\r\n", 228 | "var blobClient = storageAccount.CreateCloudBlobClient();\r\n", 229 | "var container = blobClient.GetContainerReference(\"hrsisynapsefs\");\r\n", 230 | "\r\n", 231 | "var metadata = spark.Sql(\"SELECT * FROM vw_report_metadata where isActive=1\");\r\n", 232 | "\r\n", 233 | "metadata.Collect().ToList().ForEach(row => {\r\n", 234 | " var workspaceName = row[0].ToString();\r\n", 235 | " var workspaceId = row[1].ToString();\r\n", 236 | " var reportName = row[2].ToString();\r\n", 237 | " var reportId = row[3].ToString();\r\n", 238 | " var modelName = row[4].ToString();\r\n", 239 | " \r\n", 240 | " var reportStream = pbiClient.Reports.ExportReport(new Guid(workspaceId), new Guid(reportId));\r\n", 241 | " var blob = container.GetBlockBlobReference(\"data/tompo/tompo_layout/\" + reportName + \".pbix\");\r\n", 242 | " blob.Properties.ContentType = \"mutipart/form-data\";\r\n", 243 | " blob.UploadFromStream(reportStream);\r\n", 244 | " reportStream.Close();\r\n", 245 | " Console.WriteLine(reportName + \" Report has been donwloaded successfully\");\r\n", 246 | "\r\n", 247 | " }\r\n", 248 | ");" 249 | ], 250 | "outputs": [], 251 | "execution_count": null, 252 | "metadata": { 253 | "jupyter": { 254 | "source_hidden": false, 255 | "outputs_hidden": false 256 | }, 257 | "nteract": { 258 | "transient": { 259 | "deleting": false 260 | } 261 | }, 262 | "microsoft": { 263 | "language": "csharp" 264 | } 265 | } 266 | }, 267 | { 268 | "cell_type": "code", 269 | "source": [ 270 | "%%pyspark\r\n", 271 | "from zipfile import ZipFile\r\n", 272 | "import os\r\n", 273 | "import shutil\r\n", 274 | "import pandas as pd\r\n", 275 | "import json\r\n", 276 | "from os.path import exists\r\n", 277 | "\r\n", 278 | "hrsiBasePath = os.getenv(\"hrsiBasePath\")\r\n", 279 | "\r\n", 280 | " #uncomment below block if running for first time, need to ad exists code\r\n", 281 | "#mssparkutils.fs.unmount(\"/hrsisynapsefs_temp\") \r\n", 282 | "#mssparkutils.fs.mount(hrsiBasePath, \r\n", 283 | "#\t\t\t\t\t\"/hrsisynapsefs_temp\",\r\n", 284 | "#\t\t\t\t\t{\"linkedService\":\"HRBIADLS\"}\r\n", 285 | "#\t\t\t\t\t)\r\n", 286 | "\r\n", 287 | "synpasefspath = \"/synfs/\" + mssparkutils.env.getJobId() + \"/hrsisynapsefs_temp\"\r\n", 288 | "\r\n", 289 | "outfilepath = synpasefspath + \"/data/tompo/tompo_parseroutput/tompo_output.csv\"\r\n", 290 | "\r\n", 291 | "file_exists = exists(outfilepath)\r\n", 292 | "\r\n", 293 | "if file_exists:\r\n", 294 | " print(\"output file exists\")\r\n", 295 | "else:\r\n", 296 | " outfiledf = pd.DataFrame(columns = ['WorkspaceName', 'ReportName', 'PageName', 'VisualType', 'Column', 'ModelName', 'LastRefreshedOn'])\r\n", 297 | " outfiledf.to_csv(outfilepath, index=False)\r\n", 298 | " print(\"created empty one time output file\")\r\n", 299 | "\r\n", 300 | "\r\n", 301 | "metadatadfdata = spark.sql(\"SELECT * FROM vw_report_metadata where isActive=1\")\r\n", 302 | "\r\n", 303 | "df = pd.DataFrame(columns = ['WorkspaceName', 'ReportName', 'PageName', 'VisualType', 'Column', 'ModelName', 'LastRefreshedOn'])\r\n", 304 | "\r\n", 305 | "for row in metadatadfdata.collect():\r\n", 306 | " print(\"Working on file: \" + row[\"ReportName\"])\r\n", 307 | " \r\n", 308 | " f = ZipFile(synpasefspath + \"/data/tompo/tompo_layout/\" + row[\"ReportName\"] + \".pbix\", 'r')\r\n", 309 | " f.extractall(synpasefspath + \"/data/tompo/tompo_layout/_temp\" + row[\"ReportName\"])\r\n", 310 | " shutil.copyfile(synpasefspath + \"/data/tompo/tompo_layout/_temp\" + row[\"ReportName\"] + \"/Report/Layout\", synpasefspath + \"/data/tompo/tompo_layout/\" + row[\"ReportName\"] + \"_Layout\")\r\n", 311 | " shutil.rmtree(synpasefspath + \"/data/tompo/tompo_layout/_temp\" + row[\"ReportName\"])\r\n", 312 | " print(\"Retreived layout file for report \" + row[\"ReportName\"] + \".pbix\")\r\n", 313 | "\r\n", 314 | " # code from here is to parse layout file and append in csv for POWER BI reporting\r\n", 315 | " with open(synpasefspath + \"/data/tompo/tompo_layout/\" + row[2] + \"_Layout\", encoding=\"utf-16le\", errors=\"backslashreplace\") as file:\r\n", 316 | " data = file.read().strip()\r\n", 317 | "\r\n", 318 | " layoutdata = json.loads(data)\r\n", 319 | "\r\n", 320 | " outputlist = []\r\n", 321 | "\r\n", 322 | " for section in layoutdata['sections']:\r\n", 323 | "\r\n", 324 | " tabname = section['displayName']\r\n", 325 | "\r\n", 326 | " for container in section['visualContainers']:\r\n", 327 | " # if container['id'] != 0 :\r\n", 328 | " configdict = json.loads(container['config'])\r\n", 329 | "\r\n", 330 | " if 'singleVisual' in configdict:\r\n", 331 | "\r\n", 332 | " visualtype = configdict['singleVisual']['visualType']\r\n", 333 | "\r\n", 334 | " if 'projections' in configdict['singleVisual']:\r\n", 335 | "\r\n", 336 | " for key in configdict['singleVisual']['projections']:\r\n", 337 | " for query in configdict['singleVisual']['projections'][key]:\r\n", 338 | "\r\n", 339 | " data = query['queryRef'].split('.')[-1]\r\n", 340 | " outputlist.append(row[\"WorkspaceName\"])\r\n", 341 | " outputlist.append(row[\"ReportName\"])\r\n", 342 | " outputlist.append(tabname)\r\n", 343 | " outputlist.append(visualtype)\r\n", 344 | " outputlist.append(data)\r\n", 345 | " outputlist.append(row[\"ModelName\"])\r\n", 346 | " outputlist.append(str(pd.to_datetime('now').date()))\r\n", 347 | " \r\n", 348 | " df.loc[len(df)] = outputlist\r\n", 349 | " outputlist.clear()\r\n", 350 | "\r\n", 351 | " outputdf = pd.read_csv(outfilepath, header='infer')\r\n", 352 | "\r\n", 353 | " filterval = row[\"WorkspaceName\"].strip() + row[\"ReportName\"].strip()\r\n", 354 | " outfiltereddf = outputdf[ (outputdf[\"WorkspaceName\"]+outputdf[\"ReportName\"] != filterval ) ]\r\n", 355 | "\r\n", 356 | " finaldf = pd.concat([outfiltereddf, df])\r\n", 357 | " \r\n", 358 | " finaldf.to_csv(outfilepath, index=False)\r\n", 359 | " df = df.iloc[:0]\r\n", 360 | " print(\"Layout data is added to final csv for file: \" + row[\"ReportName\"])" 361 | ], 362 | "outputs": [], 363 | "execution_count": null, 364 | "metadata": { 365 | "jupyter": { 366 | "source_hidden": false, 367 | "outputs_hidden": false 368 | }, 369 | "nteract": { 370 | "transient": { 371 | "deleting": false 372 | } 373 | }, 374 | "microsoft": { 375 | "language": "python" 376 | }, 377 | "collapsed": false 378 | } 379 | } 380 | ], 381 | "metadata": { 382 | "kernelspec": { 383 | "name": "synapse_pyspark", 384 | "language": "Python", 385 | "display_name": "Synapse PySpark" 386 | }, 387 | "language_info": { 388 | "name": "csharp" 389 | }, 390 | "kernel_info": { 391 | "name": "synapse_pyspark" 392 | }, 393 | "description": null, 394 | "save_output": true, 395 | "synapse_widget": { 396 | "version": "0.1", 397 | "state": {} 398 | } 399 | }, 400 | "nbformat": 4, 401 | "nbformat_minor": 2 402 | } --------------------------------------------------------------------------------