├── .chglog ├── CHANGELOG.tpl.md └── config.yml ├── .circleci └── config.yml ├── .coveragerc ├── .gitignore ├── .pre-commit-config.yaml ├── LICENSE ├── MANIFEST.in ├── README.md ├── api_example.ipynb ├── data_lineage ├── __init__.py ├── __main__.py ├── assets │ └── favicon.ico ├── graph.py ├── parser │ ├── __init__.py │ ├── binder.py │ ├── dml_visitor.py │ └── visitor.py ├── server.py └── worker.py ├── docker ├── Dockerfile ├── build_image.sh └── docker-entrypoint.sh ├── example.ipynb ├── full_graph.png ├── install-manifests ├── docker-compose │ ├── catalog-demo.yml │ ├── tokern-lineage-engine.yml │ └── wikimedia-demo.yml └── dockerfiles │ ├── Dockerfile-demo-catalog │ ├── Dockerfile-demo-wikimedia │ ├── Makefile │ ├── demo-catalog.sql │ └── demo-wikimedia.sql ├── one_task.png ├── poetry.lock ├── pyproject.toml ├── pytest.ini ├── setup.cfg └── test ├── catalog.json ├── conftest.py ├── queries.json ├── test_data_lineage.py ├── test_db_graph.py ├── test_dml_visitor.py ├── test_scan.py └── test_server.py /.chglog/CHANGELOG.tpl.md: -------------------------------------------------------------------------------- 1 | {{ range .Versions }} 2 | 3 | ## {{ if .Tag.Previous }}[{{ .Tag.Name }}]({{ $.Info.RepositoryURL }}/compare/{{ .Tag.Previous.Name }}...{{ .Tag.Name }}){{ else }}{{ .Tag.Name }}{{ end }} ({{ datetime "2006-01-02" .Tag.Date }}) 4 | 5 | {{ range .CommitGroups -}} 6 | ### {{ .Title }} 7 | 8 | {{ range .Commits -}} 9 | * {{ .Subject }} 10 | {{ end }} 11 | {{ end -}} 12 | 13 | {{- if .RevertCommits -}} 14 | ### Reverts 15 | 16 | {{ range .RevertCommits -}} 17 | * {{ .Revert.Header }} 18 | {{ end }} 19 | {{ end -}} 20 | 21 | {{- if .NoteGroups -}} 22 | {{ range .NoteGroups -}} 23 | ### {{ .Title }} 24 | 25 | {{ range .Notes }} 26 | {{ .Body }} 27 | {{ end }} 28 | {{ end -}} 29 | {{ end -}} 30 | {{ end -}} -------------------------------------------------------------------------------- /.chglog/config.yml: -------------------------------------------------------------------------------- 1 | style: github 2 | template: CHANGELOG.tpl.md 3 | info: 4 | title: CHANGELOG 5 | repository_url: https://github.com/tokern/data-lineage 6 | options: 7 | commits: 8 | # filters: 9 | # Type: 10 | # - feat 11 | # - fix 12 | # - perf 13 | # - refactor 14 | commit_groups: 15 | # title_maps: 16 | # feat: Features 17 | # fix: Bug Fixes 18 | # perf: Performance Improvements 19 | # refactor: Code Refactoring 20 | header: 21 | pattern: "^(\\w*)\\:\\s(.*)$" 22 | pattern_maps: 23 | - Type 24 | - Subject 25 | notes: 26 | keywords: 27 | - BREAKING CHANGE -------------------------------------------------------------------------------- /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | # Python CircleCI 2.0 configuration file 2 | # 3 | # Check https://circleci.com/docs/2.0/language-python/ for more details 4 | # 5 | version: 2.1 6 | orbs: 7 | codecov: codecov/codecov@1.0.5 8 | python: circleci/python@1.4.0 9 | workflows: 10 | build_and_deploy: 11 | jobs: 12 | - build: 13 | filters: 14 | tags: 15 | only: /.*/ 16 | - deploy: 17 | requires: 18 | - build 19 | filters: 20 | tags: 21 | only: /v[0-9]+(\.[0-9]+)*/ 22 | branches: 23 | ignore: /.*/ 24 | 25 | jobs: 26 | build: &test-template 27 | docker: 28 | - image: circleci/python:3.8.3 29 | environment: 30 | PIPENV_VENV_IN_PROJECT: true 31 | # Specify service dependencies here if necessary 32 | # CircleCI maintains a library of pre-built images 33 | # documented at https://circleci.com/docs/2.0/circleci-images/ 34 | - image: circleci/postgres:12.0-alpine-ram 35 | environment: 36 | POSTGRES_USER: piiuser 37 | POSTGRES_PASSWORD: p11secret 38 | POSTGRES_DB: piidb 39 | 40 | - image: circleci/mysql:8.0.18-ram 41 | environment: 42 | MYSQL_USER: piiuser 43 | MYSQL_PASSWORD: p11secret 44 | MYSQL_DATABASE: piidb 45 | MYSQL_ROOT_PASSWORD: r00tPa33w0rd 46 | environment: 47 | PYVERSION: "3.8.3" 48 | working_directory: ~/repo 49 | 50 | steps: 51 | - checkout 52 | 53 | - run: 54 | name: install dockerize 55 | command: wget https://github.com/jwilder/dockerize/releases/download/$DOCKERIZE_VERSION/dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz && sudo tar -C /usr/local/bin -xzvf dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz && rm dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz 56 | environment: 57 | DOCKERIZE_VERSION: v0.3.0 58 | 59 | - run: 60 | name: Wait for db 61 | command: | 62 | dockerize -wait tcp://localhost:5432 -timeout 1m 63 | dockerize -wait tcp://localhost:3306 -timeout 1m 64 | 65 | - python/install-packages: 66 | pkg-manager: poetry 67 | include-python-in-cache-key: false 68 | include-branch-in-cache-key: false 69 | 70 | # run tests! 71 | - run: 72 | name: run tests 73 | command: | 74 | poetry run isort --check --diff . 75 | poetry run black --check . 76 | poetry run flake8 data_lineage test 77 | poetry run pytest --junitxml=junit/test-results.xml --cov=data_lineage --cov-report=xml --cov-report=html test/ 78 | 79 | - store_test_results: # Upload test results for display in Test Summary: https://circleci.com/docs/2.0/collect-test-data/ 80 | path: test-results 81 | 82 | - store_artifacts: 83 | path: test-reports 84 | destination: test-reports 85 | 86 | - codecov/upload: 87 | file: coverage.xml 88 | 89 | deploy: 90 | environment: 91 | PYVERSION: "3.8.11" 92 | docker: 93 | - image: tokern/python:3.8.11-buster 94 | environment: 95 | PYVERSION: "3.8.11" 96 | steps: 97 | - checkout 98 | - python/install-packages: 99 | pkg-manager: poetry 100 | include-python-in-cache-key: false 101 | include-branch-in-cache-key: false 102 | 103 | - run: 104 | name: create packages 105 | command: | 106 | poetry publish --build --username "${PYPI_USERNAME}" --password "${PYPI_PASSWORD}" 107 | 108 | - run: 109 | name: install git release utilities 110 | command: | 111 | go get github.com/aktau/github-release 112 | GO111MODULE=on go get -u github.com/git-chglog/git-chglog/cmd/git-chglog 113 | 114 | - run: 115 | name: release 116 | command: | 117 | ~/go/bin/git-chglog $CIRCLE_TAG | ~/go/bin/github-release release --description - --tag $CIRCLE_TAG 118 | 119 | - setup_remote_docker 120 | 121 | - run: 122 | name: build docker and publish 123 | command: | 124 | ./docker/build_image.sh $CIRCLE_TAG --publish --latest 125 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | branch = True 3 | source = */data_lineage/* 4 | 5 | [report] 6 | exclude_lines = 7 | if self.debug: 8 | pragma: no cover 9 | raise NotImplementedError 10 | if __name__ == .__main__.: 11 | ignore_errors = True 12 | omit = 13 | test/* 14 | setup.py 15 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by https://www.gitignore.io/api/python,pycharm 2 | # Edit at https://www.gitignore.io/?templates=python,pycharm 3 | 4 | ### PyCharm ### 5 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm 6 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 7 | 8 | .idea 9 | 10 | # User-specific stuff 11 | .idea/**/workspace.xml 12 | .idea/**/tasks.xml 13 | .idea/**/usage.statistics.xml 14 | .idea/**/dictionaries 15 | .idea/**/shelf 16 | 17 | # Generated files 18 | .idea/**/contentModel.xml 19 | 20 | # Sensitive or high-churn files 21 | .idea/**/dataSources/ 22 | .idea/**/dataSources.ids 23 | .idea/**/dataSources.local.xml 24 | .idea/**/sqlDataSources.xml 25 | .idea/**/dynamic.xml 26 | .idea/**/uiDesigner.xml 27 | .idea/**/dbnavigator.xml 28 | 29 | # Gradle 30 | .idea/**/gradle.xml 31 | .idea/**/libraries 32 | 33 | # Gradle and Maven with auto-import 34 | # When using Gradle or Maven with auto-import, you should exclude module files, 35 | # since they will be recreated, and may cause churn. Uncomment if using 36 | # auto-import. 37 | # .idea/modules.xml 38 | # .idea/*.iml 39 | # .idea/modules 40 | # *.iml 41 | # *.ipr 42 | 43 | # CMake 44 | cmake-build-*/ 45 | 46 | # Mongo Explorer plugin 47 | .idea/**/mongoSettings.xml 48 | 49 | # File-based project format 50 | *.iws 51 | 52 | # IntelliJ 53 | out/ 54 | 55 | # mpeltonen/sbt-idea plugin 56 | .idea_modules/ 57 | 58 | # JIRA plugin 59 | atlassian-ide-plugin.xml 60 | 61 | # Cursive Clojure plugin 62 | .idea/replstate.xml 63 | 64 | # Crashlytics plugin (for Android Studio and IntelliJ) 65 | com_crashlytics_export_strings.xml 66 | crashlytics.properties 67 | crashlytics-build.properties 68 | fabric.properties 69 | 70 | # Editor-based Rest Client 71 | .idea/httpRequests 72 | 73 | # Android studio 3.1+ serialized cache file 74 | .idea/caches/build_file_checksums.ser 75 | 76 | ### PyCharm Patch ### 77 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 78 | 79 | # *.iml 80 | # modules.xml 81 | # .idea/misc.xml 82 | # *.ipr 83 | 84 | # Sonarlint plugin 85 | .idea/**/sonarlint/ 86 | 87 | # SonarQube Plugin 88 | .idea/**/sonarIssues.xml 89 | 90 | # Markdown Navigator plugin 91 | .idea/**/markdown-navigator.xml 92 | .idea/**/markdown-navigator/ 93 | 94 | ### Python ### 95 | # Byte-compiled / optimized / DLL files 96 | __pycache__/ 97 | *.py[cod] 98 | *$py.class 99 | 100 | # C extensions 101 | *.so 102 | 103 | # Distribution / packaging 104 | .Python 105 | build/ 106 | develop-eggs/ 107 | dist/ 108 | downloads/ 109 | eggs/ 110 | .eggs/ 111 | lib/ 112 | lib64/ 113 | parts/ 114 | sdist/ 115 | var/ 116 | wheels/ 117 | pip-wheel-metadata/ 118 | share/python-wheels/ 119 | *.egg-info/ 120 | .installed.cfg 121 | *.egg 122 | MANIFEST 123 | 124 | # PyInstaller 125 | # Usually these files are written by a python script from a template 126 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 127 | *.manifest 128 | *.spec 129 | 130 | # Installer logs 131 | pip-log.txt 132 | pip-delete-this-directory.txt 133 | 134 | # Unit test / coverage reports 135 | htmlcov/ 136 | .tox/ 137 | .nox/ 138 | .coverage 139 | .coverage.* 140 | .cache 141 | nosetests.xml 142 | coverage.xml 143 | *.cover 144 | .hypothesis/ 145 | .pytest_cache/ 146 | 147 | # Translations 148 | *.mo 149 | *.pot 150 | 151 | # Scrapy stuff: 152 | .scrapy 153 | 154 | # Sphinx documentation 155 | docs/_build/ 156 | 157 | # PyBuilder 158 | target/ 159 | 160 | # pyenv 161 | .python-version 162 | 163 | # pipenv 164 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 165 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 166 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 167 | # install all needed dependencies. 168 | #Pipfile.lock 169 | 170 | # celery beat schedule file 171 | celerybeat-schedule 172 | 173 | # SageMath parsed files 174 | *.sage.py 175 | 176 | # Spyder project settings 177 | .spyderproject 178 | .spyproject 179 | 180 | # Rope project settings 181 | .ropeproject 182 | 183 | # Mr Developer 184 | .mr.developer.cfg 185 | .project 186 | .pydevproject 187 | 188 | # mkdocs documentation 189 | /site 190 | 191 | # mypy 192 | .mypy_cache/ 193 | .dmypy.json 194 | dmypy.json 195 | 196 | # Pyre type checker 197 | .pyre/ 198 | 199 | junit/ 200 | 201 | .ipynb_checkpoints/ 202 | 203 | # End of https://www.gitignore.io/api/python,pycharm 204 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: local 3 | hooks: 4 | - id: isort 5 | name: isort 6 | stages: [commit] 7 | language: system 8 | entry: poetry run isort 9 | types: [python] 10 | 11 | - id: black 12 | name: black 13 | stages: [commit] 14 | language: system 15 | entry: poetry run black 16 | types: [python] 17 | 18 | - id: mypy 19 | name: mypy 20 | stages: [commit] 21 | language: system 22 | entry: poetry run mypy 23 | types: [python] 24 | pass_filenames: false 25 | 26 | 27 | - id: flake8 28 | name: flake8 29 | stages: [commit] 30 | language: system 31 | entry: poetry run flake8 32 | types: [python] 33 | exclude: setup.py 34 | 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Tokern 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include data_lineage/assets/* -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Tokern Lineage Engine 2 | 3 | [![CircleCI](https://circleci.com/gh/tokern/data-lineage.svg?style=svg)](https://circleci.com/gh/tokern/data-lineage) 4 | [![codecov](https://codecov.io/gh/tokern/data-lineage/branch/master/graph/badge.svg)](https://codecov.io/gh/tokern/data-lineage) 5 | [![PyPI](https://img.shields.io/pypi/v/data-lineage.svg)](https://pypi.python.org/pypi/data-lineage) 6 | [![image](https://img.shields.io/pypi/l/data-lineage.svg)](https://pypi.org/project/data-lineage/) 7 | [![image](https://img.shields.io/pypi/pyversions/data-lineage.svg)](https://pypi.org/project/data-lineage/) 8 | 9 | 10 | Tokern Lineage Engine is _fast_ and _easy to use_ application to collect, visualize and analyze 11 | column-level data lineage in databases, data warehouses and data lakes in AWS and RDS. 12 | 13 | Tokern Lineage helps you browse column-level data lineage 14 | * visually using [kedro-viz](https://github.com/quantumblacklabs/kedro-viz) 15 | * analyze lineage graphs programmatically using the powerful [networkx graph library](https://networkx.org/) 16 | 17 | ## Resources 18 | 19 | * Demo of Tokern Lineage App 20 | 21 | ![data-lineage](https://user-images.githubusercontent.com/1638298/118261607-688a7100-b4d1-11eb-923a-5d2407d6bd8d.gif) 22 | 23 | * Checkout an [example data lineage notebook](http://tokern.io/docs/data-lineage/example/). 24 | 25 | * Check out [the post on using data lineage for cost control](https://tokern.io/blog/data-lineage-on-redshift/) for an 26 | example of how data lineage can be used in production. 27 | 28 | ## Quick Start 29 | 30 | ### Install a demo of using Docker and Docker Compose 31 | 32 | Download the docker-compose file from Github repository. 33 | 34 | 35 | # in a new directory run 36 | wget https://raw.githubusercontent.com/tokern/data-lineage/master/install-manifests/docker-compose/catalog-demo.yml 37 | # or run 38 | curl https://raw.githubusercontent.com/tokern/data-lineage/master/install-manifests/docker-compose/tokern-lineage-engine.yml -o docker-compose.yml 39 | 40 | 41 | Run docker-compose 42 | 43 | 44 | docker-compose up -d 45 | 46 | 47 | Check that the containers are running. 48 | 49 | 50 | docker ps 51 | CONTAINER ID IMAGE CREATED STATUS PORTS NAMES 52 | 3f4e77845b81 tokern/data-lineage-viz:latest ... 4 hours ago Up 4 hours 0.0.0.0:8000->80/tcp tokern-data-lineage-visualizer 53 | 1e1ce4efd792 tokern/data-lineage:latest ... 5 days ago Up 5 days tokern-data-lineage 54 | 38be15bedd39 tokern/demodb:latest ... 2 weeks ago Up 2 weeks tokern-demodb 55 | 56 | Try out Tokern Lineage App 57 | 58 | Head to `http://localhost:8000/` to open the Tokern Lineage app 59 | 60 | ### Install Tokern Lineage Engine 61 | 62 | # in a new directory run 63 | wget https://raw.githubusercontent.com/tokern/data-lineage/master/install-manifests/docker-compose/tokern-lineage-engine.yml 64 | # or run 65 | curl https://raw.githubusercontent.com/tokern/data-lineage/master/install-manifests/docker-compose/catalog-demo.yml -o tokern-lineage-engine.yml 66 | 67 | Run docker-compose 68 | 69 | 70 | docker-compose up -d 71 | 72 | 73 | If you want to use an external Postgres database, change the following parameters in `tokern-lineage-engine.yml`: 74 | 75 | * CATALOG_HOST 76 | * CATALOG_USER 77 | * CATALOG_PASSWORD 78 | * CATALOG_DB 79 | 80 | You can also override default values using environement variables. 81 | 82 | CATALOG_HOST=... CATALOG_USER=... CATALOG_PASSWORD=... CATALOG_DB=... docker-compose -f ... up -d 83 | 84 | For more advanced usage of environment variables with docker-compose, [refer to docker-compose docs](https://docs.docker.com/compose/environment-variables/) 85 | 86 | **Pro-tip** 87 | 88 | If you want to connect to a database in the host machine, set 89 | 90 | CATALOG_HOST: host.docker.internal # For mac or windows 91 | #OR 92 | CATALOG_HOST: 172.17.0.1 # Linux 93 | 94 | ## Supported Technologies 95 | 96 | * Postgres 97 | * AWS Redshift 98 | * Snowflake 99 | 100 | ### Coming Soon 101 | 102 | * SparkSQL 103 | * Presto 104 | 105 | ## Documentation 106 | 107 | For advanced usage, please refer to [data-lineage documentation](https://tokern.io/docs/data-lineage/index.html) 108 | ## Survey 109 | 110 | Please take this [survey](https://forms.gle/p2oEQBJnpEguhrp3A) if you are a user or considering using data-lineage. Responses will help us prioritize features better. 111 | -------------------------------------------------------------------------------- /api_example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "public-income", 6 | "metadata": {}, 7 | "source": [ 8 | "# Overview\n", 9 | "\n", 10 | "This example showcases the API exposed by the data lineage package. The API can be used to build\n", 11 | "a lineage graph by adding nodes and edges that represent columns and transformations. \n", 12 | "\n", 13 | "Note that the goal of the example to explain the building blocks of the lineage graph.\n", 14 | "In practical scenarios, use a pack (e.g. query parser pack) to automate the process.\n", 15 | "\n", 16 | "This example consists of the following sequence of operations:\n", 17 | "* Start docker containers containing a demo. Refer to [docs](https://tokern.io/docs/data-lineage/installation) for detailed instructions on installing demo-wikimedia.\n", 18 | "* Register nodes from columns in the catalog.\n", 19 | "* Register directed edges to represent that a column is the source of data for another column.\n", 20 | "* Visualize the graph by visiting [Tokern UI](http://localhost:8000/).\n", 21 | "* Analyze the graph" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "id": "6a9c9b70", 27 | "metadata": { 28 | "pycharm": { 29 | "name": "#%% md\n" 30 | } 31 | }, 32 | "source": [ 33 | "# Installation\n", 34 | "\n", 35 | "This demo requires wikimedia demo to be running. Start the demo using the following instructions:\n", 36 | "\n", 37 | " # in a new directory run\n", 38 | " wget https://raw.githubusercontent.com/tokern/data-lineage/master/install-manifests/docker-compose/wikimedia-demo.yml\n", 39 | " # or run\n", 40 | " curl https://raw.githubusercontent.com/tokern/data-lineage/master/install-manifests/docker-compose/wikimedia-demo.yml -o docker-compose.yml\n", 41 | "\n", 42 | "\n", 43 | "Run docker-compose\n", 44 | "\n", 45 | "\n", 46 | " docker-compose up -d\n", 47 | "\n", 48 | "\n", 49 | "Verify container are running\n", 50 | "\n", 51 | "\n", 52 | " docker container ls | grep tokern\n" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 1, 58 | "id": "37651618", 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "# Required configuration for API and wikimedia database network address\n", 63 | "\n", 64 | "docker_address = \"http://127.0.0.1:8000\"\n", 65 | "wikimedia_db = {\n", 66 | " \"username\": \"etldev\",\n", 67 | " \"password\": \"3tld3v\",\n", 68 | " \"uri\": \"tokern-demo-wikimedia\",\n", 69 | " \"port\": \"5432\",\n", 70 | " \"database\": \"wikimedia\"\n", 71 | "}" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 2, 77 | "id": "wrong-antigua", 78 | "metadata": { 79 | "scrolled": true 80 | }, 81 | "outputs": [], 82 | "source": [ 83 | "# Setup a connection to catalog using the SDK.\n", 84 | "from data_lineage import Catalog\n", 85 | "\n", 86 | "catalog = Catalog(docker_address)" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 3, 92 | "id": "23ed8c16", 93 | "metadata": { 94 | "pycharm": { 95 | "name": "#%%\n" 96 | }, 97 | "scrolled": true 98 | }, 99 | "outputs": [], 100 | "source": [ 101 | "# Register wikimedia datawarehouse with data-lineage app.\n", 102 | "\n", 103 | "source = catalog.add_source(name=\"wikimedia\", source_type=\"postgresql\", **wikimedia_db)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 4, 109 | "id": "ce6ebf16", 110 | "metadata": { 111 | "scrolled": false 112 | }, 113 | "outputs": [ 114 | { 115 | "data": { 116 | "text/plain": [ 117 | "True" 118 | ] 119 | }, 120 | "execution_count": 4, 121 | "metadata": {}, 122 | "output_type": "execute_result" 123 | } 124 | ], 125 | "source": [ 126 | "# Scan the wikimedia data warehouse and register all schemata, tables and columns.\n", 127 | "\n", 128 | "catalog.scan_source(source)" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": 5, 134 | "id": "202c6b63", 135 | "metadata": { 136 | "scrolled": false 137 | }, 138 | "outputs": [ 139 | { 140 | "name": "stdout", 141 | "output_type": "stream", 142 | "text": [ 143 | "{'attributes': {'context': {'sql': 'insert into page_lookup_nonredirect(redirect_id) select page_id from page'}, 'name': 'insert_into_page_lookup_nonredirect'}, 'id': '1', 'links': {'self': 'http://tokern-api:4142/api/v1/catalog/jobs/1'}, 'type': 'jobs'}\n" 144 | ] 145 | } 146 | ], 147 | "source": [ 148 | "# Create a job and job_execution that inserts data from page to page_lookup_nonredirect\n", 149 | "\n", 150 | "job = catalog.add_job(\"insert_into_page_lookup_nonredirect\",\n", 151 | " {\n", 152 | " \"sql\": \"insert into page_lookup_nonredirect(redirect_id) select page_id from page\"\n", 153 | " })" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 6, 159 | "id": "cf308d97", 160 | "metadata": { 161 | "scrolled": true 162 | }, 163 | "outputs": [], 164 | "source": [ 165 | "import datetime\n", 166 | "from dbcat.catalog.models import JobExecutionStatus\n", 167 | "\n", 168 | "job_execution = catalog.add_job_execution(\n", 169 | " job=job,\n", 170 | " started_at=datetime.datetime.combine(\n", 171 | " datetime.date(2021, 4, 1), datetime.time(1, 0)\n", 172 | " ),\n", 173 | " ended_at=datetime.datetime.combine(\n", 174 | " datetime.date(2021, 4, 1), datetime.time(1, 15)\n", 175 | " ),\n", 176 | " status=JobExecutionStatus.SUCCESS,\n", 177 | ")\n" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": 8, 183 | "id": "b45aaac8", 184 | "metadata": {}, 185 | "outputs": [], 186 | "source": [ 187 | "# Add an edge between these two columns:\n", 188 | "# (test\", \"default\", \"page\", \"page_id\") -> (\"test\", \"default\", \"page_lookup_nonredirect\", \"redirect_id\"),\n", 189 | "\n", 190 | "source_column = catalog.get_column(source_name=\"wikimedia\", \n", 191 | " schema_name=\"public\", \n", 192 | " table_name=\"page\",\n", 193 | " column_name=\"page_id\")\n", 194 | "target_column = catalog.get_column(source_name=\"wikimedia\", \n", 195 | " schema_name=\"public\", \n", 196 | " table_name=\"page_lookup_nonredirect\",\n", 197 | " column_name=\"redirect_id\")\n", 198 | "\n", 199 | "edge = catalog.add_column_lineage(source=source_column,\n", 200 | " target=target_column,\n", 201 | " job_execution_id=job_execution.id,\n", 202 | " context={})" 203 | ] 204 | }, 205 | { 206 | "cell_type": "markdown", 207 | "id": "254fb735", 208 | "metadata": {}, 209 | "source": [ 210 | "Visit [Kedro UI](http://localhost:8000/)\n", 211 | "\n", 212 | "![One Task Graph](./one_task.png)" 213 | ] 214 | } 215 | ], 216 | "metadata": { 217 | "kernelspec": { 218 | "display_name": "Python 3", 219 | "language": "python", 220 | "name": "python3" 221 | }, 222 | "language_info": { 223 | "codemirror_mode": { 224 | "name": "ipython", 225 | "version": 3 226 | }, 227 | "file_extension": ".py", 228 | "mimetype": "text/x-python", 229 | "name": "python", 230 | "nbconvert_exporter": "python", 231 | "pygments_lexer": "ipython3", 232 | "version": "3.8.5" 233 | } 234 | }, 235 | "nbformat": 4, 236 | "nbformat_minor": 5 237 | } -------------------------------------------------------------------------------- /data_lineage/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | __version__ = "0.9.0" 3 | 4 | import datetime 5 | import json 6 | import logging 7 | from typing import Any, Dict, Generator, List, Optional, Type, TypeVar 8 | 9 | import requests 10 | from dbcat.catalog.models import JobExecutionStatus 11 | from furl import furl 12 | from requests import HTTPError 13 | 14 | from data_lineage.graph import LineageGraph 15 | 16 | 17 | class SourceNotFound(Exception): 18 | """Source not found in catalog""" 19 | 20 | 21 | class SchemaNotFound(Exception): 22 | """Schema not found in catalog""" 23 | 24 | 25 | class TableNotFound(Exception): 26 | """Table not found in catalog""" 27 | 28 | 29 | class ColumnNotFound(Exception): 30 | """Column not found in catalog""" 31 | 32 | 33 | class ParseError(Exception): 34 | """Parser Error""" 35 | 36 | 37 | class SemanticError(Exception): 38 | """Error due to mismatch in catalog data""" 39 | 40 | 41 | class NoResultFound(Exception): 42 | """Raised when function returns no results""" 43 | 44 | 45 | class MultipleResultsFound(Exception): 46 | """Raised when multiple results are found but expected only one or zero results""" 47 | 48 | 49 | class Graph: 50 | def __init__(self, url: str): 51 | self._base_url = furl(url) / "api/main" 52 | self._session = requests.Session() 53 | 54 | def get(self, job_ids: set = None) -> Dict[str, List[Dict[str, str]]]: 55 | if job_ids is not None: 56 | response = self._session.get( 57 | self._base_url, params={"job_ids": list(job_ids)} 58 | ) 59 | else: 60 | response = self._session.get(self._base_url) 61 | return response.json() 62 | 63 | 64 | def load_graph(graphSDK: Graph, job_ids: set = None) -> LineageGraph: 65 | data = graphSDK.get(job_ids) 66 | return LineageGraph(nodes=data["nodes"], edges=data["edges"]) 67 | 68 | 69 | class BaseModel: 70 | def __init__(self, session, attributes, obj_id, relationships): 71 | self._session = session 72 | self._attributes = attributes 73 | self._obj_id = obj_id 74 | self._relationships = relationships 75 | 76 | def __getattr__(self, item): 77 | logging.debug("Attributes: {}".format(self._attributes)) 78 | if item == "id": 79 | return self._obj_id 80 | elif self._attributes and item in self._attributes.keys(): 81 | return self._attributes[item] 82 | elif self._relationships and item in self._relationships.keys(): 83 | return self._relationships[item] 84 | raise AttributeError 85 | 86 | 87 | class Source(BaseModel): 88 | def __init__(self, session, attributes, obj_id, relationships): 89 | super().__init__(session, attributes, obj_id, relationships) 90 | 91 | 92 | class Schema(BaseModel): 93 | def __init__(self, session, attributes, obj_id, relationships): 94 | super().__init__(session, attributes, obj_id, relationships) 95 | 96 | 97 | class Table(BaseModel): 98 | def __init__(self, session, attributes, obj_id, relationships): 99 | super().__init__(session, attributes, obj_id, relationships) 100 | 101 | 102 | class Column(BaseModel): 103 | def __init__(self, session, attributes, obj_id, relationships): 104 | super().__init__(session, attributes, obj_id, relationships) 105 | 106 | 107 | class Job(BaseModel): 108 | def __init__(self, session, attributes, obj_id, relationships): 109 | super().__init__(session, attributes, obj_id, relationships) 110 | 111 | 112 | class JobExecution(BaseModel): 113 | def __init__(self, session, attributes, obj_id, relationships): 114 | super().__init__(session, attributes, obj_id, relationships) 115 | 116 | 117 | class ColumnLineage(BaseModel): 118 | def __init__(self, session, attributes, obj_id, relationships): 119 | super().__init__(session, attributes, obj_id, relationships) 120 | 121 | 122 | class DefaultSchema(BaseModel): 123 | def __init__(self, session, attributes, obj_id, relationships): 124 | super().__init__(session, attributes, obj_id, relationships) 125 | 126 | 127 | ModelType = TypeVar("ModelType", bound=BaseModel) 128 | 129 | 130 | class Catalog: 131 | def __init__(self, url: str): 132 | self._base_url = furl(url) / "api/v1/catalog" 133 | self._session = requests.Session() 134 | self._session.headers.update({"Accept": "application/vnd.api+json"}) 135 | self._session.headers.update({"Content-Type": "application/vnd.api+json"}) 136 | 137 | def _build_url(self, *urls) -> str: 138 | built_url = self._base_url 139 | for url in urls: 140 | built_url = furl(built_url) / url 141 | logging.debug(built_url) 142 | return built_url 143 | 144 | str_to_type = { 145 | "sources": Source, 146 | "schemata": Schema, 147 | } 148 | 149 | def _resolve_relationships(self, relationships) -> Dict[str, BaseModel]: 150 | resolved: Dict[str, BaseModel] = {} 151 | for key, value in relationships.items(): 152 | logging.debug("Resolving {}:{}".format(key, value)) 153 | if value["data"]: 154 | resolved[key] = self._obj_factory( 155 | value["data"], 156 | Catalog.str_to_type[value["data"]["type"]], 157 | resolve_relationships=False, 158 | ) 159 | 160 | return resolved 161 | 162 | def _obj_factory( 163 | self, 164 | payload: Dict[str, Any], 165 | clazz: Type[ModelType], 166 | resolve_relationships=False, 167 | ) -> ModelType: 168 | resolved = None 169 | if resolve_relationships and payload.get("relationships"): 170 | resolved = self._resolve_relationships(payload.get("relationships")) 171 | 172 | return clazz( 173 | session=self._session, 174 | attributes=payload.get("attributes"), 175 | obj_id=payload.get("id"), 176 | relationships=resolved, 177 | ) 178 | 179 | def _iterate(self, payload: Dict[str, Any], clazz: Type[BaseModel]): 180 | res: Optional[Dict[str, Any]] = payload 181 | while res is not None: 182 | for item in res["data"]: 183 | yield self._obj_factory(payload=item, clazz=clazz) 184 | 185 | if res["links"]["next"] is not None: 186 | response = self._session.get(res["links"]["next"]) 187 | res = response.json() 188 | else: 189 | res = None 190 | 191 | def _index(self, path: str, clazz: Type[BaseModel]): 192 | response = self._session.get(self._build_url(path)) 193 | logging.debug(response.json()) 194 | return self._iterate(response.json(), clazz) 195 | 196 | def _get( 197 | self, 198 | path: str, 199 | obj_id: int, 200 | clazz: Type[ModelType], 201 | resolve_relationships=False, 202 | ) -> ModelType: 203 | response = self._session.get(self._build_url(path, str(obj_id))) 204 | json_response = response.json() 205 | logging.debug(json_response) 206 | response.raise_for_status() 207 | return self._obj_factory( 208 | json_response["data"], clazz, resolve_relationships=resolve_relationships 209 | ) 210 | 211 | @staticmethod 212 | def _one(response): 213 | json_response = response.json() 214 | logging.debug(json_response) 215 | num_results = json_response["meta"]["total"] 216 | if num_results == 0: 217 | raise NoResultFound 218 | elif num_results > 1: 219 | raise MultipleResultsFound 220 | 221 | return json_response["data"][0] 222 | 223 | def _search_one(self, path: str, filters): 224 | params = {"filter[objects]": json.dumps(filters)} 225 | response = self._session.get(self._build_url(path), params=params) 226 | response.raise_for_status() 227 | return Catalog._one(response) 228 | 229 | def _search(self, path: str, search_string: str, clazz: Type[BaseModel]): 230 | filters = [dict(name="name", op="like", val="%{}%".format(search_string))] 231 | params = {"filter[objects]": json.dumps(filters)} 232 | response = self._session.get(self._build_url(path), params=params) 233 | return self._iterate(response.json(), clazz) 234 | 235 | def _post(self, path: str, data: Dict[str, Any], type: str) -> Dict[Any, Any]: 236 | payload = {"data": {"type": type, "attributes": data}} 237 | response = self._session.post( 238 | url=self._build_url(path), data=json.dumps(payload, default=str) 239 | ) 240 | response.raise_for_status() 241 | logging.debug(response.text) 242 | json_response = response.json() 243 | return json_response["data"] 244 | 245 | def _patch(self, path: str, obj_id: int, data: Dict[str, Any], type: str): 246 | payload = {"data": {"type": type, "attributes": data, "id": obj_id}} 247 | response = self._session.patch( 248 | url=self._build_url(path, str(obj_id)), 249 | data=json.dumps(payload, default=str), 250 | ) 251 | response.raise_for_status() 252 | return 253 | 254 | def get_sources(self) -> Generator[Any, Any, None]: 255 | return self._index("sources", Source) 256 | 257 | def get_schemata(self): 258 | return self._index("schemata", Schema) 259 | 260 | def get_tables(self): 261 | return self._index("tables", Table) 262 | 263 | def get_columns(self): 264 | return self._index("columns", Column) 265 | 266 | def get_jobs(self): 267 | return self._index("jobs", Job) 268 | 269 | def get_job_executions(self): 270 | return self._index("job_executions", JobExecution) 271 | 272 | def get_column_lineages(self): 273 | return self._index("column_lineages", ColumnLineage) 274 | 275 | def get_source_by_id(self, obj_id) -> Source: 276 | return self._get("sources", obj_id, Source) 277 | 278 | def get_schema_by_id(self, obj_id) -> Schema: 279 | return self._get("schemata", obj_id, Schema) 280 | 281 | def get_table_by_id(self, obj_id) -> Table: 282 | return self._get("tables", obj_id, Table) 283 | 284 | def get_column_by_id(self, obj_id) -> Column: 285 | return self._get("columns", obj_id, Column) 286 | 287 | def get_job_by_id(self, obj_id) -> Job: 288 | return self._get("jobs", obj_id, Job) 289 | 290 | def get_job_execution_by_id(self, obj_id) -> JobExecution: 291 | return self._get("job_executions", obj_id, JobExecution) 292 | 293 | def get_column_lineage(self, job_ids: List[int]) -> List[ColumnLineage]: 294 | params = {"job_ids": job_ids} 295 | response = self._session.get(self._build_url("column_lineage"), params=params) 296 | logging.debug(response.json()) 297 | response.raise_for_status() 298 | return [ 299 | ColumnLineage( 300 | session=self._session, 301 | attributes=item["attributes"], 302 | obj_id=item["id"], 303 | relationships=item["relationships"], 304 | ) 305 | for item in response.json()["data"] 306 | ] 307 | 308 | def get_source(self, name) -> Source: 309 | filters = [dict(name="name", op="eq", val="{}".format(name))] 310 | try: 311 | payload = self._search_one("sources", filters) 312 | except NoResultFound: 313 | raise SourceNotFound("Source not found: source_name={}".format(name)) 314 | 315 | return self._obj_factory(payload, Source) 316 | 317 | def get_schema(self, source_name: str, schema_name: str) -> Schema: 318 | name_filter = dict(name="name", op="eq", val=schema_name) 319 | source_filter = dict( 320 | name="source", op="has", val=dict(name="name", op="eq", val=source_name) 321 | ) 322 | filters = {"and": [name_filter, source_filter]} 323 | logging.debug(filters) 324 | try: 325 | payload = self._search_one("schemata", [filters]) 326 | except NoResultFound: 327 | raise SchemaNotFound( 328 | "Schema not found, (source_name={}, schema_name={})".format( 329 | source_name, schema_name 330 | ) 331 | ) 332 | return self._obj_factory(payload, Schema) 333 | 334 | def get_table(self, source_name: str, schema_name: str, table_name: str) -> Table: 335 | schema = self.get_schema(source_name, schema_name) 336 | 337 | name_filter = dict(name="name", op="eq", val=table_name) 338 | schema_id_filter = dict(name="schema_id", op="eq", val=str(schema.id)) 339 | filters = {"and": [name_filter, schema_id_filter]} 340 | logging.debug(filters) 341 | try: 342 | payload = self._search_one("tables", [filters]) 343 | except NoResultFound: 344 | raise TableNotFound( 345 | "Table not found, (source_name={}, schema_name={}, table_name={})".format( 346 | source_name, schema_name, table_name 347 | ) 348 | ) 349 | return self._obj_factory(payload, Table) 350 | 351 | def get_columns_for_table(self, table: Table): 352 | return self._index("tables/{}/columns".format(table.id), Column) 353 | 354 | def get_column(self, source_name, schema_name, table_name, column_name) -> Column: 355 | table = self.get_table(source_name, schema_name, table_name) 356 | name_filter = dict(name="name", op="eq", val=column_name) 357 | table_filter = dict(name="table_id", op="eq", val=str(table.id)) 358 | filters = {"and": [name_filter, table_filter]} 359 | logging.debug(filters) 360 | try: 361 | payload = self._search_one("columns", [filters]) 362 | except NoResultFound: 363 | raise ColumnNotFound( 364 | "Column not found, (source_name={}, schema_name={}, table_name={}, column_name={})".format( 365 | source_name, schema_name, table_name, column_name 366 | ) 367 | ) 368 | return self._obj_factory(payload, Column) 369 | 370 | def add_source(self, name: str, source_type: str, **kwargs) -> Source: 371 | data = {"name": name, "source_type": source_type, **kwargs} 372 | payload = self._post(path="sources", data=data, type="sources") 373 | return self._obj_factory(payload, Source) 374 | 375 | def add_schema(self, name: str, source: Source) -> Schema: 376 | data = {"name": name, "source_id": source.id} 377 | payload = self._post(path="schemata", data=data, type="schemata") 378 | return self._obj_factory(payload, Schema) 379 | 380 | def add_table(self, name: str, schema: Schema) -> Table: 381 | data = {"name": name, "schema_id": schema.id} 382 | payload = self._post(path="tables", data=data, type="tables") 383 | return self._obj_factory(payload, Table) 384 | 385 | def add_column( 386 | self, name: str, data_type: str, sort_order: int, table: Table 387 | ) -> Column: 388 | data = { 389 | "name": name, 390 | "table_id": table.id, 391 | "data_type": data_type, 392 | "sort_order": sort_order, 393 | } 394 | payload = self._post(path="columns", data=data, type="columns") 395 | return self._obj_factory(payload, Column) 396 | 397 | def add_job(self, name: str, context: Dict[Any, Any]) -> Job: 398 | data = {"name": name, "context": context} 399 | payload = self._post(path="jobs", data=data, type="jobs") 400 | return self._obj_factory(payload, Job) 401 | 402 | def add_job_execution( 403 | self, 404 | job: Job, 405 | started_at: datetime.datetime, 406 | ended_at: datetime.datetime, 407 | status: JobExecutionStatus, 408 | ) -> JobExecution: 409 | data = { 410 | "job_id": job.id, 411 | "started_at": started_at, 412 | "ended_at": ended_at, 413 | "status": status.name, 414 | } 415 | payload = self._post(path="job_executions", data=data, type="job_executions") 416 | return self._obj_factory(payload, JobExecution) 417 | 418 | def add_column_lineage( 419 | self, 420 | source: Column, 421 | target: Column, 422 | job_execution_id: int, 423 | context: Dict[Any, Any], 424 | ) -> ColumnLineage: 425 | data = { 426 | "source_id": source.id, 427 | "target_id": target.id, 428 | "job_execution_id": job_execution_id, 429 | "context": context, 430 | } 431 | payload = self._post(path="column_lineage", data=data, type="column_lineage") 432 | return self._obj_factory(payload, ColumnLineage) 433 | 434 | def update_source(self, source: Source, schema: Schema) -> DefaultSchema: 435 | try: 436 | current_obj = self._get( 437 | path="default_schema", 438 | obj_id=source.id, 439 | clazz=DefaultSchema, 440 | resolve_relationships=True, 441 | ) 442 | if current_obj.schema.id == schema.id: 443 | return current_obj 444 | except HTTPError as error: 445 | if error.response.status_code == 404: 446 | data = {"source_id": source.id, "schema_id": schema.id} 447 | payload = self._post( 448 | path="default_schema", data=data, type="default_schema" 449 | ) 450 | return self._obj_factory( 451 | payload, DefaultSchema, resolve_relationships=True 452 | ) 453 | 454 | # Patch 455 | data = {"schema_id": schema.id} 456 | self._patch( 457 | path="default_schema", data=data, type="default_schema", obj_id=source.id 458 | ) 459 | return self._get( 460 | path="default_schema", 461 | obj_id=source.id, 462 | clazz=DefaultSchema, 463 | resolve_relationships=True, 464 | ) 465 | 466 | 467 | class Analyze: 468 | def __init__(self, url: str): 469 | self._base_url = furl(url) / "api/v1/analyze" 470 | self._session = requests.Session() 471 | 472 | def analyze( 473 | self, 474 | query: str, 475 | source: Source, 476 | start_time: datetime.datetime, 477 | end_time: datetime.datetime, 478 | name: str = None, 479 | ) -> JobExecution: 480 | payload = { 481 | "query": query, 482 | "name": name, 483 | "source_id": source.id, 484 | "start_time": start_time.isoformat(), 485 | "end_time": end_time.isoformat(), 486 | } 487 | 488 | response = self._session.post(self._base_url, json=payload,) 489 | if response.status_code == 441: 490 | raise TableNotFound(response.json()["message"]) 491 | elif response.status_code == 442: 492 | raise ColumnNotFound(response.json()["message"]) 493 | elif response.status_code == 422: 494 | raise ParseError(response.json()["message"]) 495 | elif response.status_code == 443: 496 | raise SemanticError(response.json()["message"]) 497 | 498 | logging.debug(response.text) 499 | response.raise_for_status() 500 | payload = response.json()["data"] 501 | return JobExecution( 502 | session=self._session, 503 | attributes=payload.get("attributes"), 504 | obj_id=payload.get("id"), 505 | relationships=None, 506 | ) 507 | 508 | 509 | class Parse: 510 | def __init__(self, url: str): 511 | self._base_url = furl(url) / "api/v1/parse" 512 | self._session = requests.Session() 513 | 514 | def parse(self, query: str, source: Source): 515 | response = self._session.post( 516 | self._base_url, json={"query": query, "source_id": source.id}, 517 | ) 518 | logging.debug(response.text) 519 | response.raise_for_status() 520 | return response.json() 521 | 522 | 523 | class Scan: 524 | def __init__(self, url: str): 525 | self._base_url = furl(url) / "api/v1/scan" 526 | self._session = requests.Session() 527 | 528 | def start(self, source: Source) -> Dict[str, str]: 529 | payload = {"id": source.id} 530 | response = self._session.post(url=self._base_url, json=payload) 531 | response.raise_for_status() 532 | return response.json() 533 | 534 | def list(self) -> List[Dict[str, str]]: 535 | response = self._session.post(url=self._base_url) 536 | response.raise_for_status() 537 | return response.json() 538 | 539 | def get(self, job_id: str) -> Dict[str, str]: 540 | response = self._session.get(url=furl(self._base_url) / job_id) 541 | response.raise_for_status() 542 | return response.json() 543 | 544 | def cancel(self, job_id: str) -> Dict[str, str]: 545 | response = self._session.put(url=furl(self._base_url) / job_id) 546 | response.raise_for_status() 547 | return response.json() 548 | -------------------------------------------------------------------------------- /data_lineage/__main__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import click 4 | from redis import Redis 5 | 6 | from data_lineage import __version__ 7 | from data_lineage.server import create_server 8 | 9 | 10 | @click.command() 11 | @click.version_option(__version__) 12 | @click.option( 13 | "-l", "--log-level", envvar="LOG_LEVEL", help="Logging Level", default="INFO" 14 | ) 15 | @click.option( 16 | "--catalog-user", help="Database user name", envvar="CATALOG_USER", required=True 17 | ) 18 | @click.option( 19 | "--catalog-password", 20 | help="Database Password", 21 | envvar="CATALOG_PASSWORD", 22 | required=True, 23 | ) 24 | @click.option( 25 | "--catalog-host", help="Database Host", envvar="CATALOG_HOST", default="localhost" 26 | ) 27 | @click.option( 28 | "--catalog-port", help="Database Password", envvar="CATALOG_PORT", default=5432 29 | ) 30 | @click.option( 31 | "--catalog-db", help="Postgres Database", envvar="CATALOG_DB", default="tokern" 32 | ) 33 | @click.option( 34 | "--redis-host", 35 | help="Redis host for queueing scans", 36 | envvar="REDIS_HOST", 37 | default="localhost", 38 | ) 39 | @click.option( 40 | "--redis-port", 41 | help="Redis port for queueing scans", 42 | envvar="REDIS_PORT", 43 | default="6379", 44 | ) 45 | @click.option( 46 | "--is-production/--not-production", 47 | help="Run server in development mode", 48 | default=True, 49 | ) 50 | def main( 51 | log_level, 52 | catalog_user, 53 | catalog_password, 54 | catalog_host, 55 | catalog_port, 56 | catalog_db, 57 | redis_host, 58 | redis_port, 59 | is_production, 60 | ): 61 | logging.basicConfig(level=getattr(logging, log_level.upper())) 62 | catalog = { 63 | "user": catalog_user, 64 | "password": catalog_password, 65 | "host": catalog_host, 66 | "port": catalog_port, 67 | "database": catalog_db, 68 | } 69 | connection = Redis(redis_host, redis_port) 70 | app, catalog = create_server( 71 | catalog, connection=connection, is_production=is_production 72 | ) 73 | if is_production: 74 | app.run() 75 | else: 76 | app.run(debug=True) 77 | 78 | 79 | if __name__ == "__main__": 80 | main() 81 | -------------------------------------------------------------------------------- /data_lineage/assets/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tokern/data-lineage/5945542742979fe350d313d906440c93ee3d0f36/data_lineage/assets/favicon.ico -------------------------------------------------------------------------------- /data_lineage/graph.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Dict, List 3 | 4 | import networkx as nx 5 | 6 | 7 | class LineageGraph: 8 | def __init__( 9 | self, 10 | nodes: List[Dict[str, str]], 11 | edges: List[Dict[str, str]], 12 | name: str = "Lineage", 13 | ): 14 | self.name = name 15 | self._graph = nx.DiGraph() 16 | for node in nodes: 17 | node_id = node["id"] 18 | node_attributes = {"name": node["name"], "type": node["type"]} 19 | logging.debug("Add Node: {}, {}".format(node_id, node_attributes)) 20 | self._graph.add_node(node_id, **node_attributes) 21 | 22 | for edge in edges: 23 | logging.debug("Edge: <{}>, <{}>".format(edge["source"], edge["target"])) 24 | self._graph.add_edge(edge["source"], edge["target"]) 25 | 26 | @property 27 | def graph(self): 28 | return self._graph 29 | 30 | @graph.setter 31 | def graph(self, new_graph): 32 | self._graph = new_graph 33 | -------------------------------------------------------------------------------- /data_lineage/parser/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import List 3 | 4 | from dbcat.catalog import Catalog 5 | from dbcat.catalog.models import CatSource, JobExecution, JobExecutionStatus 6 | from pglast import Node, parse_sql 7 | from pglast.parser import ParseError 8 | 9 | from data_lineage import SemanticError 10 | from data_lineage.parser.binder import SelectBinder 11 | from data_lineage.parser.dml_visitor import ( 12 | CTASVisitor, 13 | DmlVisitor, 14 | SelectIntoVisitor, 15 | SelectSourceVisitor, 16 | ) 17 | from data_lineage.parser.visitor import ExprVisitor, RedshiftExprVisitor 18 | 19 | 20 | class Parsed: 21 | def __init__(self, name: str, query: str, node: Node): 22 | self._name = name 23 | self._node = node 24 | self._query = query 25 | 26 | @property 27 | def name(self): 28 | return self._name 29 | 30 | @property 31 | def node(self): 32 | return self._node 33 | 34 | @property 35 | def query(self): 36 | return self._query 37 | 38 | 39 | def parse_queries(queries: List[str]) -> List[Parsed]: 40 | parsed: List[Parsed] = [] 41 | 42 | for query in queries: 43 | try: 44 | parsed.append(parse(query)) 45 | except ParseError as e: 46 | logging.warning("Syntax error while parsing {}.\n{}".format(query, e)) 47 | 48 | return parsed 49 | 50 | 51 | def analyze_dml_query( 52 | catalog: Catalog, parsed: Parsed, source: CatSource, 53 | ) -> DmlVisitor: 54 | chosen_visitor = visit_dml_query(parsed, source) 55 | chosen_visitor.bind(catalog=catalog, source=source) 56 | return chosen_visitor 57 | 58 | 59 | def parse_dml_query( 60 | catalog: Catalog, parsed: Parsed, source: CatSource, 61 | ) -> SelectBinder: 62 | chosen_visitor = visit_dml_query(parsed, source) 63 | 64 | select_binder = SelectBinder( 65 | catalog=catalog, 66 | source=source, 67 | tables=chosen_visitor.select_tables, 68 | columns=chosen_visitor.select_columns, 69 | expr_visitor_clazz=chosen_visitor.expr_visitor_clazz, 70 | alias_generator=("_U{}".format(i) for i in range(0, 1000)), 71 | ) 72 | select_binder.bind() 73 | return select_binder 74 | 75 | 76 | def visit_dml_query(parsed: Parsed, source: CatSource,) -> DmlVisitor: 77 | 78 | expr_visitor_clazz = ExprVisitor 79 | if source.source_type == "redshift": 80 | expr_visitor_clazz = RedshiftExprVisitor 81 | 82 | select_source_visitor: DmlVisitor = SelectSourceVisitor( 83 | parsed.name, expr_visitor_clazz 84 | ) 85 | select_into_visitor: DmlVisitor = SelectIntoVisitor(parsed.name, expr_visitor_clazz) 86 | ctas_visitor: DmlVisitor = CTASVisitor(parsed.name, expr_visitor_clazz) 87 | 88 | for v in [select_source_visitor, select_into_visitor, ctas_visitor]: 89 | v(parsed.node) 90 | if len(v.select_tables) > 0 and v.insert_table is not None: 91 | return v 92 | raise SemanticError("Query is not a DML Query") 93 | 94 | 95 | def extract_lineage( 96 | catalog: Catalog, 97 | visited_query: DmlVisitor, 98 | source: CatSource, 99 | parsed: Parsed, 100 | start_time, 101 | end_time, 102 | ) -> JobExecution: 103 | job = catalog.add_job( 104 | name=parsed.name, source=source, context={"query": parsed.query} 105 | ) 106 | job_execution = catalog.add_job_execution( 107 | job=job, 108 | started_at=start_time, 109 | ended_at=end_time, 110 | status=JobExecutionStatus.SUCCESS, 111 | ) 112 | for source, target in zip( 113 | visited_query.source_columns, visited_query.target_columns 114 | ): 115 | for column in source.columns: 116 | edge = catalog.add_column_lineage(column, target, job_execution.id, {}) 117 | logging.debug("Added {}".format(edge)) 118 | 119 | return job_execution 120 | 121 | 122 | def parse(sql: str, name: str = None) -> Parsed: 123 | if name is None: 124 | name = str(hash(sql)) 125 | node = parse_sql(sql) 126 | 127 | return Parsed(name, sql, node) 128 | -------------------------------------------------------------------------------- /data_lineage/parser/binder.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | from abc import ABC, abstractmethod 4 | from json import JSONEncoder 5 | from typing import List, Mapping, Set, Type 6 | 7 | from dbcat.catalog import Catalog, CatColumn, CatSource, CatTable 8 | from pglast import Node 9 | from pglast.ast import RangeSubselect, RangeVar 10 | 11 | from data_lineage import ColumnNotFound, SemanticError, TableNotFound 12 | from data_lineage.parser.visitor import ( 13 | ColumnRefVisitor, 14 | ExprVisitor, 15 | RangeSubselectVisitor, 16 | RangeVarVisitor, 17 | ) 18 | 19 | 20 | class ColumnContext: 21 | def __init__(self, alias: str, columns: Set[CatColumn]): 22 | self._alias = alias.lower() 23 | self._columns = columns 24 | 25 | @property 26 | def alias(self): 27 | return self._alias 28 | 29 | @property 30 | def columns(self) -> Set[CatColumn]: 31 | return self._columns 32 | 33 | 34 | class AliasContext: 35 | def __init__(self, catalog: Catalog, alias: str, tables: Set[CatTable]): 36 | self._catalog = catalog 37 | self._alias = alias.lower() 38 | self._tables = tables 39 | 40 | @property 41 | def alias(self): 42 | return self._alias 43 | 44 | @property 45 | def tables(self): 46 | return self._tables 47 | 48 | def get_columns(self, column_names: List[str] = None) -> List[ColumnContext]: 49 | columns: List[CatColumn] = [] 50 | for table in self._tables: 51 | logging.debug("Searching in {}".format(table.fqdn)) 52 | columns = columns + self._catalog.get_columns_for_table(table, column_names) 53 | 54 | return [ 55 | ColumnContext(alias=column.name, columns={column}) for column in columns 56 | ] 57 | 58 | 59 | class WithContext(AliasContext): 60 | def __init__( 61 | self, 62 | catalog: Catalog, 63 | alias: str, 64 | tables: Set[CatTable], 65 | columns: List[ColumnContext], 66 | ): 67 | super(WithContext, self).__init__(catalog, alias, tables) 68 | self._columns = columns 69 | 70 | def get_columns(self, column_names: List[str] = None) -> List[ColumnContext]: 71 | if column_names is not None: 72 | filtered = [] 73 | for column in self._columns: 74 | logging.debug( 75 | "Comparing with alias: {} - contains columns: {}".format( 76 | column.alias, 77 | json.dumps(list(column.columns), cls=CatColumnEncoder), 78 | ) 79 | ) 80 | if column.alias in column_names: 81 | filtered.append(column) 82 | 83 | return filtered 84 | else: 85 | return self._columns 86 | 87 | 88 | class CatTableEncoder(JSONEncoder): 89 | def default(self, obj): 90 | if isinstance(obj, CatTable): 91 | return { 92 | "name": obj.name, 93 | "schema": obj.schema.name, 94 | "source": obj.schema.source.name, 95 | } 96 | 97 | # Let the base class default method raise the TypeError 98 | return json.JSONEncoder.default(self, obj) 99 | 100 | 101 | class CatColumnEncoder(JSONEncoder): 102 | def default(self, obj): 103 | if isinstance(obj, CatColumn): 104 | return { 105 | "name": obj.name, 106 | "table": obj.table.name, 107 | "schema": obj.table.schema.name, 108 | "source": obj.table.schema.source.name, 109 | } 110 | 111 | # Let the base class default method raise the TypeError 112 | return json.JSONEncoder.default(self, obj) 113 | 114 | 115 | class Binder(ABC): 116 | @property 117 | @abstractmethod 118 | def _visited_tables(self) -> List[Node]: 119 | pass 120 | 121 | @property 122 | @abstractmethod 123 | def _visited_columns(self) -> List[ExprVisitor]: 124 | pass 125 | 126 | @property 127 | def tables(self) -> Set[CatTable]: 128 | return self._tables 129 | 130 | @property 131 | def columns(self) -> List[ColumnContext]: 132 | return self._columns 133 | 134 | def __init__( 135 | self, 136 | catalog: Catalog, 137 | source: CatSource, 138 | alias_generator, 139 | expr_visitor_clazz: Type[ExprVisitor], 140 | alias_map: Mapping[str, AliasContext] = None, 141 | ): 142 | self._catalog = catalog 143 | self._source = source 144 | self._tables: Set[CatTable] = set() 145 | self._columns: List[ColumnContext] = [] 146 | self._alias_map: Mapping[str, AliasContext] = alias_map or {} 147 | self._alias_generator = alias_generator 148 | self._expr_visitor_clazz = expr_visitor_clazz 149 | 150 | def bind(self): 151 | bound_tables = self._bind_tables() 152 | 153 | self._tables = set(bound_tables) 154 | self._columns = self._bind_columns() 155 | 156 | def _bind_tables(self): 157 | bound_tables = [] 158 | for table in self._visited_tables: 159 | if isinstance(table, RangeVar): 160 | visitor = RangeVarVisitor() 161 | visitor(table) 162 | 163 | logging.debug("Searching for: {}".format(visitor.search_string)) 164 | 165 | if not visitor.is_qualified and visitor.name in self._alias_map: 166 | bound_tables = bound_tables + list( 167 | self._alias_map[visitor.name].tables 168 | ) 169 | logging.debug("Added tables for alias {}".format(visitor.name)) 170 | else: 171 | try: 172 | candidate_table = self._catalog.search_table( 173 | source_like=self._source.name, **visitor.search_string 174 | ) 175 | except RuntimeError as err: 176 | logging.debug(str(err)) 177 | raise TableNotFound( 178 | '"{schema_like}"."{table_like}" is not found'.format( 179 | **visitor.search_string 180 | ) 181 | ) 182 | logging.debug("Bound source table: {}".format(candidate_table)) 183 | 184 | self._alias_map[visitor.alias] = AliasContext( 185 | catalog=self._catalog, 186 | alias=visitor.alias, 187 | tables={candidate_table}, 188 | ) 189 | bound_tables.append(candidate_table) 190 | elif isinstance(table, RangeSubselect): 191 | visitor = RangeSubselectVisitor(self._expr_visitor_clazz) 192 | visitor(table) 193 | binder = SelectBinder( 194 | self._catalog, 195 | self._source, 196 | visitor.sources, 197 | visitor.columns, 198 | self._alias_generator, 199 | self._expr_visitor_clazz, 200 | ) 201 | binder.bind() 202 | self._alias_map[visitor.alias] = WithContext( 203 | catalog=self._catalog, 204 | alias=visitor.alias, 205 | tables=binder.tables, 206 | columns=binder.columns, 207 | ) 208 | bound_tables = bound_tables + list(binder.tables) 209 | else: 210 | raise SemanticError("Unknown parser state. Please contact Support") 211 | return bound_tables 212 | 213 | def _bind_columns(self) -> List[ColumnContext]: 214 | bound_cols: List[ColumnContext] = [] 215 | for expr_visitor in self._visited_columns: 216 | target_cols: Set[ColumnContext] = set() 217 | is_a_star = False 218 | for column in expr_visitor.columns: 219 | column_ref_visitor = ColumnRefVisitor() 220 | column_ref_visitor(column) 221 | is_a_star = column_ref_visitor.is_a_star 222 | alias_list = list(self._alias_map.values()) 223 | if column_ref_visitor.is_qualified: 224 | if column_ref_visitor.table_name not in self._alias_map: 225 | raise TableNotFound( 226 | "{} not found for column ({}).".format( 227 | column_ref_visitor.name[0], column_ref_visitor.name 228 | ) 229 | ) 230 | assert column_ref_visitor.table_name is not None 231 | alias_list = [self._alias_map[column_ref_visitor.table_name]] 232 | target_cols.update( 233 | Binder._search_column_in_tables(column_ref_visitor, alias_list) 234 | ) 235 | 236 | if is_a_star: 237 | for col in target_cols: 238 | bound_cols.append( 239 | ColumnContext(alias=col.alias, columns=col.columns) 240 | ) 241 | else: 242 | if expr_visitor.alias is not None: 243 | alias = expr_visitor.alias 244 | elif len(target_cols) == 1: 245 | alias = list(target_cols)[0].alias 246 | else: 247 | alias = next(self._alias_generator) 248 | cols: Set[CatColumn] = set() 249 | for tgt in target_cols: 250 | for c in tgt.columns: 251 | cols.add(c) 252 | bound_cols.append(ColumnContext(alias=alias, columns=cols)) 253 | 254 | if len(bound_cols) == 0: 255 | raise ColumnNotFound("No source columns found.") 256 | return bound_cols 257 | 258 | @staticmethod 259 | def _search_column_in_tables( 260 | column_ref_visitor, alias_list: List[AliasContext] 261 | ) -> List[ColumnContext]: 262 | found_cols: List[ColumnContext] = [] 263 | if column_ref_visitor.is_a_star: 264 | for alias_context in alias_list: 265 | found_cols = alias_context.get_columns() 266 | logging.debug( 267 | "Bound all source columns in {}".format(alias_context.tables) 268 | ) 269 | else: 270 | candidate_columns: List[ColumnContext] = [] 271 | global_table_list: List[CatTable] = [] 272 | logging.debug("Searching for {}".format(column_ref_visitor.column_name)) 273 | for alias_context in alias_list: 274 | logging.debug("Searching in {}".format(alias_context.alias)) 275 | candidate_columns = candidate_columns + alias_context.get_columns( 276 | [column_ref_visitor.column_name] 277 | ) 278 | global_table_list = global_table_list + list(alias_context.tables) 279 | 280 | if len(candidate_columns) == 0: 281 | raise ColumnNotFound( 282 | '"{}" not found in the following tables: {}'.format( 283 | column_ref_visitor.column_name, 284 | json.dumps(global_table_list, cls=CatTableEncoder), 285 | ) 286 | ) 287 | elif len(candidate_columns) > 1: 288 | column_list = [] 289 | for candidate in candidate_columns: 290 | for col in candidate.columns: 291 | column_list.append(col) 292 | raise ColumnNotFound( 293 | "{} Ambiguous column name. Multiple matches found: {}".format( 294 | column_ref_visitor.name, 295 | json.dumps(column_list, cls=CatColumnEncoder), 296 | ) 297 | ) 298 | logging.debug("Bound source column: {}".format(candidate_columns[0])) 299 | found_cols.append(candidate_columns[0]) 300 | return found_cols 301 | 302 | 303 | class SelectBinder(Binder): 304 | def __init__( 305 | self, 306 | catalog: Catalog, 307 | source: CatSource, 308 | tables: List[Node], 309 | columns: List[ExprVisitor], 310 | alias_generator, 311 | expr_visitor_clazz: Type[ExprVisitor], 312 | alias_map: Mapping[str, AliasContext] = None, 313 | ): 314 | super(SelectBinder, self).__init__( 315 | catalog, source, alias_generator, expr_visitor_clazz, alias_map 316 | ) 317 | self._table_nodes: List[Node] = tables 318 | self._column_nodes: List[ExprVisitor] = columns 319 | 320 | @property 321 | def _visited_tables(self) -> List[Node]: 322 | return self._table_nodes 323 | 324 | @property 325 | def _visited_columns(self) -> List[ExprVisitor]: 326 | return self._column_nodes 327 | -------------------------------------------------------------------------------- /data_lineage/parser/dml_visitor.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | from typing import Any, Dict, List, Optional, Set, Tuple, Type 4 | 5 | from dbcat.catalog import Catalog, CatColumn, CatSource, CatTable 6 | from pglast import Node 7 | from pglast.ast import IntoClause 8 | from pglast.visitors import Ancestor, Continue, Skip, Visitor 9 | 10 | from data_lineage import ColumnNotFound, SemanticError, TableNotFound 11 | from data_lineage.parser.binder import ( 12 | CatTableEncoder, 13 | ColumnContext, 14 | SelectBinder, 15 | WithContext, 16 | ) 17 | from data_lineage.parser.visitor import ( 18 | ColumnRefVisitor, 19 | ExprVisitor, 20 | RangeVarVisitor, 21 | TableVisitor, 22 | ) 23 | 24 | 25 | class DmlVisitor(Visitor): 26 | def __init__(self, name: str, expr_visitor_clazz: Type[ExprVisitor]): 27 | self._name = name 28 | self._insert_table: Optional[Node] = None 29 | self._insert_columns: List[str] = [] 30 | self._target_table: Optional[CatTable] = None 31 | self._target_columns: List[CatColumn] = [] 32 | self._source_tables: Set[CatTable] = set() 33 | self._source_columns: List[ColumnContext] = [] 34 | self._select_tables: List[Node] = [] 35 | self._select_columns: List[ExprVisitor] = [] 36 | self._with_aliases: Dict[str, Dict[str, Any]] = {} 37 | self._alias_map: Dict[str, WithContext] = {} 38 | self._column_alias_generator = ("_U{}".format(i) for i in range(0, 1000)) 39 | self.expr_visitor_clazz = expr_visitor_clazz 40 | 41 | @property 42 | def name(self) -> str: 43 | return self._name 44 | 45 | @property 46 | def insert_table(self) -> Optional[Node]: 47 | return self._insert_table 48 | 49 | @property 50 | def target_table(self) -> CatTable: 51 | return self._target_table 52 | 53 | @property 54 | def target_columns(self) -> List[CatColumn]: 55 | return self._target_columns 56 | 57 | @property 58 | def source_tables(self) -> Set[CatTable]: 59 | return self._source_tables 60 | 61 | @property 62 | def source_columns(self) -> List[ColumnContext]: 63 | return self._source_columns 64 | 65 | @property 66 | def select_tables(self) -> List[Node]: 67 | return self._select_tables 68 | 69 | @property 70 | def select_columns(self) -> List[ExprVisitor]: 71 | return self._select_columns 72 | 73 | def visit_RangeVar(self, ancestors, node): 74 | self._insert_table = node 75 | return Skip 76 | 77 | def visit_ResTarget(self, ancestors, node): 78 | self._insert_columns.append(node.name) 79 | return Skip 80 | 81 | def visit_CommonTableExpr(self, ancestors, node): 82 | with_alias = node.ctename 83 | table_visitor = TableVisitor(self.expr_visitor_clazz) 84 | table_visitor(node.ctequery) 85 | 86 | self._with_aliases[with_alias] = { 87 | "tables": table_visitor.sources, 88 | "columns": table_visitor.columns, 89 | } 90 | return Skip 91 | 92 | def visit_CreateTableAsStmt(self, ancestors, node): 93 | """ 94 | Do not process CTAS statement by default. 95 | :param ancestors: 96 | :type ancestors: 97 | :param node: 98 | :type node: 99 | :return: 100 | :rtype: 101 | """ 102 | return Skip 103 | 104 | def bind(self, catalog: Catalog, source: CatSource): 105 | self._bind_target(catalog, source) 106 | 107 | self._bind_with(catalog, source) 108 | binder = SelectBinder( 109 | catalog, 110 | source, 111 | self._select_tables, 112 | self._select_columns, 113 | self._column_alias_generator, 114 | self.expr_visitor_clazz, 115 | self._alias_map, 116 | ) 117 | binder.bind() 118 | 119 | if len(binder.tables) == 0: 120 | raise SemanticError("No source tables found") 121 | 122 | if len(binder.columns) == 0: 123 | raise SemanticError("No source columns found") 124 | 125 | if self.target_table is None: 126 | raise SemanticError("No target table found") 127 | 128 | if len(self.target_columns) == 0: 129 | raise SemanticError( 130 | "No target columns found in {}".format( 131 | json.dumps(self.target_table, cls=CatTableEncoder) 132 | ) 133 | ) 134 | 135 | if len(self.target_columns) != len(binder.columns): 136 | raise SemanticError( 137 | "No. of target columns({}) does not match no. of source columns({})".format( 138 | len(self.target_columns), len(binder.columns) 139 | ) 140 | ) 141 | 142 | self._source_tables = binder.tables 143 | self._source_columns = binder.columns 144 | 145 | def _bind_target(self, catalog: Catalog, source: CatSource): 146 | target_table_visitor = RangeVarVisitor() 147 | target_table_visitor(self._insert_table) 148 | logging.debug("Searching for: {}".format(target_table_visitor.search_string)) 149 | try: 150 | self._target_table = catalog.search_table( 151 | source_like=source.name, **target_table_visitor.search_string 152 | ) 153 | except RuntimeError as error: 154 | logging.debug(str(error)) 155 | raise TableNotFound( 156 | '"{schema_like}"."{table_like}" is not found'.format( 157 | **target_table_visitor.search_string 158 | ) 159 | ) 160 | logging.debug("Bound target table: {}".format(self._target_table)) 161 | if len(self._insert_columns) == 0: 162 | self._target_columns = catalog.get_columns_for_table(self._target_table) 163 | logging.debug("Bound all columns in {}".format(self._target_table)) 164 | else: 165 | bound_cols = catalog.get_columns_for_table( 166 | self._target_table, column_names=self._insert_columns 167 | ) 168 | # Handle error case 169 | if len(bound_cols) != len(self._insert_columns): 170 | for column in self._insert_columns: 171 | found = False 172 | for bound in bound_cols: 173 | if column == bound.name: 174 | found = True 175 | break 176 | 177 | if not found: 178 | raise ColumnNotFound( 179 | '"{}" not found in the following tables: {}'.format( 180 | column, 181 | json.dumps([self._target_table], cls=CatTableEncoder), 182 | ) 183 | ) 184 | 185 | self._target_columns = bound_cols 186 | logging.debug("Bound {} target columns".format(len(bound_cols))) 187 | 188 | def _bind_with(self, catalog: Catalog, source: CatSource): 189 | if self._with_aliases: 190 | # Bind all the WITH expressions 191 | for key in self._with_aliases.keys(): 192 | binder = SelectBinder( 193 | catalog, 194 | source, 195 | self._with_aliases[key]["tables"], 196 | self._with_aliases[key]["columns"], 197 | self._column_alias_generator, 198 | self.expr_visitor_clazz, 199 | ) 200 | binder.bind() 201 | self._alias_map[key] = WithContext( 202 | catalog=catalog, 203 | alias=key, 204 | tables=binder.tables, 205 | columns=binder.columns, 206 | ) 207 | 208 | def resolve( 209 | self, 210 | ) -> Tuple[ 211 | Tuple[Optional[str], str], 212 | List[Tuple[Optional[str], str]], 213 | List[Tuple[Optional[str], str]], 214 | ]: 215 | target_table_visitor = RangeVarVisitor() 216 | target_table_visitor(self._insert_table) 217 | 218 | bound_tables = [] 219 | for table in self._select_tables: 220 | visitor = RangeVarVisitor() 221 | visitor(table) 222 | bound_tables.append(visitor.fqdn) 223 | 224 | bound_cols = [] 225 | for expr_visitor in self._select_columns: 226 | for column in expr_visitor.columns: 227 | column_ref_visitor = ColumnRefVisitor() 228 | column_ref_visitor(column) 229 | bound_cols.append(column_ref_visitor.name[0]) 230 | 231 | return target_table_visitor.fqdn, bound_tables, bound_cols 232 | 233 | 234 | class SelectSourceVisitor(DmlVisitor): 235 | def __init__(self, name: str, expr_visitor_clazz: Type[ExprVisitor] = ExprVisitor): 236 | super(SelectSourceVisitor, self).__init__(name, expr_visitor_clazz) 237 | 238 | def visit_SelectStmt(self, ancestors, node): 239 | table_visitor = TableVisitor(self.expr_visitor_clazz) 240 | table_visitor(node) 241 | self._select_tables = table_visitor.sources 242 | self._select_columns = table_visitor.columns 243 | for key in table_visitor.with_aliases.keys(): 244 | self._with_aliases[key] = table_visitor.with_aliases[key] 245 | 246 | return Skip 247 | 248 | 249 | class SelectIntoVisitor(DmlVisitor): 250 | def __init__(self, name: str, expr_visitor_clazz: Type[ExprVisitor] = ExprVisitor): 251 | super(SelectIntoVisitor, self).__init__(name, expr_visitor_clazz) 252 | 253 | def visit_SelectStmt(self, ancestors, node): 254 | super().__call__(node.intoClause) 255 | table_visitor = TableVisitor(self.expr_visitor_clazz) 256 | table_visitor(node.targetList) 257 | table_visitor(node.fromClause) 258 | self._select_tables = table_visitor.sources 259 | self._select_columns = table_visitor.columns 260 | for key in table_visitor.with_aliases.keys(): 261 | self._with_aliases[key] = table_visitor.with_aliases[key] 262 | 263 | return Skip 264 | 265 | 266 | class CTASVisitor(SelectSourceVisitor): 267 | def __init__(self, name: str, expr_visitor_clazz: Type[ExprVisitor] = ExprVisitor): 268 | super(CTASVisitor, self).__init__(name, expr_visitor_clazz) 269 | 270 | def visit_CreateTableAsStmt(self, ancestors, node): 271 | return Continue 272 | 273 | def visit_String(self, ancestors: Ancestor, node): 274 | # Check if parent is IntoClause 275 | parent = ancestors 276 | in_into_clause = False 277 | while parent is not None and not in_into_clause: 278 | in_into_clause = isinstance(parent.node, IntoClause) 279 | parent = parent.parent 280 | 281 | if in_into_clause: 282 | self._insert_columns.append(node.val) 283 | 284 | def _bind_target(self, catalog: Catalog, source: CatSource): 285 | target_table_visitor = RangeVarVisitor() 286 | target_table_visitor(self._insert_table) 287 | 288 | if target_table_visitor.is_qualified: 289 | schema = catalog.get_schema( 290 | source_name=source.name, schema_name=target_table_visitor.schema_name 291 | ) 292 | elif source.default_schema is not None: 293 | schema = source.default_schema.schema 294 | else: 295 | raise SemanticError( 296 | "No default schema set for source {}".format(source.fqdn) 297 | ) 298 | 299 | self._target_table = catalog.add_table( 300 | table_name=target_table_visitor.name, schema=schema 301 | ) 302 | 303 | sort_order = 1 304 | for col in self._insert_columns: 305 | self._target_columns.append( 306 | catalog.add_column( 307 | column_name=col, 308 | data_type="varchar", 309 | sort_order=sort_order, 310 | table=self._target_table, 311 | ) 312 | ) 313 | -------------------------------------------------------------------------------- /data_lineage/parser/visitor.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, List, Optional, Tuple, Type 2 | 3 | from pglast import Node 4 | from pglast.visitors import Skip, Visitor 5 | 6 | 7 | class ExprVisitor(Visitor): 8 | def __init__(self, alias: str = None): 9 | self._alias: Optional[str] = alias 10 | self._columns: List[Node] = [] 11 | 12 | @property 13 | def alias(self) -> Optional[str]: 14 | return self._alias 15 | 16 | @property 17 | def columns(self) -> List[Node]: 18 | return self._columns 19 | 20 | def visit_FuncCall(self, ancestors, node): 21 | super().__call__(node.args) 22 | 23 | def visit_TypeCast(self, ancestors, node): 24 | super().__call__(node.arg) 25 | 26 | def visit_A_Expr(self, ancestors, node): 27 | super().__call__(node.lexpr) 28 | super().__call__(node.rexpr) 29 | 30 | def visit_ColumnRef(self, ancestors, node): 31 | self._columns.append(node) 32 | 33 | 34 | class RedshiftExprVisitor(ExprVisitor): 35 | class FuncNameVisitor(Visitor): 36 | def __init__(self): 37 | self._name = None 38 | 39 | @property 40 | def name(self): 41 | return self._name 42 | 43 | def visit_String(self, ancestors, obj): 44 | self._name = obj.val 45 | 46 | def visit_FuncCall(self, ancestors, node): 47 | name_visitor = RedshiftExprVisitor.FuncNameVisitor() 48 | name_visitor(node.funcname) 49 | if name_visitor.name == "dateadd": 50 | super().__call__(node.args[2]) 51 | return Skip 52 | 53 | 54 | class TableVisitor(Visitor): 55 | def __init__(self, expr_visitor_clazz: Type[ExprVisitor]): 56 | self._sources: List[Node] = [] 57 | self._columns: List[ExprVisitor] = [] 58 | self._expr_visitor_clazz = expr_visitor_clazz 59 | self._with_aliases: Dict[str, Dict[str, Any]] = {} 60 | 61 | @property 62 | def sources(self) -> List[Node]: 63 | return self._sources 64 | 65 | @property 66 | def columns(self) -> List[ExprVisitor]: 67 | return self._columns 68 | 69 | @property 70 | def with_aliases(self) -> Dict[str, Dict[str, Any]]: 71 | return self._with_aliases 72 | 73 | def visit_ResTarget(self, ancestors, node): 74 | name = None 75 | if node.name is not None: 76 | name = node.name 77 | 78 | expr_visitor = self._expr_visitor_clazz(name) 79 | expr_visitor(node.val) 80 | self._columns.append(expr_visitor) 81 | return Skip 82 | 83 | def visit_RangeVar(self, ancestors, node): 84 | self._sources.append(node) 85 | return Skip 86 | 87 | def visit_RangeSubselect(self, ancestors, node): 88 | self._sources.append(node) 89 | return Skip 90 | 91 | def visit_CommonTableExpr(self, ancestors, node): 92 | with_alias = node.ctename 93 | table_visitor = TableVisitor(self._expr_visitor_clazz) 94 | table_visitor(node.ctequery) 95 | 96 | self._with_aliases[with_alias] = { 97 | "tables": table_visitor.sources, 98 | "columns": table_visitor.columns, 99 | } 100 | return Skip 101 | 102 | 103 | class ColumnRefVisitor(Visitor): 104 | def __init__(self): 105 | self._name: List[str] = [] 106 | self._is_a_star: bool = False 107 | 108 | @property 109 | def name(self) -> Tuple: 110 | return tuple(self._name) 111 | 112 | @property 113 | def is_a_star(self) -> bool: 114 | return self._is_a_star 115 | 116 | @property 117 | def is_qualified(self) -> bool: 118 | return len(self._name) == 2 or (len(self._name) == 1 and self._is_a_star) 119 | 120 | @property 121 | def column_name(self) -> Optional[str]: 122 | if len(self._name) == 2: 123 | return self._name[1] 124 | elif len(self._name) == 1: 125 | return self._name[0] 126 | return None 127 | 128 | @property 129 | def table_name(self) -> Optional[str]: 130 | if len(self._name) == 2 or (self._is_a_star and len(self._name) == 1): 131 | return self._name[0] 132 | 133 | return None 134 | 135 | def visit_String(self, ancestors, node): 136 | self._name.append(node.val) 137 | 138 | def visit_A_Star(self, ancestors, node): 139 | self._is_a_star = True 140 | 141 | 142 | class RangeVarVisitor(Visitor): 143 | def __init__(self): 144 | self._schema_name = None 145 | self._name = None 146 | self._alias = None 147 | 148 | @property 149 | def alias(self) -> Optional[str]: 150 | if self._alias is not None: 151 | return self._alias 152 | elif self._schema_name is not None and self._name is not None: 153 | return "{}.{}".format(self._schema_name, self._name) 154 | elif self._name is not None: 155 | return self._name 156 | return None 157 | 158 | @property 159 | def fqdn(self): 160 | return self._schema_name, self._name 161 | 162 | @property 163 | def search_string(self): 164 | return {"schema_like": self._schema_name, "table_like": self._name} 165 | 166 | @property 167 | def is_qualified(self) -> bool: 168 | return self._schema_name is not None 169 | 170 | @property 171 | def schema_name(self) -> Optional[str]: 172 | return self._schema_name 173 | 174 | @property 175 | def name(self) -> str: 176 | return self._name 177 | 178 | def visit_Alias(self, ancestors, node): 179 | self._alias = node.aliasname.lower() 180 | 181 | def visit_RangeVar(self, ancestors, node): 182 | if node.schemaname: 183 | self._schema_name = node.schemaname.lower() 184 | self._name = node.relname.lower() 185 | 186 | 187 | class RangeSubselectVisitor(Visitor): 188 | def __init__(self, expr_visitor_clazz: Type[ExprVisitor]): 189 | self._alias: Optional[str] = None 190 | self._table_visitor: TableVisitor = TableVisitor(expr_visitor_clazz) 191 | 192 | @property 193 | def alias(self) -> Optional[str]: 194 | if self._alias is not None: 195 | return self._alias 196 | return None 197 | 198 | @property 199 | def sources(self) -> List[Node]: 200 | return self._table_visitor.sources 201 | 202 | @property 203 | def columns(self) -> List[ExprVisitor]: 204 | return self._table_visitor.columns 205 | 206 | def visit_Alias(self, ancestors, node): 207 | self._alias = node.aliasname 208 | 209 | def visit_RangeSubselect(self, ancestors, node): 210 | super().__call__(node.alias) 211 | self._table_visitor(node.subquery) 212 | return Skip 213 | -------------------------------------------------------------------------------- /data_lineage/server.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import logging 3 | from typing import Any, Dict, List, Tuple 4 | 5 | import flask_restless 6 | import gunicorn.app.base 7 | from dbcat import Catalog, PGCatalog, init_db 8 | from dbcat.catalog import CatColumn 9 | from dbcat.catalog.models import ( 10 | CatSchema, 11 | CatSource, 12 | CatTable, 13 | ColumnLineage, 14 | DefaultSchema, 15 | Job, 16 | JobExecution, 17 | JobExecutionStatus, 18 | ) 19 | from flask import Flask 20 | from flask_restful import Api, Resource, reqparse 21 | from pglast.parser import ParseError 22 | from rq import Queue 23 | from rq import job as RqJob 24 | from werkzeug.exceptions import NotFound, UnprocessableEntity 25 | 26 | from data_lineage import ColumnNotFound, SemanticError, TableNotFound 27 | from data_lineage.parser import ( 28 | analyze_dml_query, 29 | extract_lineage, 30 | parse, 31 | parse_dml_query, 32 | ) 33 | from data_lineage.worker import scan 34 | 35 | 36 | class TableNotFoundHTTP(NotFound): 37 | """Table not found in catalog""" 38 | 39 | code = 441 40 | 41 | 42 | class ColumnNotFoundHTTP(NotFound): 43 | """Column not found in catalog""" 44 | 45 | code = 442 46 | 47 | 48 | class ParseErrorHTTP(UnprocessableEntity): 49 | """Parser Error""" 50 | 51 | 52 | class SemanticErrorHTTP(UnprocessableEntity): 53 | """Semantic Error""" 54 | 55 | code = 443 56 | 57 | 58 | class Kedro(Resource): 59 | def __init__(self, catalog: Catalog): 60 | self._catalog = catalog 61 | self._parser = reqparse.RequestParser() 62 | self._parser.add_argument( 63 | "job_ids", action="append", help="List of job ids for a sub graph" 64 | ) 65 | 66 | def get(self): 67 | nodes = [] 68 | edges = [] 69 | 70 | args = self._parser.parse_args() 71 | with self._catalog.managed_session: 72 | column_edges = self._catalog.get_column_lineages(args["job_ids"]) 73 | for edge in column_edges: 74 | nodes.append(self._column_info(edge.source)) 75 | nodes.append(self._column_info(edge.target)) 76 | nodes.append(self._job_info(edge.job_execution.job)) 77 | edges.append( 78 | { 79 | "source": "column:{}".format(edge.source_id), 80 | "target": "task:{}".format(edge.job_execution.job_id), 81 | } 82 | ) 83 | edges.append( 84 | { 85 | "source": "task:{}".format(edge.job_execution.job_id), 86 | "target": "column:{}".format(edge.target_id), 87 | } 88 | ) 89 | 90 | return {"nodes": nodes, "edges": edges} 91 | 92 | @staticmethod 93 | def _column_info(node: CatColumn): 94 | return { 95 | "id": "column:{}".format(node.id), 96 | "name": ".".join(node.fqdn), 97 | "type": "data", 98 | } 99 | 100 | @staticmethod 101 | def _job_info(node: Job): 102 | return {"id": "task:{}".format(node.id), "name": node.name, "type": "task"} 103 | 104 | 105 | class ScanList(Resource): 106 | def __init__(self, catalog: PGCatalog, queue: Queue): 107 | self._catalog = catalog 108 | self._queue = queue 109 | self._parser = reqparse.RequestParser() 110 | self._parser.add_argument("id", required=True, help="ID of the resource") 111 | 112 | def post(self): 113 | args = self._parser.parse_args() 114 | logging.info("Args for scanning: {}".format(args)) 115 | job = self._queue.enqueue( 116 | scan, 117 | { 118 | "user": self._catalog.user, 119 | "password": self._catalog.password, 120 | "database": self._catalog.database, 121 | "host": self._catalog.host, 122 | "port": self._catalog.port, 123 | }, 124 | int(args["id"]), 125 | ) 126 | 127 | return {"id": job.id, "status": "queued"}, 200 128 | 129 | def get(self): 130 | job_list = [] 131 | for job in self._queue.started_job_registry.get_job_ids(): 132 | job_list.append({"id": job, "status": "started"}) 133 | 134 | for job in self._queue.finished_job_registry.get_job_ids(): 135 | job_list.append({"id": job, "status": "finished"}) 136 | 137 | for job in self._queue.failed_job_registry.get_job_ids(): 138 | job_list.append({"id": job, "status": "failed"}) 139 | 140 | return job_list, 200 141 | 142 | 143 | class Scan(Resource): 144 | def __init__(self, catalog: PGCatalog, queue: Queue): 145 | self._catalog = catalog 146 | self._queue = queue 147 | self._parser = reqparse.RequestParser() 148 | self._parser.add_argument("id", required=True, help="ID of the resource") 149 | 150 | def get(self, job_id): 151 | status = RqJob.Job.fetch(job_id, connection=self._queue.connection).get_status() 152 | return {"id": job_id, "status": status}, 200 153 | 154 | def put(self, job_id): 155 | RqJob.Job.fetch(job_id, connection=self._queue.connection).cancel() 156 | return {"message": "Job {} cancelled".format(job_id)}, 200 157 | 158 | 159 | class Parse(Resource): 160 | def __init__(self, catalog: Catalog): 161 | self._catalog = catalog 162 | self._parser = reqparse.RequestParser() 163 | self._parser.add_argument("query", required=True, help="Query to parse") 164 | self._parser.add_argument( 165 | "source_id", help="Source database of the query", required=True 166 | ) 167 | 168 | def post(self): 169 | args = self._parser.parse_args() 170 | logging.debug("Parse query: {}".format(args["query"])) 171 | try: 172 | parsed = parse(args["query"], "parse_api") 173 | except ParseError as error: 174 | raise ParseErrorHTTP(description=str(error)) 175 | 176 | try: 177 | with self._catalog.managed_session: 178 | source = self._catalog.get_source_by_id(args["source_id"]) 179 | logging.debug("Parsing query for source {}".format(source)) 180 | binder = parse_dml_query( 181 | catalog=self._catalog, parsed=parsed, source=source 182 | ) 183 | 184 | return ( 185 | { 186 | "select_tables": [table.name for table in binder.tables], 187 | "select_columns": [context.alias for context in binder.columns], 188 | }, 189 | 200, 190 | ) 191 | except TableNotFound as table_error: 192 | raise TableNotFoundHTTP(description=str(table_error)) 193 | except ColumnNotFound as column_error: 194 | raise ColumnNotFoundHTTP(description=str(column_error)) 195 | except SemanticError as semantic_error: 196 | raise SemanticErrorHTTP(description=str(semantic_error)) 197 | 198 | 199 | class Analyze(Resource): 200 | def __init__(self, catalog: Catalog): 201 | self._catalog = catalog 202 | self._parser = reqparse.RequestParser() 203 | self._parser.add_argument("query", required=True, help="Query to parse") 204 | self._parser.add_argument("name", help="Name of the ETL job") 205 | self._parser.add_argument( 206 | "start_time", required=True, help="Start time of the task" 207 | ) 208 | self._parser.add_argument( 209 | "end_time", required=True, help="End time of the task" 210 | ) 211 | self._parser.add_argument( 212 | "source_id", help="Source database of the query", required=True 213 | ) 214 | 215 | def post(self): 216 | args = self._parser.parse_args() 217 | logging.debug("Parse query: {}".format(args["query"])) 218 | try: 219 | parsed = parse(args["query"], args["name"]) 220 | except ParseError as error: 221 | raise ParseErrorHTTP(description=str(error)) 222 | 223 | try: 224 | with self._catalog.managed_session: 225 | source = self._catalog.get_source_by_id(args["source_id"]) 226 | logging.debug("Parsing query for source {}".format(source)) 227 | chosen_visitor = analyze_dml_query(self._catalog, parsed, source) 228 | job_execution = extract_lineage( 229 | catalog=self._catalog, 230 | visited_query=chosen_visitor, 231 | source=source, 232 | parsed=parsed, 233 | start_time=datetime.datetime.fromisoformat(args["start_time"]), 234 | end_time=datetime.datetime.fromisoformat(args["end_time"]), 235 | ) 236 | 237 | return ( 238 | { 239 | "data": { 240 | "id": job_execution.id, 241 | "type": "job_executions", 242 | "attributes": { 243 | "job_id": job_execution.job_id, 244 | "started_at": job_execution.started_at.strftime( 245 | "%Y-%m-%d %H:%M:%S" 246 | ), 247 | "ended_at": job_execution.ended_at.strftime( 248 | "%Y-%m-%d %H:%M:%S" 249 | ), 250 | "status": job_execution.status.name, 251 | }, 252 | } 253 | }, 254 | 200, 255 | ) 256 | except TableNotFound as table_error: 257 | raise TableNotFoundHTTP(description=str(table_error)) 258 | except ColumnNotFound as column_error: 259 | raise ColumnNotFoundHTTP(description=str(column_error)) 260 | except SemanticError as semantic_error: 261 | raise SemanticErrorHTTP(description=str(semantic_error)) 262 | 263 | 264 | class Server(gunicorn.app.base.BaseApplication): 265 | def __init__(self, app): 266 | self.application = app 267 | super().__init__() 268 | 269 | def load_config(self): 270 | # parse console args 271 | parser = self.cfg.parser() 272 | env_args = parser.parse_args(self.cfg.get_cmd_args_from_env()) 273 | 274 | # Load up environment configuration 275 | for k, v in vars(env_args).items(): 276 | if v is None: 277 | continue 278 | if k == "args": 279 | continue 280 | self.cfg.set(k.lower(), v) 281 | 282 | def load(self): 283 | return self.application 284 | 285 | 286 | def job_execution_serializer(instance: JobExecution, only: List[str]): 287 | return { 288 | "id": instance.id, 289 | "type": "job_executions", 290 | "attributes": { 291 | "job_id": instance.job_id, 292 | "started_at": instance.started_at.strftime("%Y-%m-%d %H:%M:%S"), 293 | "ended_at": instance.ended_at.strftime("%Y-%m-%d %H:%M:%S"), 294 | "status": instance.status.name, 295 | }, 296 | } 297 | 298 | 299 | def job_execution_deserializer(data: Dict["str", Any]): 300 | attributes = data["data"]["attributes"] 301 | logging.debug(attributes) 302 | job_execution = JobExecution() 303 | job_execution.job_id = int(attributes["job_id"]) 304 | job_execution.started_at = datetime.datetime.strptime( 305 | attributes["started_at"], "%Y-%m-%d %H:%M:%S" 306 | ) 307 | job_execution.ended_at = datetime.datetime.strptime( 308 | attributes["ended_at"], "%Y-%m-%d %H:%M:%S" 309 | ) 310 | job_execution.status = ( 311 | JobExecutionStatus.SUCCESS 312 | if attributes["status"] == "SUCCESS" 313 | else JobExecutionStatus.SUCCESS 314 | ) 315 | 316 | logging.debug(job_execution) 317 | logging.debug(job_execution.status == JobExecutionStatus.SUCCESS) 318 | return job_execution 319 | 320 | 321 | def create_server( 322 | catalog_options: Dict[str, str], connection, is_production=True 323 | ) -> Tuple[Any, PGCatalog]: 324 | logging.debug(catalog_options) 325 | catalog = PGCatalog( 326 | **catalog_options, 327 | connect_args={"application_name": "data-lineage:flask-restless"}, 328 | max_overflow=40, 329 | pool_size=20, 330 | pool_pre_ping=True 331 | ) 332 | 333 | init_db(catalog) 334 | 335 | restful_catalog = PGCatalog( 336 | **catalog_options, 337 | connect_args={"application_name": "data-lineage:restful"}, 338 | pool_pre_ping=True 339 | ) 340 | 341 | app = Flask(__name__) 342 | queue = Queue(is_async=is_production, connection=connection) 343 | 344 | # Create CRUD APIs 345 | methods = ["DELETE", "GET", "PATCH", "POST"] 346 | url_prefix = "/api/v1/catalog" 347 | api_manager = flask_restless.APIManager(app, catalog.get_scoped_session()) 348 | api_manager.create_api( 349 | CatSource, 350 | methods=methods, 351 | url_prefix=url_prefix, 352 | additional_attributes=["fqdn"], 353 | ) 354 | api_manager.create_api( 355 | CatSchema, 356 | methods=methods, 357 | url_prefix=url_prefix, 358 | additional_attributes=["fqdn"], 359 | ) 360 | api_manager.create_api( 361 | CatTable, 362 | methods=methods, 363 | url_prefix=url_prefix, 364 | additional_attributes=["fqdn"], 365 | ) 366 | api_manager.create_api( 367 | CatColumn, 368 | methods=methods, 369 | url_prefix=url_prefix, 370 | additional_attributes=["fqdn"], 371 | ) 372 | api_manager.create_api(Job, methods=methods, url_prefix=url_prefix) 373 | api_manager.create_api( 374 | JobExecution, 375 | methods=methods, 376 | url_prefix=url_prefix, 377 | serializer=job_execution_serializer, 378 | deserializer=job_execution_deserializer, 379 | ) 380 | api_manager.create_api( 381 | ColumnLineage, 382 | methods=methods, 383 | url_prefix=url_prefix, 384 | collection_name="column_lineage", 385 | ) 386 | 387 | api_manager.create_api( 388 | DefaultSchema, 389 | methods=methods, 390 | url_prefix=url_prefix, 391 | collection_name="default_schema", 392 | primary_key="source_id", 393 | ) 394 | 395 | restful_manager = Api(app) 396 | restful_manager.add_resource( 397 | Kedro, "/api/main", resource_class_kwargs={"catalog": restful_catalog} 398 | ) 399 | restful_manager.add_resource( 400 | ScanList, 401 | "/api/v1/scan", 402 | resource_class_kwargs={"catalog": restful_catalog, "queue": queue}, 403 | ) 404 | 405 | restful_manager.add_resource( 406 | Scan, 407 | "/api/v1/scan/", 408 | resource_class_kwargs={"catalog": restful_catalog, "queue": queue}, 409 | ) 410 | 411 | restful_manager.add_resource( 412 | Analyze, "/api/v1/analyze", resource_class_kwargs={"catalog": restful_catalog} 413 | ) 414 | 415 | restful_manager.add_resource( 416 | Parse, "/api/v1/parse", resource_class_kwargs={"catalog": restful_catalog} 417 | ) 418 | 419 | for rule in app.url_map.iter_rules(): 420 | rule_methods = ",".join(rule.methods) 421 | logging.debug("{:50s} {:20s} {}".format(rule.endpoint, rule_methods, rule)) 422 | 423 | if is_production: 424 | return Server(app=app), catalog 425 | else: 426 | return app, catalog 427 | -------------------------------------------------------------------------------- /data_lineage/worker.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from contextlib import closing 3 | 4 | from dbcat import DbScanner, PGCatalog 5 | 6 | 7 | def scan(connection_args, source_id): 8 | logging.info("{}".format(connection_args)) 9 | catalog = PGCatalog( 10 | **connection_args, 11 | connect_args={"application_name": "data-lineage:worker"}, 12 | max_overflow=40, 13 | pool_size=20, 14 | pool_pre_ping=True 15 | ) 16 | 17 | with closing(catalog): 18 | with catalog.managed_session: 19 | source = catalog.get_source_by_id(source_id) 20 | DbScanner(catalog, source).scan() 21 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | # Dockerfile 2 | # Uses multi-stage builds requiring Docker 17.05 or higher 3 | # See https://docs.docker.com/develop/develop-images/multistage-build/ 4 | 5 | # Creating a python base with shared environment variables 6 | FROM python:3.8.1-slim as python-base 7 | ENV PYTHONUNBUFFERED=1 \ 8 | PYTHONDONTWRITEBYTECODE=1 \ 9 | PIP_NO_CACHE_DIR=off \ 10 | PIP_DISABLE_PIP_VERSION_CHECK=on \ 11 | PIP_DEFAULT_TIMEOUT=100 \ 12 | POETRY_HOME="/opt/poetry" \ 13 | POETRY_VIRTUALENVS_IN_PROJECT=true \ 14 | POETRY_NO_INTERACTION=1 \ 15 | PYSETUP_PATH="/opt/pysetup" \ 16 | VENV_PATH="/opt/pysetup/.venv" 17 | 18 | ENV PATH="$POETRY_HOME/bin:$VENV_PATH/bin:$PATH" 19 | 20 | 21 | # builder-base is used to build dependencies 22 | FROM python-base as builder-base 23 | RUN apt-get update \ 24 | && apt-get install --no-install-recommends -y \ 25 | curl gcc python3-dev default-libmysqlclient-dev \ 26 | build-essential libpq-dev musl-dev 27 | 28 | # Install Poetry - respects $POETRY_VERSION & $POETRY_HOME 29 | ENV POETRY_VERSION=1.1.6 30 | RUN curl -sSL https://raw.githubusercontent.com/sdispater/poetry/master/get-poetry.py | python 31 | 32 | # We copy our Python requirements here to cache them 33 | # and install only runtime deps using poetry 34 | WORKDIR $PYSETUP_PATH 35 | COPY ./poetry.lock ./pyproject.toml ./ 36 | RUN poetry install --no-dev # respects 37 | 38 | WORKDIR /src 39 | COPY . . 40 | RUN poetry build 41 | ENV PATH="${VENV_PATH}/bin:$PATH" 42 | RUN pip install dist/data_lineage-*.whl 43 | 44 | # 'production' stage uses the clean 'python-base' stage and copyies 45 | # in only our runtime deps that were installed in the 'builder-base' 46 | FROM python-base as production 47 | 48 | RUN apt-get update \ 49 | && apt-get install --no-install-recommends -y \ 50 | libpq5 51 | 52 | COPY --from=builder-base $VENV_PATH $VENV_PATH 53 | COPY ./docker/docker-entrypoint.sh /docker-entrypoint.sh 54 | RUN chmod +x /docker-entrypoint.sh 55 | 56 | ENTRYPOINT /docker-entrypoint.sh $0 $@ 57 | CMD [ "data_lineage"] -------------------------------------------------------------------------------- /docker/build_image.sh: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env bash 2 | 3 | set -e 4 | 5 | PROJECT_ROOT=$(dirname $(dirname $0)) 6 | 7 | echo "$PROJECT_ROOT" 8 | 9 | DOCKERHUB_NAMESPACE=tokern 10 | 11 | 12 | TAG=$1 13 | if [ -z $TAG ]; then 14 | echo "usage: $0 [--publish] [--latest]" 15 | exit 1 16 | fi 17 | 18 | if [ "$2" == "--publish" ]; then 19 | PUBLISH="YES" 20 | fi 21 | 22 | if [ "$3" == "--latest" ]; then 23 | LATEST="YES" 24 | fi 25 | 26 | if [ "$PUBLISH" == "YES" ] && [ -z "$DOCKERHUB_USERNAME" -o -z "$DOCKERHUB_PASSWORD" ]; then 27 | echo "In order to publish an image to Dockerhub you must set \$DOCKERHUB_USERNAME and \$DOCKERHUB_PASSWORD before running." 28 | exit 1 29 | fi 30 | 31 | DOCKERHUB_REPOSITORY=data-lineage 32 | DOCKER_IMAGE="${DOCKERHUB_NAMESPACE}/${DOCKERHUB_REPOSITORY}:${TAG}" 33 | 34 | echo "Building Docker image ${DOCKER_IMAGE} from official Tokern release ${TAG}" 35 | 36 | # now tell docker to build our image 37 | 38 | docker build -t "${DOCKER_IMAGE}" -f "$PROJECT_ROOT"/docker/Dockerfile "${PROJECT_ROOT}" 39 | 40 | if [ "$PUBLISH" == "YES" ]; then 41 | echo "Publishing image ${DOCKER_IMAGE} to Dockerhub" 42 | 43 | # make sure that we are logged into dockerhub 44 | docker login --username="${DOCKERHUB_USERNAME}" --password="${DOCKERHUB_PASSWORD}" 45 | 46 | # push the built image to dockerhub 47 | docker push "${DOCKER_IMAGE}" 48 | 49 | # TODO: quick check against dockerhub to see that our new image made it 50 | 51 | if [ "$LATEST" == "YES" ]; then 52 | # tag our recent versioned image as "latest" 53 | docker tag "${DOCKER_IMAGE}" ${DOCKERHUB_NAMESPACE}/${DOCKERHUB_REPOSITORY}:latest 54 | 55 | # then push it as well 56 | docker push ${DOCKERHUB_NAMESPACE}/${DOCKERHUB_REPOSITORY}:latest 57 | 58 | # TODO: validate push succeeded 59 | fi 60 | fi 61 | 62 | echo "Done" -------------------------------------------------------------------------------- /docker/docker-entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -e 4 | 5 | # activate our virtual environment here 6 | . /opt/pysetup/.venv/bin/activate 7 | 8 | # You can put other setup logic here 9 | 10 | # Evaluating passed command: 11 | exec "$@" -------------------------------------------------------------------------------- /example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Overview\n", 8 | "\n", 9 | "This example demonstrates how to scan query history from a data warehouse and save it in the data lineage app. The app automatically parses and extracts data lineage from the queries.\n", 10 | "\n", 11 | "The example consists of the following sequence of operations:\n", 12 | "\n", 13 | "* Start docker containers containing a demo. Refer to [docs](https://tokern.io/docs/data-lineage/installation) for detailed instructions on installing demo-wikimedia.\n", 14 | "* Scan and send queries from query history to data lineage app.\n", 15 | "* Visualize the graph by visiting Tokern UI.\n", 16 | "* Analyze the graph" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "# Installation\n", 24 | "\n", 25 | "This demo requires wikimedia demo to be running. Start the demo using the following instructions:\n", 26 | "\n", 27 | " # in a new directory run\n", 28 | " wget https://raw.githubusercontent.com/tokern/data-lineage/master/install-manifests/docker-compose/wikimedia-demo.yml\n", 29 | " # or run\n", 30 | " curl https://raw.githubusercontent.com/tokern/data-lineage/master/install-manifests/docker-compose/wikimedia-demo.yml -o docker-compose.yml\n", 31 | "\n", 32 | "\n", 33 | "Run docker-compose\n", 34 | "\n", 35 | "\n", 36 | " docker-compose up -d\n", 37 | "\n", 38 | "\n", 39 | "Verify container are running\n", 40 | "\n", 41 | "\n", 42 | " docker container ls | grep tokern\n" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "# Required configuration for API and wikimedia database network address\n", 52 | "\n", 53 | "docker_address = \"http://127.0.0.1:8000\"\n", 54 | "wikimedia_db = {\n", 55 | " \"username\": \"etldev\",\n", 56 | " \"password\": \"3tld3v\",\n", 57 | " \"uri\": \"tokern-demo-wikimedia\",\n", 58 | " \"port\": \"5432\",\n", 59 | " \"database\": \"wikimedia\"\n", 60 | "}" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "import time\n", 70 | "# Setup a connection to catalog using the SDK.\n", 71 | "from data_lineage import Catalog, Scan\n", 72 | "\n", 73 | "catalog = Catalog(docker_address)\n", 74 | "\n", 75 | "# Register wikimedia datawarehouse with data-lineage app.\n", 76 | "\n", 77 | "source = catalog.add_source(name=\"wikimedia\", source_type=\"postgresql\", **wikimedia_db)\n", 78 | "\n", 79 | "# Scan the wikimedia data warehouse and register all schemata, tables and columns.\n", 80 | "scan = Scan(docker_address)\n", 81 | "job = scan.start(source)\n", 82 | "\n", 83 | "# Wait for scan to complete\n", 84 | "\n", 85 | "status = \"\"\n", 86 | "\n", 87 | "while (status != \"finished\" and status != \"failed\"):\n", 88 | " time.sleep(5)\n", 89 | " status = scan.get(job[\"id\"])[\"status\"]\n", 90 | " print(\"Status is {}\".format(status))" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "import json\n", 100 | "\n", 101 | "with open(\"test/queries.json\", \"r\") as file:\n", 102 | " queries = json.load(file)" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": { 109 | "scrolled": true 110 | }, 111 | "outputs": [], 112 | "source": [ 113 | "from datetime import datetime\n", 114 | "from data_lineage import Analyze\n", 115 | "\n", 116 | "analyze = Analyze(docker_address)\n", 117 | "\n", 118 | "for query in queries:\n", 119 | " print(query)\n", 120 | " analyze.analyze(**query, source=source, start_time=datetime.now(), end_time=datetime.now())" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "Visit [Kedro UI](http://localhost:8000/)\n", 128 | "\n", 129 | "![One Task Graph](./full_graph.png)" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [] 138 | } 139 | ], 140 | "metadata": { 141 | "kernelspec": { 142 | "display_name": "Python 3 (ipykernel)", 143 | "language": "python", 144 | "name": "python3" 145 | }, 146 | "language_info": { 147 | "codemirror_mode": { 148 | "name": "ipython", 149 | "version": 3 150 | }, 151 | "file_extension": ".py", 152 | "mimetype": "text/x-python", 153 | "name": "python", 154 | "nbconvert_exporter": "python", 155 | "pygments_lexer": "ipython3", 156 | "version": "3.8.10" 157 | } 158 | }, 159 | "nbformat": 4, 160 | "nbformat_minor": 4 161 | } -------------------------------------------------------------------------------- /full_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tokern/data-lineage/5945542742979fe350d313d906440c93ee3d0f36/full_graph.png -------------------------------------------------------------------------------- /install-manifests/docker-compose/catalog-demo.yml: -------------------------------------------------------------------------------- 1 | version: '3.6' 2 | services: 3 | tokern-demo-catalog: 4 | image: tokern/demo-catalog:latest 5 | container_name: tokern-demo-catalog 6 | restart: unless-stopped 7 | networks: 8 | - tokern-internal 9 | volumes: 10 | - tokern_demo_catalog_data:/var/lib/postgresql/data 11 | environment: 12 | POSTGRES_PASSWORD: catal0g_passw0rd 13 | POSTGRES_USER: catalog_user 14 | POSTGRES_DB: tokern 15 | tokern-api: 16 | image: tokern/data-lineage:latest 17 | container_name: tokern-data-lineage 18 | restart: unless-stopped 19 | networks: 20 | - tokern-internal 21 | environment: 22 | CATALOG_PASSWORD: catal0g_passw0rd 23 | CATALOG_USER: catalog_user 24 | CATALOG_DB: tokern 25 | CATALOG_HOST: tokern-demo-catalog 26 | GUNICORN_CMD_ARGS: "--bind 0.0.0.0:4142" 27 | toker-viz: 28 | image: tokern/data-lineage-viz:latest 29 | container_name: tokern-data-lineage-visualizer 30 | restart: unless-stopped 31 | networks: 32 | - tokern-internal 33 | - tokern-net 34 | ports: 35 | - "8000:80" 36 | networks: 37 | tokern-net: # Exposed by your host. 38 | # external: true 39 | name: "tokern-net" 40 | driver: bridge 41 | ipam: 42 | driver: default 43 | config: 44 | - subnet: 10.10.0.0/24 45 | tokern-internal: 46 | name: "tokern-internal" 47 | driver: bridge 48 | internal: true 49 | ipam: 50 | driver: default 51 | config: 52 | - subnet: 10.11.0.0/24 53 | 54 | volumes: 55 | tokern_demo_catalog_data: 56 | -------------------------------------------------------------------------------- /install-manifests/docker-compose/tokern-lineage-engine.yml: -------------------------------------------------------------------------------- 1 | version: '3.6' 2 | services: 3 | tokern-catalog: 4 | image: postgres:13.2-alpine 5 | container_name: tokern-catalog 6 | restart: unless-stopped 7 | networks: 8 | - tokern-internal 9 | volumes: 10 | - tokern_catalog_data:/var/lib/postgresql/data 11 | environment: 12 | POSTGRES_PASSWORD: catal0g_passw0rd 13 | POSTGRES_USER: catalog_user 14 | POSTGRES_DB: tokern 15 | tokern-redis: 16 | image: redis:6.2.6-alpine 17 | container_name: tokern-redis 18 | restart: unless-stopped 19 | networks: 20 | - tokern-internal 21 | tokern-api: 22 | image: tokern/data-lineage:latest 23 | container_name: tokern-data-lineage 24 | restart: unless-stopped 25 | depends_on: 26 | - tokern-redis 27 | networks: 28 | - tokern-internal 29 | - tokern-net 30 | environment: 31 | CATALOG_PASSWORD: ${CATALOG_PASSWORD:-catal0g_passw0rd} 32 | CATALOG_USER: ${CATALOG_USER:-catalog_user} 33 | CATALOG_DB: ${CATALOG_DB:-tokern} 34 | CATALOG_HOST: ${CATALOG_HOST:-tokern-catalog} 35 | CATALOG_PORT: ${CATALOG_PORT:-5432} 36 | GUNICORN_CMD_ARGS: "--bind 0.0.0.0:4142" 37 | LOG_LEVEL: ${LOG_LEVEL:-INFO} 38 | REDIS_HOST: ${REDIS_HOST:-tokern-redis} 39 | REDIS_PORT: ${REDIS_PORT:-6379} 40 | REDIS_HOST: "tokern-redis" 41 | tokern-worker: 42 | image: tokern/data-lineage:latest 43 | container_name: tokern_worker 44 | restart: unless-stopped 45 | depends_on: 46 | - tokern-redis 47 | networks: 48 | - tokern-internal 49 | command: rq worker --url redis://tokern-redis:6379 50 | tokern-viz: 51 | image: tokern/data-lineage-viz:latest 52 | container_name: tokern-data-lineage-visualizer 53 | restart: unless-stopped 54 | networks: 55 | - tokern-internal 56 | - tokern-net 57 | ports: 58 | - "8000:80" 59 | networks: 60 | tokern-net: # Exposed by your host. 61 | # external: true 62 | name: "tokern-net" 63 | driver: bridge 64 | ipam: 65 | driver: default 66 | config: 67 | - subnet: 10.10.0.0/24 68 | tokern-internal: 69 | name: "tokern-internal" 70 | driver: bridge 71 | internal: true 72 | ipam: 73 | driver: default 74 | config: 75 | - subnet: 10.11.0.0/24 76 | 77 | volumes: 78 | tokern_catalog_data: 79 | -------------------------------------------------------------------------------- /install-manifests/docker-compose/wikimedia-demo.yml: -------------------------------------------------------------------------------- 1 | version: '3.6' 2 | services: 3 | tokern-catalog: 4 | image: postgres:13.2-alpine 5 | container_name: tokern-catalog 6 | restart: unless-stopped 7 | networks: 8 | - tokern-internal 9 | volumes: 10 | - tokern_wikimedia_catalog_data:/var/lib/postgresql/data 11 | environment: 12 | POSTGRES_PASSWORD: catal0g_passw0rd 13 | POSTGRES_USER: catalog_user 14 | POSTGRES_DB: tokern 15 | tokern-redis: 16 | image: redis:6.2.6-alpine 17 | container_name: tokern-redis 18 | restart: unless-stopped 19 | networks: 20 | - tokern-internal 21 | tokern-wikimedia: 22 | image: tokern/demo-wikimedia:latest 23 | container_name: tokern-demo-wikimedia 24 | restart: unless-stopped 25 | networks: 26 | - tokern-internal 27 | volumes: 28 | - tokern_wikimedia_data:/var/lib/postgresql/data 29 | environment: 30 | POSTGRES_PASSWORD: 3tld3v 31 | POSTGRES_USER: etldev 32 | POSTGRES_DB: wikimedia 33 | tokern-api: 34 | image: tokern/data-lineage:latest 35 | container_name: tokern-data-lineage 36 | restart: unless-stopped 37 | depends_on: 38 | - tokern-redis 39 | networks: 40 | - tokern-internal 41 | environment: 42 | CATALOG_PASSWORD: catal0g_passw0rd 43 | CATALOG_USER: catalog_user 44 | CATALOG_DB: tokern 45 | CATALOG_HOST: tokern-catalog 46 | GUNICORN_CMD_ARGS: "--bind 0.0.0.0:4142" 47 | REDIS_HOST: "tokern-redis" 48 | tokern-worker: 49 | image: tokern/data-lineage:latest 50 | container_name: tokern_worker 51 | restart: unless-stopped 52 | depends_on: 53 | - tokern-redis 54 | networks: 55 | - tokern-internal 56 | command: rq worker --url redis://tokern-redis:6379 57 | toker-viz: 58 | image: tokern/data-lineage-viz:latest 59 | container_name: tokern-data-lineage-visualizer 60 | restart: unless-stopped 61 | networks: 62 | - tokern-internal 63 | - tokern-net 64 | ports: 65 | - "8000:80" 66 | networks: 67 | tokern-net: # Exposed by your host. 68 | # external: true 69 | name: "tokern-net" 70 | driver: bridge 71 | ipam: 72 | driver: default 73 | config: 74 | - subnet: 10.10.0.0/24 75 | tokern-internal: 76 | name: "tokern-internal" 77 | driver: bridge 78 | internal: true 79 | ipam: 80 | driver: default 81 | config: 82 | - subnet: 10.11.0.0/24 83 | 84 | volumes: 85 | tokern_wikimedia_catalog_data: 86 | tokern_wikimedia_data: -------------------------------------------------------------------------------- /install-manifests/dockerfiles/Dockerfile-demo-catalog: -------------------------------------------------------------------------------- 1 | FROM postgres:13.2-alpine 2 | COPY demo-catalog.sql /docker-entrypoint-initdb.d/ -------------------------------------------------------------------------------- /install-manifests/dockerfiles/Dockerfile-demo-wikimedia: -------------------------------------------------------------------------------- 1 | FROM postgres:13.2-alpine 2 | COPY demo-wikimedia.sql /docker-entrypoint-initdb.d/ -------------------------------------------------------------------------------- /install-manifests/dockerfiles/Makefile: -------------------------------------------------------------------------------- 1 | default: all 2 | 3 | .PHONY: default all fetch_dump 4 | 5 | date := `date '+%Y-%m-%d'` 6 | TARGET_IMAGE ?= demo-catalog 7 | VERSION ?= "0.2.0" 8 | DESTINATION_REPOSITORY ?= "tokern" 9 | 10 | all: generate_image push_to_registry finished 11 | 12 | check_vars: 13 | @test -n "$DESTINATION_REPOSITORY" || (echo "You need to set DESTINATION_REPOSITORY environment variable" >&2 && exit 1) 14 | 15 | generate_image: 16 | @docker build . -f Dockerfile-$(TARGET_IMAGE) -t $(TARGET_IMAGE)\:latest -t $(DESTINATION_REPOSITORY)/$(TARGET_IMAGE)\:latest -t $(DESTINATION_REPOSITORY)/$(TARGET_IMAGE)\:$(VERSION) 17 | 18 | push_to_registry: 19 | @echo "" 20 | @echo "====== Pushing image to repository ======" 21 | @docker push $(DESTINATION_REPOSITORY)/$(TARGET_IMAGE):latest 22 | @docker push $(DESTINATION_REPOSITORY)/$(TARGET_IMAGE):$(VERSION) 23 | 24 | finished: 25 | @echo "" 26 | @echo "Finished with success. Pushed image to $(DESTINATION_REPOSITORY)/$(TARGET_IMAGE)" -------------------------------------------------------------------------------- /install-manifests/dockerfiles/demo-catalog.sql: -------------------------------------------------------------------------------- 1 | -- 2 | -- PostgreSQL database dump 3 | -- 4 | 5 | -- Dumped from database version 13.2 (Debian 13.2-1.pgdg100+1) 6 | -- Dumped by pg_dump version 13.3 (Ubuntu 13.3-1.pgdg20.04+1) 7 | 8 | SET statement_timeout = 0; 9 | SET lock_timeout = 0; 10 | SET idle_in_transaction_session_timeout = 0; 11 | SET client_encoding = 'UTF8'; 12 | SET standard_conforming_strings = on; 13 | SELECT pg_catalog.set_config('search_path', '', false); 14 | SET check_function_bodies = false; 15 | SET xmloption = content; 16 | SET client_min_messages = warning; 17 | SET row_security = off; 18 | 19 | -- 20 | -- Name: jobexecutionstatus; Type: TYPE; Schema: public; Owner: catalog_user 21 | -- 22 | 23 | CREATE TYPE public.jobexecutionstatus AS ENUM ( 24 | 'SUCCESS', 25 | 'FAILURE' 26 | ); 27 | 28 | 29 | ALTER TYPE public.jobexecutionstatus OWNER TO catalog_user; 30 | 31 | SET default_tablespace = ''; 32 | 33 | SET default_table_access_method = heap; 34 | 35 | -- 36 | -- Name: alembic_version; Type: TABLE; Schema: public; Owner: catalog_user 37 | -- 38 | 39 | CREATE TABLE public.alembic_version ( 40 | version_num character varying(32) NOT NULL 41 | ); 42 | 43 | 44 | ALTER TABLE public.alembic_version OWNER TO catalog_user; 45 | 46 | -- 47 | -- Name: column_lineage; Type: TABLE; Schema: public; Owner: catalog_user 48 | -- 49 | 50 | CREATE TABLE public.column_lineage ( 51 | id integer NOT NULL, 52 | context jsonb, 53 | source_id integer, 54 | target_id integer, 55 | job_execution_id integer 56 | ); 57 | 58 | 59 | ALTER TABLE public.column_lineage OWNER TO catalog_user; 60 | 61 | -- 62 | -- Name: column_lineage_id_seq; Type: SEQUENCE; Schema: public; Owner: catalog_user 63 | -- 64 | 65 | CREATE SEQUENCE public.column_lineage_id_seq 66 | AS integer 67 | START WITH 1 68 | INCREMENT BY 1 69 | NO MINVALUE 70 | NO MAXVALUE 71 | CACHE 1; 72 | 73 | 74 | ALTER TABLE public.column_lineage_id_seq OWNER TO catalog_user; 75 | 76 | -- 77 | -- Name: column_lineage_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: catalog_user 78 | -- 79 | 80 | ALTER SEQUENCE public.column_lineage_id_seq OWNED BY public.column_lineage.id; 81 | 82 | 83 | -- 84 | -- Name: columns; Type: TABLE; Schema: public; Owner: catalog_user 85 | -- 86 | 87 | CREATE TABLE public.columns ( 88 | id integer NOT NULL, 89 | name character varying, 90 | data_type character varying, 91 | sort_order integer, 92 | table_id integer 93 | ); 94 | 95 | 96 | ALTER TABLE public.columns OWNER TO catalog_user; 97 | 98 | -- 99 | -- Name: columns_id_seq; Type: SEQUENCE; Schema: public; Owner: catalog_user 100 | -- 101 | 102 | CREATE SEQUENCE public.columns_id_seq 103 | AS integer 104 | START WITH 1 105 | INCREMENT BY 1 106 | NO MINVALUE 107 | NO MAXVALUE 108 | CACHE 1; 109 | 110 | 111 | ALTER TABLE public.columns_id_seq OWNER TO catalog_user; 112 | 113 | -- 114 | -- Name: columns_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: catalog_user 115 | -- 116 | 117 | ALTER SEQUENCE public.columns_id_seq OWNED BY public.columns.id; 118 | 119 | 120 | -- 121 | -- Name: default_schema; Type: TABLE; Schema: public; Owner: catalog_user 122 | -- 123 | 124 | CREATE TABLE public.default_schema ( 125 | source_id integer NOT NULL, 126 | schema_id integer 127 | ); 128 | 129 | 130 | ALTER TABLE public.default_schema OWNER TO catalog_user; 131 | 132 | -- 133 | -- Name: job_executions; Type: TABLE; Schema: public; Owner: catalog_user 134 | -- 135 | 136 | CREATE TABLE public.job_executions ( 137 | id integer NOT NULL, 138 | job_id integer, 139 | started_at timestamp without time zone, 140 | ended_at timestamp without time zone, 141 | status public.jobexecutionstatus 142 | ); 143 | 144 | 145 | ALTER TABLE public.job_executions OWNER TO catalog_user; 146 | 147 | -- 148 | -- Name: job_executions_id_seq; Type: SEQUENCE; Schema: public; Owner: catalog_user 149 | -- 150 | 151 | CREATE SEQUENCE public.job_executions_id_seq 152 | AS integer 153 | START WITH 1 154 | INCREMENT BY 1 155 | NO MINVALUE 156 | NO MAXVALUE 157 | CACHE 1; 158 | 159 | 160 | ALTER TABLE public.job_executions_id_seq OWNER TO catalog_user; 161 | 162 | -- 163 | -- Name: job_executions_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: catalog_user 164 | -- 165 | 166 | ALTER SEQUENCE public.job_executions_id_seq OWNED BY public.job_executions.id; 167 | 168 | 169 | -- 170 | -- Name: jobs; Type: TABLE; Schema: public; Owner: catalog_user 171 | -- 172 | 173 | CREATE TABLE public.jobs ( 174 | id integer NOT NULL, 175 | name character varying, 176 | context jsonb, 177 | source_id integer 178 | ); 179 | 180 | 181 | ALTER TABLE public.jobs OWNER TO catalog_user; 182 | 183 | -- 184 | -- Name: jobs_id_seq; Type: SEQUENCE; Schema: public; Owner: catalog_user 185 | -- 186 | 187 | CREATE SEQUENCE public.jobs_id_seq 188 | AS integer 189 | START WITH 1 190 | INCREMENT BY 1 191 | NO MINVALUE 192 | NO MAXVALUE 193 | CACHE 1; 194 | 195 | 196 | ALTER TABLE public.jobs_id_seq OWNER TO catalog_user; 197 | 198 | -- 199 | -- Name: jobs_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: catalog_user 200 | -- 201 | 202 | ALTER SEQUENCE public.jobs_id_seq OWNED BY public.jobs.id; 203 | 204 | 205 | -- 206 | -- Name: schemata; Type: TABLE; Schema: public; Owner: catalog_user 207 | -- 208 | 209 | CREATE TABLE public.schemata ( 210 | id integer NOT NULL, 211 | name character varying, 212 | source_id integer 213 | ); 214 | 215 | 216 | ALTER TABLE public.schemata OWNER TO catalog_user; 217 | 218 | -- 219 | -- Name: schemata_id_seq; Type: SEQUENCE; Schema: public; Owner: catalog_user 220 | -- 221 | 222 | CREATE SEQUENCE public.schemata_id_seq 223 | AS integer 224 | START WITH 1 225 | INCREMENT BY 1 226 | NO MINVALUE 227 | NO MAXVALUE 228 | CACHE 1; 229 | 230 | 231 | ALTER TABLE public.schemata_id_seq OWNER TO catalog_user; 232 | 233 | -- 234 | -- Name: schemata_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: catalog_user 235 | -- 236 | 237 | ALTER SEQUENCE public.schemata_id_seq OWNED BY public.schemata.id; 238 | 239 | 240 | -- 241 | -- Name: sources; Type: TABLE; Schema: public; Owner: catalog_user 242 | -- 243 | 244 | CREATE TABLE public.sources ( 245 | id integer NOT NULL, 246 | source_type character varying, 247 | name character varying, 248 | dialect character varying, 249 | uri character varying, 250 | port character varying, 251 | username character varying, 252 | password character varying, 253 | database character varying, 254 | instance character varying, 255 | cluster character varying, 256 | project_id character varying, 257 | project_credentials character varying, 258 | page_size character varying, 259 | filter_key character varying, 260 | included_tables_regex character varying, 261 | key_path character varying, 262 | account character varying, 263 | role character varying, 264 | warehouse character varying 265 | ); 266 | 267 | 268 | ALTER TABLE public.sources OWNER TO catalog_user; 269 | 270 | -- 271 | -- Name: sources_id_seq; Type: SEQUENCE; Schema: public; Owner: catalog_user 272 | -- 273 | 274 | CREATE SEQUENCE public.sources_id_seq 275 | AS integer 276 | START WITH 1 277 | INCREMENT BY 1 278 | NO MINVALUE 279 | NO MAXVALUE 280 | CACHE 1; 281 | 282 | 283 | ALTER TABLE public.sources_id_seq OWNER TO catalog_user; 284 | 285 | -- 286 | -- Name: sources_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: catalog_user 287 | -- 288 | 289 | ALTER SEQUENCE public.sources_id_seq OWNED BY public.sources.id; 290 | 291 | 292 | -- 293 | -- Name: tables; Type: TABLE; Schema: public; Owner: catalog_user 294 | -- 295 | 296 | CREATE TABLE public.tables ( 297 | id integer NOT NULL, 298 | name character varying, 299 | schema_id integer 300 | ); 301 | 302 | 303 | ALTER TABLE public.tables OWNER TO catalog_user; 304 | 305 | -- 306 | -- Name: tables_id_seq; Type: SEQUENCE; Schema: public; Owner: catalog_user 307 | -- 308 | 309 | CREATE SEQUENCE public.tables_id_seq 310 | AS integer 311 | START WITH 1 312 | INCREMENT BY 1 313 | NO MINVALUE 314 | NO MAXVALUE 315 | CACHE 1; 316 | 317 | 318 | ALTER TABLE public.tables_id_seq OWNER TO catalog_user; 319 | 320 | -- 321 | -- Name: tables_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: catalog_user 322 | -- 323 | 324 | ALTER SEQUENCE public.tables_id_seq OWNED BY public.tables.id; 325 | 326 | 327 | -- 328 | -- Name: column_lineage id; Type: DEFAULT; Schema: public; Owner: catalog_user 329 | -- 330 | 331 | ALTER TABLE ONLY public.column_lineage ALTER COLUMN id SET DEFAULT nextval('public.column_lineage_id_seq'::regclass); 332 | 333 | 334 | -- 335 | -- Name: columns id; Type: DEFAULT; Schema: public; Owner: catalog_user 336 | -- 337 | 338 | ALTER TABLE ONLY public.columns ALTER COLUMN id SET DEFAULT nextval('public.columns_id_seq'::regclass); 339 | 340 | 341 | -- 342 | -- Name: job_executions id; Type: DEFAULT; Schema: public; Owner: catalog_user 343 | -- 344 | 345 | ALTER TABLE ONLY public.job_executions ALTER COLUMN id SET DEFAULT nextval('public.job_executions_id_seq'::regclass); 346 | 347 | 348 | -- 349 | -- Name: jobs id; Type: DEFAULT; Schema: public; Owner: catalog_user 350 | -- 351 | 352 | ALTER TABLE ONLY public.jobs ALTER COLUMN id SET DEFAULT nextval('public.jobs_id_seq'::regclass); 353 | 354 | 355 | -- 356 | -- Name: schemata id; Type: DEFAULT; Schema: public; Owner: catalog_user 357 | -- 358 | 359 | ALTER TABLE ONLY public.schemata ALTER COLUMN id SET DEFAULT nextval('public.schemata_id_seq'::regclass); 360 | 361 | 362 | -- 363 | -- Name: sources id; Type: DEFAULT; Schema: public; Owner: catalog_user 364 | -- 365 | 366 | ALTER TABLE ONLY public.sources ALTER COLUMN id SET DEFAULT nextval('public.sources_id_seq'::regclass); 367 | 368 | 369 | -- 370 | -- Name: tables id; Type: DEFAULT; Schema: public; Owner: catalog_user 371 | -- 372 | 373 | ALTER TABLE ONLY public.tables ALTER COLUMN id SET DEFAULT nextval('public.tables_id_seq'::regclass); 374 | 375 | 376 | -- 377 | -- Data for Name: alembic_version; Type: TABLE DATA; Schema: public; Owner: catalog_user 378 | -- 379 | 380 | COPY public.alembic_version (version_num) FROM stdin; 381 | d1daff1715f7 382 | \. 383 | 384 | 385 | -- 386 | -- Data for Name: column_lineage; Type: TABLE DATA; Schema: public; Owner: catalog_user 387 | -- 388 | 389 | COPY public.column_lineage (id, context, source_id, target_id, job_execution_id) FROM stdin; 390 | 1 {} 5 10 1 391 | 2 {} 7 11 1 392 | 3 {} 7 12 1 393 | 4 {} 5 13 1 394 | 5 {} 6 14 1 395 | 6 {} 5 15 2 396 | 7 {} 7 16 2 397 | 8 {} 7 17 2 398 | 9 {} 5 18 2 399 | 10 {} 6 19 2 400 | 11 {} 15 20 3 401 | 12 {} 16 21 3 402 | 13 {} 17 22 3 403 | 14 {} 18 23 3 404 | 15 {} 19 24 3 405 | 16 {} 2 25 4 406 | 17 {} 3 26 4 407 | 18 {} 4 27 4 408 | 19 {} 23 29 5 409 | 20 {} 22 30 5 410 | 21 {} 22 31 5 411 | 22 {} 27 32 5 412 | 23 {} 28 33 5 413 | \. 414 | 415 | 416 | -- 417 | -- Data for Name: columns; Type: TABLE DATA; Schema: public; Owner: catalog_user 418 | -- 419 | 420 | COPY public.columns (id, name, data_type, sort_order, table_id) FROM stdin; 421 | 1 group STRING 0 1 422 | 2 page_title STRING 1 1 423 | 3 views BIGINT 2 1 424 | 4 bytes_sent BIGINT 3 1 425 | 5 page_id BIGINT 0 2 426 | 6 page_latest BIGINT 1 2 427 | 7 page_title STRING 2 2 428 | 8 rd_from BIGINT 0 3 429 | 9 page_title STRING 1 3 430 | 10 redirect_id BIGINT 0 4 431 | 11 redirect_title STRING 1 4 432 | 12 true_title STRING 2 4 433 | 13 page_id BIGINT 3 4 434 | 14 page_version BIGINT 4 4 435 | 15 redirect_id BIGINT 0 5 436 | 16 redirect_title STRING 1 5 437 | 17 true_title STRING 2 5 438 | 18 page_id BIGINT 3 5 439 | 19 page_version BIGINT 4 5 440 | 20 redirect_id bigint 0 6 441 | 21 redirect_title STRING 1 6 442 | 22 true_title STRING 2 6 443 | 23 page_id BIGINT 3 6 444 | 24 page_version BIGINT 4 6 445 | 25 group STRING 0 7 446 | 26 page_title STRING 1 7 447 | 27 views BIGINT 2 7 448 | 28 bytes_sent BIGINT 3 7 449 | 29 page_id BIGINT 0 8 450 | 30 page_title STRING 1 8 451 | 31 page_url STRING 2 8 452 | 32 views BIGINT 3 8 453 | 33 bytes_sent BIGINT 4 8 454 | \. 455 | 456 | 457 | -- 458 | -- Data for Name: default_schema; Type: TABLE DATA; Schema: public; Owner: catalog_user 459 | -- 460 | 461 | COPY public.default_schema (source_id, schema_id) FROM stdin; 462 | \. 463 | 464 | 465 | -- 466 | -- Data for Name: job_executions; Type: TABLE DATA; Schema: public; Owner: catalog_user 467 | -- 468 | 469 | COPY public.job_executions (id, job_id, started_at, ended_at, status) FROM stdin; 470 | 1 1 2021-07-29 23:11:44.470984 2021-07-29 23:11:44.470993 SUCCESS 471 | 2 2 2021-07-29 23:11:44.61084 2021-07-29 23:11:44.610849 SUCCESS 472 | 3 3 2021-07-29 23:11:44.717093 2021-07-29 23:11:44.717101 SUCCESS 473 | 4 4 2021-07-29 23:11:44.842395 2021-07-29 23:11:44.84241 SUCCESS 474 | 5 5 2021-07-29 23:11:44.949858 2021-07-29 23:11:44.949867 SUCCESS 475 | \. 476 | 477 | 478 | -- 479 | -- Data for Name: jobs; Type: TABLE DATA; Schema: public; Owner: catalog_user 480 | -- 481 | 482 | COPY public.jobs (id, name, context, source_id) FROM stdin; 483 | 1 LOAD page_lookup_nonredirect {"query": "INSERT INTO page_lookup_nonredirect SELECT page.page_id as redircet_id, page.page_title as redirect_title, page.page_title true_title, page.page_id, page.page_latest FROM page LEFT OUTER JOIN redirect ON page.page_id = redirect.rd_from WHERE redirect.rd_from IS NULL "} 1 484 | 2 LOAD page_lookup_redirect {"query": "insert into page_lookup_redirect select original_page.page_id redirect_id, original_page.page_title redirect_title, final_page.page_title as true_title, final_page.page_id, final_page.page_latest from page final_page join redirect on (redirect.page_title = final_page.page_title) join page original_page on (redirect.rd_from = original_page.page_id)"} 1 485 | 3 LOAD page_lookup {"query": "INSERT INTO page_lookup SELECT plr.redirect_id, plr.redirect_title, plr.true_title, plr.page_id, plr.page_version FROM page_lookup_redirect plr"} 1 486 | 4 LOAD filtered_pagecounts {"query": "INSERT INTO filtered_pagecounts(\\"group\\", page_title, views) SELECT regexp_replace (reflect ('java.net.URLDecoder','decode', reflect ('java.net.URLDecoder','decode',pvs.page_title)),'^\\\\s*([a-zA-Z0-9]+).*','$1') page_title,SUM (pvs.views) AS total_views, SUM (pvs.bytes_sent) AS total_bytes_sent FROM pagecounts as pvs WHERE not pvs.page_title LIKE '(MEDIA|SPECIAL||Talk|User|User_talk|Project|Project_talk|File|File_talk|MediaWiki|MediaWiki_talk|Template|Template_talk|Help|Help_talk|Category|Category_talk|Portal|Wikipedia|Wikipedia_talk|upload|Special)\\\\:(.*)' and pvs.page_title LIKE '^([A-Z])(.*)' and not pvs.page_title LIKE '(.*).(jpg|gif|png|JPG|GIF|PNG|txt|ico)$' and pvs.page_title <> '404_error/' and pvs.page_title <> 'Main_Page' and pvs.page_title <> 'Hypertext_Transfer_Protocol' and pvs.page_title <> 'Favicon.ico' and pvs.page_title <> 'Search' and pvs.dt = '2020-01-01' GROUP BY regexp_replace (reflect ('java.net.URLDecoder','decode', reflect ('java.net.URLDecoder','decode',pvs.page_title)),'^\\\\s*([a-zA-Z0-9]+).*','$1')"} 1 487 | 5 LOAD normalized_pagecounts {"query": "INSERT INTO normalized_pagecounts SELECT pl.page_id page_id, REGEXP_REPLACE(pl.true_title, '_', ' ') page_title, pl.true_title page_url, fp.views, fp.bytes_sent FROM page_lookup pl JOIN filtered_pagecounts fp ON fp.page_title = pl.redirect_title where fp.dt='2020-01-01'"} 1 488 | \. 489 | 490 | 491 | -- 492 | -- Data for Name: schemata; Type: TABLE DATA; Schema: public; Owner: catalog_user 493 | -- 494 | 495 | COPY public.schemata (id, name, source_id) FROM stdin; 496 | 1 default 1 497 | \. 498 | 499 | 500 | -- 501 | -- Data for Name: sources; Type: TABLE DATA; Schema: public; Owner: catalog_user 502 | -- 503 | 504 | COPY public.sources (id, source_type, name, dialect, uri, port, username, password, database, instance, cluster, project_id, project_credentials, page_size, filter_key, included_tables_regex, key_path, account, role, warehouse) FROM stdin; 505 | 1 redshift test \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N 506 | \. 507 | 508 | 509 | -- 510 | -- Data for Name: tables; Type: TABLE DATA; Schema: public; Owner: catalog_user 511 | -- 512 | 513 | COPY public.tables (id, name, schema_id) FROM stdin; 514 | 1 pagecounts 1 515 | 2 page 1 516 | 3 redirect 1 517 | 4 page_lookup_nonredirect 1 518 | 5 page_lookup_redirect 1 519 | 6 page_lookup 1 520 | 7 filtered_pagecounts 1 521 | 8 normalized_pagecounts 1 522 | \. 523 | 524 | 525 | -- 526 | -- Name: column_lineage_id_seq; Type: SEQUENCE SET; Schema: public; Owner: catalog_user 527 | -- 528 | 529 | SELECT pg_catalog.setval('public.column_lineage_id_seq', 23, true); 530 | 531 | 532 | -- 533 | -- Name: columns_id_seq; Type: SEQUENCE SET; Schema: public; Owner: catalog_user 534 | -- 535 | 536 | SELECT pg_catalog.setval('public.columns_id_seq', 33, true); 537 | 538 | 539 | -- 540 | -- Name: job_executions_id_seq; Type: SEQUENCE SET; Schema: public; Owner: catalog_user 541 | -- 542 | 543 | SELECT pg_catalog.setval('public.job_executions_id_seq', 5, true); 544 | 545 | 546 | -- 547 | -- Name: jobs_id_seq; Type: SEQUENCE SET; Schema: public; Owner: catalog_user 548 | -- 549 | 550 | SELECT pg_catalog.setval('public.jobs_id_seq', 5, true); 551 | 552 | 553 | -- 554 | -- Name: schemata_id_seq; Type: SEQUENCE SET; Schema: public; Owner: catalog_user 555 | -- 556 | 557 | SELECT pg_catalog.setval('public.schemata_id_seq', 1, true); 558 | 559 | 560 | -- 561 | -- Name: sources_id_seq; Type: SEQUENCE SET; Schema: public; Owner: catalog_user 562 | -- 563 | 564 | SELECT pg_catalog.setval('public.sources_id_seq', 1, true); 565 | 566 | 567 | -- 568 | -- Name: tables_id_seq; Type: SEQUENCE SET; Schema: public; Owner: catalog_user 569 | -- 570 | 571 | SELECT pg_catalog.setval('public.tables_id_seq', 8, true); 572 | 573 | 574 | -- 575 | -- Name: alembic_version alembic_version_pkc; Type: CONSTRAINT; Schema: public; Owner: catalog_user 576 | -- 577 | 578 | ALTER TABLE ONLY public.alembic_version 579 | ADD CONSTRAINT alembic_version_pkc PRIMARY KEY (version_num); 580 | 581 | 582 | -- 583 | -- Name: column_lineage column_lineage_pkey; Type: CONSTRAINT; Schema: public; Owner: catalog_user 584 | -- 585 | 586 | ALTER TABLE ONLY public.column_lineage 587 | ADD CONSTRAINT column_lineage_pkey PRIMARY KEY (id); 588 | 589 | 590 | -- 591 | -- Name: columns columns_pkey; Type: CONSTRAINT; Schema: public; Owner: catalog_user 592 | -- 593 | 594 | ALTER TABLE ONLY public.columns 595 | ADD CONSTRAINT columns_pkey PRIMARY KEY (id); 596 | 597 | 598 | -- 599 | -- Name: default_schema default_schema_pkey; Type: CONSTRAINT; Schema: public; Owner: catalog_user 600 | -- 601 | 602 | ALTER TABLE ONLY public.default_schema 603 | ADD CONSTRAINT default_schema_pkey PRIMARY KEY (source_id); 604 | 605 | 606 | -- 607 | -- Name: job_executions job_executions_pkey; Type: CONSTRAINT; Schema: public; Owner: catalog_user 608 | -- 609 | 610 | ALTER TABLE ONLY public.job_executions 611 | ADD CONSTRAINT job_executions_pkey PRIMARY KEY (id); 612 | 613 | 614 | -- 615 | -- Name: jobs jobs_name_key; Type: CONSTRAINT; Schema: public; Owner: catalog_user 616 | -- 617 | 618 | ALTER TABLE ONLY public.jobs 619 | ADD CONSTRAINT jobs_name_key UNIQUE (name); 620 | 621 | 622 | -- 623 | -- Name: jobs jobs_pkey; Type: CONSTRAINT; Schema: public; Owner: catalog_user 624 | -- 625 | 626 | ALTER TABLE ONLY public.jobs 627 | ADD CONSTRAINT jobs_pkey PRIMARY KEY (id); 628 | 629 | 630 | -- 631 | -- Name: jobs jobs_source_id_name_key; Type: CONSTRAINT; Schema: public; Owner: catalog_user 632 | -- 633 | 634 | ALTER TABLE ONLY public.jobs 635 | ADD CONSTRAINT jobs_source_id_name_key UNIQUE (source_id, name); 636 | 637 | 638 | -- 639 | -- Name: schemata schemata_pkey; Type: CONSTRAINT; Schema: public; Owner: catalog_user 640 | -- 641 | 642 | ALTER TABLE ONLY public.schemata 643 | ADD CONSTRAINT schemata_pkey PRIMARY KEY (id); 644 | 645 | 646 | -- 647 | -- Name: sources sources_name_key; Type: CONSTRAINT; Schema: public; Owner: catalog_user 648 | -- 649 | 650 | ALTER TABLE ONLY public.sources 651 | ADD CONSTRAINT sources_name_key UNIQUE (name); 652 | 653 | 654 | -- 655 | -- Name: sources sources_pkey; Type: CONSTRAINT; Schema: public; Owner: catalog_user 656 | -- 657 | 658 | ALTER TABLE ONLY public.sources 659 | ADD CONSTRAINT sources_pkey PRIMARY KEY (id); 660 | 661 | 662 | -- 663 | -- Name: tables tables_pkey; Type: CONSTRAINT; Schema: public; Owner: catalog_user 664 | -- 665 | 666 | ALTER TABLE ONLY public.tables 667 | ADD CONSTRAINT tables_pkey PRIMARY KEY (id); 668 | 669 | 670 | -- 671 | -- Name: columns unique_column_name; Type: CONSTRAINT; Schema: public; Owner: catalog_user 672 | -- 673 | 674 | ALTER TABLE ONLY public.columns 675 | ADD CONSTRAINT unique_column_name UNIQUE (table_id, name); 676 | 677 | 678 | -- 679 | -- Name: column_lineage unique_lineage; Type: CONSTRAINT; Schema: public; Owner: catalog_user 680 | -- 681 | 682 | ALTER TABLE ONLY public.column_lineage 683 | ADD CONSTRAINT unique_lineage UNIQUE (source_id, target_id, job_execution_id); 684 | 685 | 686 | -- 687 | -- Name: schemata unique_schema_name; Type: CONSTRAINT; Schema: public; Owner: catalog_user 688 | -- 689 | 690 | ALTER TABLE ONLY public.schemata 691 | ADD CONSTRAINT unique_schema_name UNIQUE (source_id, name); 692 | 693 | 694 | -- 695 | -- Name: tables unique_table_name; Type: CONSTRAINT; Schema: public; Owner: catalog_user 696 | -- 697 | 698 | ALTER TABLE ONLY public.tables 699 | ADD CONSTRAINT unique_table_name UNIQUE (schema_id, name); 700 | 701 | 702 | -- 703 | -- Name: column_lineage column_lineage_job_execution_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: catalog_user 704 | -- 705 | 706 | ALTER TABLE ONLY public.column_lineage 707 | ADD CONSTRAINT column_lineage_job_execution_id_fkey FOREIGN KEY (job_execution_id) REFERENCES public.job_executions(id); 708 | 709 | 710 | -- 711 | -- Name: column_lineage column_lineage_source_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: catalog_user 712 | -- 713 | 714 | ALTER TABLE ONLY public.column_lineage 715 | ADD CONSTRAINT column_lineage_source_id_fkey FOREIGN KEY (source_id) REFERENCES public.columns(id); 716 | 717 | 718 | -- 719 | -- Name: column_lineage column_lineage_target_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: catalog_user 720 | -- 721 | 722 | ALTER TABLE ONLY public.column_lineage 723 | ADD CONSTRAINT column_lineage_target_id_fkey FOREIGN KEY (target_id) REFERENCES public.columns(id); 724 | 725 | 726 | -- 727 | -- Name: columns columns_table_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: catalog_user 728 | -- 729 | 730 | ALTER TABLE ONLY public.columns 731 | ADD CONSTRAINT columns_table_id_fkey FOREIGN KEY (table_id) REFERENCES public.tables(id); 732 | 733 | 734 | -- 735 | -- Name: default_schema default_schema_schema_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: catalog_user 736 | -- 737 | 738 | ALTER TABLE ONLY public.default_schema 739 | ADD CONSTRAINT default_schema_schema_id_fkey FOREIGN KEY (schema_id) REFERENCES public.schemata(id); 740 | 741 | 742 | -- 743 | -- Name: default_schema default_schema_source_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: catalog_user 744 | -- 745 | 746 | ALTER TABLE ONLY public.default_schema 747 | ADD CONSTRAINT default_schema_source_id_fkey FOREIGN KEY (source_id) REFERENCES public.sources(id); 748 | 749 | 750 | -- 751 | -- Name: job_executions job_executions_job_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: catalog_user 752 | -- 753 | 754 | ALTER TABLE ONLY public.job_executions 755 | ADD CONSTRAINT job_executions_job_id_fkey FOREIGN KEY (job_id) REFERENCES public.jobs(id); 756 | 757 | 758 | -- 759 | -- Name: jobs jobs_source_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: catalog_user 760 | -- 761 | 762 | ALTER TABLE ONLY public.jobs 763 | ADD CONSTRAINT jobs_source_id_fkey FOREIGN KEY (source_id) REFERENCES public.sources(id); 764 | 765 | 766 | -- 767 | -- Name: schemata schemata_source_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: catalog_user 768 | -- 769 | 770 | ALTER TABLE ONLY public.schemata 771 | ADD CONSTRAINT schemata_source_id_fkey FOREIGN KEY (source_id) REFERENCES public.sources(id); 772 | 773 | 774 | -- 775 | -- Name: tables tables_schema_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: catalog_user 776 | -- 777 | 778 | ALTER TABLE ONLY public.tables 779 | ADD CONSTRAINT tables_schema_id_fkey FOREIGN KEY (schema_id) REFERENCES public.schemata(id); 780 | 781 | 782 | -- 783 | -- PostgreSQL database dump complete 784 | -- 785 | 786 | -------------------------------------------------------------------------------- /install-manifests/dockerfiles/demo-wikimedia.sql: -------------------------------------------------------------------------------- 1 | -- 2 | -- PostgreSQL database dump 3 | -- 4 | 5 | -- Dumped from database version 13.2 (Debian 13.2-1.pgdg100+1) 6 | -- Dumped by pg_dump version 13.3 (Ubuntu 13.3-1.pgdg20.04+1) 7 | 8 | SET statement_timeout = 0; 9 | SET lock_timeout = 0; 10 | SET idle_in_transaction_session_timeout = 0; 11 | SET client_encoding = 'UTF8'; 12 | SET standard_conforming_strings = on; 13 | SELECT pg_catalog.set_config('search_path', '', false); 14 | SET check_function_bodies = false; 15 | SET xmloption = content; 16 | SET client_min_messages = warning; 17 | SET row_security = off; 18 | 19 | SET default_tablespace = ''; 20 | 21 | SET default_table_access_method = heap; 22 | 23 | -- 24 | -- Name: filtered_pagecounts; Type: TABLE; Schema: public; Owner: etldev 25 | -- 26 | 27 | CREATE TABLE public.filtered_pagecounts ( 28 | "group" character varying, 29 | page_title character varying, 30 | views bigint, 31 | bytes_sent bigint 32 | ); 33 | 34 | 35 | ALTER TABLE public.filtered_pagecounts OWNER TO etldev; 36 | 37 | -- 38 | -- Name: page_lookup; Type: TABLE; Schema: public; Owner: etldev 39 | -- 40 | 41 | CREATE TABLE public.page_lookup ( 42 | redirect_id bigint, 43 | redirect_title bigint, 44 | true_title character varying, 45 | page_id bigint, 46 | page_version bigint 47 | ); 48 | 49 | 50 | ALTER TABLE public.page_lookup OWNER TO etldev; 51 | 52 | -- 53 | -- Name: normalized_pagecounts; Type: TABLE; Schema: public; Owner: etldev 54 | -- 55 | 56 | CREATE TABLE public.normalized_pagecounts ( 57 | page_id bigint, 58 | page_title character varying, 59 | page_url character varying, 60 | views bigint, 61 | bytes_sent bigint 62 | ); 63 | 64 | 65 | ALTER TABLE public.normalized_pagecounts OWNER TO etldev; 66 | 67 | -- 68 | -- Name: page; Type: TABLE; Schema: public; Owner: etldev 69 | -- 70 | 71 | CREATE TABLE public.page ( 72 | page_id bigint, 73 | page_latest bigint, 74 | page_title character varying 75 | ); 76 | 77 | 78 | ALTER TABLE public.page OWNER TO etldev; 79 | 80 | -- 81 | -- Name: page_lookup_nonredirect; Type: TABLE; Schema: public; Owner: etldev 82 | -- 83 | 84 | CREATE TABLE public.page_lookup_nonredirect ( 85 | redirect_id bigint, 86 | redirect_title bigint, 87 | true_title character varying, 88 | page_id bigint, 89 | page_version bigint 90 | ); 91 | 92 | 93 | ALTER TABLE public.page_lookup_nonredirect OWNER TO etldev; 94 | 95 | -- 96 | -- Name: page_lookup_redirect; Type: TABLE; Schema: public; Owner: etldev 97 | -- 98 | 99 | CREATE TABLE public.page_lookup_redirect ( 100 | redirect_id bigint, 101 | redirect_title bigint, 102 | true_title character varying, 103 | page_id bigint, 104 | page_version bigint 105 | ); 106 | 107 | 108 | ALTER TABLE public.page_lookup_redirect OWNER TO etldev; 109 | 110 | -- 111 | -- Name: pagecounts; Type: TABLE; Schema: public; Owner: etldev 112 | -- 113 | 114 | CREATE TABLE public.pagecounts ( 115 | "group" character varying, 116 | page_title character varying, 117 | views bigint, 118 | bytes_sent bigint 119 | ); 120 | 121 | 122 | ALTER TABLE public.pagecounts OWNER TO etldev; 123 | 124 | -- 125 | -- Name: redirect; Type: TABLE; Schema: public; Owner: etldev 126 | -- 127 | 128 | CREATE TABLE public.redirect ( 129 | rd_from bigint, 130 | page_title character varying 131 | ); 132 | 133 | 134 | ALTER TABLE public.redirect OWNER TO etldev; 135 | 136 | -- 137 | -- Data for Name: filtered_pagecounts; Type: TABLE DATA; Schema: public; Owner: etldev 138 | -- 139 | 140 | COPY public.filtered_pagecounts ("group", page_title, views, bytes_sent) FROM stdin; 141 | \. 142 | 143 | 144 | -- 145 | -- Data for Name: lookup; Type: TABLE DATA; Schema: public; Owner: etldev 146 | -- 147 | 148 | COPY public.page_lookup (redirect_id, redirect_title, true_title, page_id, page_version) FROM stdin; 149 | \. 150 | 151 | 152 | -- 153 | -- Data for Name: normalized_pagecounts; Type: TABLE DATA; Schema: public; Owner: etldev 154 | -- 155 | 156 | COPY public.normalized_pagecounts ("group", page_title, views, bytes_sent) FROM stdin; 157 | \. 158 | 159 | 160 | -- 161 | -- Data for Name: page; Type: TABLE DATA; Schema: public; Owner: etldev 162 | -- 163 | 164 | COPY public.page (page_id, page_latest, page_title) FROM stdin; 165 | \. 166 | 167 | 168 | -- 169 | -- Data for Name: page_lookup_nonredirect; Type: TABLE DATA; Schema: public; Owner: etldev 170 | -- 171 | 172 | COPY public.page_lookup_nonredirect (redirect_id, redirect_title, true_title, page_id, page_version) FROM stdin; 173 | \. 174 | 175 | 176 | -- 177 | -- Data for Name: page_lookup_redirect; Type: TABLE DATA; Schema: public; Owner: etldev 178 | -- 179 | 180 | COPY public.page_lookup_redirect (redirect_id, redirect_title, true_title, page_id, page_version) FROM stdin; 181 | \. 182 | 183 | 184 | -- 185 | -- Data for Name: pagecounts; Type: TABLE DATA; Schema: public; Owner: etldev 186 | -- 187 | 188 | COPY public.pagecounts ("group", page_title, views, bytes_sent) FROM stdin; 189 | \. 190 | 191 | 192 | -- 193 | -- Data for Name: redirect; Type: TABLE DATA; Schema: public; Owner: etldev 194 | -- 195 | 196 | COPY public.redirect (rd_from, page_title) FROM stdin; 197 | \. 198 | 199 | 200 | -- 201 | -- PostgreSQL database dump complete 202 | -- 203 | 204 | -------------------------------------------------------------------------------- /one_task.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tokern/data-lineage/5945542742979fe350d313d906440c93ee3d0f36/one_task.png -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "data-lineage" 3 | version = "0.9.0" 4 | description = "Open Source Data Lineage Tool for Redshift. Snowflake and many other databases" 5 | authors = ["Tokern "] 6 | license = "MIT" 7 | classifiers = [ 8 | "Development Status :: 3 - Alpha", 9 | "Intended Audience :: Developers", 10 | "Programming Language :: Python", 11 | "Programming Language :: Python :: 3", 12 | "Programming Language :: Python :: 3.7", 13 | "Programming Language :: Python :: 3.8", 14 | "Topic :: Database", 15 | "Topic :: Software Development", 16 | "Topic :: Software Development :: Libraries :: Python Modules", 17 | ] 18 | keywords=["data-lineage","postgres","snowflake","redshift","glue"] 19 | readme="README.md" 20 | homepage="https://tokern.io/" 21 | repository="https://github.com/tokern/data-lineage/" 22 | 23 | [tool.poetry.dependencies] 24 | python = "^3.8" 25 | pglast = "*" 26 | inflection = "*" 27 | networkx = "*" 28 | click = "^7" 29 | PyYAML = "*" 30 | dbcat = "^0.7.1" 31 | gunicorn = "*" 32 | flask = "~=1.1" 33 | flask-restless-ng = "*" 34 | requests = "*" 35 | furl = "*" 36 | flask-restful = "*" 37 | psycopg2 = "^2.9.1" 38 | SQLAlchemy = "^1.3" 39 | botocore = "^1.20" 40 | rq = "^1.10.0" 41 | redis = "^3.5.3" 42 | 43 | [tool.poetry.dev-dependencies] 44 | black = "==19.10b0" 45 | flake8 = "*" 46 | isort = "*" 47 | pre-commit = "*" 48 | pytest = "*" 49 | pytest-cov = "*" 50 | pipenv-setup = "*" 51 | mypy = "*" 52 | jupyter = "*" 53 | pytest-flask = "*" 54 | types-requests = "^0.1.13" 55 | types-Flask = "^1.1.1" 56 | types-PyYAML = "^5.4.3" 57 | types-click = "^7.1.2" 58 | fakeredis = "^1.6.1" 59 | types-redis = "^3.5.15" 60 | 61 | [build-system] 62 | requires = ["poetry-core>=1.0.0"] 63 | build-backend = "poetry.core.masonry.api" 64 | 65 | [tool.poetry.scripts] 66 | data_lineage = "data_lineage.__main__:main" 67 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | testpaths= 3 | test 4 | 5 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = E203, E266, E501, W503 3 | max-line-length = 88 4 | max-complexity = 18 5 | select = B,C,E,F,W,T4 6 | 7 | [isort] 8 | multi_line_output=3 9 | include_trailing_comma=True 10 | force_grid_wrap=0 11 | use_parentheses=True 12 | line_length=88 13 | 14 | [mypy] 15 | files=data_lineage,test 16 | ignore_missing_imports=true 17 | -------------------------------------------------------------------------------- /test/catalog.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "test", 3 | "source_type": "redshift", 4 | "schemata": [ 5 | { 6 | "name": "default", 7 | "tables": [ 8 | { 9 | "name": "pagecounts", 10 | "columns": [ 11 | { 12 | "name": "group", 13 | "data_type": "STRING" 14 | }, 15 | { 16 | "name": "page_title", 17 | "data_type": "STRING" 18 | }, 19 | { 20 | "name": "views", 21 | "data_type": "BIGINT" 22 | }, 23 | { 24 | "name": "bytes_sent", 25 | "data_type": "BIGINT" 26 | } 27 | ] 28 | }, 29 | { 30 | "name": "page", 31 | "columns": [ 32 | { 33 | "name": "page_id", 34 | "data_type": "BIGINT" 35 | }, 36 | { 37 | "name": "page_latest", 38 | "data_type": "BIGINT" 39 | }, 40 | { 41 | "name": "page_title", 42 | "data_type": "STRING" 43 | } 44 | ] 45 | }, 46 | { 47 | "name": "redirect", 48 | "columns": [ 49 | { 50 | "name": "rd_from", 51 | "data_type": "BIGINT" 52 | }, 53 | { 54 | "name": "page_title", 55 | "data_type": "STRING" 56 | } 57 | 58 | ] 59 | }, 60 | { 61 | "name": "page_lookup_nonredirect", 62 | "columns": [ 63 | { 64 | "name": "redirect_id", 65 | "data_type": "BIGINT" 66 | }, 67 | { 68 | "name": "redirect_title", 69 | "data_type": "STRING" 70 | }, 71 | { 72 | "name": "true_title", 73 | "data_type": "STRING" 74 | }, 75 | { 76 | "name": "page_id", 77 | "data_type": "BIGINT" 78 | }, 79 | { 80 | "name": "page_version", 81 | "data_type": "BIGINT" 82 | } 83 | ] 84 | }, 85 | { 86 | "name": "page_lookup_redirect", 87 | "columns": [ 88 | { 89 | "name": "redirect_id", 90 | "data_type": "BIGINT" 91 | }, 92 | { 93 | "name": "redirect_title", 94 | "data_type": "STRING" 95 | }, 96 | { 97 | "name": "true_title", 98 | "data_type": "STRING" 99 | }, 100 | { 101 | "name": "page_id", 102 | "data_type": "BIGINT" 103 | }, 104 | { 105 | "name": "page_version", 106 | "data_type": "BIGINT" 107 | } 108 | ] 109 | }, 110 | { 111 | "name": "page_lookup", 112 | "columns": [ 113 | { 114 | "name": "redirect_id", 115 | "data_type": "bigint" 116 | }, 117 | { 118 | "name": "redirect_title", 119 | "data_type": "STRING" 120 | }, 121 | { 122 | "name": "true_title", 123 | "data_type": "STRING" 124 | }, 125 | { 126 | "name": "page_id", 127 | "data_type": "BIGINT" 128 | }, 129 | { 130 | "name": "page_version", 131 | "data_type": "BIGINT" 132 | } 133 | ] 134 | }, 135 | { 136 | "name": "filtered_pagecounts", 137 | "columns": [ 138 | { 139 | "name": "group", 140 | "data_type": "STRING" 141 | }, 142 | { 143 | "name": "page_title", 144 | "data_type": "STRING" 145 | }, 146 | { 147 | "name": "views", 148 | "data_type": "BIGINT" 149 | }, 150 | { 151 | "name": "bytes_sent", 152 | "data_type": "BIGINT" 153 | } 154 | ] 155 | }, 156 | { 157 | "name": "normalized_pagecounts", 158 | "columns": [ 159 | { 160 | "name": "page_id", 161 | "data_type": "BIGINT" 162 | }, 163 | { 164 | "name": "page_title", 165 | "data_type": "STRING" 166 | }, 167 | { 168 | "name": "page_url", 169 | "data_type": "STRING" 170 | }, 171 | { 172 | "name": "views", 173 | "data_type": "BIGINT" 174 | }, 175 | { 176 | "name": "bytes_sent", 177 | "data_type": "BIGINT" 178 | } 179 | ] 180 | } 181 | ] 182 | } 183 | ] 184 | } -------------------------------------------------------------------------------- /test/conftest.py: -------------------------------------------------------------------------------- 1 | from contextlib import closing 2 | 3 | import pytest 4 | import yaml 5 | from dbcat import PGCatalog as DbCatalog 6 | from dbcat import catalog_connection, init_db 7 | from dbcat.catalog import CatSource 8 | from fakeredis import FakeStrictRedis 9 | 10 | from data_lineage import Analyze, Catalog, Graph, Scan 11 | from data_lineage.parser import parse 12 | from data_lineage.server import create_server 13 | 14 | 15 | @pytest.fixture(scope="session") 16 | def load_queries(): 17 | import json 18 | 19 | with open("test/queries.json", "r") as file: 20 | queries = json.load(file) 21 | 22 | yield queries 23 | 24 | 25 | @pytest.fixture(scope="session") 26 | def parse_queries_fixture(load_queries): 27 | parsed = [parse(sql=query["query"], name=query["name"]) for query in load_queries] 28 | yield parsed 29 | 30 | 31 | postgres_conf = """ 32 | catalog: 33 | user: piiuser 34 | password: p11secret 35 | host: 127.0.0.1 36 | port: 5432 37 | database: piidb 38 | """ 39 | 40 | 41 | @pytest.fixture(scope="session") 42 | def root_connection() -> DbCatalog: 43 | config = yaml.safe_load(postgres_conf) 44 | with closing(DbCatalog(**config["catalog"])) as conn: 45 | yield conn 46 | 47 | 48 | @pytest.fixture(scope="session") 49 | def setup_catalog(root_connection): 50 | with root_connection.engine.connect() as conn: 51 | conn.execute("CREATE USER catalog_user PASSWORD 'catal0g_passw0rd'") 52 | conn.execution_options(isolation_level="AUTOCOMMIT").execute( 53 | "CREATE DATABASE tokern" 54 | ) 55 | conn.execution_options(isolation_level="AUTOCOMMIT").execute( 56 | "GRANT ALL PRIVILEGES ON DATABASE tokern TO catalog_user" 57 | ) 58 | 59 | yield root_connection 60 | 61 | with root_connection.engine.connect() as conn: 62 | conn.execution_options(isolation_level="AUTOCOMMIT").execute( 63 | "DROP DATABASE tokern" 64 | ) 65 | 66 | conn.execution_options(isolation_level="AUTOCOMMIT").execute( 67 | "DROP USER catalog_user" 68 | ) 69 | 70 | 71 | catalog_conf = """ 72 | catalog: 73 | user: catalog_user 74 | password: catal0g_passw0rd 75 | host: 127.0.0.1 76 | port: 5432 77 | database: tokern 78 | """ 79 | 80 | 81 | @pytest.fixture(scope="session") 82 | def open_catalog_connection(setup_catalog): 83 | with closing(catalog_connection(catalog_conf)) as conn: 84 | init_db(conn) 85 | yield conn 86 | 87 | 88 | class File: 89 | def __init__(self, name: str, path: str, catalog: DbCatalog): 90 | self.name = name 91 | self._path = path 92 | self._catalog = catalog 93 | 94 | @property 95 | def path(self): 96 | return self._path 97 | 98 | def scan(self): 99 | import json 100 | 101 | with open(self.path, "r") as file: 102 | content = json.load(file) 103 | 104 | with self._catalog.managed_session: 105 | source = self._catalog.add_source( 106 | name=content["name"], source_type=content["source_type"] 107 | ) 108 | for s in content["schemata"]: 109 | schema = self._catalog.add_schema(s["name"], source=source) 110 | 111 | for t in s["tables"]: 112 | table = self._catalog.add_table(t["name"], schema) 113 | 114 | index = 0 115 | for c in t["columns"]: 116 | self._catalog.add_column( 117 | column_name=c["name"], 118 | data_type=c["data_type"], 119 | sort_order=index, 120 | table=table, 121 | ) 122 | index += 1 123 | 124 | 125 | @pytest.fixture(scope="session") 126 | def save_catalog(open_catalog_connection): 127 | scanner = File("test", "test/catalog.json", open_catalog_connection) 128 | scanner.scan() 129 | yield open_catalog_connection 130 | with open_catalog_connection.managed_session as session: 131 | [session.delete(db) for db in session.query(CatSource).all()] 132 | session.commit() 133 | 134 | 135 | @pytest.fixture(scope="function") 136 | def managed_session(save_catalog): 137 | with save_catalog.managed_session: 138 | yield save_catalog 139 | 140 | 141 | @pytest.fixture(scope="session") 142 | def app(setup_catalog): 143 | config = yaml.safe_load(catalog_conf) 144 | app, catalog = create_server( 145 | config["catalog"], connection=FakeStrictRedis(), is_production=False 146 | ) 147 | yield app 148 | catalog.close() 149 | 150 | 151 | @pytest.fixture(scope="session") 152 | def rest_catalog(live_server, save_catalog): 153 | yield Catalog("http://{}:{}".format(live_server.host, live_server.port)) 154 | 155 | 156 | @pytest.fixture(scope="session") 157 | def graph_sdk(live_server): 158 | yield Graph("http://{}:{}".format(live_server.host, live_server.port)) 159 | 160 | 161 | @pytest.fixture(scope="session") 162 | def parser_sdk(live_server): 163 | yield Analyze("http://{}:{}".format(live_server.host, live_server.port)) 164 | 165 | 166 | @pytest.fixture(scope="session") 167 | def scan_sdk(live_server): 168 | yield Scan("http://{}:{}".format(live_server.host, live_server.port)) 169 | -------------------------------------------------------------------------------- /test/queries.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "name": "LOAD page_lookup_nonredirect", 4 | "query": "INSERT INTO page_lookup_nonredirect SELECT page.page_id as redircet_id, page.page_title as redirect_title, page.page_title true_title, page.page_id, page.page_latest FROM page LEFT OUTER JOIN redirect ON page.page_id = redirect.rd_from WHERE redirect.rd_from IS NULL " 5 | }, 6 | { 7 | "name": "LOAD page_lookup_redirect", 8 | "query": "insert into page_lookup_redirect select original_page.page_id redirect_id, original_page.page_title redirect_title, final_page.page_title as true_title, final_page.page_id, final_page.page_latest from page final_page join redirect on (redirect.page_title = final_page.page_title) join page original_page on (redirect.rd_from = original_page.page_id)" 9 | }, 10 | { 11 | "name": "LOAD page_lookup", 12 | "query": "INSERT INTO page_lookup SELECT plr.redirect_id, plr.redirect_title, plr.true_title, plr.page_id, plr.page_version FROM page_lookup_redirect plr" 13 | }, 14 | { 15 | "name": "LOAD filtered_pagecounts", 16 | "query": "INSERT INTO filtered_pagecounts(\"group\", page_title, views) SELECT regexp_replace (reflect ('java.net.URLDecoder','decode', reflect ('java.net.URLDecoder','decode',pvs.page_title)),'^\\s*([a-zA-Z0-9]+).*','$1') page_title,SUM (pvs.views) AS total_views, SUM (pvs.bytes_sent) AS total_bytes_sent FROM pagecounts as pvs WHERE not pvs.page_title LIKE '(MEDIA|SPECIAL||Talk|User|User_talk|Project|Project_talk|File|File_talk|MediaWiki|MediaWiki_talk|Template|Template_talk|Help|Help_talk|Category|Category_talk|Portal|Wikipedia|Wikipedia_talk|upload|Special)\\:(.*)' and pvs.page_title LIKE '^([A-Z])(.*)' and not pvs.page_title LIKE '(.*).(jpg|gif|png|JPG|GIF|PNG|txt|ico)$' and pvs.page_title <> '404_error/' and pvs.page_title <> 'Main_Page' and pvs.page_title <> 'Hypertext_Transfer_Protocol' and pvs.page_title <> 'Favicon.ico' and pvs.page_title <> 'Search' and pvs.dt = '2020-01-01' GROUP BY regexp_replace (reflect ('java.net.URLDecoder','decode', reflect ('java.net.URLDecoder','decode',pvs.page_title)),'^\\s*([a-zA-Z0-9]+).*','$1')" 17 | }, 18 | { 19 | "name": "LOAD normalized_pagecounts", 20 | "query": "INSERT INTO normalized_pagecounts SELECT pl.page_id page_id, REGEXP_REPLACE(pl.true_title, '_', ' ') page_title, pl.true_title page_url, fp.views, fp.bytes_sent FROM page_lookup pl JOIN filtered_pagecounts fp ON fp.page_title = pl.redirect_title where fp.dt='2020-01-01'" 21 | } 22 | ] 23 | -------------------------------------------------------------------------------- /test/test_data_lineage.py: -------------------------------------------------------------------------------- 1 | from data_lineage.parser import analyze_dml_query 2 | 3 | 4 | def test_parser(parse_queries_fixture): 5 | assert len(parse_queries_fixture) == 5 6 | 7 | 8 | def test_visitor(save_catalog, parse_queries_fixture): 9 | catalog = save_catalog 10 | with catalog.managed_session: 11 | source = catalog.get_source("test") 12 | 13 | dml = [ 14 | analyze_dml_query(catalog, parsed, source) 15 | for parsed in parse_queries_fixture 16 | ] 17 | assert len(dml) == 5 18 | 19 | for d in dml: 20 | assert len(d.source_tables) > 0 and d.target_table is not None 21 | -------------------------------------------------------------------------------- /test/test_db_graph.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import logging 3 | 4 | import pytest 5 | from dbcat.catalog import ColumnLineage 6 | from networkx import edges 7 | 8 | from data_lineage import load_graph 9 | from data_lineage.parser import analyze_dml_query, extract_lineage, parse 10 | from data_lineage.parser.dml_visitor import SelectSourceVisitor 11 | 12 | logging.basicConfig(level=getattr(logging, "DEBUG")) 13 | 14 | 15 | def test_no_insert_column_graph(managed_session, graph_sdk): 16 | catalog = managed_session 17 | query = """ 18 | INSERT INTO page_lookup_nonredirect 19 | SELECT page.page_id as redirect_id, page.page_title as redirect_title, 20 | page.page_title true_title, page.page_id, page.page_latest 21 | FROM page 22 | """ 23 | 24 | parsed = parse( 25 | query, name="LOAD page_lookup_nonredirect-test_no_insert_column_graph" 26 | ) 27 | visitor = SelectSourceVisitor(parsed.name) 28 | visitor(parsed.node) 29 | source = catalog.get_source("test") 30 | visitor.bind(catalog, source) 31 | 32 | job_execution = extract_lineage( 33 | catalog, 34 | visitor, 35 | source, 36 | parsed, 37 | datetime.datetime.now(), 38 | datetime.datetime.now(), 39 | ) 40 | graph = load_graph(graph_sdk, [job_execution.job_id]) 41 | 42 | assert sorted([node[1]["name"] for node in list(graph.graph.nodes(data=True))]) == [ 43 | "LOAD page_lookup_nonredirect-test_no_insert_column_graph", 44 | "test.default.page.page_id", 45 | "test.default.page.page_latest", 46 | "test.default.page.page_title", 47 | "test.default.page_lookup_nonredirect.page_id", 48 | "test.default.page_lookup_nonredirect.page_version", 49 | "test.default.page_lookup_nonredirect.redirect_id", 50 | "test.default.page_lookup_nonredirect.redirect_title", 51 | "test.default.page_lookup_nonredirect.true_title", 52 | ] 53 | 54 | expected_edges = [ 55 | ("column:5", "task:1"), 56 | ("task:1", "column:10"), 57 | ("task:1", "column:11"), 58 | ("task:1", "column:12"), 59 | ("task:1", "column:13"), 60 | ("task:1", "column:14"), 61 | ("column:7", "task:1"), 62 | ("column:6", "task:1"), 63 | ] 64 | 65 | assert [(edge[0], edge[1]) for edge in list(edges(graph.graph))] == expected_edges 66 | 67 | expected_db_edges = [ 68 | ( 69 | ("test", "default", "page", "page_id"), 70 | ("test", "default", "page_lookup_nonredirect", "redirect_id"), 71 | ), 72 | ( 73 | ("test", "default", "page", "page_id"), 74 | ("test", "default", "page_lookup_nonredirect", "page_id"), 75 | ), 76 | ( 77 | ("test", "default", "page", "page_title"), 78 | ("test", "default", "page_lookup_nonredirect", "redirect_title"), 79 | ), 80 | ( 81 | ("test", "default", "page", "page_title"), 82 | ("test", "default", "page_lookup_nonredirect", "true_title"), 83 | ), 84 | ( 85 | ("test", "default", "page", "page_latest"), 86 | ("test", "default", "page_lookup_nonredirect", "page_version"), 87 | ), 88 | ] 89 | with catalog.managed_session as session: 90 | all_edges = session.query(ColumnLineage).all() 91 | assert set([(e.source.fqdn, e.target.fqdn) for e in all_edges]) == set( 92 | expected_db_edges 93 | ) 94 | 95 | 96 | def test_basic_column_graph(managed_session, graph_sdk): 97 | catalog = managed_session 98 | 99 | query = "INSERT INTO page_lookup_nonredirect(page_id, page_version) SELECT page.page_id, page.page_latest FROM page" 100 | parsed = parse(query, "basic_column_graph") 101 | visitor = SelectSourceVisitor(parsed.name) 102 | visitor(parsed.node) 103 | source = catalog.get_source("test") 104 | visitor.bind(catalog, source) 105 | 106 | job_execution = extract_lineage( 107 | catalog, 108 | visitor, 109 | source, 110 | parsed, 111 | datetime.datetime.now(), 112 | datetime.datetime.now(), 113 | ) 114 | graph = load_graph(graph_sdk, [job_execution.job_id]) 115 | 116 | assert sorted([node[1]["name"] for node in list(graph.graph.nodes(data=True))]) == [ 117 | "basic_column_graph", 118 | "test.default.page.page_id", 119 | "test.default.page.page_latest", 120 | "test.default.page_lookup_nonredirect.page_id", 121 | "test.default.page_lookup_nonredirect.page_version", 122 | ] 123 | 124 | expected_edges = [ 125 | ("column:5", "task:2"), 126 | ("task:2", "column:13"), 127 | ("task:2", "column:14"), 128 | ("column:6", "task:2"), 129 | ] 130 | 131 | assert [(edge[0], edge[1]) for edge in list(edges(graph.graph))] == expected_edges 132 | 133 | table = catalog.get_table( 134 | source_name="test", schema_name="default", table_name="page_lookup_nonredirect", 135 | ) 136 | columns = catalog.get_columns_for_table( 137 | table, column_names=["page_id", "page_version"] 138 | ) 139 | 140 | assert len(columns) == 2 141 | 142 | expected_db_edges = [ 143 | ( 144 | ("test", "default", "page", "page_id"), 145 | ("test", "default", "page_lookup_nonredirect", "page_id"), 146 | ), 147 | ( 148 | ("test", "default", "page", "page_latest"), 149 | ("test", "default", "page_lookup_nonredirect", "page_version"), 150 | ), 151 | ] 152 | 153 | with catalog.managed_session as session: 154 | all_edges = ( 155 | session.query(ColumnLineage) 156 | .filter(ColumnLineage.target_id.in_([c.id for c in columns])) 157 | .all() 158 | ) 159 | assert set([(e.source.fqdn, e.target.fqdn) for e in all_edges]) == set( 160 | expected_db_edges 161 | ) 162 | 163 | 164 | @pytest.fixture(scope="module") 165 | def get_graph(save_catalog, parse_queries_fixture, graph_sdk): 166 | catalog = save_catalog 167 | job_ids = [] 168 | 169 | with catalog.managed_session: 170 | source = catalog.get_source("test") 171 | for parsed in parse_queries_fixture: 172 | visitor = analyze_dml_query(catalog, parsed, source) 173 | job_execution = extract_lineage( 174 | catalog, 175 | visitor, 176 | source, 177 | parsed, 178 | datetime.datetime.now(), 179 | datetime.datetime.now(), 180 | ) 181 | job_ids.append(job_execution.job_id) 182 | graph = load_graph(graph_sdk, job_ids) 183 | yield graph, catalog 184 | 185 | 186 | def test_column_graph(get_graph): 187 | graph, catalog = get_graph 188 | assert sorted([node[1]["name"] for node in list(graph.graph.nodes(data=True))]) == [ 189 | "LOAD filtered_pagecounts", 190 | "LOAD normalized_pagecounts", 191 | "LOAD page_lookup", 192 | "LOAD page_lookup_nonredirect", 193 | "LOAD page_lookup_redirect", 194 | "test.default.filtered_pagecounts.bytes_sent", 195 | "test.default.filtered_pagecounts.group", 196 | "test.default.filtered_pagecounts.page_title", 197 | "test.default.filtered_pagecounts.views", 198 | "test.default.normalized_pagecounts.bytes_sent", 199 | "test.default.normalized_pagecounts.page_id", 200 | "test.default.normalized_pagecounts.page_title", 201 | "test.default.normalized_pagecounts.page_url", 202 | "test.default.normalized_pagecounts.views", 203 | "test.default.page.page_id", 204 | "test.default.page.page_latest", 205 | "test.default.page.page_title", 206 | "test.default.page_lookup.page_id", 207 | "test.default.page_lookup.page_version", 208 | "test.default.page_lookup.redirect_id", 209 | "test.default.page_lookup.redirect_title", 210 | "test.default.page_lookup.true_title", 211 | "test.default.page_lookup_nonredirect.page_id", 212 | "test.default.page_lookup_nonredirect.page_version", 213 | "test.default.page_lookup_nonredirect.redirect_id", 214 | "test.default.page_lookup_nonredirect.redirect_title", 215 | "test.default.page_lookup_nonredirect.true_title", 216 | "test.default.page_lookup_redirect.page_id", 217 | "test.default.page_lookup_redirect.page_version", 218 | "test.default.page_lookup_redirect.redirect_id", 219 | "test.default.page_lookup_redirect.redirect_title", 220 | "test.default.page_lookup_redirect.true_title", 221 | "test.default.pagecounts.bytes_sent", 222 | "test.default.pagecounts.page_title", 223 | "test.default.pagecounts.views", 224 | ] 225 | # expected_edges = [ 226 | # ("column:4", "task:1"), 227 | # ("column:4", "task:3"), 228 | # ("task:1", "column:9"), 229 | # ("task:1", "column:10"), 230 | # ("task:1", "column:11"), 231 | # ("task:1", "column:12"), 232 | # ("task:1", "column:13"), 233 | # ("column:6", "task:1"), 234 | # ("column:6", "task:3"), 235 | # ("column:5", "task:1"), 236 | # ("column:5", "task:3"), 237 | # ("column:14", "task:4"), 238 | # ("task:3", "column:14"), 239 | # ("task:3", "column:15"), 240 | # ("task:3", "column:16"), 241 | # ("task:3", "column:17"), 242 | # ("task:3", "column:18"), 243 | # ("column:15", "task:4"), 244 | # ("column:16", "task:4"), 245 | # ("column:17", "task:4"), 246 | # ("column:18", "task:4"), 247 | # ("task:4", "column:19"), 248 | # ("task:4", "column:20"), 249 | # ("task:4", "column:21"), 250 | # ("task:4", "column:22"), 251 | # ("task:4", "column:23"), 252 | # ("column:21", "task:6"), 253 | # ("column:22", "task:6"), 254 | # ("task:6", "column:28"), 255 | # ("task:6", "column:29"), 256 | # ("task:6", "column:30"), 257 | # ("task:6", "column:31"), 258 | # ("column:26", "task:6"), 259 | # ("column:27", "task:6"), 260 | # ] 261 | 262 | 263 | # assert [ 264 | # (edge[0], edge[1]) for edge in list(edges(graph.graph)) 265 | # ] == expected_edges 266 | -------------------------------------------------------------------------------- /test/test_dml_visitor.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from data_lineage.parser import analyze_dml_query, parse, parse_dml_query, parse_queries 4 | from data_lineage.parser.dml_visitor import ( 5 | CTASVisitor, 6 | SelectIntoVisitor, 7 | SelectSourceVisitor, 8 | ) 9 | 10 | 11 | @pytest.mark.parametrize( 12 | "target, sources, sql", 13 | [ 14 | ((None, "c"), [(None, "a")], "insert into c select x,y from a"), 15 | ( 16 | (None, "c"), 17 | [(None, "a"), (None, "b")], 18 | "insert into c select x,y from a join b on a.id = b.id", 19 | ), 20 | ( 21 | (None, "c"), 22 | [(None, "a"), (None, "b")], 23 | "insert into c select x,y from a join b on a.id = b.id", 24 | ), 25 | ( 26 | (None, "c"), 27 | [(None, "a"), (None, "b")], 28 | "insert into c select x,y from a as aa join b on " "aa.id = b.id", 29 | ), 30 | ], 31 | ) 32 | def test_sanity_insert(target, sources, sql): 33 | parsed = parse(sql) 34 | insert_visitor = SelectSourceVisitor("test_sanity_insert") 35 | insert_visitor(parsed.node) 36 | bound_target, bound_tables, bound_cols = insert_visitor.resolve() 37 | 38 | assert bound_target == target 39 | assert bound_tables == sources 40 | 41 | 42 | @pytest.mark.parametrize( 43 | "target, sources, sql", 44 | [ 45 | ((None, "c"), [(None, "a")], "create table c as select x,y from a"), 46 | ( 47 | (None, "c"), 48 | [(None, "a"), (None, "b")], 49 | "create table c as select x,y from a join b on a.id = b.id", 50 | ), 51 | ( 52 | (None, "c"), 53 | [(None, "a"), (None, "b")], 54 | "create table c as select x,y from a join b on a.id = b.id", 55 | ), 56 | ( 57 | (None, "c"), 58 | [(None, "a"), (None, "b")], 59 | "create table c as select x,y from a as aa join b on aa.id = b.id", 60 | ), 61 | ], 62 | ) 63 | def test_sanity_ctas(target, sources, sql): 64 | parsed = parse(sql) 65 | visitor = CTASVisitor("test_sanity_ctas") 66 | visitor(parsed.node) 67 | bound_target, bound_tables, bound_cols = visitor.resolve() 68 | 69 | assert bound_target == target 70 | assert bound_tables == sources 71 | 72 | 73 | @pytest.mark.parametrize( 74 | "target, sources, sql", 75 | [ 76 | ( 77 | (None, "c"), 78 | [(None, "a"), (None, "b")], 79 | "select x,y into c from a join b on a.id = b.id", 80 | ), 81 | ( 82 | (None, "c"), 83 | [(None, "a"), (None, "b")], 84 | "select x,y into c from a join b on a.id = b.id", 85 | ), 86 | ( 87 | (None, "c"), 88 | [(None, "a"), (None, "b")], 89 | "select x,y into c from a as aa join b on aa.id = b.id", 90 | ), 91 | ], 92 | ) 93 | def test_sanity_select_into(target, sources, sql): 94 | parsed = parse(sql) 95 | visitor = SelectIntoVisitor("test_sanity_select_into") 96 | visitor(parsed.node) 97 | bound_target, bound_tables, bound_cols = visitor.resolve() 98 | 99 | assert bound_target == target 100 | assert bound_tables == sources 101 | 102 | 103 | @pytest.mark.parametrize( 104 | "query", 105 | [ 106 | "INSERT INTO page_lookup SELECT plr.redirect_id, plr.redirect_title, plr.true_title, plr.page_id, plr.page_version FROM page_lookup_redirect plr", 107 | "INSERT INTO page_lookup SELECT redirect_id, redirect_title, true_title, page_id, page_version FROM page_lookup_redirect", 108 | "INSERT INTO page_lookup SELECT page_lookup_redirect.* FROM page_lookup_redirect", 109 | "INSERT INTO page_lookup SELECT * FROM page_lookup_redirect", 110 | 'INSERT INTO "default".page_lookup SELECT * FROM page_lookup_redirect', 111 | "SELECT * INTO page_lookup from page_lookup_redirect", 112 | 'SELECT * INTO "default".page_lookup from page_lookup_redirect', 113 | """ 114 | INSERT INTO page_lookup 115 | SELECT * FROM ( 116 | select redirect_id, redirect_title, true_title, page_id, page_version FROM page_lookup_redirect 117 | ) plr 118 | """, 119 | """ 120 | INSERT INTO page_lookup 121 | SELECT plr.* FROM ( 122 | select redirect_id, redirect_title, true_title, page_id, page_version FROM page_lookup_redirect 123 | ) plr 124 | """, 125 | """ 126 | INSERT INTO page_lookup 127 | SELECT redirect_id, redirect_title, true_title, page_id, page_version FROM ( 128 | select redirect_id, redirect_title, true_title, page_id, page_version FROM page_lookup_redirect 129 | ) plr 130 | """, 131 | """ 132 | INSERT INTO page_lookup 133 | SELECT plr.redirect_id, plr.redirect_title, plr.true_title, plr.page_id, plr.page_version FROM ( 134 | select redirect_id, redirect_title, true_title, page_id, page_version FROM page_lookup_redirect 135 | ) plr 136 | """, 137 | ], 138 | ) 139 | def test_insert(managed_session, query): 140 | source = managed_session.get_source("test") 141 | parsed = parse(query) 142 | visitor = analyze_dml_query(managed_session, parsed, source) 143 | assert visitor is not None 144 | 145 | assert len(visitor.target_columns) == 5 146 | assert visitor.target_table.fqdn == ("test", "default", "page_lookup") 147 | assert len(visitor.source_columns) == 5 148 | assert [table.fqdn for table in visitor.source_tables] == [ 149 | ("test", "default", "page_lookup_redirect") 150 | ] 151 | 152 | 153 | def test_insert_cols(managed_session): 154 | source = managed_session.get_source("test") 155 | query = "INSERT INTO page_lookup_nonredirect(page_id, page_version) SELECT page.page_id, page.page_latest FROM page" 156 | parsed = parse(query) 157 | visitor = analyze_dml_query(managed_session, parsed, source) 158 | assert visitor is not None 159 | 160 | assert len(visitor.target_columns) == 2 161 | assert visitor.target_table.fqdn == ("test", "default", "page_lookup_nonredirect") 162 | assert len(visitor.source_columns) == 2 163 | assert [table.fqdn for table in visitor.source_tables] == [ 164 | ("test", "default", "page") 165 | ] 166 | 167 | 168 | def test_insert_with_join(managed_session): 169 | source = managed_session.get_source("test") 170 | query = "insert into page_lookup_redirect select original_page.page_id redirect_id, original_page.page_title redirect_title, final_page.page_title as true_title, final_page.page_id, final_page.page_latest from page final_page join redirect on (redirect.page_title = final_page.page_title) join page original_page on (redirect.rd_from = original_page.page_id)" 171 | parsed = parse(query) 172 | visitor = analyze_dml_query(managed_session, parsed, source) 173 | assert visitor is not None 174 | 175 | assert len(visitor.target_columns) == 5 176 | assert visitor.target_table.fqdn == ("test", "default", "page_lookup_redirect") 177 | assert len(visitor.source_columns) == 5 178 | assert sorted([table.fqdn for table in visitor.source_tables]) == [ 179 | ("test", "default", "page"), 180 | ("test", "default", "redirect"), 181 | ] 182 | 183 | 184 | @pytest.mark.parametrize( 185 | "query", 186 | [ 187 | "with pln as (select redirect_title, true_title, page_id, page_version from page_lookup_nonredirect) insert into page_lookup_redirect (redirect_title, true_title, page_id, page_version) select redirect_title, true_title, page_id, page_version from pln;", 188 | "with pln as (select * from page_lookup_nonredirect) insert into page_lookup_redirect (redirect_title, true_title, page_id, page_version) select redirect_title, true_title, page_id, page_version from pln;", 189 | "with pln as (select redirect_title, true_title, page_id, page_version from page_lookup_nonredirect) insert into page_lookup_redirect (redirect_title, true_title, page_id, page_version) select * from pln;", 190 | "with pln as (select redirect_title as t1, true_title as t2, page_id as t3, page_version as t4 from page_lookup_nonredirect) insert into page_lookup_redirect (redirect_title, true_title, page_id, page_version) select t1, t2, t3, t4 from pln;", 191 | "insert into page_lookup_redirect (redirect_title, true_title, page_id, page_version) with pln as (select redirect_title, true_title, page_id, page_version from page_lookup_nonredirect) select redirect_title, true_title, page_id, page_version from pln;", 192 | ], 193 | ) 194 | def test_with_clause(managed_session, query): 195 | source = managed_session.get_source("test") 196 | parsed = parse(query) 197 | visitor = analyze_dml_query(managed_session, parsed, source) 198 | assert visitor is not None 199 | 200 | assert len(visitor.target_columns) == 4 201 | assert visitor.target_table.fqdn == ("test", "default", "page_lookup_redirect") 202 | assert len(visitor.source_columns) == 4 203 | assert [table.fqdn for table in visitor.source_tables] == [ 204 | ("test", "default", "page_lookup_nonredirect") 205 | ] 206 | 207 | 208 | def test_col_exprs(managed_session): 209 | query = """ 210 | INSERT INTO page_lookup_redirect(true_title) 211 | SELECT 212 | BTRIM(TO_CHAR(DATEADD (MONTH,-1,('20' ||MAX ("redirect_id") || '-01')::DATE)::DATE,'YY-MM')) AS "max_month" 213 | FROM page_lookup_nonredirect; 214 | """ 215 | source = managed_session.get_source("test") 216 | parsed = parse(query) 217 | visitor = analyze_dml_query(catalog=managed_session, parsed=parsed, source=source) 218 | assert visitor is not None 219 | 220 | assert len(visitor.target_columns) == 1 221 | assert visitor.target_table.fqdn == ("test", "default", "page_lookup_redirect") 222 | assert len(visitor.source_columns) == 1 223 | assert [table.fqdn for table in visitor.source_tables] == [ 224 | ("test", "default", "page_lookup_nonredirect") 225 | ] 226 | 227 | 228 | def test_syntax_errors(): 229 | queries = [ 230 | "INSERT INTO page_lookup_nonredirect(page_id, latest) SELECT page.page_id, page.page_latest FROM page", 231 | "select a from table(b)", 232 | "INSERT INTO page_lookup_nonredirect SELECT page.page_id, page.page_latest FROM page", 233 | ] 234 | 235 | parsed = parse_queries(queries) 236 | 237 | assert len(parsed) == 2 238 | 239 | 240 | def test_parse_query(managed_session): 241 | query = """ 242 | SELECT BTRIM(TO_CHAR(DATEADD (MONTH,-1,(\'20\' ||MAX ("group") || \'-01\')::DATE)::DATE,\'YY-MM\')) AS "max_month", 243 | DATEADD(YEAR,-1,DATEADD (MONTH,-3,LAST_DAY (DATEADD (MONTH,-1,(\'20\' ||MAX ("group") || \'-01\')::DATE)::DATE))::DATE)::DATE AS "min_date", 244 | DATEADD(MONTH,-3,LAST_DAY (DATEADD (MONTH,-1,(\'20\' ||MAX ("group") || \'-01\')::DATE)::DATE))::DATE AS "max_date", 245 | page_title, 246 | bytes_sent as mb_sent 247 | INTO "new_table" 248 | FROM pagecounts; 249 | """ 250 | source = managed_session.get_source("test") 251 | parsed = parse(query) 252 | binder = parse_dml_query(catalog=managed_session, parsed=parsed, source=source) 253 | assert [context.alias for context in binder.columns] == [ 254 | "max_month", 255 | "min_date", 256 | "max_date", 257 | "page_title", 258 | "mb_sent", 259 | ] 260 | 261 | 262 | def test_ctas(managed_session): 263 | query = """ 264 | CREATE TEMP TABLE temp_table_x(page_title) AS select redirect_title from page_lookup_nonredirect 265 | where redirect_title is not null 266 | """ 267 | source = managed_session.get_source("test") 268 | schema = managed_session.get_schema("test", "default") 269 | managed_session.update_source(source, schema) 270 | parsed = parse(query) 271 | visitor = analyze_dml_query(managed_session, parsed, source) 272 | assert visitor is not None 273 | 274 | assert len(visitor.target_columns) == 1 275 | assert visitor.target_table.fqdn == ("test", "default", "temp_table_x") 276 | assert len(visitor.source_columns) == 1 277 | assert [table.fqdn for table in visitor.source_tables] == [ 278 | ("test", "default", "page_lookup_nonredirect") 279 | ] 280 | -------------------------------------------------------------------------------- /test/test_scan.py: -------------------------------------------------------------------------------- 1 | import psycopg2 2 | import pytest 3 | from fakeredis import FakeStrictRedis 4 | from rq import Queue 5 | 6 | pii_data_script = """ 7 | create table no_pii(a text, b text); 8 | insert into no_pii values ('abc', 'def'); 9 | insert into no_pii values ('xsfr', 'asawe'); 10 | 11 | create table partial_pii(a text, b text); 12 | insert into partial_pii values ('917-908-2234', 'plkj'); 13 | insert into partial_pii values ('215-099-2234', 'sfrf'); 14 | 15 | create table full_pii(name text, location text); 16 | insert into full_pii values ('Jonathan Smith', 'Virginia'); 17 | insert into full_pii values ('Chase Ryan', 'Chennai'); 18 | 19 | """ 20 | 21 | 22 | pii_data_load = [ 23 | "create table no_pii(a text, b text)", 24 | "insert into no_pii values ('abc', 'def')", 25 | "insert into no_pii values ('xsfr', 'asawe')", 26 | "create table partial_pii(a text, b text)", 27 | "insert into partial_pii values ('917-908-2234', 'plkj')", 28 | "insert into partial_pii values ('215-099-2234', 'sfrf')", 29 | "create table full_pii(name text, location text)", 30 | "insert into full_pii values ('Jonathan Smith', 'Virginia')", 31 | "insert into full_pii values ('Chase Ryan', 'Chennai')", 32 | ] 33 | 34 | pii_data_drop = ["DROP TABLE full_pii", "DROP TABLE partial_pii", "DROP TABLE no_pii"] 35 | 36 | 37 | def pg_conn(): 38 | return ( 39 | psycopg2.connect( 40 | host="127.0.0.1", user="piiuser", password="p11secret", database="piidb" 41 | ), 42 | "public", 43 | ) 44 | 45 | 46 | @pytest.fixture(scope="module") 47 | def load_all_data(): 48 | params = [pg_conn()] 49 | for p in params: 50 | db_conn, expected_schema = p 51 | with db_conn.cursor() as cursor: 52 | for statement in pii_data_load: 53 | cursor.execute(statement) 54 | cursor.execute("commit") 55 | yield params 56 | for p in params: 57 | db_conn, expected_schema = p 58 | with db_conn.cursor() as cursor: 59 | for statement in pii_data_drop: 60 | cursor.execute(statement) 61 | cursor.execute("commit") 62 | 63 | for p in params: 64 | db_conn, expected_schema = p 65 | db_conn.close() 66 | 67 | 68 | @pytest.fixture(scope="module") 69 | def setup_catalog_and_data(load_all_data, rest_catalog): 70 | catalog = rest_catalog 71 | source = catalog.add_source( 72 | name="pg_scan", 73 | source_type="postgresql", 74 | uri="127.0.0.1", 75 | username="piiuser", 76 | password="p11secret", 77 | database="piidb", 78 | cluster="public", 79 | ) 80 | yield catalog, source 81 | 82 | 83 | @pytest.fixture(scope="module") 84 | def fake_queue(): 85 | yield Queue(is_async=False, connection=FakeStrictRedis()) 86 | 87 | 88 | def test_scan_source(setup_catalog_and_data, scan_sdk): 89 | catalog, source = setup_catalog_and_data 90 | scan_sdk.start(source) 91 | 92 | pg_source = catalog.get_source("pg_scan") 93 | assert pg_source is not None 94 | 95 | no_pii = catalog.get_table("pg_scan", "public", "no_pii") 96 | assert no_pii is not None 97 | -------------------------------------------------------------------------------- /test/test_server.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import logging 3 | 4 | import pytest 5 | from dbcat.catalog.models import ColumnLineage, Job, JobExecution, JobExecutionStatus 6 | 7 | from data_lineage import ( 8 | ColumnNotFound, 9 | ParseError, 10 | SchemaNotFound, 11 | SourceNotFound, 12 | TableNotFound, 13 | ) 14 | 15 | 16 | def test_get_sources(rest_catalog): 17 | source = rest_catalog.get_source("test") 18 | assert source.name == "test" 19 | assert source.id is not None 20 | 21 | 22 | def test_get_schemata(rest_catalog): 23 | schema = rest_catalog.get_schema("test", "default") 24 | assert schema.name == "default" 25 | assert schema.id is not None 26 | 27 | 28 | def test_get_tables(rest_catalog): 29 | num = 0 30 | for table in rest_catalog.get_tables(): 31 | assert table.id is not None 32 | assert table.name is not None 33 | num += 1 34 | assert num == 12 35 | 36 | 37 | def test_get_columns(rest_catalog): 38 | num = 0 39 | for column in rest_catalog.get_columns(): 40 | assert column.id is not None 41 | assert column.name is not None 42 | assert column.data_type is not None 43 | assert column.sort_order is not None 44 | num += 1 45 | 46 | assert num == 40 47 | 48 | 49 | def test_get_source_by_id(rest_catalog): 50 | source = rest_catalog.get_source_by_id(1) 51 | print(source.__class__.__name__) 52 | assert source.name == "test" 53 | assert source.fqdn == "test" 54 | assert source.source_type == "redshift" 55 | 56 | 57 | def test_get_schema_by_id(rest_catalog): 58 | schema = rest_catalog.get_schema_by_id(1) 59 | assert schema.name == "default" 60 | assert schema.fqdn == ["test", "default"] 61 | 62 | 63 | def test_get_table_by_id(rest_catalog): 64 | table = rest_catalog.get_table_by_id(1) 65 | assert table.name == "pagecounts" 66 | assert table.fqdn == ["test", "default", "pagecounts"] 67 | 68 | 69 | def test_get_column_by_id(rest_catalog): 70 | column = rest_catalog.get_column_by_id(1) 71 | assert column.name == "group" 72 | assert column.fqdn == ["test", "default", "pagecounts", "group"] 73 | 74 | 75 | def test_get_source(rest_catalog): 76 | source = rest_catalog.get_source("test") 77 | assert source.name == "test" 78 | assert source.id is not None 79 | 80 | 81 | def test_get_schema(rest_catalog): 82 | schema = rest_catalog.get_schema("test", "default") 83 | assert schema.name == "default" 84 | assert schema.id is not None 85 | 86 | 87 | def test_get_table(rest_catalog): 88 | table = rest_catalog.get_table("test", "default", "normalized_pagecounts") 89 | assert table.id is not None 90 | assert table.name == "normalized_pagecounts" 91 | 92 | 93 | def test_get_column(rest_catalog): 94 | column = rest_catalog.get_column("test", "default", "pagecounts", "bytes_sent") 95 | assert column.id is not None 96 | assert column.name is not None 97 | assert column.sort_order is not None 98 | 99 | 100 | def test_get_source_exception(rest_catalog): 101 | with pytest.raises(SourceNotFound): 102 | rest_catalog.get_source("tes") 103 | 104 | 105 | @pytest.mark.parametrize( 106 | "source_name, schema_name", [("test", "def"), ("tes", "default")] 107 | ) 108 | def test_get_schema_exception(rest_catalog, source_name, schema_name): 109 | with pytest.raises(SchemaNotFound): 110 | rest_catalog.get_schema(source_name, schema_name) 111 | 112 | 113 | def test_add_source_pg(rest_catalog): 114 | data = { 115 | "name": "pg", 116 | "source_type": "postgres", 117 | "database": "db_database", 118 | "username": "db_user", 119 | "password": "db_password", 120 | "port": "db_port", 121 | "uri": "db_uri", 122 | } 123 | 124 | pg_connection = rest_catalog.add_source(**data) 125 | assert pg_connection.name == "pg" 126 | assert pg_connection.source_type == "postgres" 127 | assert pg_connection.database == "db_database" 128 | assert pg_connection.username == "db_user" 129 | assert pg_connection.password == "db_password" 130 | assert pg_connection.port == "db_port" 131 | assert pg_connection.uri == "db_uri" 132 | 133 | 134 | def test_add_source_mysql(rest_catalog): 135 | data = { 136 | "name": "mys", 137 | "source_type": "mysql", 138 | "database": "db_database", 139 | "username": "db_user", 140 | "password": "db_password", 141 | "port": "db_port", 142 | "uri": "db_uri", 143 | } 144 | 145 | mysql_conn = rest_catalog.add_source(**data) 146 | 147 | assert mysql_conn.name == "mys" 148 | assert mysql_conn.source_type == "mysql" 149 | assert mysql_conn.database == "db_database" 150 | assert mysql_conn.username == "db_user" 151 | assert mysql_conn.password == "db_password" 152 | assert mysql_conn.port == "db_port" 153 | assert mysql_conn.uri == "db_uri" 154 | 155 | 156 | def test_add_source_bq(rest_catalog): 157 | bq_conn = rest_catalog.add_source( 158 | name="bq", 159 | source_type="bigquery", 160 | key_path="db_key_path", 161 | project_credentials="db_creds", 162 | project_id="db_project_id", 163 | ) 164 | assert bq_conn.name == "bq" 165 | assert bq_conn.source_type == "bigquery" 166 | assert bq_conn.key_path == "db_key_path" 167 | assert bq_conn.project_credentials == "db_creds" 168 | assert bq_conn.project_id == "db_project_id" 169 | 170 | 171 | def test_add_source_glue(rest_catalog): 172 | glue_conn = rest_catalog.add_source(name="gl", source_type="glue") 173 | assert glue_conn.name == "gl" 174 | assert glue_conn.source_type == "glue" 175 | 176 | 177 | def test_add_source_snowflake(rest_catalog): 178 | sf_conn = rest_catalog.add_source( 179 | name="sf", 180 | source_type="snowflake", 181 | database="db_database", 182 | username="db_user", 183 | password="db_password", 184 | account="db_account", 185 | role="db_role", 186 | warehouse="db_warehouse", 187 | ) 188 | assert sf_conn.name == "sf" 189 | assert sf_conn.source_type == "snowflake" 190 | assert sf_conn.database == "db_database" 191 | assert sf_conn.username == "db_user" 192 | assert sf_conn.password == "db_password" 193 | assert sf_conn.account == "db_account" 194 | assert sf_conn.role == "db_role" 195 | assert sf_conn.warehouse == "db_warehouse" 196 | 197 | 198 | def test_update_source(rest_catalog): 199 | glue_conn = rest_catalog.add_source(name="gl_2", source_type="glue") 200 | schema_1 = rest_catalog.add_schema("schema_1", glue_conn) 201 | 202 | default_schema = rest_catalog.update_source(glue_conn, schema_1) 203 | 204 | assert default_schema.source.id == glue_conn.id 205 | assert default_schema.schema.id == schema_1.id 206 | 207 | schema_2 = rest_catalog.add_schema("schema_2", glue_conn) 208 | 209 | default_schema = rest_catalog.update_source(glue_conn, schema_2) 210 | 211 | assert default_schema.source.id == glue_conn.id 212 | assert default_schema.schema.id == schema_2.id 213 | 214 | 215 | def load_edges(catalog, expected_edges, job_execution_id): 216 | column_edge_ids = [] 217 | for edge in expected_edges: 218 | source = catalog.get_column( 219 | source_name=edge[0][0], 220 | schema_name=edge[0][1], 221 | table_name=edge[0][2], 222 | column_name=edge[0][3], 223 | ) 224 | 225 | target = catalog.get_column( 226 | source_name=edge[1][0], 227 | schema_name=edge[1][1], 228 | table_name=edge[1][2], 229 | column_name=edge[1][3], 230 | ) 231 | 232 | added_edge = catalog.add_column_lineage(source, target, job_execution_id, {}) 233 | 234 | column_edge_ids.append(added_edge.id) 235 | return column_edge_ids 236 | 237 | 238 | @pytest.fixture(scope="module") 239 | def load_page_lookup_nonredirect_edges(save_catalog): 240 | catalog = save_catalog 241 | 242 | expected_edges = [ 243 | ( 244 | ("test", "default", "page", "page_id"), 245 | ("test", "default", "page_lookup_nonredirect", "redirect_id"), 246 | ), 247 | ( 248 | ("test", "default", "page", "page_id"), 249 | ("test", "default", "page_lookup_nonredirect", "page_id"), 250 | ), 251 | ( 252 | ("test", "default", "page", "page_title"), 253 | ("test", "default", "page_lookup_nonredirect", "redirect_title"), 254 | ), 255 | ( 256 | ("test", "default", "page", "page_title"), 257 | ("test", "default", "page_lookup_nonredirect", "true_title"), 258 | ), 259 | ( 260 | ("test", "default", "page", "page_latest"), 261 | ("test", "default", "page_lookup_nonredirect", "page_version"), 262 | ), 263 | ] 264 | 265 | job_id = None 266 | 267 | with catalog.managed_session: 268 | job = catalog.add_job( 269 | "insert_page_lookup_nonredirect", 270 | catalog.get_source("test"), 271 | {"sql": "insert into page_lookup_nonredirect select from page"}, 272 | ) 273 | e1 = catalog.add_job_execution( 274 | job=job, 275 | started_at=datetime.datetime.combine( 276 | datetime.date(2021, 4, 1), datetime.time(1, 0) 277 | ), 278 | ended_at=datetime.datetime.combine( 279 | datetime.date(2021, 4, 1), datetime.time(1, 15) 280 | ), 281 | status=JobExecutionStatus.SUCCESS, 282 | ) 283 | 284 | executions = [e1.id] 285 | name = job.name 286 | job_id = job.id 287 | 288 | print("Inserted job {}".format(name)) 289 | print("Inserted executions {}".format(",".join(str(v) for v in executions))) 290 | 291 | column_edge_ids = load_edges(catalog, expected_edges, executions[0]) 292 | print("Inserted edges {}".format(",".join(str(v) for v in column_edge_ids))) 293 | 294 | yield catalog, job_id, expected_edges 295 | 296 | with catalog.managed_session as session: 297 | session.query(ColumnLineage).filter( 298 | ColumnLineage.id.in_(column_edge_ids) 299 | ).delete(synchronize_session=False) 300 | print("DELETED edges {}".format(",".join(str(v) for v in column_edge_ids))) 301 | session.commit() 302 | 303 | session.query(JobExecution).filter(JobExecution.id.in_(executions)).delete( 304 | synchronize_session=False 305 | ) 306 | print("DELETED executions {}".format(",".join(str(v) for v in executions))) 307 | session.commit() 308 | 309 | session.query(Job).filter(Job.name == name).delete(synchronize_session=False) 310 | print("DELETED job {}".format(name)) 311 | session.commit() 312 | 313 | 314 | def test_api_main(graph_sdk, load_page_lookup_nonredirect_edges): 315 | catalog, job_id, expected_edges = load_page_lookup_nonredirect_edges 316 | graph = graph_sdk.get([job_id]) 317 | assert len(graph["edges"]) == 10 318 | assert len(graph["nodes"]) == 15 319 | 320 | 321 | def test_parser(rest_catalog, parser_sdk, graph_sdk, save_catalog): 322 | source = rest_catalog.get_source("test") 323 | data = { 324 | "name": "LOAD page_lookup", 325 | "query": "INSERT INTO page_lookup SELECT plr.redirect_id, plr.redirect_title, plr.true_title, plr.page_id, " 326 | "plr.page_version FROM page_lookup_redirect plr", 327 | "source": source, 328 | "start_time": datetime.datetime.now(), 329 | "end_time": datetime.datetime.now(), 330 | } 331 | 332 | job_execution = parser_sdk.analyze(**data) 333 | assert job_execution is not None 334 | 335 | graph = graph_sdk.get([job_execution.job_id]) 336 | 337 | assert len(graph["edges"]) == 10 338 | assert len(graph["nodes"]) == 15 339 | 340 | column_lineages = rest_catalog.get_column_lineage([job_execution.job_id]) 341 | assert (len(column_lineages)) == 10 342 | 343 | 344 | @pytest.mark.parametrize( 345 | "query", 346 | [ 347 | "insert into p_lookup select * from page_lookup_redirect", 348 | "insert into page_lookup select * from pg_lp_rt", 349 | "insert into page_lookup select plr.page_id, true_title from page_lookup_redirect", 350 | ], 351 | ) 352 | def test_parser_table_not_found(rest_catalog, parser_sdk, managed_session, query): 353 | source = rest_catalog.get_source("test") 354 | 355 | with pytest.raises(TableNotFound) as exc: 356 | parser_sdk.analyze( 357 | query=query, 358 | source=source, 359 | start_time=datetime.datetime.now(), 360 | end_time=datetime.datetime.now(), 361 | ) 362 | logging.debug(exc) 363 | 364 | 365 | @pytest.mark.parametrize( 366 | "query", 367 | [ 368 | "insert into page_lookup(title) select true_title from page_lookup_redirect", 369 | "insert into page_lookup(true_title) select title from page_lookup_redirect", 370 | ], 371 | ) 372 | def test_parser_column_not_found(rest_catalog, parser_sdk, managed_session, query): 373 | source = rest_catalog.get_source("test") 374 | 375 | with pytest.raises(ColumnNotFound) as exc: 376 | parser_sdk.analyze( 377 | query=query, 378 | source=source, 379 | start_time=datetime.datetime.now(), 380 | end_time=datetime.datetime.now(), 381 | ) 382 | logging.debug(exc) 383 | 384 | 385 | @pytest.mark.parametrize( 386 | "query", ["insert page_lookup select * from page_lookup_redirect"] 387 | ) 388 | def test_parser_parse_error(rest_catalog, parser_sdk, managed_session, query): 389 | source = rest_catalog.get_source("test") 390 | 391 | with pytest.raises(ParseError) as exc: 392 | parser_sdk.analyze( 393 | query=query, 394 | source=source, 395 | start_time=datetime.datetime.now(), 396 | end_time=datetime.datetime.now(), 397 | ) 398 | logging.debug(exc) 399 | --------------------------------------------------------------------------------