├── .chglog
    ├── CHANGELOG.tpl.md
    └── config.yml
├── .circleci
    └── config.yml
├── .coveragerc
├── .gitignore
├── .pre-commit-config.yaml
├── LICENSE
├── MANIFEST.in
├── README.md
├── api_example.ipynb
├── data_lineage
    ├── __init__.py
    ├── __main__.py
    ├── assets
    │   └── favicon.ico
    ├── graph.py
    ├── parser
    │   ├── __init__.py
    │   ├── binder.py
    │   ├── dml_visitor.py
    │   └── visitor.py
    ├── server.py
    └── worker.py
├── docker
    ├── Dockerfile
    ├── build_image.sh
    └── docker-entrypoint.sh
├── example.ipynb
├── full_graph.png
├── install-manifests
    ├── docker-compose
    │   ├── catalog-demo.yml
    │   ├── tokern-lineage-engine.yml
    │   └── wikimedia-demo.yml
    └── dockerfiles
    │   ├── Dockerfile-demo-catalog
    │   ├── Dockerfile-demo-wikimedia
    │   ├── Makefile
    │   ├── demo-catalog.sql
    │   └── demo-wikimedia.sql
├── one_task.png
├── poetry.lock
├── pyproject.toml
├── pytest.ini
├── setup.cfg
└── test
    ├── catalog.json
    ├── conftest.py
    ├── queries.json
    ├── test_data_lineage.py
    ├── test_db_graph.py
    ├── test_dml_visitor.py
    ├── test_scan.py
    └── test_server.py


/.chglog/CHANGELOG.tpl.md:
--------------------------------------------------------------------------------
 1 | {{ range .Versions }}
 2 | <a name="{{ .Tag.Name }}"></a>
 3 | ## {{ if .Tag.Previous }}[{{ .Tag.Name }}]({{ $.Info.RepositoryURL }}/compare/{{ .Tag.Previous.Name }}...{{ .Tag.Name }}){{ else }}{{ .Tag.Name }}{{ end }} ({{ datetime "2006-01-02" .Tag.Date }})
 4 | 
 5 | {{ range .CommitGroups -}}
 6 | ### {{ .Title }}
 7 | 
 8 | {{ range .Commits -}}
 9 | * {{ .Subject }}
10 | {{ end }}
11 | {{ end -}}
12 | 
13 | {{- if .RevertCommits -}}
14 | ### Reverts
15 | 
16 | {{ range .RevertCommits -}}
17 | * {{ .Revert.Header }}
18 | {{ end }}
19 | {{ end -}}
20 | 
21 | {{- if .NoteGroups -}}
22 | {{ range .NoteGroups -}}
23 | ### {{ .Title }}
24 | 
25 | {{ range .Notes }}
26 | {{ .Body }}
27 | {{ end }}
28 | {{ end -}}
29 | {{ end -}}
30 | {{ end -}}


--------------------------------------------------------------------------------
/.chglog/config.yml:
--------------------------------------------------------------------------------
 1 | style: github
 2 | template: CHANGELOG.tpl.md
 3 | info:
 4 |   title: CHANGELOG
 5 |   repository_url: https://github.com/tokern/data-lineage
 6 | options:
 7 |   commits:
 8 |     # filters:
 9 |     #   Type:
10 |     #     - feat
11 |     #     - fix
12 |     #     - perf
13 |     #     - refactor
14 |   commit_groups:
15 |     # title_maps:
16 |     #   feat: Features
17 |     #   fix: Bug Fixes
18 |     #   perf: Performance Improvements
19 |     #   refactor: Code Refactoring
20 |   header:
21 |     pattern: "^(\\w*)\\:\\s(.*)$"
22 |     pattern_maps:
23 |       - Type
24 |       - Subject
25 |   notes:
26 |     keywords:
27 |       - BREAKING CHANGE


--------------------------------------------------------------------------------
/.circleci/config.yml:
--------------------------------------------------------------------------------
  1 | # Python CircleCI 2.0 configuration file
  2 | #
  3 | # Check https://circleci.com/docs/2.0/language-python/ for more details
  4 | #
  5 | version: 2.1
  6 | orbs:
  7 |   codecov: codecov/codecov@1.0.5
  8 |   python: circleci/python@1.4.0
  9 | workflows:
 10 |   build_and_deploy:
 11 |     jobs:
 12 |       - build:
 13 |           filters:
 14 |             tags:
 15 |               only: /.*/
 16 |       - deploy:
 17 |           requires:
 18 |             - build
 19 |           filters:
 20 |             tags:
 21 |               only: /v[0-9]+(\.[0-9]+)*/
 22 |             branches:
 23 |               ignore: /.*/
 24 | 
 25 | jobs:
 26 |   build: &test-template
 27 |     docker:
 28 |       - image: circleci/python:3.8.3
 29 |         environment:
 30 |           PIPENV_VENV_IN_PROJECT: true
 31 |       # Specify service dependencies here if necessary
 32 |       # CircleCI maintains a library of pre-built images
 33 |       # documented at https://circleci.com/docs/2.0/circleci-images/
 34 |       - image: circleci/postgres:12.0-alpine-ram
 35 |         environment:
 36 |           POSTGRES_USER: piiuser
 37 |           POSTGRES_PASSWORD: p11secret
 38 |           POSTGRES_DB: piidb
 39 | 
 40 |       - image: circleci/mysql:8.0.18-ram
 41 |         environment:
 42 |           MYSQL_USER: piiuser
 43 |           MYSQL_PASSWORD: p11secret
 44 |           MYSQL_DATABASE: piidb
 45 |           MYSQL_ROOT_PASSWORD: r00tPa33w0rd
 46 |     environment:
 47 |       PYVERSION: "3.8.3"
 48 |     working_directory: ~/repo
 49 | 
 50 |     steps:
 51 |       - checkout
 52 | 
 53 |       - run:
 54 |           name: install dockerize
 55 |           command: wget https://github.com/jwilder/dockerize/releases/download/$DOCKERIZE_VERSION/dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz && sudo tar -C /usr/local/bin -xzvf dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz && rm dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz
 56 |           environment:
 57 |             DOCKERIZE_VERSION: v0.3.0
 58 | 
 59 |       - run:
 60 |           name: Wait for db
 61 |           command: |
 62 |             dockerize -wait tcp://localhost:5432 -timeout 1m
 63 |             dockerize -wait tcp://localhost:3306 -timeout 1m
 64 | 
 65 |       - python/install-packages:
 66 |           pkg-manager: poetry
 67 |           include-python-in-cache-key: false
 68 |           include-branch-in-cache-key: false
 69 | 
 70 |       # run tests!
 71 |       - run:
 72 |           name: run tests
 73 |           command: |
 74 |             poetry run isort --check --diff .
 75 |             poetry run black --check .
 76 |             poetry run flake8 data_lineage test
 77 |             poetry run pytest --junitxml=junit/test-results.xml --cov=data_lineage --cov-report=xml --cov-report=html test/
 78 | 
 79 |       - store_test_results: # Upload test results for display in Test Summary: https://circleci.com/docs/2.0/collect-test-data/
 80 |           path: test-results
 81 | 
 82 |       - store_artifacts:
 83 |           path: test-reports
 84 |           destination: test-reports
 85 | 
 86 |       - codecov/upload:
 87 |           file: coverage.xml
 88 | 
 89 |   deploy:
 90 |     environment:
 91 |       PYVERSION: "3.8.11"
 92 |     docker:
 93 |       - image: tokern/python:3.8.11-buster
 94 |         environment:
 95 |           PYVERSION: "3.8.11"
 96 |     steps:
 97 |       - checkout
 98 |       - python/install-packages:
 99 |           pkg-manager: poetry
100 |           include-python-in-cache-key: false
101 |           include-branch-in-cache-key: false
102 | 
103 |       - run:
104 |           name: create packages
105 |           command: |
106 |             poetry publish --build --username "${PYPI_USERNAME}" --password "${PYPI_PASSWORD}"
107 | 
108 |       - run:
109 |           name: install git release utilities
110 |           command: |
111 |             go get github.com/aktau/github-release
112 |             GO111MODULE=on go get -u github.com/git-chglog/git-chglog/cmd/git-chglog
113 | 
114 |       - run:
115 |           name: release
116 |           command: |
117 |             ~/go/bin/git-chglog $CIRCLE_TAG | ~/go/bin/github-release release --description - --tag $CIRCLE_TAG
118 | 
119 |       - setup_remote_docker
120 | 
121 |       - run:
122 |           name: build docker and publish
123 |           command: |
124 |             ./docker/build_image.sh $CIRCLE_TAG --publish --latest
125 | 


--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
 1 | [run]
 2 | branch = True
 3 | source = */data_lineage/*
 4 | 
 5 | [report]
 6 | exclude_lines =
 7 |     if self.debug:
 8 |     pragma: no cover
 9 |     raise NotImplementedError
10 |     if __name__ == .__main__.:
11 | ignore_errors = True
12 | omit =
13 |     test/*
14 |     setup.py
15 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Created by https://www.gitignore.io/api/python,pycharm
  2 | # Edit at https://www.gitignore.io/?templates=python,pycharm
  3 | 
  4 | ### PyCharm ###
  5 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
  6 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
  7 | 
  8 | .idea
  9 | 
 10 | # User-specific stuff
 11 | .idea/**/workspace.xml
 12 | .idea/**/tasks.xml
 13 | .idea/**/usage.statistics.xml
 14 | .idea/**/dictionaries
 15 | .idea/**/shelf
 16 | 
 17 | # Generated files
 18 | .idea/**/contentModel.xml
 19 | 
 20 | # Sensitive or high-churn files
 21 | .idea/**/dataSources/
 22 | .idea/**/dataSources.ids
 23 | .idea/**/dataSources.local.xml
 24 | .idea/**/sqlDataSources.xml
 25 | .idea/**/dynamic.xml
 26 | .idea/**/uiDesigner.xml
 27 | .idea/**/dbnavigator.xml
 28 | 
 29 | # Gradle
 30 | .idea/**/gradle.xml
 31 | .idea/**/libraries
 32 | 
 33 | # Gradle and Maven with auto-import
 34 | # When using Gradle or Maven with auto-import, you should exclude module files,
 35 | # since they will be recreated, and may cause churn.  Uncomment if using
 36 | # auto-import.
 37 | # .idea/modules.xml
 38 | # .idea/*.iml
 39 | # .idea/modules
 40 | # *.iml
 41 | # *.ipr
 42 | 
 43 | # CMake
 44 | cmake-build-*/
 45 | 
 46 | # Mongo Explorer plugin
 47 | .idea/**/mongoSettings.xml
 48 | 
 49 | # File-based project format
 50 | *.iws
 51 | 
 52 | # IntelliJ
 53 | out/
 54 | 
 55 | # mpeltonen/sbt-idea plugin
 56 | .idea_modules/
 57 | 
 58 | # JIRA plugin
 59 | atlassian-ide-plugin.xml
 60 | 
 61 | # Cursive Clojure plugin
 62 | .idea/replstate.xml
 63 | 
 64 | # Crashlytics plugin (for Android Studio and IntelliJ)
 65 | com_crashlytics_export_strings.xml
 66 | crashlytics.properties
 67 | crashlytics-build.properties
 68 | fabric.properties
 69 | 
 70 | # Editor-based Rest Client
 71 | .idea/httpRequests
 72 | 
 73 | # Android studio 3.1+ serialized cache file
 74 | .idea/caches/build_file_checksums.ser
 75 | 
 76 | ### PyCharm Patch ###
 77 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
 78 | 
 79 | # *.iml
 80 | # modules.xml
 81 | # .idea/misc.xml
 82 | # *.ipr
 83 | 
 84 | # Sonarlint plugin
 85 | .idea/**/sonarlint/
 86 | 
 87 | # SonarQube Plugin
 88 | .idea/**/sonarIssues.xml
 89 | 
 90 | # Markdown Navigator plugin
 91 | .idea/**/markdown-navigator.xml
 92 | .idea/**/markdown-navigator/
 93 | 
 94 | ### Python ###
 95 | # Byte-compiled / optimized / DLL files
 96 | __pycache__/
 97 | *.py[cod]
 98 | *$py.class
 99 | 
100 | # C extensions
101 | *.so
102 | 
103 | # Distribution / packaging
104 | .Python
105 | build/
106 | develop-eggs/
107 | dist/
108 | downloads/
109 | eggs/
110 | .eggs/
111 | lib/
112 | lib64/
113 | parts/
114 | sdist/
115 | var/
116 | wheels/
117 | pip-wheel-metadata/
118 | share/python-wheels/
119 | *.egg-info/
120 | .installed.cfg
121 | *.egg
122 | MANIFEST
123 | 
124 | # PyInstaller
125 | #  Usually these files are written by a python script from a template
126 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
127 | *.manifest
128 | *.spec
129 | 
130 | # Installer logs
131 | pip-log.txt
132 | pip-delete-this-directory.txt
133 | 
134 | # Unit test / coverage reports
135 | htmlcov/
136 | .tox/
137 | .nox/
138 | .coverage
139 | .coverage.*
140 | .cache
141 | nosetests.xml
142 | coverage.xml
143 | *.cover
144 | .hypothesis/
145 | .pytest_cache/
146 | 
147 | # Translations
148 | *.mo
149 | *.pot
150 | 
151 | # Scrapy stuff:
152 | .scrapy
153 | 
154 | # Sphinx documentation
155 | docs/_build/
156 | 
157 | # PyBuilder
158 | target/
159 | 
160 | # pyenv
161 | .python-version
162 | 
163 | # pipenv
164 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
165 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
166 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
167 | #   install all needed dependencies.
168 | #Pipfile.lock
169 | 
170 | # celery beat schedule file
171 | celerybeat-schedule
172 | 
173 | # SageMath parsed files
174 | *.sage.py
175 | 
176 | # Spyder project settings
177 | .spyderproject
178 | .spyproject
179 | 
180 | # Rope project settings
181 | .ropeproject
182 | 
183 | # Mr Developer
184 | .mr.developer.cfg
185 | .project
186 | .pydevproject
187 | 
188 | # mkdocs documentation
189 | /site
190 | 
191 | # mypy
192 | .mypy_cache/
193 | .dmypy.json
194 | dmypy.json
195 | 
196 | # Pyre type checker
197 | .pyre/
198 | 
199 | junit/
200 | 
201 | .ipynb_checkpoints/
202 | 
203 | # End of https://www.gitignore.io/api/python,pycharm
204 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: local
 3 |     hooks:
 4 |       - id: isort
 5 |         name: isort
 6 |         stages: [commit]
 7 |         language: system
 8 |         entry: poetry run isort
 9 |         types: [python]
10 | 
11 |       - id: black
12 |         name: black
13 |         stages: [commit]
14 |         language: system
15 |         entry: poetry run black
16 |         types: [python]
17 | 
18 |       - id: mypy
19 |         name: mypy
20 |         stages: [commit]
21 |         language: system
22 |         entry: poetry run mypy
23 |         types: [python]
24 |         pass_filenames: false
25 | 
26 |  
27 |       - id: flake8
28 |         name: flake8
29 |         stages: [commit]
30 |         language: system
31 |         entry: poetry run flake8
32 |         types: [python]
33 |         exclude: setup.py
34 | 
35 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Tokern
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include data_lineage/assets/*


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Tokern Lineage Engine
  2 | 
  3 | [![CircleCI](https://circleci.com/gh/tokern/data-lineage.svg?style=svg)](https://circleci.com/gh/tokern/data-lineage)
  4 | [![codecov](https://codecov.io/gh/tokern/data-lineage/branch/master/graph/badge.svg)](https://codecov.io/gh/tokern/data-lineage)
  5 | [![PyPI](https://img.shields.io/pypi/v/data-lineage.svg)](https://pypi.python.org/pypi/data-lineage)
  6 | [![image](https://img.shields.io/pypi/l/data-lineage.svg)](https://pypi.org/project/data-lineage/)
  7 | [![image](https://img.shields.io/pypi/pyversions/data-lineage.svg)](https://pypi.org/project/data-lineage/)
  8 | 
  9 | 
 10 | Tokern Lineage Engine is _fast_ and _easy to use_ application to collect, visualize and analyze 
 11 | column-level data lineage in databases, data warehouses and data lakes in AWS and RDS.
 12 | 
 13 | Tokern Lineage helps you browse column-level data lineage 
 14 | * visually using [kedro-viz](https://github.com/quantumblacklabs/kedro-viz)
 15 | * analyze lineage graphs programmatically using the powerful [networkx graph library](https://networkx.org/)
 16 | 
 17 | ## Resources
 18 | 
 19 | * Demo of Tokern Lineage App
 20 | 
 21 | ![data-lineage](https://user-images.githubusercontent.com/1638298/118261607-688a7100-b4d1-11eb-923a-5d2407d6bd8d.gif)
 22 | 
 23 | * Checkout an [example data lineage notebook](http://tokern.io/docs/data-lineage/example/).
 24 | 
 25 | * Check out [the post on using data lineage for cost control](https://tokern.io/blog/data-lineage-on-redshift/) for an 
 26 | example of how data lineage can be used in production.
 27 | 
 28 | ## Quick Start
 29 | 
 30 | ### Install a demo of using Docker and Docker Compose
 31 | 
 32 | Download the docker-compose file from Github repository.
 33 | 
 34 | 
 35 |     # in a new directory run
 36 |     wget https://raw.githubusercontent.com/tokern/data-lineage/master/install-manifests/docker-compose/catalog-demo.yml
 37 |     # or run
 38 |     curl https://raw.githubusercontent.com/tokern/data-lineage/master/install-manifests/docker-compose/tokern-lineage-engine.yml -o docker-compose.yml
 39 | 
 40 | 
 41 | Run docker-compose
 42 |    
 43 | 
 44 |     docker-compose up -d
 45 | 
 46 | 
 47 | Check that the containers are running.
 48 | 
 49 | 
 50 |     docker ps
 51 |     CONTAINER ID   IMAGE                                    CREATED        STATUS       PORTS                    NAMES
 52 |     3f4e77845b81   tokern/data-lineage-viz:latest   ...   4 hours ago    Up 4 hours   0.0.0.0:8000->80/tcp     tokern-data-lineage-visualizer
 53 |     1e1ce4efd792   tokern/data-lineage:latest       ...   5 days ago     Up 5 days                             tokern-data-lineage
 54 |     38be15bedd39   tokern/demodb:latest             ...   2 weeks ago    Up 2 weeks                            tokern-demodb
 55 | 
 56 | Try out Tokern Lineage App
 57 | 
 58 | Head to `http://localhost:8000/` to open the Tokern Lineage app
 59 | 
 60 | ### Install Tokern Lineage Engine
 61 | 
 62 |     # in a new directory run
 63 |     wget https://raw.githubusercontent.com/tokern/data-lineage/master/install-manifests/docker-compose/tokern-lineage-engine.yml
 64 |     # or run
 65 |     curl https://raw.githubusercontent.com/tokern/data-lineage/master/install-manifests/docker-compose/catalog-demo.yml -o tokern-lineage-engine.yml
 66 | 
 67 | Run docker-compose
 68 |    
 69 | 
 70 |     docker-compose up -d
 71 | 
 72 | 
 73 | If you want to use an external Postgres database, change the following parameters in `tokern-lineage-engine.yml`:
 74 | 
 75 | * CATALOG_HOST
 76 | * CATALOG_USER
 77 | * CATALOG_PASSWORD
 78 | * CATALOG_DB
 79 | 
 80 | You can also override default values using environement variables. 
 81 | 
 82 |     CATALOG_HOST=... CATALOG_USER=... CATALOG_PASSWORD=... CATALOG_DB=... docker-compose -f ... up -d
 83 | 
 84 | For more advanced usage of environment variables with docker-compose, [refer to docker-compose docs](https://docs.docker.com/compose/environment-variables/)
 85 | 
 86 | **Pro-tip**
 87 | 
 88 | If you want to connect to a database in the host machine, set 
 89 | 
 90 |     CATALOG_HOST: host.docker.internal # For mac or windows
 91 |     #OR
 92 |     CATALOG_HOST: 172.17.0.1 # Linux
 93 | 
 94 | ## Supported Technologies
 95 | 
 96 | * Postgres
 97 | * AWS Redshift
 98 | * Snowflake
 99 | 
100 | ### Coming Soon
101 | 
102 | * SparkSQL
103 | * Presto
104 | 
105 | ## Documentation
106 | 
107 | For advanced usage, please refer to [data-lineage documentation](https://tokern.io/docs/data-lineage/index.html)
108 | ## Survey
109 | 
110 | Please take this [survey](https://forms.gle/p2oEQBJnpEguhrp3A) if you are a user or considering using data-lineage. Responses will help us prioritize features better. 
111 | 


--------------------------------------------------------------------------------
/api_example.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "public-income",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Overview\n",
  9 |     "\n",
 10 |     "This example showcases the API exposed by the data lineage package. The API can be used to build\n",
 11 |     "a lineage graph by adding nodes and edges that represent columns and transformations. \n",
 12 |     "\n",
 13 |     "Note that the goal of the example to explain the building blocks of the lineage graph.\n",
 14 |     "In practical scenarios, use a pack (e.g. query parser pack) to automate the process.\n",
 15 |     "\n",
 16 |     "This example consists of the following sequence of operations:\n",
 17 |     "* Start docker containers containing a demo. Refer to [docs](https://tokern.io/docs/data-lineage/installation) for detailed instructions on installing demo-wikimedia.\n",
 18 |     "* Register nodes from columns in the catalog.\n",
 19 |     "* Register directed edges to represent that a column is the source of data for another column.\n",
 20 |     "* Visualize the graph by visiting [Tokern UI](http://localhost:8000/).\n",
 21 |     "* Analyze the graph"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "markdown",
 26 |    "id": "6a9c9b70",
 27 |    "metadata": {
 28 |     "pycharm": {
 29 |      "name": "#%% md\n"
 30 |     }
 31 |    },
 32 |    "source": [
 33 |     "# Installation\n",
 34 |     "\n",
 35 |     "This demo requires wikimedia demo to be running. Start the demo using the following instructions:\n",
 36 |     "\n",
 37 |     "    # in a new directory run\n",
 38 |     "    wget https://raw.githubusercontent.com/tokern/data-lineage/master/install-manifests/docker-compose/wikimedia-demo.yml\n",
 39 |     "    # or run\n",
 40 |     "    curl https://raw.githubusercontent.com/tokern/data-lineage/master/install-manifests/docker-compose/wikimedia-demo.yml -o docker-compose.yml\n",
 41 |     "\n",
 42 |     "\n",
 43 |     "Run docker-compose\n",
 44 |     "\n",
 45 |     "\n",
 46 |     "    docker-compose up -d\n",
 47 |     "\n",
 48 |     "\n",
 49 |     "Verify container are running\n",
 50 |     "\n",
 51 |     "\n",
 52 |     "    docker container ls | grep tokern\n"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": 1,
 58 |    "id": "37651618",
 59 |    "metadata": {},
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "# Required configuration for API and wikimedia database network address\n",
 63 |     "\n",
 64 |     "docker_address = \"http://127.0.0.1:8000\"\n",
 65 |     "wikimedia_db = {\n",
 66 |     "  \"username\": \"etldev\",\n",
 67 |     "  \"password\": \"3tld3v\",\n",
 68 |     "  \"uri\": \"tokern-demo-wikimedia\",\n",
 69 |     "  \"port\": \"5432\",\n",
 70 |     "  \"database\": \"wikimedia\"\n",
 71 |     "}"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": 2,
 77 |    "id": "wrong-antigua",
 78 |    "metadata": {
 79 |     "scrolled": true
 80 |    },
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "# Setup a connection to catalog using the SDK.\n",
 84 |     "from data_lineage import Catalog\n",
 85 |     "\n",
 86 |     "catalog = Catalog(docker_address)"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 3,
 92 |    "id": "23ed8c16",
 93 |    "metadata": {
 94 |     "pycharm": {
 95 |      "name": "#%%\n"
 96 |     },
 97 |     "scrolled": true
 98 |    },
 99 |    "outputs": [],
100 |    "source": [
101 |     "# Register wikimedia datawarehouse with data-lineage app.\n",
102 |     "\n",
103 |     "source = catalog.add_source(name=\"wikimedia\", source_type=\"postgresql\", **wikimedia_db)"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": 4,
109 |    "id": "ce6ebf16",
110 |    "metadata": {
111 |     "scrolled": false
112 |    },
113 |    "outputs": [
114 |     {
115 |      "data": {
116 |       "text/plain": [
117 |        "True"
118 |       ]
119 |      },
120 |      "execution_count": 4,
121 |      "metadata": {},
122 |      "output_type": "execute_result"
123 |     }
124 |    ],
125 |    "source": [
126 |     "# Scan the wikimedia data warehouse and register all schemata, tables and columns.\n",
127 |     "\n",
128 |     "catalog.scan_source(source)"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": 5,
134 |    "id": "202c6b63",
135 |    "metadata": {
136 |     "scrolled": false
137 |    },
138 |    "outputs": [
139 |     {
140 |      "name": "stdout",
141 |      "output_type": "stream",
142 |      "text": [
143 |       "{'attributes': {'context': {'sql': 'insert into page_lookup_nonredirect(redirect_id) select page_id from page'}, 'name': 'insert_into_page_lookup_nonredirect'}, 'id': '1', 'links': {'self': 'http://tokern-api:4142/api/v1/catalog/jobs/1'}, 'type': 'jobs'}\n"
144 |      ]
145 |     }
146 |    ],
147 |    "source": [
148 |     "# Create a job and job_execution that inserts data from page to page_lookup_nonredirect\n",
149 |     "\n",
150 |     "job = catalog.add_job(\"insert_into_page_lookup_nonredirect\",\n",
151 |     "                      {\n",
152 |     "                          \"sql\": \"insert into page_lookup_nonredirect(redirect_id) select page_id from page\"\n",
153 |     "                      })"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": 6,
159 |    "id": "cf308d97",
160 |    "metadata": {
161 |     "scrolled": true
162 |    },
163 |    "outputs": [],
164 |    "source": [
165 |     "import datetime\n",
166 |     "from dbcat.catalog.models import JobExecutionStatus\n",
167 |     "\n",
168 |     "job_execution = catalog.add_job_execution(\n",
169 |     "    job=job,\n",
170 |     "    started_at=datetime.datetime.combine(\n",
171 |     "        datetime.date(2021, 4, 1), datetime.time(1, 0)\n",
172 |     "    ),\n",
173 |     "    ended_at=datetime.datetime.combine(\n",
174 |     "        datetime.date(2021, 4, 1), datetime.time(1, 15)\n",
175 |     "    ),\n",
176 |     "    status=JobExecutionStatus.SUCCESS,\n",
177 |     ")\n"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": 8,
183 |    "id": "b45aaac8",
184 |    "metadata": {},
185 |    "outputs": [],
186 |    "source": [
187 |     "# Add an edge between these two columns:\n",
188 |     "# (test\", \"default\", \"page\", \"page_id\") -> (\"test\", \"default\", \"page_lookup_nonredirect\", \"redirect_id\"),\n",
189 |     "\n",
190 |     "source_column = catalog.get_column(source_name=\"wikimedia\", \n",
191 |     "                                   schema_name=\"public\", \n",
192 |     "                                   table_name=\"page\",\n",
193 |     "                                   column_name=\"page_id\")\n",
194 |     "target_column = catalog.get_column(source_name=\"wikimedia\", \n",
195 |     "                                   schema_name=\"public\", \n",
196 |     "                                   table_name=\"page_lookup_nonredirect\",\n",
197 |     "                                   column_name=\"redirect_id\")\n",
198 |     "\n",
199 |     "edge = catalog.add_column_lineage(source=source_column,\n",
200 |     "                                  target=target_column,\n",
201 |     "                                  job_execution_id=job_execution.id,\n",
202 |     "                                  context={})"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "markdown",
207 |    "id": "254fb735",
208 |    "metadata": {},
209 |    "source": [
210 |     "Visit [Kedro UI](http://localhost:8000/)\n",
211 |     "\n",
212 |     "![One Task Graph](./one_task.png)"
213 |    ]
214 |   }
215 |  ],
216 |  "metadata": {
217 |   "kernelspec": {
218 |    "display_name": "Python 3",
219 |    "language": "python",
220 |    "name": "python3"
221 |   },
222 |   "language_info": {
223 |    "codemirror_mode": {
224 |     "name": "ipython",
225 |     "version": 3
226 |    },
227 |    "file_extension": ".py",
228 |    "mimetype": "text/x-python",
229 |    "name": "python",
230 |    "nbconvert_exporter": "python",
231 |    "pygments_lexer": "ipython3",
232 |    "version": "3.8.5"
233 |   }
234 |  },
235 |  "nbformat": 4,
236 |  "nbformat_minor": 5
237 | }


--------------------------------------------------------------------------------
/data_lineage/__init__.py:
--------------------------------------------------------------------------------
  1 | # flake8: noqa
  2 | __version__ = "0.9.0"
  3 | 
  4 | import datetime
  5 | import json
  6 | import logging
  7 | from typing import Any, Dict, Generator, List, Optional, Type, TypeVar
  8 | 
  9 | import requests
 10 | from dbcat.catalog.models import JobExecutionStatus
 11 | from furl import furl
 12 | from requests import HTTPError
 13 | 
 14 | from data_lineage.graph import LineageGraph
 15 | 
 16 | 
 17 | class SourceNotFound(Exception):
 18 |     """Source not found in catalog"""
 19 | 
 20 | 
 21 | class SchemaNotFound(Exception):
 22 |     """Schema not found in catalog"""
 23 | 
 24 | 
 25 | class TableNotFound(Exception):
 26 |     """Table not found in catalog"""
 27 | 
 28 | 
 29 | class ColumnNotFound(Exception):
 30 |     """Column not found in catalog"""
 31 | 
 32 | 
 33 | class ParseError(Exception):
 34 |     """Parser Error"""
 35 | 
 36 | 
 37 | class SemanticError(Exception):
 38 |     """Error due to mismatch in catalog data"""
 39 | 
 40 | 
 41 | class NoResultFound(Exception):
 42 |     """Raised when function returns no results"""
 43 | 
 44 | 
 45 | class MultipleResultsFound(Exception):
 46 |     """Raised when multiple results are found but expected only one or zero results"""
 47 | 
 48 | 
 49 | class Graph:
 50 |     def __init__(self, url: str):
 51 |         self._base_url = furl(url) / "api/main"
 52 |         self._session = requests.Session()
 53 | 
 54 |     def get(self, job_ids: set = None) -> Dict[str, List[Dict[str, str]]]:
 55 |         if job_ids is not None:
 56 |             response = self._session.get(
 57 |                 self._base_url, params={"job_ids": list(job_ids)}
 58 |             )
 59 |         else:
 60 |             response = self._session.get(self._base_url)
 61 |         return response.json()
 62 | 
 63 | 
 64 | def load_graph(graphSDK: Graph, job_ids: set = None) -> LineageGraph:
 65 |     data = graphSDK.get(job_ids)
 66 |     return LineageGraph(nodes=data["nodes"], edges=data["edges"])
 67 | 
 68 | 
 69 | class BaseModel:
 70 |     def __init__(self, session, attributes, obj_id, relationships):
 71 |         self._session = session
 72 |         self._attributes = attributes
 73 |         self._obj_id = obj_id
 74 |         self._relationships = relationships
 75 | 
 76 |     def __getattr__(self, item):
 77 |         logging.debug("Attributes: {}".format(self._attributes))
 78 |         if item == "id":
 79 |             return self._obj_id
 80 |         elif self._attributes and item in self._attributes.keys():
 81 |             return self._attributes[item]
 82 |         elif self._relationships and item in self._relationships.keys():
 83 |             return self._relationships[item]
 84 |         raise AttributeError
 85 | 
 86 | 
 87 | class Source(BaseModel):
 88 |     def __init__(self, session, attributes, obj_id, relationships):
 89 |         super().__init__(session, attributes, obj_id, relationships)
 90 | 
 91 | 
 92 | class Schema(BaseModel):
 93 |     def __init__(self, session, attributes, obj_id, relationships):
 94 |         super().__init__(session, attributes, obj_id, relationships)
 95 | 
 96 | 
 97 | class Table(BaseModel):
 98 |     def __init__(self, session, attributes, obj_id, relationships):
 99 |         super().__init__(session, attributes, obj_id, relationships)
100 | 
101 | 
102 | class Column(BaseModel):
103 |     def __init__(self, session, attributes, obj_id, relationships):
104 |         super().__init__(session, attributes, obj_id, relationships)
105 | 
106 | 
107 | class Job(BaseModel):
108 |     def __init__(self, session, attributes, obj_id, relationships):
109 |         super().__init__(session, attributes, obj_id, relationships)
110 | 
111 | 
112 | class JobExecution(BaseModel):
113 |     def __init__(self, session, attributes, obj_id, relationships):
114 |         super().__init__(session, attributes, obj_id, relationships)
115 | 
116 | 
117 | class ColumnLineage(BaseModel):
118 |     def __init__(self, session, attributes, obj_id, relationships):
119 |         super().__init__(session, attributes, obj_id, relationships)
120 | 
121 | 
122 | class DefaultSchema(BaseModel):
123 |     def __init__(self, session, attributes, obj_id, relationships):
124 |         super().__init__(session, attributes, obj_id, relationships)
125 | 
126 | 
127 | ModelType = TypeVar("ModelType", bound=BaseModel)
128 | 
129 | 
130 | class Catalog:
131 |     def __init__(self, url: str):
132 |         self._base_url = furl(url) / "api/v1/catalog"
133 |         self._session = requests.Session()
134 |         self._session.headers.update({"Accept": "application/vnd.api+json"})
135 |         self._session.headers.update({"Content-Type": "application/vnd.api+json"})
136 | 
137 |     def _build_url(self, *urls) -> str:
138 |         built_url = self._base_url
139 |         for url in urls:
140 |             built_url = furl(built_url) / url
141 |         logging.debug(built_url)
142 |         return built_url
143 | 
144 |     str_to_type = {
145 |         "sources": Source,
146 |         "schemata": Schema,
147 |     }
148 | 
149 |     def _resolve_relationships(self, relationships) -> Dict[str, BaseModel]:
150 |         resolved: Dict[str, BaseModel] = {}
151 |         for key, value in relationships.items():
152 |             logging.debug("Resolving {}:{}".format(key, value))
153 |             if value["data"]:
154 |                 resolved[key] = self._obj_factory(
155 |                     value["data"],
156 |                     Catalog.str_to_type[value["data"]["type"]],
157 |                     resolve_relationships=False,
158 |                 )
159 | 
160 |         return resolved
161 | 
162 |     def _obj_factory(
163 |         self,
164 |         payload: Dict[str, Any],
165 |         clazz: Type[ModelType],
166 |         resolve_relationships=False,
167 |     ) -> ModelType:
168 |         resolved = None
169 |         if resolve_relationships and payload.get("relationships"):
170 |             resolved = self._resolve_relationships(payload.get("relationships"))
171 | 
172 |         return clazz(
173 |             session=self._session,
174 |             attributes=payload.get("attributes"),
175 |             obj_id=payload.get("id"),
176 |             relationships=resolved,
177 |         )
178 | 
179 |     def _iterate(self, payload: Dict[str, Any], clazz: Type[BaseModel]):
180 |         res: Optional[Dict[str, Any]] = payload
181 |         while res is not None:
182 |             for item in res["data"]:
183 |                 yield self._obj_factory(payload=item, clazz=clazz)
184 | 
185 |             if res["links"]["next"] is not None:
186 |                 response = self._session.get(res["links"]["next"])
187 |                 res = response.json()
188 |             else:
189 |                 res = None
190 | 
191 |     def _index(self, path: str, clazz: Type[BaseModel]):
192 |         response = self._session.get(self._build_url(path))
193 |         logging.debug(response.json())
194 |         return self._iterate(response.json(), clazz)
195 | 
196 |     def _get(
197 |         self,
198 |         path: str,
199 |         obj_id: int,
200 |         clazz: Type[ModelType],
201 |         resolve_relationships=False,
202 |     ) -> ModelType:
203 |         response = self._session.get(self._build_url(path, str(obj_id)))
204 |         json_response = response.json()
205 |         logging.debug(json_response)
206 |         response.raise_for_status()
207 |         return self._obj_factory(
208 |             json_response["data"], clazz, resolve_relationships=resolve_relationships
209 |         )
210 | 
211 |     @staticmethod
212 |     def _one(response):
213 |         json_response = response.json()
214 |         logging.debug(json_response)
215 |         num_results = json_response["meta"]["total"]
216 |         if num_results == 0:
217 |             raise NoResultFound
218 |         elif num_results > 1:
219 |             raise MultipleResultsFound
220 | 
221 |         return json_response["data"][0]
222 | 
223 |     def _search_one(self, path: str, filters):
224 |         params = {"filter[objects]": json.dumps(filters)}
225 |         response = self._session.get(self._build_url(path), params=params)
226 |         response.raise_for_status()
227 |         return Catalog._one(response)
228 | 
229 |     def _search(self, path: str, search_string: str, clazz: Type[BaseModel]):
230 |         filters = [dict(name="name", op="like", val="%{}%".format(search_string))]
231 |         params = {"filter[objects]": json.dumps(filters)}
232 |         response = self._session.get(self._build_url(path), params=params)
233 |         return self._iterate(response.json(), clazz)
234 | 
235 |     def _post(self, path: str, data: Dict[str, Any], type: str) -> Dict[Any, Any]:
236 |         payload = {"data": {"type": type, "attributes": data}}
237 |         response = self._session.post(
238 |             url=self._build_url(path), data=json.dumps(payload, default=str)
239 |         )
240 |         response.raise_for_status()
241 |         logging.debug(response.text)
242 |         json_response = response.json()
243 |         return json_response["data"]
244 | 
245 |     def _patch(self, path: str, obj_id: int, data: Dict[str, Any], type: str):
246 |         payload = {"data": {"type": type, "attributes": data, "id": obj_id}}
247 |         response = self._session.patch(
248 |             url=self._build_url(path, str(obj_id)),
249 |             data=json.dumps(payload, default=str),
250 |         )
251 |         response.raise_for_status()
252 |         return
253 | 
254 |     def get_sources(self) -> Generator[Any, Any, None]:
255 |         return self._index("sources", Source)
256 | 
257 |     def get_schemata(self):
258 |         return self._index("schemata", Schema)
259 | 
260 |     def get_tables(self):
261 |         return self._index("tables", Table)
262 | 
263 |     def get_columns(self):
264 |         return self._index("columns", Column)
265 | 
266 |     def get_jobs(self):
267 |         return self._index("jobs", Job)
268 | 
269 |     def get_job_executions(self):
270 |         return self._index("job_executions", JobExecution)
271 | 
272 |     def get_column_lineages(self):
273 |         return self._index("column_lineages", ColumnLineage)
274 | 
275 |     def get_source_by_id(self, obj_id) -> Source:
276 |         return self._get("sources", obj_id, Source)
277 | 
278 |     def get_schema_by_id(self, obj_id) -> Schema:
279 |         return self._get("schemata", obj_id, Schema)
280 | 
281 |     def get_table_by_id(self, obj_id) -> Table:
282 |         return self._get("tables", obj_id, Table)
283 | 
284 |     def get_column_by_id(self, obj_id) -> Column:
285 |         return self._get("columns", obj_id, Column)
286 | 
287 |     def get_job_by_id(self, obj_id) -> Job:
288 |         return self._get("jobs", obj_id, Job)
289 | 
290 |     def get_job_execution_by_id(self, obj_id) -> JobExecution:
291 |         return self._get("job_executions", obj_id, JobExecution)
292 | 
293 |     def get_column_lineage(self, job_ids: List[int]) -> List[ColumnLineage]:
294 |         params = {"job_ids": job_ids}
295 |         response = self._session.get(self._build_url("column_lineage"), params=params)
296 |         logging.debug(response.json())
297 |         response.raise_for_status()
298 |         return [
299 |             ColumnLineage(
300 |                 session=self._session,
301 |                 attributes=item["attributes"],
302 |                 obj_id=item["id"],
303 |                 relationships=item["relationships"],
304 |             )
305 |             for item in response.json()["data"]
306 |         ]
307 | 
308 |     def get_source(self, name) -> Source:
309 |         filters = [dict(name="name", op="eq", val="{}".format(name))]
310 |         try:
311 |             payload = self._search_one("sources", filters)
312 |         except NoResultFound:
313 |             raise SourceNotFound("Source not found: source_name={}".format(name))
314 | 
315 |         return self._obj_factory(payload, Source)
316 | 
317 |     def get_schema(self, source_name: str, schema_name: str) -> Schema:
318 |         name_filter = dict(name="name", op="eq", val=schema_name)
319 |         source_filter = dict(
320 |             name="source", op="has", val=dict(name="name", op="eq", val=source_name)
321 |         )
322 |         filters = {"and": [name_filter, source_filter]}
323 |         logging.debug(filters)
324 |         try:
325 |             payload = self._search_one("schemata", [filters])
326 |         except NoResultFound:
327 |             raise SchemaNotFound(
328 |                 "Schema not found, (source_name={}, schema_name={})".format(
329 |                     source_name, schema_name
330 |                 )
331 |             )
332 |         return self._obj_factory(payload, Schema)
333 | 
334 |     def get_table(self, source_name: str, schema_name: str, table_name: str) -> Table:
335 |         schema = self.get_schema(source_name, schema_name)
336 | 
337 |         name_filter = dict(name="name", op="eq", val=table_name)
338 |         schema_id_filter = dict(name="schema_id", op="eq", val=str(schema.id))
339 |         filters = {"and": [name_filter, schema_id_filter]}
340 |         logging.debug(filters)
341 |         try:
342 |             payload = self._search_one("tables", [filters])
343 |         except NoResultFound:
344 |             raise TableNotFound(
345 |                 "Table not found, (source_name={}, schema_name={}, table_name={})".format(
346 |                     source_name, schema_name, table_name
347 |                 )
348 |             )
349 |         return self._obj_factory(payload, Table)
350 | 
351 |     def get_columns_for_table(self, table: Table):
352 |         return self._index("tables/{}/columns".format(table.id), Column)
353 | 
354 |     def get_column(self, source_name, schema_name, table_name, column_name) -> Column:
355 |         table = self.get_table(source_name, schema_name, table_name)
356 |         name_filter = dict(name="name", op="eq", val=column_name)
357 |         table_filter = dict(name="table_id", op="eq", val=str(table.id))
358 |         filters = {"and": [name_filter, table_filter]}
359 |         logging.debug(filters)
360 |         try:
361 |             payload = self._search_one("columns", [filters])
362 |         except NoResultFound:
363 |             raise ColumnNotFound(
364 |                 "Column not found, (source_name={}, schema_name={}, table_name={}, column_name={})".format(
365 |                     source_name, schema_name, table_name, column_name
366 |                 )
367 |             )
368 |         return self._obj_factory(payload, Column)
369 | 
370 |     def add_source(self, name: str, source_type: str, **kwargs) -> Source:
371 |         data = {"name": name, "source_type": source_type, **kwargs}
372 |         payload = self._post(path="sources", data=data, type="sources")
373 |         return self._obj_factory(payload, Source)
374 | 
375 |     def add_schema(self, name: str, source: Source) -> Schema:
376 |         data = {"name": name, "source_id": source.id}
377 |         payload = self._post(path="schemata", data=data, type="schemata")
378 |         return self._obj_factory(payload, Schema)
379 | 
380 |     def add_table(self, name: str, schema: Schema) -> Table:
381 |         data = {"name": name, "schema_id": schema.id}
382 |         payload = self._post(path="tables", data=data, type="tables")
383 |         return self._obj_factory(payload, Table)
384 | 
385 |     def add_column(
386 |         self, name: str, data_type: str, sort_order: int, table: Table
387 |     ) -> Column:
388 |         data = {
389 |             "name": name,
390 |             "table_id": table.id,
391 |             "data_type": data_type,
392 |             "sort_order": sort_order,
393 |         }
394 |         payload = self._post(path="columns", data=data, type="columns")
395 |         return self._obj_factory(payload, Column)
396 | 
397 |     def add_job(self, name: str, context: Dict[Any, Any]) -> Job:
398 |         data = {"name": name, "context": context}
399 |         payload = self._post(path="jobs", data=data, type="jobs")
400 |         return self._obj_factory(payload, Job)
401 | 
402 |     def add_job_execution(
403 |         self,
404 |         job: Job,
405 |         started_at: datetime.datetime,
406 |         ended_at: datetime.datetime,
407 |         status: JobExecutionStatus,
408 |     ) -> JobExecution:
409 |         data = {
410 |             "job_id": job.id,
411 |             "started_at": started_at,
412 |             "ended_at": ended_at,
413 |             "status": status.name,
414 |         }
415 |         payload = self._post(path="job_executions", data=data, type="job_executions")
416 |         return self._obj_factory(payload, JobExecution)
417 | 
418 |     def add_column_lineage(
419 |         self,
420 |         source: Column,
421 |         target: Column,
422 |         job_execution_id: int,
423 |         context: Dict[Any, Any],
424 |     ) -> ColumnLineage:
425 |         data = {
426 |             "source_id": source.id,
427 |             "target_id": target.id,
428 |             "job_execution_id": job_execution_id,
429 |             "context": context,
430 |         }
431 |         payload = self._post(path="column_lineage", data=data, type="column_lineage")
432 |         return self._obj_factory(payload, ColumnLineage)
433 | 
434 |     def update_source(self, source: Source, schema: Schema) -> DefaultSchema:
435 |         try:
436 |             current_obj = self._get(
437 |                 path="default_schema",
438 |                 obj_id=source.id,
439 |                 clazz=DefaultSchema,
440 |                 resolve_relationships=True,
441 |             )
442 |             if current_obj.schema.id == schema.id:
443 |                 return current_obj
444 |         except HTTPError as error:
445 |             if error.response.status_code == 404:
446 |                 data = {"source_id": source.id, "schema_id": schema.id}
447 |                 payload = self._post(
448 |                     path="default_schema", data=data, type="default_schema"
449 |                 )
450 |                 return self._obj_factory(
451 |                     payload, DefaultSchema, resolve_relationships=True
452 |                 )
453 | 
454 |         # Patch
455 |         data = {"schema_id": schema.id}
456 |         self._patch(
457 |             path="default_schema", data=data, type="default_schema", obj_id=source.id
458 |         )
459 |         return self._get(
460 |             path="default_schema",
461 |             obj_id=source.id,
462 |             clazz=DefaultSchema,
463 |             resolve_relationships=True,
464 |         )
465 | 
466 | 
467 | class Analyze:
468 |     def __init__(self, url: str):
469 |         self._base_url = furl(url) / "api/v1/analyze"
470 |         self._session = requests.Session()
471 | 
472 |     def analyze(
473 |         self,
474 |         query: str,
475 |         source: Source,
476 |         start_time: datetime.datetime,
477 |         end_time: datetime.datetime,
478 |         name: str = None,
479 |     ) -> JobExecution:
480 |         payload = {
481 |             "query": query,
482 |             "name": name,
483 |             "source_id": source.id,
484 |             "start_time": start_time.isoformat(),
485 |             "end_time": end_time.isoformat(),
486 |         }
487 | 
488 |         response = self._session.post(self._base_url, json=payload,)
489 |         if response.status_code == 441:
490 |             raise TableNotFound(response.json()["message"])
491 |         elif response.status_code == 442:
492 |             raise ColumnNotFound(response.json()["message"])
493 |         elif response.status_code == 422:
494 |             raise ParseError(response.json()["message"])
495 |         elif response.status_code == 443:
496 |             raise SemanticError(response.json()["message"])
497 | 
498 |         logging.debug(response.text)
499 |         response.raise_for_status()
500 |         payload = response.json()["data"]
501 |         return JobExecution(
502 |             session=self._session,
503 |             attributes=payload.get("attributes"),
504 |             obj_id=payload.get("id"),
505 |             relationships=None,
506 |         )
507 | 
508 | 
509 | class Parse:
510 |     def __init__(self, url: str):
511 |         self._base_url = furl(url) / "api/v1/parse"
512 |         self._session = requests.Session()
513 | 
514 |     def parse(self, query: str, source: Source):
515 |         response = self._session.post(
516 |             self._base_url, json={"query": query, "source_id": source.id},
517 |         )
518 |         logging.debug(response.text)
519 |         response.raise_for_status()
520 |         return response.json()
521 | 
522 | 
523 | class Scan:
524 |     def __init__(self, url: str):
525 |         self._base_url = furl(url) / "api/v1/scan"
526 |         self._session = requests.Session()
527 | 
528 |     def start(self, source: Source) -> Dict[str, str]:
529 |         payload = {"id": source.id}
530 |         response = self._session.post(url=self._base_url, json=payload)
531 |         response.raise_for_status()
532 |         return response.json()
533 | 
534 |     def list(self) -> List[Dict[str, str]]:
535 |         response = self._session.post(url=self._base_url)
536 |         response.raise_for_status()
537 |         return response.json()
538 | 
539 |     def get(self, job_id: str) -> Dict[str, str]:
540 |         response = self._session.get(url=furl(self._base_url) / job_id)
541 |         response.raise_for_status()
542 |         return response.json()
543 | 
544 |     def cancel(self, job_id: str) -> Dict[str, str]:
545 |         response = self._session.put(url=furl(self._base_url) / job_id)
546 |         response.raise_for_status()
547 |         return response.json()
548 | 


--------------------------------------------------------------------------------
/data_lineage/__main__.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import click
 4 | from redis import Redis
 5 | 
 6 | from data_lineage import __version__
 7 | from data_lineage.server import create_server
 8 | 
 9 | 
10 | @click.command()
11 | @click.version_option(__version__)
12 | @click.option(
13 |     "-l", "--log-level", envvar="LOG_LEVEL", help="Logging Level", default="INFO"
14 | )
15 | @click.option(
16 |     "--catalog-user", help="Database user name", envvar="CATALOG_USER", required=True
17 | )
18 | @click.option(
19 |     "--catalog-password",
20 |     help="Database Password",
21 |     envvar="CATALOG_PASSWORD",
22 |     required=True,
23 | )
24 | @click.option(
25 |     "--catalog-host", help="Database Host", envvar="CATALOG_HOST", default="localhost"
26 | )
27 | @click.option(
28 |     "--catalog-port", help="Database Password", envvar="CATALOG_PORT", default=5432
29 | )
30 | @click.option(
31 |     "--catalog-db", help="Postgres Database", envvar="CATALOG_DB", default="tokern"
32 | )
33 | @click.option(
34 |     "--redis-host",
35 |     help="Redis host for queueing scans",
36 |     envvar="REDIS_HOST",
37 |     default="localhost",
38 | )
39 | @click.option(
40 |     "--redis-port",
41 |     help="Redis port for queueing scans",
42 |     envvar="REDIS_PORT",
43 |     default="6379",
44 | )
45 | @click.option(
46 |     "--is-production/--not-production",
47 |     help="Run server in development mode",
48 |     default=True,
49 | )
50 | def main(
51 |     log_level,
52 |     catalog_user,
53 |     catalog_password,
54 |     catalog_host,
55 |     catalog_port,
56 |     catalog_db,
57 |     redis_host,
58 |     redis_port,
59 |     is_production,
60 | ):
61 |     logging.basicConfig(level=getattr(logging, log_level.upper()))
62 |     catalog = {
63 |         "user": catalog_user,
64 |         "password": catalog_password,
65 |         "host": catalog_host,
66 |         "port": catalog_port,
67 |         "database": catalog_db,
68 |     }
69 |     connection = Redis(redis_host, redis_port)
70 |     app, catalog = create_server(
71 |         catalog, connection=connection, is_production=is_production
72 |     )
73 |     if is_production:
74 |         app.run()
75 |     else:
76 |         app.run(debug=True)
77 | 
78 | 
79 | if __name__ == "__main__":
80 |     main()
81 | 


--------------------------------------------------------------------------------
/data_lineage/assets/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tokern/data-lineage/5945542742979fe350d313d906440c93ee3d0f36/data_lineage/assets/favicon.ico


--------------------------------------------------------------------------------
/data_lineage/graph.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from typing import Dict, List
 3 | 
 4 | import networkx as nx
 5 | 
 6 | 
 7 | class LineageGraph:
 8 |     def __init__(
 9 |         self,
10 |         nodes: List[Dict[str, str]],
11 |         edges: List[Dict[str, str]],
12 |         name: str = "Lineage",
13 |     ):
14 |         self.name = name
15 |         self._graph = nx.DiGraph()
16 |         for node in nodes:
17 |             node_id = node["id"]
18 |             node_attributes = {"name": node["name"], "type": node["type"]}
19 |             logging.debug("Add Node: {}, {}".format(node_id, node_attributes))
20 |             self._graph.add_node(node_id, **node_attributes)
21 | 
22 |         for edge in edges:
23 |             logging.debug("Edge: <{}>, <{}>".format(edge["source"], edge["target"]))
24 |             self._graph.add_edge(edge["source"], edge["target"])
25 | 
26 |     @property
27 |     def graph(self):
28 |         return self._graph
29 | 
30 |     @graph.setter
31 |     def graph(self, new_graph):
32 |         self._graph = new_graph
33 | 


--------------------------------------------------------------------------------
/data_lineage/parser/__init__.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from typing import List
  3 | 
  4 | from dbcat.catalog import Catalog
  5 | from dbcat.catalog.models import CatSource, JobExecution, JobExecutionStatus
  6 | from pglast import Node, parse_sql
  7 | from pglast.parser import ParseError
  8 | 
  9 | from data_lineage import SemanticError
 10 | from data_lineage.parser.binder import SelectBinder
 11 | from data_lineage.parser.dml_visitor import (
 12 |     CTASVisitor,
 13 |     DmlVisitor,
 14 |     SelectIntoVisitor,
 15 |     SelectSourceVisitor,
 16 | )
 17 | from data_lineage.parser.visitor import ExprVisitor, RedshiftExprVisitor
 18 | 
 19 | 
 20 | class Parsed:
 21 |     def __init__(self, name: str, query: str, node: Node):
 22 |         self._name = name
 23 |         self._node = node
 24 |         self._query = query
 25 | 
 26 |     @property
 27 |     def name(self):
 28 |         return self._name
 29 | 
 30 |     @property
 31 |     def node(self):
 32 |         return self._node
 33 | 
 34 |     @property
 35 |     def query(self):
 36 |         return self._query
 37 | 
 38 | 
 39 | def parse_queries(queries: List[str]) -> List[Parsed]:
 40 |     parsed: List[Parsed] = []
 41 | 
 42 |     for query in queries:
 43 |         try:
 44 |             parsed.append(parse(query))
 45 |         except ParseError as e:
 46 |             logging.warning("Syntax error while parsing {}.\n{}".format(query, e))
 47 | 
 48 |     return parsed
 49 | 
 50 | 
 51 | def analyze_dml_query(
 52 |     catalog: Catalog, parsed: Parsed, source: CatSource,
 53 | ) -> DmlVisitor:
 54 |     chosen_visitor = visit_dml_query(parsed, source)
 55 |     chosen_visitor.bind(catalog=catalog, source=source)
 56 |     return chosen_visitor
 57 | 
 58 | 
 59 | def parse_dml_query(
 60 |     catalog: Catalog, parsed: Parsed, source: CatSource,
 61 | ) -> SelectBinder:
 62 |     chosen_visitor = visit_dml_query(parsed, source)
 63 | 
 64 |     select_binder = SelectBinder(
 65 |         catalog=catalog,
 66 |         source=source,
 67 |         tables=chosen_visitor.select_tables,
 68 |         columns=chosen_visitor.select_columns,
 69 |         expr_visitor_clazz=chosen_visitor.expr_visitor_clazz,
 70 |         alias_generator=("_U{}".format(i) for i in range(0, 1000)),
 71 |     )
 72 |     select_binder.bind()
 73 |     return select_binder
 74 | 
 75 | 
 76 | def visit_dml_query(parsed: Parsed, source: CatSource,) -> DmlVisitor:
 77 | 
 78 |     expr_visitor_clazz = ExprVisitor
 79 |     if source.source_type == "redshift":
 80 |         expr_visitor_clazz = RedshiftExprVisitor
 81 | 
 82 |     select_source_visitor: DmlVisitor = SelectSourceVisitor(
 83 |         parsed.name, expr_visitor_clazz
 84 |     )
 85 |     select_into_visitor: DmlVisitor = SelectIntoVisitor(parsed.name, expr_visitor_clazz)
 86 |     ctas_visitor: DmlVisitor = CTASVisitor(parsed.name, expr_visitor_clazz)
 87 | 
 88 |     for v in [select_source_visitor, select_into_visitor, ctas_visitor]:
 89 |         v(parsed.node)
 90 |         if len(v.select_tables) > 0 and v.insert_table is not None:
 91 |             return v
 92 |     raise SemanticError("Query is not a DML Query")
 93 | 
 94 | 
 95 | def extract_lineage(
 96 |     catalog: Catalog,
 97 |     visited_query: DmlVisitor,
 98 |     source: CatSource,
 99 |     parsed: Parsed,
100 |     start_time,
101 |     end_time,
102 | ) -> JobExecution:
103 |     job = catalog.add_job(
104 |         name=parsed.name, source=source, context={"query": parsed.query}
105 |     )
106 |     job_execution = catalog.add_job_execution(
107 |         job=job,
108 |         started_at=start_time,
109 |         ended_at=end_time,
110 |         status=JobExecutionStatus.SUCCESS,
111 |     )
112 |     for source, target in zip(
113 |         visited_query.source_columns, visited_query.target_columns
114 |     ):
115 |         for column in source.columns:
116 |             edge = catalog.add_column_lineage(column, target, job_execution.id, {})
117 |             logging.debug("Added {}".format(edge))
118 | 
119 |     return job_execution
120 | 
121 | 
122 | def parse(sql: str, name: str = None) -> Parsed:
123 |     if name is None:
124 |         name = str(hash(sql))
125 |     node = parse_sql(sql)
126 | 
127 |     return Parsed(name, sql, node)
128 | 


--------------------------------------------------------------------------------
/data_lineage/parser/binder.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | from abc import ABC, abstractmethod
  4 | from json import JSONEncoder
  5 | from typing import List, Mapping, Set, Type
  6 | 
  7 | from dbcat.catalog import Catalog, CatColumn, CatSource, CatTable
  8 | from pglast import Node
  9 | from pglast.ast import RangeSubselect, RangeVar
 10 | 
 11 | from data_lineage import ColumnNotFound, SemanticError, TableNotFound
 12 | from data_lineage.parser.visitor import (
 13 |     ColumnRefVisitor,
 14 |     ExprVisitor,
 15 |     RangeSubselectVisitor,
 16 |     RangeVarVisitor,
 17 | )
 18 | 
 19 | 
 20 | class ColumnContext:
 21 |     def __init__(self, alias: str, columns: Set[CatColumn]):
 22 |         self._alias = alias.lower()
 23 |         self._columns = columns
 24 | 
 25 |     @property
 26 |     def alias(self):
 27 |         return self._alias
 28 | 
 29 |     @property
 30 |     def columns(self) -> Set[CatColumn]:
 31 |         return self._columns
 32 | 
 33 | 
 34 | class AliasContext:
 35 |     def __init__(self, catalog: Catalog, alias: str, tables: Set[CatTable]):
 36 |         self._catalog = catalog
 37 |         self._alias = alias.lower()
 38 |         self._tables = tables
 39 | 
 40 |     @property
 41 |     def alias(self):
 42 |         return self._alias
 43 | 
 44 |     @property
 45 |     def tables(self):
 46 |         return self._tables
 47 | 
 48 |     def get_columns(self, column_names: List[str] = None) -> List[ColumnContext]:
 49 |         columns: List[CatColumn] = []
 50 |         for table in self._tables:
 51 |             logging.debug("Searching in {}".format(table.fqdn))
 52 |             columns = columns + self._catalog.get_columns_for_table(table, column_names)
 53 | 
 54 |         return [
 55 |             ColumnContext(alias=column.name, columns={column}) for column in columns
 56 |         ]
 57 | 
 58 | 
 59 | class WithContext(AliasContext):
 60 |     def __init__(
 61 |         self,
 62 |         catalog: Catalog,
 63 |         alias: str,
 64 |         tables: Set[CatTable],
 65 |         columns: List[ColumnContext],
 66 |     ):
 67 |         super(WithContext, self).__init__(catalog, alias, tables)
 68 |         self._columns = columns
 69 | 
 70 |     def get_columns(self, column_names: List[str] = None) -> List[ColumnContext]:
 71 |         if column_names is not None:
 72 |             filtered = []
 73 |             for column in self._columns:
 74 |                 logging.debug(
 75 |                     "Comparing with alias: {} - contains columns: {}".format(
 76 |                         column.alias,
 77 |                         json.dumps(list(column.columns), cls=CatColumnEncoder),
 78 |                     )
 79 |                 )
 80 |                 if column.alias in column_names:
 81 |                     filtered.append(column)
 82 | 
 83 |             return filtered
 84 |         else:
 85 |             return self._columns
 86 | 
 87 | 
 88 | class CatTableEncoder(JSONEncoder):
 89 |     def default(self, obj):
 90 |         if isinstance(obj, CatTable):
 91 |             return {
 92 |                 "name": obj.name,
 93 |                 "schema": obj.schema.name,
 94 |                 "source": obj.schema.source.name,
 95 |             }
 96 | 
 97 |         # Let the base class default method raise the TypeError
 98 |         return json.JSONEncoder.default(self, obj)
 99 | 
100 | 
101 | class CatColumnEncoder(JSONEncoder):
102 |     def default(self, obj):
103 |         if isinstance(obj, CatColumn):
104 |             return {
105 |                 "name": obj.name,
106 |                 "table": obj.table.name,
107 |                 "schema": obj.table.schema.name,
108 |                 "source": obj.table.schema.source.name,
109 |             }
110 | 
111 |         # Let the base class default method raise the TypeError
112 |         return json.JSONEncoder.default(self, obj)
113 | 
114 | 
115 | class Binder(ABC):
116 |     @property
117 |     @abstractmethod
118 |     def _visited_tables(self) -> List[Node]:
119 |         pass
120 | 
121 |     @property
122 |     @abstractmethod
123 |     def _visited_columns(self) -> List[ExprVisitor]:
124 |         pass
125 | 
126 |     @property
127 |     def tables(self) -> Set[CatTable]:
128 |         return self._tables
129 | 
130 |     @property
131 |     def columns(self) -> List[ColumnContext]:
132 |         return self._columns
133 | 
134 |     def __init__(
135 |         self,
136 |         catalog: Catalog,
137 |         source: CatSource,
138 |         alias_generator,
139 |         expr_visitor_clazz: Type[ExprVisitor],
140 |         alias_map: Mapping[str, AliasContext] = None,
141 |     ):
142 |         self._catalog = catalog
143 |         self._source = source
144 |         self._tables: Set[CatTable] = set()
145 |         self._columns: List[ColumnContext] = []
146 |         self._alias_map: Mapping[str, AliasContext] = alias_map or {}
147 |         self._alias_generator = alias_generator
148 |         self._expr_visitor_clazz = expr_visitor_clazz
149 | 
150 |     def bind(self):
151 |         bound_tables = self._bind_tables()
152 | 
153 |         self._tables = set(bound_tables)
154 |         self._columns = self._bind_columns()
155 | 
156 |     def _bind_tables(self):
157 |         bound_tables = []
158 |         for table in self._visited_tables:
159 |             if isinstance(table, RangeVar):
160 |                 visitor = RangeVarVisitor()
161 |                 visitor(table)
162 | 
163 |                 logging.debug("Searching for: {}".format(visitor.search_string))
164 | 
165 |                 if not visitor.is_qualified and visitor.name in self._alias_map:
166 |                     bound_tables = bound_tables + list(
167 |                         self._alias_map[visitor.name].tables
168 |                     )
169 |                     logging.debug("Added tables for alias {}".format(visitor.name))
170 |                 else:
171 |                     try:
172 |                         candidate_table = self._catalog.search_table(
173 |                             source_like=self._source.name, **visitor.search_string
174 |                         )
175 |                     except RuntimeError as err:
176 |                         logging.debug(str(err))
177 |                         raise TableNotFound(
178 |                             '"{schema_like}"."{table_like}" is not found'.format(
179 |                                 **visitor.search_string
180 |                             )
181 |                         )
182 |                     logging.debug("Bound source table: {}".format(candidate_table))
183 | 
184 |                     self._alias_map[visitor.alias] = AliasContext(
185 |                         catalog=self._catalog,
186 |                         alias=visitor.alias,
187 |                         tables={candidate_table},
188 |                     )
189 |                     bound_tables.append(candidate_table)
190 |             elif isinstance(table, RangeSubselect):
191 |                 visitor = RangeSubselectVisitor(self._expr_visitor_clazz)
192 |                 visitor(table)
193 |                 binder = SelectBinder(
194 |                     self._catalog,
195 |                     self._source,
196 |                     visitor.sources,
197 |                     visitor.columns,
198 |                     self._alias_generator,
199 |                     self._expr_visitor_clazz,
200 |                 )
201 |                 binder.bind()
202 |                 self._alias_map[visitor.alias] = WithContext(
203 |                     catalog=self._catalog,
204 |                     alias=visitor.alias,
205 |                     tables=binder.tables,
206 |                     columns=binder.columns,
207 |                 )
208 |                 bound_tables = bound_tables + list(binder.tables)
209 |             else:
210 |                 raise SemanticError("Unknown parser state. Please contact Support")
211 |         return bound_tables
212 | 
213 |     def _bind_columns(self) -> List[ColumnContext]:
214 |         bound_cols: List[ColumnContext] = []
215 |         for expr_visitor in self._visited_columns:
216 |             target_cols: Set[ColumnContext] = set()
217 |             is_a_star = False
218 |             for column in expr_visitor.columns:
219 |                 column_ref_visitor = ColumnRefVisitor()
220 |                 column_ref_visitor(column)
221 |                 is_a_star = column_ref_visitor.is_a_star
222 |                 alias_list = list(self._alias_map.values())
223 |                 if column_ref_visitor.is_qualified:
224 |                     if column_ref_visitor.table_name not in self._alias_map:
225 |                         raise TableNotFound(
226 |                             "{} not found for column ({}).".format(
227 |                                 column_ref_visitor.name[0], column_ref_visitor.name
228 |                             )
229 |                         )
230 |                     assert column_ref_visitor.table_name is not None
231 |                     alias_list = [self._alias_map[column_ref_visitor.table_name]]
232 |                 target_cols.update(
233 |                     Binder._search_column_in_tables(column_ref_visitor, alias_list)
234 |                 )
235 | 
236 |             if is_a_star:
237 |                 for col in target_cols:
238 |                     bound_cols.append(
239 |                         ColumnContext(alias=col.alias, columns=col.columns)
240 |                     )
241 |             else:
242 |                 if expr_visitor.alias is not None:
243 |                     alias = expr_visitor.alias
244 |                 elif len(target_cols) == 1:
245 |                     alias = list(target_cols)[0].alias
246 |                 else:
247 |                     alias = next(self._alias_generator)
248 |                 cols: Set[CatColumn] = set()
249 |                 for tgt in target_cols:
250 |                     for c in tgt.columns:
251 |                         cols.add(c)
252 |                 bound_cols.append(ColumnContext(alias=alias, columns=cols))
253 | 
254 |         if len(bound_cols) == 0:
255 |             raise ColumnNotFound("No source columns found.")
256 |         return bound_cols
257 | 
258 |     @staticmethod
259 |     def _search_column_in_tables(
260 |         column_ref_visitor, alias_list: List[AliasContext]
261 |     ) -> List[ColumnContext]:
262 |         found_cols: List[ColumnContext] = []
263 |         if column_ref_visitor.is_a_star:
264 |             for alias_context in alias_list:
265 |                 found_cols = alias_context.get_columns()
266 |                 logging.debug(
267 |                     "Bound all source columns in {}".format(alias_context.tables)
268 |                 )
269 |         else:
270 |             candidate_columns: List[ColumnContext] = []
271 |             global_table_list: List[CatTable] = []
272 |             logging.debug("Searching for {}".format(column_ref_visitor.column_name))
273 |             for alias_context in alias_list:
274 |                 logging.debug("Searching in {}".format(alias_context.alias))
275 |                 candidate_columns = candidate_columns + alias_context.get_columns(
276 |                     [column_ref_visitor.column_name]
277 |                 )
278 |                 global_table_list = global_table_list + list(alias_context.tables)
279 | 
280 |             if len(candidate_columns) == 0:
281 |                 raise ColumnNotFound(
282 |                     '"{}" not found in the following tables: {}'.format(
283 |                         column_ref_visitor.column_name,
284 |                         json.dumps(global_table_list, cls=CatTableEncoder),
285 |                     )
286 |                 )
287 |             elif len(candidate_columns) > 1:
288 |                 column_list = []
289 |                 for candidate in candidate_columns:
290 |                     for col in candidate.columns:
291 |                         column_list.append(col)
292 |                 raise ColumnNotFound(
293 |                     "{} Ambiguous column name. Multiple matches found: {}".format(
294 |                         column_ref_visitor.name,
295 |                         json.dumps(column_list, cls=CatColumnEncoder),
296 |                     )
297 |                 )
298 |             logging.debug("Bound source column: {}".format(candidate_columns[0]))
299 |             found_cols.append(candidate_columns[0])
300 |         return found_cols
301 | 
302 | 
303 | class SelectBinder(Binder):
304 |     def __init__(
305 |         self,
306 |         catalog: Catalog,
307 |         source: CatSource,
308 |         tables: List[Node],
309 |         columns: List[ExprVisitor],
310 |         alias_generator,
311 |         expr_visitor_clazz: Type[ExprVisitor],
312 |         alias_map: Mapping[str, AliasContext] = None,
313 |     ):
314 |         super(SelectBinder, self).__init__(
315 |             catalog, source, alias_generator, expr_visitor_clazz, alias_map
316 |         )
317 |         self._table_nodes: List[Node] = tables
318 |         self._column_nodes: List[ExprVisitor] = columns
319 | 
320 |     @property
321 |     def _visited_tables(self) -> List[Node]:
322 |         return self._table_nodes
323 | 
324 |     @property
325 |     def _visited_columns(self) -> List[ExprVisitor]:
326 |         return self._column_nodes
327 | 


--------------------------------------------------------------------------------
/data_lineage/parser/dml_visitor.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | from typing import Any, Dict, List, Optional, Set, Tuple, Type
  4 | 
  5 | from dbcat.catalog import Catalog, CatColumn, CatSource, CatTable
  6 | from pglast import Node
  7 | from pglast.ast import IntoClause
  8 | from pglast.visitors import Ancestor, Continue, Skip, Visitor
  9 | 
 10 | from data_lineage import ColumnNotFound, SemanticError, TableNotFound
 11 | from data_lineage.parser.binder import (
 12 |     CatTableEncoder,
 13 |     ColumnContext,
 14 |     SelectBinder,
 15 |     WithContext,
 16 | )
 17 | from data_lineage.parser.visitor import (
 18 |     ColumnRefVisitor,
 19 |     ExprVisitor,
 20 |     RangeVarVisitor,
 21 |     TableVisitor,
 22 | )
 23 | 
 24 | 
 25 | class DmlVisitor(Visitor):
 26 |     def __init__(self, name: str, expr_visitor_clazz: Type[ExprVisitor]):
 27 |         self._name = name
 28 |         self._insert_table: Optional[Node] = None
 29 |         self._insert_columns: List[str] = []
 30 |         self._target_table: Optional[CatTable] = None
 31 |         self._target_columns: List[CatColumn] = []
 32 |         self._source_tables: Set[CatTable] = set()
 33 |         self._source_columns: List[ColumnContext] = []
 34 |         self._select_tables: List[Node] = []
 35 |         self._select_columns: List[ExprVisitor] = []
 36 |         self._with_aliases: Dict[str, Dict[str, Any]] = {}
 37 |         self._alias_map: Dict[str, WithContext] = {}
 38 |         self._column_alias_generator = ("_U{}".format(i) for i in range(0, 1000))
 39 |         self.expr_visitor_clazz = expr_visitor_clazz
 40 | 
 41 |     @property
 42 |     def name(self) -> str:
 43 |         return self._name
 44 | 
 45 |     @property
 46 |     def insert_table(self) -> Optional[Node]:
 47 |         return self._insert_table
 48 | 
 49 |     @property
 50 |     def target_table(self) -> CatTable:
 51 |         return self._target_table
 52 | 
 53 |     @property
 54 |     def target_columns(self) -> List[CatColumn]:
 55 |         return self._target_columns
 56 | 
 57 |     @property
 58 |     def source_tables(self) -> Set[CatTable]:
 59 |         return self._source_tables
 60 | 
 61 |     @property
 62 |     def source_columns(self) -> List[ColumnContext]:
 63 |         return self._source_columns
 64 | 
 65 |     @property
 66 |     def select_tables(self) -> List[Node]:
 67 |         return self._select_tables
 68 | 
 69 |     @property
 70 |     def select_columns(self) -> List[ExprVisitor]:
 71 |         return self._select_columns
 72 | 
 73 |     def visit_RangeVar(self, ancestors, node):
 74 |         self._insert_table = node
 75 |         return Skip
 76 | 
 77 |     def visit_ResTarget(self, ancestors, node):
 78 |         self._insert_columns.append(node.name)
 79 |         return Skip
 80 | 
 81 |     def visit_CommonTableExpr(self, ancestors, node):
 82 |         with_alias = node.ctename
 83 |         table_visitor = TableVisitor(self.expr_visitor_clazz)
 84 |         table_visitor(node.ctequery)
 85 | 
 86 |         self._with_aliases[with_alias] = {
 87 |             "tables": table_visitor.sources,
 88 |             "columns": table_visitor.columns,
 89 |         }
 90 |         return Skip
 91 | 
 92 |     def visit_CreateTableAsStmt(self, ancestors, node):
 93 |         """
 94 |             Do not process CTAS statement by default.
 95 |         :param ancestors:
 96 |         :type ancestors:
 97 |         :param node:
 98 |         :type node:
 99 |         :return:
100 |         :rtype:
101 |         """
102 |         return Skip
103 | 
104 |     def bind(self, catalog: Catalog, source: CatSource):
105 |         self._bind_target(catalog, source)
106 | 
107 |         self._bind_with(catalog, source)
108 |         binder = SelectBinder(
109 |             catalog,
110 |             source,
111 |             self._select_tables,
112 |             self._select_columns,
113 |             self._column_alias_generator,
114 |             self.expr_visitor_clazz,
115 |             self._alias_map,
116 |         )
117 |         binder.bind()
118 | 
119 |         if len(binder.tables) == 0:
120 |             raise SemanticError("No source tables found")
121 | 
122 |         if len(binder.columns) == 0:
123 |             raise SemanticError("No source columns found")
124 | 
125 |         if self.target_table is None:
126 |             raise SemanticError("No target table found")
127 | 
128 |         if len(self.target_columns) == 0:
129 |             raise SemanticError(
130 |                 "No target columns found in {}".format(
131 |                     json.dumps(self.target_table, cls=CatTableEncoder)
132 |                 )
133 |             )
134 | 
135 |         if len(self.target_columns) != len(binder.columns):
136 |             raise SemanticError(
137 |                 "No. of target columns({}) does not match no. of source columns({})".format(
138 |                     len(self.target_columns), len(binder.columns)
139 |                 )
140 |             )
141 | 
142 |         self._source_tables = binder.tables
143 |         self._source_columns = binder.columns
144 | 
145 |     def _bind_target(self, catalog: Catalog, source: CatSource):
146 |         target_table_visitor = RangeVarVisitor()
147 |         target_table_visitor(self._insert_table)
148 |         logging.debug("Searching for: {}".format(target_table_visitor.search_string))
149 |         try:
150 |             self._target_table = catalog.search_table(
151 |                 source_like=source.name, **target_table_visitor.search_string
152 |             )
153 |         except RuntimeError as error:
154 |             logging.debug(str(error))
155 |             raise TableNotFound(
156 |                 '"{schema_like}"."{table_like}" is not found'.format(
157 |                     **target_table_visitor.search_string
158 |                 )
159 |             )
160 |         logging.debug("Bound target table: {}".format(self._target_table))
161 |         if len(self._insert_columns) == 0:
162 |             self._target_columns = catalog.get_columns_for_table(self._target_table)
163 |             logging.debug("Bound all columns in {}".format(self._target_table))
164 |         else:
165 |             bound_cols = catalog.get_columns_for_table(
166 |                 self._target_table, column_names=self._insert_columns
167 |             )
168 |             # Handle error case
169 |             if len(bound_cols) != len(self._insert_columns):
170 |                 for column in self._insert_columns:
171 |                     found = False
172 |                     for bound in bound_cols:
173 |                         if column == bound.name:
174 |                             found = True
175 |                             break
176 | 
177 |                     if not found:
178 |                         raise ColumnNotFound(
179 |                             '"{}" not found in the following tables: {}'.format(
180 |                                 column,
181 |                                 json.dumps([self._target_table], cls=CatTableEncoder),
182 |                             )
183 |                         )
184 | 
185 |             self._target_columns = bound_cols
186 |             logging.debug("Bound {} target columns".format(len(bound_cols)))
187 | 
188 |     def _bind_with(self, catalog: Catalog, source: CatSource):
189 |         if self._with_aliases:
190 |             # Bind all the WITH expressions
191 |             for key in self._with_aliases.keys():
192 |                 binder = SelectBinder(
193 |                     catalog,
194 |                     source,
195 |                     self._with_aliases[key]["tables"],
196 |                     self._with_aliases[key]["columns"],
197 |                     self._column_alias_generator,
198 |                     self.expr_visitor_clazz,
199 |                 )
200 |                 binder.bind()
201 |                 self._alias_map[key] = WithContext(
202 |                     catalog=catalog,
203 |                     alias=key,
204 |                     tables=binder.tables,
205 |                     columns=binder.columns,
206 |                 )
207 | 
208 |     def resolve(
209 |         self,
210 |     ) -> Tuple[
211 |         Tuple[Optional[str], str],
212 |         List[Tuple[Optional[str], str]],
213 |         List[Tuple[Optional[str], str]],
214 |     ]:
215 |         target_table_visitor = RangeVarVisitor()
216 |         target_table_visitor(self._insert_table)
217 | 
218 |         bound_tables = []
219 |         for table in self._select_tables:
220 |             visitor = RangeVarVisitor()
221 |             visitor(table)
222 |             bound_tables.append(visitor.fqdn)
223 | 
224 |         bound_cols = []
225 |         for expr_visitor in self._select_columns:
226 |             for column in expr_visitor.columns:
227 |                 column_ref_visitor = ColumnRefVisitor()
228 |                 column_ref_visitor(column)
229 |                 bound_cols.append(column_ref_visitor.name[0])
230 | 
231 |         return target_table_visitor.fqdn, bound_tables, bound_cols
232 | 
233 | 
234 | class SelectSourceVisitor(DmlVisitor):
235 |     def __init__(self, name: str, expr_visitor_clazz: Type[ExprVisitor] = ExprVisitor):
236 |         super(SelectSourceVisitor, self).__init__(name, expr_visitor_clazz)
237 | 
238 |     def visit_SelectStmt(self, ancestors, node):
239 |         table_visitor = TableVisitor(self.expr_visitor_clazz)
240 |         table_visitor(node)
241 |         self._select_tables = table_visitor.sources
242 |         self._select_columns = table_visitor.columns
243 |         for key in table_visitor.with_aliases.keys():
244 |             self._with_aliases[key] = table_visitor.with_aliases[key]
245 | 
246 |         return Skip
247 | 
248 | 
249 | class SelectIntoVisitor(DmlVisitor):
250 |     def __init__(self, name: str, expr_visitor_clazz: Type[ExprVisitor] = ExprVisitor):
251 |         super(SelectIntoVisitor, self).__init__(name, expr_visitor_clazz)
252 | 
253 |     def visit_SelectStmt(self, ancestors, node):
254 |         super().__call__(node.intoClause)
255 |         table_visitor = TableVisitor(self.expr_visitor_clazz)
256 |         table_visitor(node.targetList)
257 |         table_visitor(node.fromClause)
258 |         self._select_tables = table_visitor.sources
259 |         self._select_columns = table_visitor.columns
260 |         for key in table_visitor.with_aliases.keys():
261 |             self._with_aliases[key] = table_visitor.with_aliases[key]
262 | 
263 |         return Skip
264 | 
265 | 
266 | class CTASVisitor(SelectSourceVisitor):
267 |     def __init__(self, name: str, expr_visitor_clazz: Type[ExprVisitor] = ExprVisitor):
268 |         super(CTASVisitor, self).__init__(name, expr_visitor_clazz)
269 | 
270 |     def visit_CreateTableAsStmt(self, ancestors, node):
271 |         return Continue
272 | 
273 |     def visit_String(self, ancestors: Ancestor, node):
274 |         # Check if parent is IntoClause
275 |         parent = ancestors
276 |         in_into_clause = False
277 |         while parent is not None and not in_into_clause:
278 |             in_into_clause = isinstance(parent.node, IntoClause)
279 |             parent = parent.parent
280 | 
281 |         if in_into_clause:
282 |             self._insert_columns.append(node.val)
283 | 
284 |     def _bind_target(self, catalog: Catalog, source: CatSource):
285 |         target_table_visitor = RangeVarVisitor()
286 |         target_table_visitor(self._insert_table)
287 | 
288 |         if target_table_visitor.is_qualified:
289 |             schema = catalog.get_schema(
290 |                 source_name=source.name, schema_name=target_table_visitor.schema_name
291 |             )
292 |         elif source.default_schema is not None:
293 |             schema = source.default_schema.schema
294 |         else:
295 |             raise SemanticError(
296 |                 "No default schema set for source {}".format(source.fqdn)
297 |             )
298 | 
299 |         self._target_table = catalog.add_table(
300 |             table_name=target_table_visitor.name, schema=schema
301 |         )
302 | 
303 |         sort_order = 1
304 |         for col in self._insert_columns:
305 |             self._target_columns.append(
306 |                 catalog.add_column(
307 |                     column_name=col,
308 |                     data_type="varchar",
309 |                     sort_order=sort_order,
310 |                     table=self._target_table,
311 |                 )
312 |             )
313 | 


--------------------------------------------------------------------------------
/data_lineage/parser/visitor.py:
--------------------------------------------------------------------------------
  1 | from typing import Any, Dict, List, Optional, Tuple, Type
  2 | 
  3 | from pglast import Node
  4 | from pglast.visitors import Skip, Visitor
  5 | 
  6 | 
  7 | class ExprVisitor(Visitor):
  8 |     def __init__(self, alias: str = None):
  9 |         self._alias: Optional[str] = alias
 10 |         self._columns: List[Node] = []
 11 | 
 12 |     @property
 13 |     def alias(self) -> Optional[str]:
 14 |         return self._alias
 15 | 
 16 |     @property
 17 |     def columns(self) -> List[Node]:
 18 |         return self._columns
 19 | 
 20 |     def visit_FuncCall(self, ancestors, node):
 21 |         super().__call__(node.args)
 22 | 
 23 |     def visit_TypeCast(self, ancestors, node):
 24 |         super().__call__(node.arg)
 25 | 
 26 |     def visit_A_Expr(self, ancestors, node):
 27 |         super().__call__(node.lexpr)
 28 |         super().__call__(node.rexpr)
 29 | 
 30 |     def visit_ColumnRef(self, ancestors, node):
 31 |         self._columns.append(node)
 32 | 
 33 | 
 34 | class RedshiftExprVisitor(ExprVisitor):
 35 |     class FuncNameVisitor(Visitor):
 36 |         def __init__(self):
 37 |             self._name = None
 38 | 
 39 |         @property
 40 |         def name(self):
 41 |             return self._name
 42 | 
 43 |         def visit_String(self, ancestors, obj):
 44 |             self._name = obj.val
 45 | 
 46 |     def visit_FuncCall(self, ancestors, node):
 47 |         name_visitor = RedshiftExprVisitor.FuncNameVisitor()
 48 |         name_visitor(node.funcname)
 49 |         if name_visitor.name == "dateadd":
 50 |             super().__call__(node.args[2])
 51 |             return Skip
 52 | 
 53 | 
 54 | class TableVisitor(Visitor):
 55 |     def __init__(self, expr_visitor_clazz: Type[ExprVisitor]):
 56 |         self._sources: List[Node] = []
 57 |         self._columns: List[ExprVisitor] = []
 58 |         self._expr_visitor_clazz = expr_visitor_clazz
 59 |         self._with_aliases: Dict[str, Dict[str, Any]] = {}
 60 | 
 61 |     @property
 62 |     def sources(self) -> List[Node]:
 63 |         return self._sources
 64 | 
 65 |     @property
 66 |     def columns(self) -> List[ExprVisitor]:
 67 |         return self._columns
 68 | 
 69 |     @property
 70 |     def with_aliases(self) -> Dict[str, Dict[str, Any]]:
 71 |         return self._with_aliases
 72 | 
 73 |     def visit_ResTarget(self, ancestors, node):
 74 |         name = None
 75 |         if node.name is not None:
 76 |             name = node.name
 77 | 
 78 |         expr_visitor = self._expr_visitor_clazz(name)
 79 |         expr_visitor(node.val)
 80 |         self._columns.append(expr_visitor)
 81 |         return Skip
 82 | 
 83 |     def visit_RangeVar(self, ancestors, node):
 84 |         self._sources.append(node)
 85 |         return Skip
 86 | 
 87 |     def visit_RangeSubselect(self, ancestors, node):
 88 |         self._sources.append(node)
 89 |         return Skip
 90 | 
 91 |     def visit_CommonTableExpr(self, ancestors, node):
 92 |         with_alias = node.ctename
 93 |         table_visitor = TableVisitor(self._expr_visitor_clazz)
 94 |         table_visitor(node.ctequery)
 95 | 
 96 |         self._with_aliases[with_alias] = {
 97 |             "tables": table_visitor.sources,
 98 |             "columns": table_visitor.columns,
 99 |         }
100 |         return Skip
101 | 
102 | 
103 | class ColumnRefVisitor(Visitor):
104 |     def __init__(self):
105 |         self._name: List[str] = []
106 |         self._is_a_star: bool = False
107 | 
108 |     @property
109 |     def name(self) -> Tuple:
110 |         return tuple(self._name)
111 | 
112 |     @property
113 |     def is_a_star(self) -> bool:
114 |         return self._is_a_star
115 | 
116 |     @property
117 |     def is_qualified(self) -> bool:
118 |         return len(self._name) == 2 or (len(self._name) == 1 and self._is_a_star)
119 | 
120 |     @property
121 |     def column_name(self) -> Optional[str]:
122 |         if len(self._name) == 2:
123 |             return self._name[1]
124 |         elif len(self._name) == 1:
125 |             return self._name[0]
126 |         return None
127 | 
128 |     @property
129 |     def table_name(self) -> Optional[str]:
130 |         if len(self._name) == 2 or (self._is_a_star and len(self._name) == 1):
131 |             return self._name[0]
132 | 
133 |         return None
134 | 
135 |     def visit_String(self, ancestors, node):
136 |         self._name.append(node.val)
137 | 
138 |     def visit_A_Star(self, ancestors, node):
139 |         self._is_a_star = True
140 | 
141 | 
142 | class RangeVarVisitor(Visitor):
143 |     def __init__(self):
144 |         self._schema_name = None
145 |         self._name = None
146 |         self._alias = None
147 | 
148 |     @property
149 |     def alias(self) -> Optional[str]:
150 |         if self._alias is not None:
151 |             return self._alias
152 |         elif self._schema_name is not None and self._name is not None:
153 |             return "{}.{}".format(self._schema_name, self._name)
154 |         elif self._name is not None:
155 |             return self._name
156 |         return None
157 | 
158 |     @property
159 |     def fqdn(self):
160 |         return self._schema_name, self._name
161 | 
162 |     @property
163 |     def search_string(self):
164 |         return {"schema_like": self._schema_name, "table_like": self._name}
165 | 
166 |     @property
167 |     def is_qualified(self) -> bool:
168 |         return self._schema_name is not None
169 | 
170 |     @property
171 |     def schema_name(self) -> Optional[str]:
172 |         return self._schema_name
173 | 
174 |     @property
175 |     def name(self) -> str:
176 |         return self._name
177 | 
178 |     def visit_Alias(self, ancestors, node):
179 |         self._alias = node.aliasname.lower()
180 | 
181 |     def visit_RangeVar(self, ancestors, node):
182 |         if node.schemaname:
183 |             self._schema_name = node.schemaname.lower()
184 |         self._name = node.relname.lower()
185 | 
186 | 
187 | class RangeSubselectVisitor(Visitor):
188 |     def __init__(self, expr_visitor_clazz: Type[ExprVisitor]):
189 |         self._alias: Optional[str] = None
190 |         self._table_visitor: TableVisitor = TableVisitor(expr_visitor_clazz)
191 | 
192 |     @property
193 |     def alias(self) -> Optional[str]:
194 |         if self._alias is not None:
195 |             return self._alias
196 |         return None
197 | 
198 |     @property
199 |     def sources(self) -> List[Node]:
200 |         return self._table_visitor.sources
201 | 
202 |     @property
203 |     def columns(self) -> List[ExprVisitor]:
204 |         return self._table_visitor.columns
205 | 
206 |     def visit_Alias(self, ancestors, node):
207 |         self._alias = node.aliasname
208 | 
209 |     def visit_RangeSubselect(self, ancestors, node):
210 |         super().__call__(node.alias)
211 |         self._table_visitor(node.subquery)
212 |         return Skip
213 | 


--------------------------------------------------------------------------------
/data_lineage/server.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import logging
  3 | from typing import Any, Dict, List, Tuple
  4 | 
  5 | import flask_restless
  6 | import gunicorn.app.base
  7 | from dbcat import Catalog, PGCatalog, init_db
  8 | from dbcat.catalog import CatColumn
  9 | from dbcat.catalog.models import (
 10 |     CatSchema,
 11 |     CatSource,
 12 |     CatTable,
 13 |     ColumnLineage,
 14 |     DefaultSchema,
 15 |     Job,
 16 |     JobExecution,
 17 |     JobExecutionStatus,
 18 | )
 19 | from flask import Flask
 20 | from flask_restful import Api, Resource, reqparse
 21 | from pglast.parser import ParseError
 22 | from rq import Queue
 23 | from rq import job as RqJob
 24 | from werkzeug.exceptions import NotFound, UnprocessableEntity
 25 | 
 26 | from data_lineage import ColumnNotFound, SemanticError, TableNotFound
 27 | from data_lineage.parser import (
 28 |     analyze_dml_query,
 29 |     extract_lineage,
 30 |     parse,
 31 |     parse_dml_query,
 32 | )
 33 | from data_lineage.worker import scan
 34 | 
 35 | 
 36 | class TableNotFoundHTTP(NotFound):
 37 |     """Table not found in catalog"""
 38 | 
 39 |     code = 441
 40 | 
 41 | 
 42 | class ColumnNotFoundHTTP(NotFound):
 43 |     """Column not found in catalog"""
 44 | 
 45 |     code = 442
 46 | 
 47 | 
 48 | class ParseErrorHTTP(UnprocessableEntity):
 49 |     """Parser Error"""
 50 | 
 51 | 
 52 | class SemanticErrorHTTP(UnprocessableEntity):
 53 |     """Semantic Error"""
 54 | 
 55 |     code = 443
 56 | 
 57 | 
 58 | class Kedro(Resource):
 59 |     def __init__(self, catalog: Catalog):
 60 |         self._catalog = catalog
 61 |         self._parser = reqparse.RequestParser()
 62 |         self._parser.add_argument(
 63 |             "job_ids", action="append", help="List of job ids for a sub graph"
 64 |         )
 65 | 
 66 |     def get(self):
 67 |         nodes = []
 68 |         edges = []
 69 | 
 70 |         args = self._parser.parse_args()
 71 |         with self._catalog.managed_session:
 72 |             column_edges = self._catalog.get_column_lineages(args["job_ids"])
 73 |             for edge in column_edges:
 74 |                 nodes.append(self._column_info(edge.source))
 75 |                 nodes.append(self._column_info(edge.target))
 76 |                 nodes.append(self._job_info(edge.job_execution.job))
 77 |                 edges.append(
 78 |                     {
 79 |                         "source": "column:{}".format(edge.source_id),
 80 |                         "target": "task:{}".format(edge.job_execution.job_id),
 81 |                     }
 82 |                 )
 83 |                 edges.append(
 84 |                     {
 85 |                         "source": "task:{}".format(edge.job_execution.job_id),
 86 |                         "target": "column:{}".format(edge.target_id),
 87 |                     }
 88 |                 )
 89 | 
 90 |         return {"nodes": nodes, "edges": edges}
 91 | 
 92 |     @staticmethod
 93 |     def _column_info(node: CatColumn):
 94 |         return {
 95 |             "id": "column:{}".format(node.id),
 96 |             "name": ".".join(node.fqdn),
 97 |             "type": "data",
 98 |         }
 99 | 
100 |     @staticmethod
101 |     def _job_info(node: Job):
102 |         return {"id": "task:{}".format(node.id), "name": node.name, "type": "task"}
103 | 
104 | 
105 | class ScanList(Resource):
106 |     def __init__(self, catalog: PGCatalog, queue: Queue):
107 |         self._catalog = catalog
108 |         self._queue = queue
109 |         self._parser = reqparse.RequestParser()
110 |         self._parser.add_argument("id", required=True, help="ID of the resource")
111 | 
112 |     def post(self):
113 |         args = self._parser.parse_args()
114 |         logging.info("Args for scanning: {}".format(args))
115 |         job = self._queue.enqueue(
116 |             scan,
117 |             {
118 |                 "user": self._catalog.user,
119 |                 "password": self._catalog.password,
120 |                 "database": self._catalog.database,
121 |                 "host": self._catalog.host,
122 |                 "port": self._catalog.port,
123 |             },
124 |             int(args["id"]),
125 |         )
126 | 
127 |         return {"id": job.id, "status": "queued"}, 200
128 | 
129 |     def get(self):
130 |         job_list = []
131 |         for job in self._queue.started_job_registry.get_job_ids():
132 |             job_list.append({"id": job, "status": "started"})
133 | 
134 |         for job in self._queue.finished_job_registry.get_job_ids():
135 |             job_list.append({"id": job, "status": "finished"})
136 | 
137 |         for job in self._queue.failed_job_registry.get_job_ids():
138 |             job_list.append({"id": job, "status": "failed"})
139 | 
140 |         return job_list, 200
141 | 
142 | 
143 | class Scan(Resource):
144 |     def __init__(self, catalog: PGCatalog, queue: Queue):
145 |         self._catalog = catalog
146 |         self._queue = queue
147 |         self._parser = reqparse.RequestParser()
148 |         self._parser.add_argument("id", required=True, help="ID of the resource")
149 | 
150 |     def get(self, job_id):
151 |         status = RqJob.Job.fetch(job_id, connection=self._queue.connection).get_status()
152 |         return {"id": job_id, "status": status}, 200
153 | 
154 |     def put(self, job_id):
155 |         RqJob.Job.fetch(job_id, connection=self._queue.connection).cancel()
156 |         return {"message": "Job {} cancelled".format(job_id)}, 200
157 | 
158 | 
159 | class Parse(Resource):
160 |     def __init__(self, catalog: Catalog):
161 |         self._catalog = catalog
162 |         self._parser = reqparse.RequestParser()
163 |         self._parser.add_argument("query", required=True, help="Query to parse")
164 |         self._parser.add_argument(
165 |             "source_id", help="Source database of the query", required=True
166 |         )
167 | 
168 |     def post(self):
169 |         args = self._parser.parse_args()
170 |         logging.debug("Parse query: {}".format(args["query"]))
171 |         try:
172 |             parsed = parse(args["query"], "parse_api")
173 |         except ParseError as error:
174 |             raise ParseErrorHTTP(description=str(error))
175 | 
176 |         try:
177 |             with self._catalog.managed_session:
178 |                 source = self._catalog.get_source_by_id(args["source_id"])
179 |                 logging.debug("Parsing query for source {}".format(source))
180 |                 binder = parse_dml_query(
181 |                     catalog=self._catalog, parsed=parsed, source=source
182 |                 )
183 | 
184 |                 return (
185 |                     {
186 |                         "select_tables": [table.name for table in binder.tables],
187 |                         "select_columns": [context.alias for context in binder.columns],
188 |                     },
189 |                     200,
190 |                 )
191 |         except TableNotFound as table_error:
192 |             raise TableNotFoundHTTP(description=str(table_error))
193 |         except ColumnNotFound as column_error:
194 |             raise ColumnNotFoundHTTP(description=str(column_error))
195 |         except SemanticError as semantic_error:
196 |             raise SemanticErrorHTTP(description=str(semantic_error))
197 | 
198 | 
199 | class Analyze(Resource):
200 |     def __init__(self, catalog: Catalog):
201 |         self._catalog = catalog
202 |         self._parser = reqparse.RequestParser()
203 |         self._parser.add_argument("query", required=True, help="Query to parse")
204 |         self._parser.add_argument("name", help="Name of the ETL job")
205 |         self._parser.add_argument(
206 |             "start_time", required=True, help="Start time of the task"
207 |         )
208 |         self._parser.add_argument(
209 |             "end_time", required=True, help="End time of the task"
210 |         )
211 |         self._parser.add_argument(
212 |             "source_id", help="Source database of the query", required=True
213 |         )
214 | 
215 |     def post(self):
216 |         args = self._parser.parse_args()
217 |         logging.debug("Parse query: {}".format(args["query"]))
218 |         try:
219 |             parsed = parse(args["query"], args["name"])
220 |         except ParseError as error:
221 |             raise ParseErrorHTTP(description=str(error))
222 | 
223 |         try:
224 |             with self._catalog.managed_session:
225 |                 source = self._catalog.get_source_by_id(args["source_id"])
226 |                 logging.debug("Parsing query for source {}".format(source))
227 |                 chosen_visitor = analyze_dml_query(self._catalog, parsed, source)
228 |                 job_execution = extract_lineage(
229 |                     catalog=self._catalog,
230 |                     visited_query=chosen_visitor,
231 |                     source=source,
232 |                     parsed=parsed,
233 |                     start_time=datetime.datetime.fromisoformat(args["start_time"]),
234 |                     end_time=datetime.datetime.fromisoformat(args["end_time"]),
235 |                 )
236 | 
237 |                 return (
238 |                     {
239 |                         "data": {
240 |                             "id": job_execution.id,
241 |                             "type": "job_executions",
242 |                             "attributes": {
243 |                                 "job_id": job_execution.job_id,
244 |                                 "started_at": job_execution.started_at.strftime(
245 |                                     "%Y-%m-%d %H:%M:%S"
246 |                                 ),
247 |                                 "ended_at": job_execution.ended_at.strftime(
248 |                                     "%Y-%m-%d %H:%M:%S"
249 |                                 ),
250 |                                 "status": job_execution.status.name,
251 |                             },
252 |                         }
253 |                     },
254 |                     200,
255 |                 )
256 |         except TableNotFound as table_error:
257 |             raise TableNotFoundHTTP(description=str(table_error))
258 |         except ColumnNotFound as column_error:
259 |             raise ColumnNotFoundHTTP(description=str(column_error))
260 |         except SemanticError as semantic_error:
261 |             raise SemanticErrorHTTP(description=str(semantic_error))
262 | 
263 | 
264 | class Server(gunicorn.app.base.BaseApplication):
265 |     def __init__(self, app):
266 |         self.application = app
267 |         super().__init__()
268 | 
269 |     def load_config(self):
270 |         # parse console args
271 |         parser = self.cfg.parser()
272 |         env_args = parser.parse_args(self.cfg.get_cmd_args_from_env())
273 | 
274 |         # Load up environment configuration
275 |         for k, v in vars(env_args).items():
276 |             if v is None:
277 |                 continue
278 |             if k == "args":
279 |                 continue
280 |             self.cfg.set(k.lower(), v)
281 | 
282 |     def load(self):
283 |         return self.application
284 | 
285 | 
286 | def job_execution_serializer(instance: JobExecution, only: List[str]):
287 |     return {
288 |         "id": instance.id,
289 |         "type": "job_executions",
290 |         "attributes": {
291 |             "job_id": instance.job_id,
292 |             "started_at": instance.started_at.strftime("%Y-%m-%d %H:%M:%S"),
293 |             "ended_at": instance.ended_at.strftime("%Y-%m-%d %H:%M:%S"),
294 |             "status": instance.status.name,
295 |         },
296 |     }
297 | 
298 | 
299 | def job_execution_deserializer(data: Dict["str", Any]):
300 |     attributes = data["data"]["attributes"]
301 |     logging.debug(attributes)
302 |     job_execution = JobExecution()
303 |     job_execution.job_id = int(attributes["job_id"])
304 |     job_execution.started_at = datetime.datetime.strptime(
305 |         attributes["started_at"], "%Y-%m-%d %H:%M:%S"
306 |     )
307 |     job_execution.ended_at = datetime.datetime.strptime(
308 |         attributes["ended_at"], "%Y-%m-%d %H:%M:%S"
309 |     )
310 |     job_execution.status = (
311 |         JobExecutionStatus.SUCCESS
312 |         if attributes["status"] == "SUCCESS"
313 |         else JobExecutionStatus.SUCCESS
314 |     )
315 | 
316 |     logging.debug(job_execution)
317 |     logging.debug(job_execution.status == JobExecutionStatus.SUCCESS)
318 |     return job_execution
319 | 
320 | 
321 | def create_server(
322 |     catalog_options: Dict[str, str], connection, is_production=True
323 | ) -> Tuple[Any, PGCatalog]:
324 |     logging.debug(catalog_options)
325 |     catalog = PGCatalog(
326 |         **catalog_options,
327 |         connect_args={"application_name": "data-lineage:flask-restless"},
328 |         max_overflow=40,
329 |         pool_size=20,
330 |         pool_pre_ping=True
331 |     )
332 | 
333 |     init_db(catalog)
334 | 
335 |     restful_catalog = PGCatalog(
336 |         **catalog_options,
337 |         connect_args={"application_name": "data-lineage:restful"},
338 |         pool_pre_ping=True
339 |     )
340 | 
341 |     app = Flask(__name__)
342 |     queue = Queue(is_async=is_production, connection=connection)
343 | 
344 |     # Create CRUD APIs
345 |     methods = ["DELETE", "GET", "PATCH", "POST"]
346 |     url_prefix = "/api/v1/catalog"
347 |     api_manager = flask_restless.APIManager(app, catalog.get_scoped_session())
348 |     api_manager.create_api(
349 |         CatSource,
350 |         methods=methods,
351 |         url_prefix=url_prefix,
352 |         additional_attributes=["fqdn"],
353 |     )
354 |     api_manager.create_api(
355 |         CatSchema,
356 |         methods=methods,
357 |         url_prefix=url_prefix,
358 |         additional_attributes=["fqdn"],
359 |     )
360 |     api_manager.create_api(
361 |         CatTable,
362 |         methods=methods,
363 |         url_prefix=url_prefix,
364 |         additional_attributes=["fqdn"],
365 |     )
366 |     api_manager.create_api(
367 |         CatColumn,
368 |         methods=methods,
369 |         url_prefix=url_prefix,
370 |         additional_attributes=["fqdn"],
371 |     )
372 |     api_manager.create_api(Job, methods=methods, url_prefix=url_prefix)
373 |     api_manager.create_api(
374 |         JobExecution,
375 |         methods=methods,
376 |         url_prefix=url_prefix,
377 |         serializer=job_execution_serializer,
378 |         deserializer=job_execution_deserializer,
379 |     )
380 |     api_manager.create_api(
381 |         ColumnLineage,
382 |         methods=methods,
383 |         url_prefix=url_prefix,
384 |         collection_name="column_lineage",
385 |     )
386 | 
387 |     api_manager.create_api(
388 |         DefaultSchema,
389 |         methods=methods,
390 |         url_prefix=url_prefix,
391 |         collection_name="default_schema",
392 |         primary_key="source_id",
393 |     )
394 | 
395 |     restful_manager = Api(app)
396 |     restful_manager.add_resource(
397 |         Kedro, "/api/main", resource_class_kwargs={"catalog": restful_catalog}
398 |     )
399 |     restful_manager.add_resource(
400 |         ScanList,
401 |         "/api/v1/scan",
402 |         resource_class_kwargs={"catalog": restful_catalog, "queue": queue},
403 |     )
404 | 
405 |     restful_manager.add_resource(
406 |         Scan,
407 |         "/api/v1/scan/<job_id>",
408 |         resource_class_kwargs={"catalog": restful_catalog, "queue": queue},
409 |     )
410 | 
411 |     restful_manager.add_resource(
412 |         Analyze, "/api/v1/analyze", resource_class_kwargs={"catalog": restful_catalog}
413 |     )
414 | 
415 |     restful_manager.add_resource(
416 |         Parse, "/api/v1/parse", resource_class_kwargs={"catalog": restful_catalog}
417 |     )
418 | 
419 |     for rule in app.url_map.iter_rules():
420 |         rule_methods = ",".join(rule.methods)
421 |         logging.debug("{:50s} {:20s} {}".format(rule.endpoint, rule_methods, rule))
422 | 
423 |     if is_production:
424 |         return Server(app=app), catalog
425 |     else:
426 |         return app, catalog
427 | 


--------------------------------------------------------------------------------
/data_lineage/worker.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from contextlib import closing
 3 | 
 4 | from dbcat import DbScanner, PGCatalog
 5 | 
 6 | 
 7 | def scan(connection_args, source_id):
 8 |     logging.info("{}".format(connection_args))
 9 |     catalog = PGCatalog(
10 |         **connection_args,
11 |         connect_args={"application_name": "data-lineage:worker"},
12 |         max_overflow=40,
13 |         pool_size=20,
14 |         pool_pre_ping=True
15 |     )
16 | 
17 |     with closing(catalog):
18 |         with catalog.managed_session:
19 |             source = catalog.get_source_by_id(source_id)
20 |             DbScanner(catalog, source).scan()
21 | 


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Dockerfile
 2 | # Uses multi-stage builds requiring Docker 17.05 or higher
 3 | # See https://docs.docker.com/develop/develop-images/multistage-build/
 4 | 
 5 | # Creating a python base with shared environment variables
 6 | FROM python:3.8.1-slim as python-base
 7 | ENV PYTHONUNBUFFERED=1 \
 8 |     PYTHONDONTWRITEBYTECODE=1 \
 9 |     PIP_NO_CACHE_DIR=off \
10 |     PIP_DISABLE_PIP_VERSION_CHECK=on \
11 |     PIP_DEFAULT_TIMEOUT=100 \
12 |     POETRY_HOME="/opt/poetry" \
13 |     POETRY_VIRTUALENVS_IN_PROJECT=true \
14 |     POETRY_NO_INTERACTION=1 \
15 |     PYSETUP_PATH="/opt/pysetup" \
16 |     VENV_PATH="/opt/pysetup/.venv"
17 | 
18 | ENV PATH="$POETRY_HOME/bin:$VENV_PATH/bin:$PATH"
19 | 
20 | 
21 | # builder-base is used to build dependencies
22 | FROM python-base as builder-base
23 | RUN apt-get update \
24 |     && apt-get install --no-install-recommends -y \
25 |         curl gcc python3-dev default-libmysqlclient-dev \
26 |         build-essential libpq-dev musl-dev
27 | 
28 | # Install Poetry - respects $POETRY_VERSION & $POETRY_HOME
29 | ENV POETRY_VERSION=1.1.6
30 | RUN curl -sSL https://raw.githubusercontent.com/sdispater/poetry/master/get-poetry.py | python
31 | 
32 | # We copy our Python requirements here to cache them
33 | # and install only runtime deps using poetry
34 | WORKDIR $PYSETUP_PATH
35 | COPY ./poetry.lock ./pyproject.toml ./
36 | RUN poetry install --no-dev  # respects 
37 | 
38 | WORKDIR /src
39 | COPY . .
40 | RUN poetry build
41 | ENV PATH="${VENV_PATH}/bin:$PATH"
42 | RUN pip install dist/data_lineage-*.whl
43 | 
44 | # 'production' stage uses the clean 'python-base' stage and copyies
45 | # in only our runtime deps that were installed in the 'builder-base'
46 | FROM python-base as production
47 | 
48 | RUN apt-get update \
49 |     && apt-get install --no-install-recommends -y \
50 |         libpq5
51 | 
52 | COPY --from=builder-base $VENV_PATH $VENV_PATH
53 | COPY ./docker/docker-entrypoint.sh /docker-entrypoint.sh
54 | RUN chmod +x /docker-entrypoint.sh
55 | 
56 | ENTRYPOINT /docker-entrypoint.sh $0 $@
57 | CMD [ "data_lineage"]


--------------------------------------------------------------------------------
/docker/build_image.sh:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | PROJECT_ROOT=$(dirname $(dirname $0))
 6 | 
 7 | echo "$PROJECT_ROOT"
 8 | 
 9 | DOCKERHUB_NAMESPACE=tokern
10 | 
11 | 
12 | TAG=$1
13 | if [ -z $TAG ]; then
14 |     echo "usage: $0 <release-name> [--publish] [--latest]"
15 |     exit 1
16 | fi
17 | 
18 | if [ "$2" == "--publish" ]; then
19 |     PUBLISH="YES"
20 | fi
21 | 
22 | if [ "$3" == "--latest" ]; then
23 |     LATEST="YES"
24 | fi
25 | 
26 | if [ "$PUBLISH" == "YES" ] && [ -z "$DOCKERHUB_USERNAME" -o -z "$DOCKERHUB_PASSWORD" ]; then
27 |     echo "In order to publish an image to Dockerhub you must set \$DOCKERHUB_USERNAME and \$DOCKERHUB_PASSWORD before running."
28 |     exit 1
29 | fi
30 | 
31 | DOCKERHUB_REPOSITORY=data-lineage
32 | DOCKER_IMAGE="${DOCKERHUB_NAMESPACE}/${DOCKERHUB_REPOSITORY}:${TAG}"
33 | 
34 | echo "Building Docker image ${DOCKER_IMAGE} from official Tokern release ${TAG}"
35 | 
36 | # now tell docker to build our image
37 | 
38 | docker build -t "${DOCKER_IMAGE}" -f "$PROJECT_ROOT"/docker/Dockerfile "${PROJECT_ROOT}"
39 | 
40 | if [ "$PUBLISH" == "YES" ]; then
41 |     echo "Publishing image ${DOCKER_IMAGE} to Dockerhub"
42 | 
43 |     # make sure that we are logged into dockerhub
44 |     docker login --username="${DOCKERHUB_USERNAME}" --password="${DOCKERHUB_PASSWORD}"
45 | 
46 |     # push the built image to dockerhub
47 |     docker push "${DOCKER_IMAGE}"
48 | 
49 |     # TODO: quick check against dockerhub to see that our new image made it
50 | 
51 |     if [ "$LATEST" == "YES" ]; then
52 |         # tag our recent versioned image as "latest"
53 |         docker tag "${DOCKER_IMAGE}" ${DOCKERHUB_NAMESPACE}/${DOCKERHUB_REPOSITORY}:latest
54 | 
55 |         # then push it as well
56 |         docker push ${DOCKERHUB_NAMESPACE}/${DOCKERHUB_REPOSITORY}:latest
57 | 
58 |         # TODO: validate push succeeded
59 |     fi
60 | fi
61 | 
62 | echo "Done"


--------------------------------------------------------------------------------
/docker/docker-entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | set -e
 4 | 
 5 | # activate our virtual environment here
 6 | . /opt/pysetup/.venv/bin/activate
 7 | 
 8 | # You can put other setup logic here
 9 | 
10 | # Evaluating passed command:
11 | exec "$@"


--------------------------------------------------------------------------------
/example.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Overview\n",
  8 |     "\n",
  9 |     "This example demonstrates how to scan query history from a data warehouse and save it in the data lineage app. The app automatically parses and extracts data lineage from the queries.\n",
 10 |     "\n",
 11 |     "The example consists of the following sequence of operations:\n",
 12 |     "\n",
 13 |     "* Start docker containers containing a demo. Refer to [docs](https://tokern.io/docs/data-lineage/installation) for detailed instructions on installing demo-wikimedia.\n",
 14 |     "* Scan and send queries from query history to data lineage app.\n",
 15 |     "* Visualize the graph by visiting Tokern UI.\n",
 16 |     "* Analyze the graph"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "# Installation\n",
 24 |     "\n",
 25 |     "This demo requires wikimedia demo to be running. Start the demo using the following instructions:\n",
 26 |     "\n",
 27 |     "    # in a new directory run\n",
 28 |     "    wget https://raw.githubusercontent.com/tokern/data-lineage/master/install-manifests/docker-compose/wikimedia-demo.yml\n",
 29 |     "    # or run\n",
 30 |     "    curl https://raw.githubusercontent.com/tokern/data-lineage/master/install-manifests/docker-compose/wikimedia-demo.yml -o docker-compose.yml\n",
 31 |     "\n",
 32 |     "\n",
 33 |     "Run docker-compose\n",
 34 |     "\n",
 35 |     "\n",
 36 |     "    docker-compose up -d\n",
 37 |     "\n",
 38 |     "\n",
 39 |     "Verify container are running\n",
 40 |     "\n",
 41 |     "\n",
 42 |     "    docker container ls | grep tokern\n"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "# Required configuration for API and wikimedia database network address\n",
 52 |     "\n",
 53 |     "docker_address = \"http://127.0.0.1:8000\"\n",
 54 |     "wikimedia_db = {\n",
 55 |     "  \"username\": \"etldev\",\n",
 56 |     "  \"password\": \"3tld3v\",\n",
 57 |     "  \"uri\": \"tokern-demo-wikimedia\",\n",
 58 |     "  \"port\": \"5432\",\n",
 59 |     "  \"database\": \"wikimedia\"\n",
 60 |     "}"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": null,
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "import time\n",
 70 |     "# Setup a connection to catalog using the SDK.\n",
 71 |     "from data_lineage import Catalog, Scan\n",
 72 |     "\n",
 73 |     "catalog = Catalog(docker_address)\n",
 74 |     "\n",
 75 |     "# Register wikimedia datawarehouse with data-lineage app.\n",
 76 |     "\n",
 77 |     "source = catalog.add_source(name=\"wikimedia\", source_type=\"postgresql\", **wikimedia_db)\n",
 78 |     "\n",
 79 |     "# Scan the wikimedia data warehouse and register all schemata, tables and columns.\n",
 80 |     "scan = Scan(docker_address)\n",
 81 |     "job = scan.start(source)\n",
 82 |     "\n",
 83 |     "# Wait for scan to complete\n",
 84 |     "\n",
 85 |     "status = \"\"\n",
 86 |     "\n",
 87 |     "while (status != \"finished\" and status != \"failed\"):\n",
 88 |     "    time.sleep(5)\n",
 89 |     "    status = scan.get(job[\"id\"])[\"status\"]\n",
 90 |     "    print(\"Status is {}\".format(status))"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": null,
 96 |    "metadata": {},
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "import json\n",
100 |     "\n",
101 |     "with open(\"test/queries.json\", \"r\") as file:\n",
102 |     "    queries = json.load(file)"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "metadata": {
109 |     "scrolled": true
110 |    },
111 |    "outputs": [],
112 |    "source": [
113 |     "from datetime import datetime\n",
114 |     "from data_lineage import Analyze\n",
115 |     "\n",
116 |     "analyze = Analyze(docker_address)\n",
117 |     "\n",
118 |     "for query in queries:\n",
119 |     "    print(query)\n",
120 |     "    analyze.analyze(**query, source=source, start_time=datetime.now(), end_time=datetime.now())"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "markdown",
125 |    "metadata": {},
126 |    "source": [
127 |     "Visit [Kedro UI](http://localhost:8000/)\n",
128 |     "\n",
129 |     "![One Task Graph](./full_graph.png)"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": null,
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": []
138 |   }
139 |  ],
140 |  "metadata": {
141 |   "kernelspec": {
142 |    "display_name": "Python 3 (ipykernel)",
143 |    "language": "python",
144 |    "name": "python3"
145 |   },
146 |   "language_info": {
147 |    "codemirror_mode": {
148 |     "name": "ipython",
149 |     "version": 3
150 |    },
151 |    "file_extension": ".py",
152 |    "mimetype": "text/x-python",
153 |    "name": "python",
154 |    "nbconvert_exporter": "python",
155 |    "pygments_lexer": "ipython3",
156 |    "version": "3.8.10"
157 |   }
158 |  },
159 |  "nbformat": 4,
160 |  "nbformat_minor": 4
161 | }


--------------------------------------------------------------------------------
/full_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tokern/data-lineage/5945542742979fe350d313d906440c93ee3d0f36/full_graph.png


--------------------------------------------------------------------------------
/install-manifests/docker-compose/catalog-demo.yml:
--------------------------------------------------------------------------------
 1 | version: '3.6'
 2 | services:
 3 |   tokern-demo-catalog:
 4 |     image: tokern/demo-catalog:latest
 5 |     container_name: tokern-demo-catalog
 6 |     restart: unless-stopped
 7 |     networks:
 8 |       - tokern-internal
 9 |     volumes:
10 |     - tokern_demo_catalog_data:/var/lib/postgresql/data
11 |     environment:
12 |       POSTGRES_PASSWORD: catal0g_passw0rd
13 |       POSTGRES_USER: catalog_user
14 |       POSTGRES_DB: tokern
15 |   tokern-api:
16 |     image: tokern/data-lineage:latest
17 |     container_name: tokern-data-lineage
18 |     restart: unless-stopped
19 |     networks:
20 |       - tokern-internal
21 |     environment:
22 |       CATALOG_PASSWORD: catal0g_passw0rd
23 |       CATALOG_USER: catalog_user
24 |       CATALOG_DB: tokern
25 |       CATALOG_HOST: tokern-demo-catalog
26 |       GUNICORN_CMD_ARGS: "--bind 0.0.0.0:4142"
27 |   toker-viz:
28 |     image: tokern/data-lineage-viz:latest
29 |     container_name: tokern-data-lineage-visualizer
30 |     restart: unless-stopped
31 |     networks:
32 |       - tokern-internal
33 |       - tokern-net
34 |     ports:
35 |       - "8000:80"
36 | networks:
37 |   tokern-net: # Exposed by your host.
38 |     # external: true
39 |     name: "tokern-net"
40 |     driver: bridge
41 |     ipam:
42 |       driver: default
43 |       config:
44 |       - subnet: 10.10.0.0/24
45 |   tokern-internal:
46 |     name: "tokern-internal"
47 |     driver: bridge
48 |     internal: true
49 |     ipam:
50 |       driver: default
51 |       config:
52 |       - subnet: 10.11.0.0/24
53 | 
54 | volumes:
55 |   tokern_demo_catalog_data:
56 | 


--------------------------------------------------------------------------------
/install-manifests/docker-compose/tokern-lineage-engine.yml:
--------------------------------------------------------------------------------
 1 | version: '3.6'
 2 | services:
 3 |   tokern-catalog:
 4 |     image: postgres:13.2-alpine
 5 |     container_name: tokern-catalog
 6 |     restart: unless-stopped
 7 |     networks:
 8 |       - tokern-internal
 9 |     volumes:
10 |     - tokern_catalog_data:/var/lib/postgresql/data
11 |     environment:
12 |       POSTGRES_PASSWORD: catal0g_passw0rd
13 |       POSTGRES_USER: catalog_user
14 |       POSTGRES_DB: tokern
15 |   tokern-redis:
16 |     image: redis:6.2.6-alpine
17 |     container_name: tokern-redis
18 |     restart: unless-stopped
19 |     networks:
20 |       - tokern-internal
21 |   tokern-api:
22 |     image: tokern/data-lineage:latest
23 |     container_name: tokern-data-lineage
24 |     restart: unless-stopped
25 |     depends_on:
26 |       - tokern-redis
27 |     networks:
28 |       - tokern-internal
29 |       - tokern-net
30 |     environment:
31 |       CATALOG_PASSWORD: ${CATALOG_PASSWORD:-catal0g_passw0rd}
32 |       CATALOG_USER: ${CATALOG_USER:-catalog_user}
33 |       CATALOG_DB: ${CATALOG_DB:-tokern}
34 |       CATALOG_HOST: ${CATALOG_HOST:-tokern-catalog}
35 |       CATALOG_PORT: ${CATALOG_PORT:-5432}
36 |       GUNICORN_CMD_ARGS: "--bind 0.0.0.0:4142"
37 |       LOG_LEVEL: ${LOG_LEVEL:-INFO}
38 |       REDIS_HOST: ${REDIS_HOST:-tokern-redis}
39 |       REDIS_PORT: ${REDIS_PORT:-6379}
40 |       REDIS_HOST: "tokern-redis"
41 |   tokern-worker:
42 |     image: tokern/data-lineage:latest
43 |     container_name: tokern_worker
44 |     restart: unless-stopped
45 |     depends_on:
46 |       - tokern-redis
47 |     networks:
48 |       - tokern-internal
49 |     command: rq worker --url redis://tokern-redis:6379
50 |   tokern-viz:
51 |     image: tokern/data-lineage-viz:latest
52 |     container_name: tokern-data-lineage-visualizer
53 |     restart: unless-stopped
54 |     networks:
55 |       - tokern-internal
56 |       - tokern-net
57 |     ports:
58 |       - "8000:80"
59 | networks:
60 |   tokern-net: # Exposed by your host.
61 |     # external: true
62 |     name: "tokern-net"
63 |     driver: bridge
64 |     ipam:
65 |       driver: default
66 |       config:
67 |       - subnet: 10.10.0.0/24
68 |   tokern-internal:
69 |     name: "tokern-internal"
70 |     driver: bridge
71 |     internal: true
72 |     ipam:
73 |       driver: default
74 |       config:
75 |       - subnet: 10.11.0.0/24
76 | 
77 | volumes:
78 |   tokern_catalog_data:
79 | 


--------------------------------------------------------------------------------
/install-manifests/docker-compose/wikimedia-demo.yml:
--------------------------------------------------------------------------------
 1 | version: '3.6'
 2 | services:
 3 |   tokern-catalog:
 4 |     image: postgres:13.2-alpine
 5 |     container_name: tokern-catalog
 6 |     restart: unless-stopped
 7 |     networks:
 8 |       - tokern-internal
 9 |     volumes:
10 |     - tokern_wikimedia_catalog_data:/var/lib/postgresql/data
11 |     environment:
12 |       POSTGRES_PASSWORD: catal0g_passw0rd
13 |       POSTGRES_USER: catalog_user
14 |       POSTGRES_DB: tokern
15 |   tokern-redis:
16 |     image: redis:6.2.6-alpine
17 |     container_name: tokern-redis
18 |     restart: unless-stopped
19 |     networks:
20 |       - tokern-internal
21 |   tokern-wikimedia:
22 |     image: tokern/demo-wikimedia:latest
23 |     container_name: tokern-demo-wikimedia
24 |     restart: unless-stopped
25 |     networks:
26 |       - tokern-internal
27 |     volumes:
28 |     - tokern_wikimedia_data:/var/lib/postgresql/data
29 |     environment:
30 |       POSTGRES_PASSWORD: 3tld3v
31 |       POSTGRES_USER: etldev
32 |       POSTGRES_DB: wikimedia
33 |   tokern-api:
34 |     image: tokern/data-lineage:latest
35 |     container_name: tokern-data-lineage
36 |     restart: unless-stopped
37 |     depends_on:
38 |       - tokern-redis
39 |     networks:
40 |       - tokern-internal
41 |     environment:
42 |       CATALOG_PASSWORD: catal0g_passw0rd
43 |       CATALOG_USER: catalog_user
44 |       CATALOG_DB: tokern
45 |       CATALOG_HOST: tokern-catalog
46 |       GUNICORN_CMD_ARGS: "--bind 0.0.0.0:4142"
47 |       REDIS_HOST: "tokern-redis"
48 |   tokern-worker:
49 |     image: tokern/data-lineage:latest
50 |     container_name: tokern_worker
51 |     restart: unless-stopped
52 |     depends_on:
53 |       - tokern-redis
54 |     networks:
55 |       - tokern-internal
56 |     command: rq worker --url redis://tokern-redis:6379
57 |   toker-viz:
58 |     image: tokern/data-lineage-viz:latest
59 |     container_name: tokern-data-lineage-visualizer
60 |     restart: unless-stopped
61 |     networks:
62 |       - tokern-internal
63 |       - tokern-net
64 |     ports:
65 |       - "8000:80"
66 | networks:
67 |   tokern-net: # Exposed by your host.
68 |     # external: true
69 |     name: "tokern-net"
70 |     driver: bridge
71 |     ipam:
72 |       driver: default
73 |       config:
74 |       - subnet: 10.10.0.0/24
75 |   tokern-internal:
76 |     name: "tokern-internal"
77 |     driver: bridge
78 |     internal: true
79 |     ipam:
80 |       driver: default
81 |       config:
82 |       - subnet: 10.11.0.0/24
83 | 
84 | volumes:
85 |   tokern_wikimedia_catalog_data:
86 |   tokern_wikimedia_data:


--------------------------------------------------------------------------------
/install-manifests/dockerfiles/Dockerfile-demo-catalog:
--------------------------------------------------------------------------------
1 | FROM postgres:13.2-alpine
2 | COPY demo-catalog.sql /docker-entrypoint-initdb.d/


--------------------------------------------------------------------------------
/install-manifests/dockerfiles/Dockerfile-demo-wikimedia:
--------------------------------------------------------------------------------
1 | FROM postgres:13.2-alpine
2 | COPY demo-wikimedia.sql /docker-entrypoint-initdb.d/


--------------------------------------------------------------------------------
/install-manifests/dockerfiles/Makefile:
--------------------------------------------------------------------------------
 1 | default: all
 2 | 
 3 | .PHONY: default all fetch_dump
 4 | 
 5 | date := `date '+%Y-%m-%d'`
 6 | TARGET_IMAGE ?= demo-catalog
 7 | VERSION ?= "0.2.0"
 8 | DESTINATION_REPOSITORY ?= "tokern"
 9 | 
10 | all: generate_image push_to_registry finished
11 | 
12 |  check_vars:
13 | 	@test -n "$DESTINATION_REPOSITORY" || (echo "You need to set DESTINATION_REPOSITORY environment variable" >&2 && exit 1)
14 | 
15 | generate_image:
16 | 	@docker build . -f Dockerfile-$(TARGET_IMAGE) -t $(TARGET_IMAGE)\:latest -t $(DESTINATION_REPOSITORY)/$(TARGET_IMAGE)\:latest -t $(DESTINATION_REPOSITORY)/$(TARGET_IMAGE)\:$(VERSION)
17 | 
18 | push_to_registry:
19 | 	@echo ""
20 | 	@echo "====== Pushing image to repository ======"
21 | 	@docker push $(DESTINATION_REPOSITORY)/$(TARGET_IMAGE):latest
22 | 	@docker push $(DESTINATION_REPOSITORY)/$(TARGET_IMAGE):$(VERSION)
23 | 
24 | finished:
25 | 	@echo ""
26 | 	@echo "Finished with success. Pushed image to $(DESTINATION_REPOSITORY)/$(TARGET_IMAGE)"


--------------------------------------------------------------------------------
/install-manifests/dockerfiles/demo-catalog.sql:
--------------------------------------------------------------------------------
  1 | --
  2 | -- PostgreSQL database dump
  3 | --
  4 | 
  5 | -- Dumped from database version 13.2 (Debian 13.2-1.pgdg100+1)
  6 | -- Dumped by pg_dump version 13.3 (Ubuntu 13.3-1.pgdg20.04+1)
  7 | 
  8 | SET statement_timeout = 0;
  9 | SET lock_timeout = 0;
 10 | SET idle_in_transaction_session_timeout = 0;
 11 | SET client_encoding = 'UTF8';
 12 | SET standard_conforming_strings = on;
 13 | SELECT pg_catalog.set_config('search_path', '', false);
 14 | SET check_function_bodies = false;
 15 | SET xmloption = content;
 16 | SET client_min_messages = warning;
 17 | SET row_security = off;
 18 | 
 19 | --
 20 | -- Name: jobexecutionstatus; Type: TYPE; Schema: public; Owner: catalog_user
 21 | --
 22 | 
 23 | CREATE TYPE public.jobexecutionstatus AS ENUM (
 24 |     'SUCCESS',
 25 |     'FAILURE'
 26 | );
 27 | 
 28 | 
 29 | ALTER TYPE public.jobexecutionstatus OWNER TO catalog_user;
 30 | 
 31 | SET default_tablespace = '';
 32 | 
 33 | SET default_table_access_method = heap;
 34 | 
 35 | --
 36 | -- Name: alembic_version; Type: TABLE; Schema: public; Owner: catalog_user
 37 | --
 38 | 
 39 | CREATE TABLE public.alembic_version (
 40 |     version_num character varying(32) NOT NULL
 41 | );
 42 | 
 43 | 
 44 | ALTER TABLE public.alembic_version OWNER TO catalog_user;
 45 | 
 46 | --
 47 | -- Name: column_lineage; Type: TABLE; Schema: public; Owner: catalog_user
 48 | --
 49 | 
 50 | CREATE TABLE public.column_lineage (
 51 |     id integer NOT NULL,
 52 |     context jsonb,
 53 |     source_id integer,
 54 |     target_id integer,
 55 |     job_execution_id integer
 56 | );
 57 | 
 58 | 
 59 | ALTER TABLE public.column_lineage OWNER TO catalog_user;
 60 | 
 61 | --
 62 | -- Name: column_lineage_id_seq; Type: SEQUENCE; Schema: public; Owner: catalog_user
 63 | --
 64 | 
 65 | CREATE SEQUENCE public.column_lineage_id_seq
 66 |     AS integer
 67 |     START WITH 1
 68 |     INCREMENT BY 1
 69 |     NO MINVALUE
 70 |     NO MAXVALUE
 71 |     CACHE 1;
 72 | 
 73 | 
 74 | ALTER TABLE public.column_lineage_id_seq OWNER TO catalog_user;
 75 | 
 76 | --
 77 | -- Name: column_lineage_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: catalog_user
 78 | --
 79 | 
 80 | ALTER SEQUENCE public.column_lineage_id_seq OWNED BY public.column_lineage.id;
 81 | 
 82 | 
 83 | --
 84 | -- Name: columns; Type: TABLE; Schema: public; Owner: catalog_user
 85 | --
 86 | 
 87 | CREATE TABLE public.columns (
 88 |     id integer NOT NULL,
 89 |     name character varying,
 90 |     data_type character varying,
 91 |     sort_order integer,
 92 |     table_id integer
 93 | );
 94 | 
 95 | 
 96 | ALTER TABLE public.columns OWNER TO catalog_user;
 97 | 
 98 | --
 99 | -- Name: columns_id_seq; Type: SEQUENCE; Schema: public; Owner: catalog_user
100 | --
101 | 
102 | CREATE SEQUENCE public.columns_id_seq
103 |     AS integer
104 |     START WITH 1
105 |     INCREMENT BY 1
106 |     NO MINVALUE
107 |     NO MAXVALUE
108 |     CACHE 1;
109 | 
110 | 
111 | ALTER TABLE public.columns_id_seq OWNER TO catalog_user;
112 | 
113 | --
114 | -- Name: columns_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: catalog_user
115 | --
116 | 
117 | ALTER SEQUENCE public.columns_id_seq OWNED BY public.columns.id;
118 | 
119 | 
120 | --
121 | -- Name: default_schema; Type: TABLE; Schema: public; Owner: catalog_user
122 | --
123 | 
124 | CREATE TABLE public.default_schema (
125 |     source_id integer NOT NULL,
126 |     schema_id integer
127 | );
128 | 
129 | 
130 | ALTER TABLE public.default_schema OWNER TO catalog_user;
131 | 
132 | --
133 | -- Name: job_executions; Type: TABLE; Schema: public; Owner: catalog_user
134 | --
135 | 
136 | CREATE TABLE public.job_executions (
137 |     id integer NOT NULL,
138 |     job_id integer,
139 |     started_at timestamp without time zone,
140 |     ended_at timestamp without time zone,
141 |     status public.jobexecutionstatus
142 | );
143 | 
144 | 
145 | ALTER TABLE public.job_executions OWNER TO catalog_user;
146 | 
147 | --
148 | -- Name: job_executions_id_seq; Type: SEQUENCE; Schema: public; Owner: catalog_user
149 | --
150 | 
151 | CREATE SEQUENCE public.job_executions_id_seq
152 |     AS integer
153 |     START WITH 1
154 |     INCREMENT BY 1
155 |     NO MINVALUE
156 |     NO MAXVALUE
157 |     CACHE 1;
158 | 
159 | 
160 | ALTER TABLE public.job_executions_id_seq OWNER TO catalog_user;
161 | 
162 | --
163 | -- Name: job_executions_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: catalog_user
164 | --
165 | 
166 | ALTER SEQUENCE public.job_executions_id_seq OWNED BY public.job_executions.id;
167 | 
168 | 
169 | --
170 | -- Name: jobs; Type: TABLE; Schema: public; Owner: catalog_user
171 | --
172 | 
173 | CREATE TABLE public.jobs (
174 |     id integer NOT NULL,
175 |     name character varying,
176 |     context jsonb,
177 |     source_id integer
178 | );
179 | 
180 | 
181 | ALTER TABLE public.jobs OWNER TO catalog_user;
182 | 
183 | --
184 | -- Name: jobs_id_seq; Type: SEQUENCE; Schema: public; Owner: catalog_user
185 | --
186 | 
187 | CREATE SEQUENCE public.jobs_id_seq
188 |     AS integer
189 |     START WITH 1
190 |     INCREMENT BY 1
191 |     NO MINVALUE
192 |     NO MAXVALUE
193 |     CACHE 1;
194 | 
195 | 
196 | ALTER TABLE public.jobs_id_seq OWNER TO catalog_user;
197 | 
198 | --
199 | -- Name: jobs_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: catalog_user
200 | --
201 | 
202 | ALTER SEQUENCE public.jobs_id_seq OWNED BY public.jobs.id;
203 | 
204 | 
205 | --
206 | -- Name: schemata; Type: TABLE; Schema: public; Owner: catalog_user
207 | --
208 | 
209 | CREATE TABLE public.schemata (
210 |     id integer NOT NULL,
211 |     name character varying,
212 |     source_id integer
213 | );
214 | 
215 | 
216 | ALTER TABLE public.schemata OWNER TO catalog_user;
217 | 
218 | --
219 | -- Name: schemata_id_seq; Type: SEQUENCE; Schema: public; Owner: catalog_user
220 | --
221 | 
222 | CREATE SEQUENCE public.schemata_id_seq
223 |     AS integer
224 |     START WITH 1
225 |     INCREMENT BY 1
226 |     NO MINVALUE
227 |     NO MAXVALUE
228 |     CACHE 1;
229 | 
230 | 
231 | ALTER TABLE public.schemata_id_seq OWNER TO catalog_user;
232 | 
233 | --
234 | -- Name: schemata_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: catalog_user
235 | --
236 | 
237 | ALTER SEQUENCE public.schemata_id_seq OWNED BY public.schemata.id;
238 | 
239 | 
240 | --
241 | -- Name: sources; Type: TABLE; Schema: public; Owner: catalog_user
242 | --
243 | 
244 | CREATE TABLE public.sources (
245 |     id integer NOT NULL,
246 |     source_type character varying,
247 |     name character varying,
248 |     dialect character varying,
249 |     uri character varying,
250 |     port character varying,
251 |     username character varying,
252 |     password character varying,
253 |     database character varying,
254 |     instance character varying,
255 |     cluster character varying,
256 |     project_id character varying,
257 |     project_credentials character varying,
258 |     page_size character varying,
259 |     filter_key character varying,
260 |     included_tables_regex character varying,
261 |     key_path character varying,
262 |     account character varying,
263 |     role character varying,
264 |     warehouse character varying
265 | );
266 | 
267 | 
268 | ALTER TABLE public.sources OWNER TO catalog_user;
269 | 
270 | --
271 | -- Name: sources_id_seq; Type: SEQUENCE; Schema: public; Owner: catalog_user
272 | --
273 | 
274 | CREATE SEQUENCE public.sources_id_seq
275 |     AS integer
276 |     START WITH 1
277 |     INCREMENT BY 1
278 |     NO MINVALUE
279 |     NO MAXVALUE
280 |     CACHE 1;
281 | 
282 | 
283 | ALTER TABLE public.sources_id_seq OWNER TO catalog_user;
284 | 
285 | --
286 | -- Name: sources_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: catalog_user
287 | --
288 | 
289 | ALTER SEQUENCE public.sources_id_seq OWNED BY public.sources.id;
290 | 
291 | 
292 | --
293 | -- Name: tables; Type: TABLE; Schema: public; Owner: catalog_user
294 | --
295 | 
296 | CREATE TABLE public.tables (
297 |     id integer NOT NULL,
298 |     name character varying,
299 |     schema_id integer
300 | );
301 | 
302 | 
303 | ALTER TABLE public.tables OWNER TO catalog_user;
304 | 
305 | --
306 | -- Name: tables_id_seq; Type: SEQUENCE; Schema: public; Owner: catalog_user
307 | --
308 | 
309 | CREATE SEQUENCE public.tables_id_seq
310 |     AS integer
311 |     START WITH 1
312 |     INCREMENT BY 1
313 |     NO MINVALUE
314 |     NO MAXVALUE
315 |     CACHE 1;
316 | 
317 | 
318 | ALTER TABLE public.tables_id_seq OWNER TO catalog_user;
319 | 
320 | --
321 | -- Name: tables_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: catalog_user
322 | --
323 | 
324 | ALTER SEQUENCE public.tables_id_seq OWNED BY public.tables.id;
325 | 
326 | 
327 | --
328 | -- Name: column_lineage id; Type: DEFAULT; Schema: public; Owner: catalog_user
329 | --
330 | 
331 | ALTER TABLE ONLY public.column_lineage ALTER COLUMN id SET DEFAULT nextval('public.column_lineage_id_seq'::regclass);
332 | 
333 | 
334 | --
335 | -- Name: columns id; Type: DEFAULT; Schema: public; Owner: catalog_user
336 | --
337 | 
338 | ALTER TABLE ONLY public.columns ALTER COLUMN id SET DEFAULT nextval('public.columns_id_seq'::regclass);
339 | 
340 | 
341 | --
342 | -- Name: job_executions id; Type: DEFAULT; Schema: public; Owner: catalog_user
343 | --
344 | 
345 | ALTER TABLE ONLY public.job_executions ALTER COLUMN id SET DEFAULT nextval('public.job_executions_id_seq'::regclass);
346 | 
347 | 
348 | --
349 | -- Name: jobs id; Type: DEFAULT; Schema: public; Owner: catalog_user
350 | --
351 | 
352 | ALTER TABLE ONLY public.jobs ALTER COLUMN id SET DEFAULT nextval('public.jobs_id_seq'::regclass);
353 | 
354 | 
355 | --
356 | -- Name: schemata id; Type: DEFAULT; Schema: public; Owner: catalog_user
357 | --
358 | 
359 | ALTER TABLE ONLY public.schemata ALTER COLUMN id SET DEFAULT nextval('public.schemata_id_seq'::regclass);
360 | 
361 | 
362 | --
363 | -- Name: sources id; Type: DEFAULT; Schema: public; Owner: catalog_user
364 | --
365 | 
366 | ALTER TABLE ONLY public.sources ALTER COLUMN id SET DEFAULT nextval('public.sources_id_seq'::regclass);
367 | 
368 | 
369 | --
370 | -- Name: tables id; Type: DEFAULT; Schema: public; Owner: catalog_user
371 | --
372 | 
373 | ALTER TABLE ONLY public.tables ALTER COLUMN id SET DEFAULT nextval('public.tables_id_seq'::regclass);
374 | 
375 | 
376 | --
377 | -- Data for Name: alembic_version; Type: TABLE DATA; Schema: public; Owner: catalog_user
378 | --
379 | 
380 | COPY public.alembic_version (version_num) FROM stdin;
381 | d1daff1715f7
382 | \.
383 | 
384 | 
385 | --
386 | -- Data for Name: column_lineage; Type: TABLE DATA; Schema: public; Owner: catalog_user
387 | --
388 | 
389 | COPY public.column_lineage (id, context, source_id, target_id, job_execution_id) FROM stdin;
390 | 1	{}	5	10	1
391 | 2	{}	7	11	1
392 | 3	{}	7	12	1
393 | 4	{}	5	13	1
394 | 5	{}	6	14	1
395 | 6	{}	5	15	2
396 | 7	{}	7	16	2
397 | 8	{}	7	17	2
398 | 9	{}	5	18	2
399 | 10	{}	6	19	2
400 | 11	{}	15	20	3
401 | 12	{}	16	21	3
402 | 13	{}	17	22	3
403 | 14	{}	18	23	3
404 | 15	{}	19	24	3
405 | 16	{}	2	25	4
406 | 17	{}	3	26	4
407 | 18	{}	4	27	4
408 | 19	{}	23	29	5
409 | 20	{}	22	30	5
410 | 21	{}	22	31	5
411 | 22	{}	27	32	5
412 | 23	{}	28	33	5
413 | \.
414 | 
415 | 
416 | --
417 | -- Data for Name: columns; Type: TABLE DATA; Schema: public; Owner: catalog_user
418 | --
419 | 
420 | COPY public.columns (id, name, data_type, sort_order, table_id) FROM stdin;
421 | 1	group	STRING	0	1
422 | 2	page_title	STRING	1	1
423 | 3	views	BIGINT	2	1
424 | 4	bytes_sent	BIGINT	3	1
425 | 5	page_id	BIGINT	0	2
426 | 6	page_latest	BIGINT	1	2
427 | 7	page_title	STRING	2	2
428 | 8	rd_from	BIGINT	0	3
429 | 9	page_title	STRING	1	3
430 | 10	redirect_id	BIGINT	0	4
431 | 11	redirect_title	STRING	1	4
432 | 12	true_title	STRING	2	4
433 | 13	page_id	BIGINT	3	4
434 | 14	page_version	BIGINT	4	4
435 | 15	redirect_id	BIGINT	0	5
436 | 16	redirect_title	STRING	1	5
437 | 17	true_title	STRING	2	5
438 | 18	page_id	BIGINT	3	5
439 | 19	page_version	BIGINT	4	5
440 | 20	redirect_id	bigint	0	6
441 | 21	redirect_title	STRING	1	6
442 | 22	true_title	STRING	2	6
443 | 23	page_id	BIGINT	3	6
444 | 24	page_version	BIGINT	4	6
445 | 25	group	STRING	0	7
446 | 26	page_title	STRING	1	7
447 | 27	views	BIGINT	2	7
448 | 28	bytes_sent	BIGINT	3	7
449 | 29	page_id	BIGINT	0	8
450 | 30	page_title	STRING	1	8
451 | 31	page_url	STRING	2	8
452 | 32	views	BIGINT	3	8
453 | 33	bytes_sent	BIGINT	4	8
454 | \.
455 | 
456 | 
457 | --
458 | -- Data for Name: default_schema; Type: TABLE DATA; Schema: public; Owner: catalog_user
459 | --
460 | 
461 | COPY public.default_schema (source_id, schema_id) FROM stdin;
462 | \.
463 | 
464 | 
465 | --
466 | -- Data for Name: job_executions; Type: TABLE DATA; Schema: public; Owner: catalog_user
467 | --
468 | 
469 | COPY public.job_executions (id, job_id, started_at, ended_at, status) FROM stdin;
470 | 1	1	2021-07-29 23:11:44.470984	2021-07-29 23:11:44.470993	SUCCESS
471 | 2	2	2021-07-29 23:11:44.61084	2021-07-29 23:11:44.610849	SUCCESS
472 | 3	3	2021-07-29 23:11:44.717093	2021-07-29 23:11:44.717101	SUCCESS
473 | 4	4	2021-07-29 23:11:44.842395	2021-07-29 23:11:44.84241	SUCCESS
474 | 5	5	2021-07-29 23:11:44.949858	2021-07-29 23:11:44.949867	SUCCESS
475 | \.
476 | 
477 | 
478 | --
479 | -- Data for Name: jobs; Type: TABLE DATA; Schema: public; Owner: catalog_user
480 | --
481 | 
482 | COPY public.jobs (id, name, context, source_id) FROM stdin;
483 | 1	LOAD page_lookup_nonredirect	{"query": "INSERT INTO page_lookup_nonredirect SELECT  page.page_id as redircet_id, page.page_title as redirect_title, page.page_title true_title, page.page_id, page.page_latest FROM page LEFT OUTER JOIN redirect ON page.page_id = redirect.rd_from WHERE redirect.rd_from IS NULL "}	1
484 | 2	LOAD page_lookup_redirect	{"query": "insert into page_lookup_redirect select original_page.page_id redirect_id, original_page.page_title redirect_title, final_page.page_title as true_title, final_page.page_id, final_page.page_latest from page final_page join redirect on (redirect.page_title = final_page.page_title) join page original_page on (redirect.rd_from = original_page.page_id)"}	1
485 | 3	LOAD page_lookup	{"query": "INSERT INTO page_lookup SELECT plr.redirect_id, plr.redirect_title, plr.true_title, plr.page_id, plr.page_version FROM page_lookup_redirect plr"}	1
486 | 4	LOAD filtered_pagecounts	{"query": "INSERT INTO filtered_pagecounts(\\"group\\", page_title, views) SELECT regexp_replace (reflect ('java.net.URLDecoder','decode', reflect ('java.net.URLDecoder','decode',pvs.page_title)),'^\\\\s*([a-zA-Z0-9]+).*','$1') page_title,SUM (pvs.views) AS total_views, SUM (pvs.bytes_sent) AS total_bytes_sent FROM pagecounts as pvs WHERE not pvs.page_title LIKE '(MEDIA|SPECIAL||Talk|User|User_talk|Project|Project_talk|File|File_talk|MediaWiki|MediaWiki_talk|Template|Template_talk|Help|Help_talk|Category|Category_talk|Portal|Wikipedia|Wikipedia_talk|upload|Special)\\\\:(.*)' and pvs.page_title LIKE '^([A-Z])(.*)' and not pvs.page_title LIKE '(.*).(jpg|gif|png|JPG|GIF|PNG|txt|ico)$' and pvs.page_title <> '404_error/' and pvs.page_title <> 'Main_Page' and pvs.page_title <> 'Hypertext_Transfer_Protocol' and pvs.page_title <> 'Favicon.ico' and pvs.page_title <> 'Search' and pvs.dt = '2020-01-01' GROUP BY regexp_replace (reflect ('java.net.URLDecoder','decode', reflect ('java.net.URLDecoder','decode',pvs.page_title)),'^\\\\s*([a-zA-Z0-9]+).*','$1')"}	1
487 | 5	LOAD normalized_pagecounts	{"query": "INSERT INTO normalized_pagecounts SELECT pl.page_id page_id, REGEXP_REPLACE(pl.true_title, '_', ' ') page_title, pl.true_title page_url, fp.views, fp.bytes_sent FROM page_lookup pl JOIN filtered_pagecounts fp ON fp.page_title = pl.redirect_title where fp.dt='2020-01-01'"}	1
488 | \.
489 | 
490 | 
491 | --
492 | -- Data for Name: schemata; Type: TABLE DATA; Schema: public; Owner: catalog_user
493 | --
494 | 
495 | COPY public.schemata (id, name, source_id) FROM stdin;
496 | 1	default	1
497 | \.
498 | 
499 | 
500 | --
501 | -- Data for Name: sources; Type: TABLE DATA; Schema: public; Owner: catalog_user
502 | --
503 | 
504 | COPY public.sources (id, source_type, name, dialect, uri, port, username, password, database, instance, cluster, project_id, project_credentials, page_size, filter_key, included_tables_regex, key_path, account, role, warehouse) FROM stdin;
505 | 1	redshift	test	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N
506 | \.
507 | 
508 | 
509 | --
510 | -- Data for Name: tables; Type: TABLE DATA; Schema: public; Owner: catalog_user
511 | --
512 | 
513 | COPY public.tables (id, name, schema_id) FROM stdin;
514 | 1	pagecounts	1
515 | 2	page	1
516 | 3	redirect	1
517 | 4	page_lookup_nonredirect	1
518 | 5	page_lookup_redirect	1
519 | 6	page_lookup	1
520 | 7	filtered_pagecounts	1
521 | 8	normalized_pagecounts	1
522 | \.
523 | 
524 | 
525 | --
526 | -- Name: column_lineage_id_seq; Type: SEQUENCE SET; Schema: public; Owner: catalog_user
527 | --
528 | 
529 | SELECT pg_catalog.setval('public.column_lineage_id_seq', 23, true);
530 | 
531 | 
532 | --
533 | -- Name: columns_id_seq; Type: SEQUENCE SET; Schema: public; Owner: catalog_user
534 | --
535 | 
536 | SELECT pg_catalog.setval('public.columns_id_seq', 33, true);
537 | 
538 | 
539 | --
540 | -- Name: job_executions_id_seq; Type: SEQUENCE SET; Schema: public; Owner: catalog_user
541 | --
542 | 
543 | SELECT pg_catalog.setval('public.job_executions_id_seq', 5, true);
544 | 
545 | 
546 | --
547 | -- Name: jobs_id_seq; Type: SEQUENCE SET; Schema: public; Owner: catalog_user
548 | --
549 | 
550 | SELECT pg_catalog.setval('public.jobs_id_seq', 5, true);
551 | 
552 | 
553 | --
554 | -- Name: schemata_id_seq; Type: SEQUENCE SET; Schema: public; Owner: catalog_user
555 | --
556 | 
557 | SELECT pg_catalog.setval('public.schemata_id_seq', 1, true);
558 | 
559 | 
560 | --
561 | -- Name: sources_id_seq; Type: SEQUENCE SET; Schema: public; Owner: catalog_user
562 | --
563 | 
564 | SELECT pg_catalog.setval('public.sources_id_seq', 1, true);
565 | 
566 | 
567 | --
568 | -- Name: tables_id_seq; Type: SEQUENCE SET; Schema: public; Owner: catalog_user
569 | --
570 | 
571 | SELECT pg_catalog.setval('public.tables_id_seq', 8, true);
572 | 
573 | 
574 | --
575 | -- Name: alembic_version alembic_version_pkc; Type: CONSTRAINT; Schema: public; Owner: catalog_user
576 | --
577 | 
578 | ALTER TABLE ONLY public.alembic_version
579 |     ADD CONSTRAINT alembic_version_pkc PRIMARY KEY (version_num);
580 | 
581 | 
582 | --
583 | -- Name: column_lineage column_lineage_pkey; Type: CONSTRAINT; Schema: public; Owner: catalog_user
584 | --
585 | 
586 | ALTER TABLE ONLY public.column_lineage
587 |     ADD CONSTRAINT column_lineage_pkey PRIMARY KEY (id);
588 | 
589 | 
590 | --
591 | -- Name: columns columns_pkey; Type: CONSTRAINT; Schema: public; Owner: catalog_user
592 | --
593 | 
594 | ALTER TABLE ONLY public.columns
595 |     ADD CONSTRAINT columns_pkey PRIMARY KEY (id);
596 | 
597 | 
598 | --
599 | -- Name: default_schema default_schema_pkey; Type: CONSTRAINT; Schema: public; Owner: catalog_user
600 | --
601 | 
602 | ALTER TABLE ONLY public.default_schema
603 |     ADD CONSTRAINT default_schema_pkey PRIMARY KEY (source_id);
604 | 
605 | 
606 | --
607 | -- Name: job_executions job_executions_pkey; Type: CONSTRAINT; Schema: public; Owner: catalog_user
608 | --
609 | 
610 | ALTER TABLE ONLY public.job_executions
611 |     ADD CONSTRAINT job_executions_pkey PRIMARY KEY (id);
612 | 
613 | 
614 | --
615 | -- Name: jobs jobs_name_key; Type: CONSTRAINT; Schema: public; Owner: catalog_user
616 | --
617 | 
618 | ALTER TABLE ONLY public.jobs
619 |     ADD CONSTRAINT jobs_name_key UNIQUE (name);
620 | 
621 | 
622 | --
623 | -- Name: jobs jobs_pkey; Type: CONSTRAINT; Schema: public; Owner: catalog_user
624 | --
625 | 
626 | ALTER TABLE ONLY public.jobs
627 |     ADD CONSTRAINT jobs_pkey PRIMARY KEY (id);
628 | 
629 | 
630 | --
631 | -- Name: jobs jobs_source_id_name_key; Type: CONSTRAINT; Schema: public; Owner: catalog_user
632 | --
633 | 
634 | ALTER TABLE ONLY public.jobs
635 |     ADD CONSTRAINT jobs_source_id_name_key UNIQUE (source_id, name);
636 | 
637 | 
638 | --
639 | -- Name: schemata schemata_pkey; Type: CONSTRAINT; Schema: public; Owner: catalog_user
640 | --
641 | 
642 | ALTER TABLE ONLY public.schemata
643 |     ADD CONSTRAINT schemata_pkey PRIMARY KEY (id);
644 | 
645 | 
646 | --
647 | -- Name: sources sources_name_key; Type: CONSTRAINT; Schema: public; Owner: catalog_user
648 | --
649 | 
650 | ALTER TABLE ONLY public.sources
651 |     ADD CONSTRAINT sources_name_key UNIQUE (name);
652 | 
653 | 
654 | --
655 | -- Name: sources sources_pkey; Type: CONSTRAINT; Schema: public; Owner: catalog_user
656 | --
657 | 
658 | ALTER TABLE ONLY public.sources
659 |     ADD CONSTRAINT sources_pkey PRIMARY KEY (id);
660 | 
661 | 
662 | --
663 | -- Name: tables tables_pkey; Type: CONSTRAINT; Schema: public; Owner: catalog_user
664 | --
665 | 
666 | ALTER TABLE ONLY public.tables
667 |     ADD CONSTRAINT tables_pkey PRIMARY KEY (id);
668 | 
669 | 
670 | --
671 | -- Name: columns unique_column_name; Type: CONSTRAINT; Schema: public; Owner: catalog_user
672 | --
673 | 
674 | ALTER TABLE ONLY public.columns
675 |     ADD CONSTRAINT unique_column_name UNIQUE (table_id, name);
676 | 
677 | 
678 | --
679 | -- Name: column_lineage unique_lineage; Type: CONSTRAINT; Schema: public; Owner: catalog_user
680 | --
681 | 
682 | ALTER TABLE ONLY public.column_lineage
683 |     ADD CONSTRAINT unique_lineage UNIQUE (source_id, target_id, job_execution_id);
684 | 
685 | 
686 | --
687 | -- Name: schemata unique_schema_name; Type: CONSTRAINT; Schema: public; Owner: catalog_user
688 | --
689 | 
690 | ALTER TABLE ONLY public.schemata
691 |     ADD CONSTRAINT unique_schema_name UNIQUE (source_id, name);
692 | 
693 | 
694 | --
695 | -- Name: tables unique_table_name; Type: CONSTRAINT; Schema: public; Owner: catalog_user
696 | --
697 | 
698 | ALTER TABLE ONLY public.tables
699 |     ADD CONSTRAINT unique_table_name UNIQUE (schema_id, name);
700 | 
701 | 
702 | --
703 | -- Name: column_lineage column_lineage_job_execution_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: catalog_user
704 | --
705 | 
706 | ALTER TABLE ONLY public.column_lineage
707 |     ADD CONSTRAINT column_lineage_job_execution_id_fkey FOREIGN KEY (job_execution_id) REFERENCES public.job_executions(id);
708 | 
709 | 
710 | --
711 | -- Name: column_lineage column_lineage_source_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: catalog_user
712 | --
713 | 
714 | ALTER TABLE ONLY public.column_lineage
715 |     ADD CONSTRAINT column_lineage_source_id_fkey FOREIGN KEY (source_id) REFERENCES public.columns(id);
716 | 
717 | 
718 | --
719 | -- Name: column_lineage column_lineage_target_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: catalog_user
720 | --
721 | 
722 | ALTER TABLE ONLY public.column_lineage
723 |     ADD CONSTRAINT column_lineage_target_id_fkey FOREIGN KEY (target_id) REFERENCES public.columns(id);
724 | 
725 | 
726 | --
727 | -- Name: columns columns_table_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: catalog_user
728 | --
729 | 
730 | ALTER TABLE ONLY public.columns
731 |     ADD CONSTRAINT columns_table_id_fkey FOREIGN KEY (table_id) REFERENCES public.tables(id);
732 | 
733 | 
734 | --
735 | -- Name: default_schema default_schema_schema_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: catalog_user
736 | --
737 | 
738 | ALTER TABLE ONLY public.default_schema
739 |     ADD CONSTRAINT default_schema_schema_id_fkey FOREIGN KEY (schema_id) REFERENCES public.schemata(id);
740 | 
741 | 
742 | --
743 | -- Name: default_schema default_schema_source_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: catalog_user
744 | --
745 | 
746 | ALTER TABLE ONLY public.default_schema
747 |     ADD CONSTRAINT default_schema_source_id_fkey FOREIGN KEY (source_id) REFERENCES public.sources(id);
748 | 
749 | 
750 | --
751 | -- Name: job_executions job_executions_job_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: catalog_user
752 | --
753 | 
754 | ALTER TABLE ONLY public.job_executions
755 |     ADD CONSTRAINT job_executions_job_id_fkey FOREIGN KEY (job_id) REFERENCES public.jobs(id);
756 | 
757 | 
758 | --
759 | -- Name: jobs jobs_source_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: catalog_user
760 | --
761 | 
762 | ALTER TABLE ONLY public.jobs
763 |     ADD CONSTRAINT jobs_source_id_fkey FOREIGN KEY (source_id) REFERENCES public.sources(id);
764 | 
765 | 
766 | --
767 | -- Name: schemata schemata_source_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: catalog_user
768 | --
769 | 
770 | ALTER TABLE ONLY public.schemata
771 |     ADD CONSTRAINT schemata_source_id_fkey FOREIGN KEY (source_id) REFERENCES public.sources(id);
772 | 
773 | 
774 | --
775 | -- Name: tables tables_schema_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: catalog_user
776 | --
777 | 
778 | ALTER TABLE ONLY public.tables
779 |     ADD CONSTRAINT tables_schema_id_fkey FOREIGN KEY (schema_id) REFERENCES public.schemata(id);
780 | 
781 | 
782 | --
783 | -- PostgreSQL database dump complete
784 | --
785 | 
786 | 


--------------------------------------------------------------------------------
/install-manifests/dockerfiles/demo-wikimedia.sql:
--------------------------------------------------------------------------------
  1 | --
  2 | -- PostgreSQL database dump
  3 | --
  4 | 
  5 | -- Dumped from database version 13.2 (Debian 13.2-1.pgdg100+1)
  6 | -- Dumped by pg_dump version 13.3 (Ubuntu 13.3-1.pgdg20.04+1)
  7 | 
  8 | SET statement_timeout = 0;
  9 | SET lock_timeout = 0;
 10 | SET idle_in_transaction_session_timeout = 0;
 11 | SET client_encoding = 'UTF8';
 12 | SET standard_conforming_strings = on;
 13 | SELECT pg_catalog.set_config('search_path', '', false);
 14 | SET check_function_bodies = false;
 15 | SET xmloption = content;
 16 | SET client_min_messages = warning;
 17 | SET row_security = off;
 18 | 
 19 | SET default_tablespace = '';
 20 | 
 21 | SET default_table_access_method = heap;
 22 | 
 23 | --
 24 | -- Name: filtered_pagecounts; Type: TABLE; Schema: public; Owner: etldev
 25 | --
 26 | 
 27 | CREATE TABLE public.filtered_pagecounts (
 28 |     "group" character varying,
 29 |     page_title character varying,
 30 |     views bigint,
 31 |     bytes_sent bigint
 32 | );
 33 | 
 34 | 
 35 | ALTER TABLE public.filtered_pagecounts OWNER TO etldev;
 36 | 
 37 | --
 38 | -- Name: page_lookup; Type: TABLE; Schema: public; Owner: etldev
 39 | --
 40 | 
 41 | CREATE TABLE public.page_lookup (
 42 |     redirect_id bigint,
 43 |     redirect_title bigint,
 44 |     true_title character varying,
 45 |     page_id bigint,
 46 |     page_version bigint
 47 | );
 48 | 
 49 | 
 50 | ALTER TABLE public.page_lookup OWNER TO etldev;
 51 | 
 52 | --
 53 | -- Name: normalized_pagecounts; Type: TABLE; Schema: public; Owner: etldev
 54 | --
 55 | 
 56 | CREATE TABLE public.normalized_pagecounts (
 57 |     page_id bigint,
 58 |     page_title character varying,
 59 |     page_url character varying,
 60 |     views bigint,
 61 |     bytes_sent bigint
 62 | );
 63 | 
 64 | 
 65 | ALTER TABLE public.normalized_pagecounts OWNER TO etldev;
 66 | 
 67 | --
 68 | -- Name: page; Type: TABLE; Schema: public; Owner: etldev
 69 | --
 70 | 
 71 | CREATE TABLE public.page (
 72 |     page_id bigint,
 73 |     page_latest bigint,
 74 |     page_title character varying
 75 | );
 76 | 
 77 | 
 78 | ALTER TABLE public.page OWNER TO etldev;
 79 | 
 80 | --
 81 | -- Name: page_lookup_nonredirect; Type: TABLE; Schema: public; Owner: etldev
 82 | --
 83 | 
 84 | CREATE TABLE public.page_lookup_nonredirect (
 85 |     redirect_id bigint,
 86 |     redirect_title bigint,
 87 |     true_title character varying,
 88 |     page_id bigint,
 89 |     page_version bigint
 90 | );
 91 | 
 92 | 
 93 | ALTER TABLE public.page_lookup_nonredirect OWNER TO etldev;
 94 | 
 95 | --
 96 | -- Name: page_lookup_redirect; Type: TABLE; Schema: public; Owner: etldev
 97 | --
 98 | 
 99 | CREATE TABLE public.page_lookup_redirect (
100 |     redirect_id bigint,
101 |     redirect_title bigint,
102 |     true_title character varying,
103 |     page_id bigint,
104 |     page_version bigint
105 | );
106 | 
107 | 
108 | ALTER TABLE public.page_lookup_redirect OWNER TO etldev;
109 | 
110 | --
111 | -- Name: pagecounts; Type: TABLE; Schema: public; Owner: etldev
112 | --
113 | 
114 | CREATE TABLE public.pagecounts (
115 |     "group" character varying,
116 |     page_title character varying,
117 |     views bigint,
118 |     bytes_sent bigint
119 | );
120 | 
121 | 
122 | ALTER TABLE public.pagecounts OWNER TO etldev;
123 | 
124 | --
125 | -- Name: redirect; Type: TABLE; Schema: public; Owner: etldev
126 | --
127 | 
128 | CREATE TABLE public.redirect (
129 |     rd_from bigint,
130 |     page_title character varying
131 | );
132 | 
133 | 
134 | ALTER TABLE public.redirect OWNER TO etldev;
135 | 
136 | --
137 | -- Data for Name: filtered_pagecounts; Type: TABLE DATA; Schema: public; Owner: etldev
138 | --
139 | 
140 | COPY public.filtered_pagecounts ("group", page_title, views, bytes_sent) FROM stdin;
141 | \.
142 | 
143 | 
144 | --
145 | -- Data for Name: lookup; Type: TABLE DATA; Schema: public; Owner: etldev
146 | --
147 | 
148 | COPY public.page_lookup (redirect_id, redirect_title, true_title, page_id, page_version) FROM stdin;
149 | \.
150 | 
151 | 
152 | --
153 | -- Data for Name: normalized_pagecounts; Type: TABLE DATA; Schema: public; Owner: etldev
154 | --
155 | 
156 | COPY public.normalized_pagecounts ("group", page_title, views, bytes_sent) FROM stdin;
157 | \.
158 | 
159 | 
160 | --
161 | -- Data for Name: page; Type: TABLE DATA; Schema: public; Owner: etldev
162 | --
163 | 
164 | COPY public.page (page_id, page_latest, page_title) FROM stdin;
165 | \.
166 | 
167 | 
168 | --
169 | -- Data for Name: page_lookup_nonredirect; Type: TABLE DATA; Schema: public; Owner: etldev
170 | --
171 | 
172 | COPY public.page_lookup_nonredirect (redirect_id, redirect_title, true_title, page_id, page_version) FROM stdin;
173 | \.
174 | 
175 | 
176 | --
177 | -- Data for Name: page_lookup_redirect; Type: TABLE DATA; Schema: public; Owner: etldev
178 | --
179 | 
180 | COPY public.page_lookup_redirect (redirect_id, redirect_title, true_title, page_id, page_version) FROM stdin;
181 | \.
182 | 
183 | 
184 | --
185 | -- Data for Name: pagecounts; Type: TABLE DATA; Schema: public; Owner: etldev
186 | --
187 | 
188 | COPY public.pagecounts ("group", page_title, views, bytes_sent) FROM stdin;
189 | \.
190 | 
191 | 
192 | --
193 | -- Data for Name: redirect; Type: TABLE DATA; Schema: public; Owner: etldev
194 | --
195 | 
196 | COPY public.redirect (rd_from, page_title) FROM stdin;
197 | \.
198 | 
199 | 
200 | --
201 | -- PostgreSQL database dump complete
202 | --
203 | 
204 | 


--------------------------------------------------------------------------------
/one_task.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tokern/data-lineage/5945542742979fe350d313d906440c93ee3d0f36/one_task.png


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "data-lineage"
 3 | version = "0.9.0"
 4 | description = "Open Source Data Lineage Tool for Redshift. Snowflake and many other databases"
 5 | authors = ["Tokern <info@tokern.io>"]
 6 | license = "MIT"
 7 | classifiers = [
 8 |         "Development Status :: 3 - Alpha",
 9 |         "Intended Audience :: Developers",
10 |         "Programming Language :: Python",
11 |         "Programming Language :: Python :: 3",
12 |         "Programming Language :: Python :: 3.7",
13 |         "Programming Language :: Python :: 3.8",
14 |         "Topic :: Database",
15 |         "Topic :: Software Development",
16 |         "Topic :: Software Development :: Libraries :: Python Modules",
17 | ]
18 | keywords=["data-lineage","postgres","snowflake","redshift","glue"]
19 | readme="README.md"
20 | homepage="https://tokern.io/"
21 | repository="https://github.com/tokern/data-lineage/"
22 | 
23 | [tool.poetry.dependencies]
24 | python = "^3.8"
25 | pglast = "*"
26 | inflection = "*"
27 | networkx = "*"
28 | click = "^7"
29 | PyYAML = "*"
30 | dbcat = "^0.7.1"
31 | gunicorn = "*"
32 | flask = "~=1.1"
33 | flask-restless-ng = "*"
34 | requests = "*"
35 | furl = "*"
36 | flask-restful = "*"
37 | psycopg2 = "^2.9.1"
38 | SQLAlchemy = "^1.3"
39 | botocore = "^1.20"
40 | rq = "^1.10.0"
41 | redis = "^3.5.3"
42 | 
43 | [tool.poetry.dev-dependencies]
44 | black = "==19.10b0"
45 | flake8 = "*"
46 | isort = "*"
47 | pre-commit = "*"
48 | pytest = "*"
49 | pytest-cov = "*"
50 | pipenv-setup = "*"
51 | mypy = "*"
52 | jupyter = "*"
53 | pytest-flask = "*"
54 | types-requests = "^0.1.13"
55 | types-Flask = "^1.1.1"
56 | types-PyYAML = "^5.4.3"
57 | types-click = "^7.1.2"
58 | fakeredis = "^1.6.1"
59 | types-redis = "^3.5.15"
60 | 
61 | [build-system]
62 | requires = ["poetry-core>=1.0.0"]
63 | build-backend = "poetry.core.masonry.api"
64 | 
65 | [tool.poetry.scripts]
66 | data_lineage = "data_lineage.__main__:main"
67 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | testpaths=
3 |     test
4 | 
5 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | ignore = E203, E266, E501, W503
 3 | max-line-length = 88
 4 | max-complexity = 18
 5 | select = B,C,E,F,W,T4
 6 | 
 7 | [isort]
 8 | multi_line_output=3
 9 | include_trailing_comma=True
10 | force_grid_wrap=0
11 | use_parentheses=True
12 | line_length=88
13 | 
14 | [mypy]
15 | files=data_lineage,test
16 | ignore_missing_imports=true
17 | 


--------------------------------------------------------------------------------
/test/catalog.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "name": "test",
  3 |   "source_type": "redshift",
  4 |   "schemata": [
  5 |     {
  6 |       "name": "default",
  7 |       "tables": [
  8 |         {
  9 |           "name": "pagecounts",
 10 |           "columns": [
 11 |             {
 12 |               "name": "group",
 13 |               "data_type": "STRING"
 14 |             },
 15 |             {
 16 |               "name": "page_title",
 17 |               "data_type": "STRING"
 18 |             },
 19 |             {
 20 |               "name": "views",
 21 |               "data_type": "BIGINT"
 22 |             },
 23 |             {
 24 |               "name": "bytes_sent",
 25 |               "data_type": "BIGINT"
 26 |             }
 27 |           ]
 28 |         },
 29 |         {
 30 |           "name": "page",
 31 |           "columns": [
 32 |             {
 33 |               "name":  "page_id",
 34 |               "data_type": "BIGINT"
 35 |             },
 36 |             {
 37 |               "name":  "page_latest",
 38 |               "data_type": "BIGINT"
 39 |             },
 40 |             {
 41 |               "name": "page_title",
 42 |               "data_type": "STRING"
 43 |             }
 44 |           ]
 45 |         },
 46 |         {
 47 |           "name": "redirect",
 48 |           "columns": [
 49 |             {
 50 |               "name": "rd_from",
 51 |               "data_type": "BIGINT"
 52 |             },
 53 |             {
 54 |               "name": "page_title",
 55 |               "data_type": "STRING"
 56 |             }
 57 | 
 58 |           ]
 59 |         },
 60 |         {
 61 |           "name": "page_lookup_nonredirect",
 62 |           "columns": [
 63 |             {
 64 |               "name": "redirect_id",
 65 |               "data_type": "BIGINT"
 66 |             },
 67 |             {
 68 |               "name": "redirect_title",
 69 |               "data_type": "STRING"
 70 |             },
 71 |             {
 72 |               "name": "true_title",
 73 |               "data_type": "STRING"
 74 |             },
 75 |             {
 76 |               "name": "page_id",
 77 |               "data_type": "BIGINT"
 78 |             },
 79 |             {
 80 |               "name": "page_version",
 81 |               "data_type": "BIGINT"
 82 |             }
 83 |           ]
 84 |         },
 85 |         {
 86 |           "name": "page_lookup_redirect",
 87 |           "columns": [
 88 |             {
 89 |              "name": "redirect_id",
 90 |              "data_type": "BIGINT"
 91 |             },
 92 |             {
 93 |               "name": "redirect_title",
 94 |               "data_type": "STRING"
 95 |             },
 96 |             {
 97 |               "name": "true_title",
 98 |               "data_type": "STRING"
 99 |             },
100 |             {
101 |               "name": "page_id",
102 |               "data_type": "BIGINT"
103 |             },
104 |             {
105 |               "name": "page_version",
106 |               "data_type": "BIGINT"
107 |             }
108 |           ]
109 |         },
110 |         {
111 |           "name": "page_lookup",
112 |           "columns": [
113 |             {
114 |               "name": "redirect_id",
115 |               "data_type": "bigint"
116 |             },
117 |             {
118 |               "name": "redirect_title",
119 |               "data_type": "STRING"
120 |             },
121 |             {
122 |               "name": "true_title",
123 |               "data_type": "STRING"
124 |             },
125 |             {
126 |               "name": "page_id",
127 |               "data_type": "BIGINT"
128 |             },
129 |             {
130 |               "name": "page_version",
131 |               "data_type": "BIGINT"
132 |             }
133 |           ]
134 |         },
135 |         {
136 |           "name": "filtered_pagecounts",
137 |           "columns": [
138 |             {
139 |               "name": "group",
140 |               "data_type": "STRING"
141 |             },
142 |             {
143 |               "name": "page_title",
144 |               "data_type": "STRING"
145 |             },
146 |             {
147 |               "name": "views",
148 |               "data_type": "BIGINT"
149 |             },
150 |             {
151 |               "name": "bytes_sent",
152 |               "data_type": "BIGINT"
153 |             }
154 |           ]
155 |         },
156 |         {
157 |           "name": "normalized_pagecounts",
158 |           "columns": [
159 |             {
160 |               "name": "page_id",
161 |               "data_type": "BIGINT"
162 |             },
163 |             {
164 |               "name": "page_title",
165 |               "data_type": "STRING"
166 |             },
167 |             {
168 |               "name": "page_url",
169 |               "data_type": "STRING"
170 |             },
171 |             {
172 |               "name": "views",
173 |               "data_type": "BIGINT"
174 |             },
175 |             {
176 |               "name": "bytes_sent",
177 |               "data_type": "BIGINT"
178 |             }
179 |           ]
180 |         }
181 |       ]
182 |     }
183 |   ]
184 | }


--------------------------------------------------------------------------------
/test/conftest.py:
--------------------------------------------------------------------------------
  1 | from contextlib import closing
  2 | 
  3 | import pytest
  4 | import yaml
  5 | from dbcat import PGCatalog as DbCatalog
  6 | from dbcat import catalog_connection, init_db
  7 | from dbcat.catalog import CatSource
  8 | from fakeredis import FakeStrictRedis
  9 | 
 10 | from data_lineage import Analyze, Catalog, Graph, Scan
 11 | from data_lineage.parser import parse
 12 | from data_lineage.server import create_server
 13 | 
 14 | 
 15 | @pytest.fixture(scope="session")
 16 | def load_queries():
 17 |     import json
 18 | 
 19 |     with open("test/queries.json", "r") as file:
 20 |         queries = json.load(file)
 21 | 
 22 |     yield queries
 23 | 
 24 | 
 25 | @pytest.fixture(scope="session")
 26 | def parse_queries_fixture(load_queries):
 27 |     parsed = [parse(sql=query["query"], name=query["name"]) for query in load_queries]
 28 |     yield parsed
 29 | 
 30 | 
 31 | postgres_conf = """
 32 | catalog:
 33 |   user: piiuser
 34 |   password: p11secret
 35 |   host: 127.0.0.1
 36 |   port: 5432
 37 |   database: piidb
 38 | """
 39 | 
 40 | 
 41 | @pytest.fixture(scope="session")
 42 | def root_connection() -> DbCatalog:
 43 |     config = yaml.safe_load(postgres_conf)
 44 |     with closing(DbCatalog(**config["catalog"])) as conn:
 45 |         yield conn
 46 | 
 47 | 
 48 | @pytest.fixture(scope="session")
 49 | def setup_catalog(root_connection):
 50 |     with root_connection.engine.connect() as conn:
 51 |         conn.execute("CREATE USER catalog_user PASSWORD 'catal0g_passw0rd'")
 52 |         conn.execution_options(isolation_level="AUTOCOMMIT").execute(
 53 |             "CREATE DATABASE tokern"
 54 |         )
 55 |         conn.execution_options(isolation_level="AUTOCOMMIT").execute(
 56 |             "GRANT ALL PRIVILEGES ON DATABASE tokern TO catalog_user"
 57 |         )
 58 | 
 59 |     yield root_connection
 60 | 
 61 |     with root_connection.engine.connect() as conn:
 62 |         conn.execution_options(isolation_level="AUTOCOMMIT").execute(
 63 |             "DROP DATABASE tokern"
 64 |         )
 65 | 
 66 |         conn.execution_options(isolation_level="AUTOCOMMIT").execute(
 67 |             "DROP USER catalog_user"
 68 |         )
 69 | 
 70 | 
 71 | catalog_conf = """
 72 | catalog:
 73 |   user: catalog_user
 74 |   password: catal0g_passw0rd
 75 |   host: 127.0.0.1
 76 |   port: 5432
 77 |   database: tokern
 78 | """
 79 | 
 80 | 
 81 | @pytest.fixture(scope="session")
 82 | def open_catalog_connection(setup_catalog):
 83 |     with closing(catalog_connection(catalog_conf)) as conn:
 84 |         init_db(conn)
 85 |         yield conn
 86 | 
 87 | 
 88 | class File:
 89 |     def __init__(self, name: str, path: str, catalog: DbCatalog):
 90 |         self.name = name
 91 |         self._path = path
 92 |         self._catalog = catalog
 93 | 
 94 |     @property
 95 |     def path(self):
 96 |         return self._path
 97 | 
 98 |     def scan(self):
 99 |         import json
100 | 
101 |         with open(self.path, "r") as file:
102 |             content = json.load(file)
103 | 
104 |         with self._catalog.managed_session:
105 |             source = self._catalog.add_source(
106 |                 name=content["name"], source_type=content["source_type"]
107 |             )
108 |             for s in content["schemata"]:
109 |                 schema = self._catalog.add_schema(s["name"], source=source)
110 | 
111 |                 for t in s["tables"]:
112 |                     table = self._catalog.add_table(t["name"], schema)
113 | 
114 |                     index = 0
115 |                     for c in t["columns"]:
116 |                         self._catalog.add_column(
117 |                             column_name=c["name"],
118 |                             data_type=c["data_type"],
119 |                             sort_order=index,
120 |                             table=table,
121 |                         )
122 |                         index += 1
123 | 
124 | 
125 | @pytest.fixture(scope="session")
126 | def save_catalog(open_catalog_connection):
127 |     scanner = File("test", "test/catalog.json", open_catalog_connection)
128 |     scanner.scan()
129 |     yield open_catalog_connection
130 |     with open_catalog_connection.managed_session as session:
131 |         [session.delete(db) for db in session.query(CatSource).all()]
132 |         session.commit()
133 | 
134 | 
135 | @pytest.fixture(scope="function")
136 | def managed_session(save_catalog):
137 |     with save_catalog.managed_session:
138 |         yield save_catalog
139 | 
140 | 
141 | @pytest.fixture(scope="session")
142 | def app(setup_catalog):
143 |     config = yaml.safe_load(catalog_conf)
144 |     app, catalog = create_server(
145 |         config["catalog"], connection=FakeStrictRedis(), is_production=False
146 |     )
147 |     yield app
148 |     catalog.close()
149 | 
150 | 
151 | @pytest.fixture(scope="session")
152 | def rest_catalog(live_server, save_catalog):
153 |     yield Catalog("http://{}:{}".format(live_server.host, live_server.port))
154 | 
155 | 
156 | @pytest.fixture(scope="session")
157 | def graph_sdk(live_server):
158 |     yield Graph("http://{}:{}".format(live_server.host, live_server.port))
159 | 
160 | 
161 | @pytest.fixture(scope="session")
162 | def parser_sdk(live_server):
163 |     yield Analyze("http://{}:{}".format(live_server.host, live_server.port))
164 | 
165 | 
166 | @pytest.fixture(scope="session")
167 | def scan_sdk(live_server):
168 |     yield Scan("http://{}:{}".format(live_server.host, live_server.port))
169 | 


--------------------------------------------------------------------------------
/test/queries.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "name": "LOAD page_lookup_nonredirect",
 4 |         "query": "INSERT INTO page_lookup_nonredirect SELECT  page.page_id as redircet_id, page.page_title as redirect_title, page.page_title true_title, page.page_id, page.page_latest FROM page LEFT OUTER JOIN redirect ON page.page_id = redirect.rd_from WHERE redirect.rd_from IS NULL "
 5 |     },
 6 |     {
 7 |         "name": "LOAD page_lookup_redirect",
 8 |         "query": "insert into page_lookup_redirect select original_page.page_id redirect_id, original_page.page_title redirect_title, final_page.page_title as true_title, final_page.page_id, final_page.page_latest from page final_page join redirect on (redirect.page_title = final_page.page_title) join page original_page on (redirect.rd_from = original_page.page_id)"
 9 |     },
10 |     {
11 |         "name": "LOAD page_lookup",
12 |         "query": "INSERT INTO page_lookup SELECT plr.redirect_id, plr.redirect_title, plr.true_title, plr.page_id, plr.page_version FROM page_lookup_redirect plr"
13 |     },
14 |     {
15 |         "name": "LOAD filtered_pagecounts",
16 |         "query": "INSERT INTO filtered_pagecounts(\"group\", page_title, views) SELECT regexp_replace (reflect ('java.net.URLDecoder','decode', reflect ('java.net.URLDecoder','decode',pvs.page_title)),'^\\s*([a-zA-Z0-9]+).*','$1') page_title,SUM (pvs.views) AS total_views, SUM (pvs.bytes_sent) AS total_bytes_sent FROM pagecounts as pvs WHERE not pvs.page_title LIKE '(MEDIA|SPECIAL||Talk|User|User_talk|Project|Project_talk|File|File_talk|MediaWiki|MediaWiki_talk|Template|Template_talk|Help|Help_talk|Category|Category_talk|Portal|Wikipedia|Wikipedia_talk|upload|Special)\\:(.*)' and pvs.page_title LIKE '^([A-Z])(.*)' and not pvs.page_title LIKE '(.*).(jpg|gif|png|JPG|GIF|PNG|txt|ico)$' and pvs.page_title <> '404_error/' and pvs.page_title <> 'Main_Page' and pvs.page_title <> 'Hypertext_Transfer_Protocol' and pvs.page_title <> 'Favicon.ico' and pvs.page_title <> 'Search' and pvs.dt = '2020-01-01' GROUP BY regexp_replace (reflect ('java.net.URLDecoder','decode', reflect ('java.net.URLDecoder','decode',pvs.page_title)),'^\\s*([a-zA-Z0-9]+).*','$1')"
17 |     },
18 |     {
19 |         "name": "LOAD normalized_pagecounts",
20 |         "query": "INSERT INTO normalized_pagecounts SELECT pl.page_id page_id, REGEXP_REPLACE(pl.true_title, '_', ' ') page_title, pl.true_title page_url, fp.views, fp.bytes_sent FROM page_lookup pl JOIN filtered_pagecounts fp ON fp.page_title = pl.redirect_title where fp.dt='2020-01-01'"
21 |     }
22 | ]
23 | 


--------------------------------------------------------------------------------
/test/test_data_lineage.py:
--------------------------------------------------------------------------------
 1 | from data_lineage.parser import analyze_dml_query
 2 | 
 3 | 
 4 | def test_parser(parse_queries_fixture):
 5 |     assert len(parse_queries_fixture) == 5
 6 | 
 7 | 
 8 | def test_visitor(save_catalog, parse_queries_fixture):
 9 |     catalog = save_catalog
10 |     with catalog.managed_session:
11 |         source = catalog.get_source("test")
12 | 
13 |         dml = [
14 |             analyze_dml_query(catalog, parsed, source)
15 |             for parsed in parse_queries_fixture
16 |         ]
17 |         assert len(dml) == 5
18 | 
19 |         for d in dml:
20 |             assert len(d.source_tables) > 0 and d.target_table is not None
21 | 


--------------------------------------------------------------------------------
/test/test_db_graph.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import logging
  3 | 
  4 | import pytest
  5 | from dbcat.catalog import ColumnLineage
  6 | from networkx import edges
  7 | 
  8 | from data_lineage import load_graph
  9 | from data_lineage.parser import analyze_dml_query, extract_lineage, parse
 10 | from data_lineage.parser.dml_visitor import SelectSourceVisitor
 11 | 
 12 | logging.basicConfig(level=getattr(logging, "DEBUG"))
 13 | 
 14 | 
 15 | def test_no_insert_column_graph(managed_session, graph_sdk):
 16 |     catalog = managed_session
 17 |     query = """
 18 |         INSERT INTO page_lookup_nonredirect
 19 |         SELECT page.page_id as redirect_id, page.page_title as redirect_title,
 20 |             page.page_title true_title, page.page_id, page.page_latest
 21 |         FROM page
 22 |     """
 23 | 
 24 |     parsed = parse(
 25 |         query, name="LOAD page_lookup_nonredirect-test_no_insert_column_graph"
 26 |     )
 27 |     visitor = SelectSourceVisitor(parsed.name)
 28 |     visitor(parsed.node)
 29 |     source = catalog.get_source("test")
 30 |     visitor.bind(catalog, source)
 31 | 
 32 |     job_execution = extract_lineage(
 33 |         catalog,
 34 |         visitor,
 35 |         source,
 36 |         parsed,
 37 |         datetime.datetime.now(),
 38 |         datetime.datetime.now(),
 39 |     )
 40 |     graph = load_graph(graph_sdk, [job_execution.job_id])
 41 | 
 42 |     assert sorted([node[1]["name"] for node in list(graph.graph.nodes(data=True))]) == [
 43 |         "LOAD page_lookup_nonredirect-test_no_insert_column_graph",
 44 |         "test.default.page.page_id",
 45 |         "test.default.page.page_latest",
 46 |         "test.default.page.page_title",
 47 |         "test.default.page_lookup_nonredirect.page_id",
 48 |         "test.default.page_lookup_nonredirect.page_version",
 49 |         "test.default.page_lookup_nonredirect.redirect_id",
 50 |         "test.default.page_lookup_nonredirect.redirect_title",
 51 |         "test.default.page_lookup_nonredirect.true_title",
 52 |     ]
 53 | 
 54 |     expected_edges = [
 55 |         ("column:5", "task:1"),
 56 |         ("task:1", "column:10"),
 57 |         ("task:1", "column:11"),
 58 |         ("task:1", "column:12"),
 59 |         ("task:1", "column:13"),
 60 |         ("task:1", "column:14"),
 61 |         ("column:7", "task:1"),
 62 |         ("column:6", "task:1"),
 63 |     ]
 64 | 
 65 |     assert [(edge[0], edge[1]) for edge in list(edges(graph.graph))] == expected_edges
 66 | 
 67 |     expected_db_edges = [
 68 |         (
 69 |             ("test", "default", "page", "page_id"),
 70 |             ("test", "default", "page_lookup_nonredirect", "redirect_id"),
 71 |         ),
 72 |         (
 73 |             ("test", "default", "page", "page_id"),
 74 |             ("test", "default", "page_lookup_nonredirect", "page_id"),
 75 |         ),
 76 |         (
 77 |             ("test", "default", "page", "page_title"),
 78 |             ("test", "default", "page_lookup_nonredirect", "redirect_title"),
 79 |         ),
 80 |         (
 81 |             ("test", "default", "page", "page_title"),
 82 |             ("test", "default", "page_lookup_nonredirect", "true_title"),
 83 |         ),
 84 |         (
 85 |             ("test", "default", "page", "page_latest"),
 86 |             ("test", "default", "page_lookup_nonredirect", "page_version"),
 87 |         ),
 88 |     ]
 89 |     with catalog.managed_session as session:
 90 |         all_edges = session.query(ColumnLineage).all()
 91 |         assert set([(e.source.fqdn, e.target.fqdn) for e in all_edges]) == set(
 92 |             expected_db_edges
 93 |         )
 94 | 
 95 | 
 96 | def test_basic_column_graph(managed_session, graph_sdk):
 97 |     catalog = managed_session
 98 | 
 99 |     query = "INSERT INTO page_lookup_nonredirect(page_id, page_version) SELECT page.page_id, page.page_latest FROM page"
100 |     parsed = parse(query, "basic_column_graph")
101 |     visitor = SelectSourceVisitor(parsed.name)
102 |     visitor(parsed.node)
103 |     source = catalog.get_source("test")
104 |     visitor.bind(catalog, source)
105 | 
106 |     job_execution = extract_lineage(
107 |         catalog,
108 |         visitor,
109 |         source,
110 |         parsed,
111 |         datetime.datetime.now(),
112 |         datetime.datetime.now(),
113 |     )
114 |     graph = load_graph(graph_sdk, [job_execution.job_id])
115 | 
116 |     assert sorted([node[1]["name"] for node in list(graph.graph.nodes(data=True))]) == [
117 |         "basic_column_graph",
118 |         "test.default.page.page_id",
119 |         "test.default.page.page_latest",
120 |         "test.default.page_lookup_nonredirect.page_id",
121 |         "test.default.page_lookup_nonredirect.page_version",
122 |     ]
123 | 
124 |     expected_edges = [
125 |         ("column:5", "task:2"),
126 |         ("task:2", "column:13"),
127 |         ("task:2", "column:14"),
128 |         ("column:6", "task:2"),
129 |     ]
130 | 
131 |     assert [(edge[0], edge[1]) for edge in list(edges(graph.graph))] == expected_edges
132 | 
133 |     table = catalog.get_table(
134 |         source_name="test", schema_name="default", table_name="page_lookup_nonredirect",
135 |     )
136 |     columns = catalog.get_columns_for_table(
137 |         table, column_names=["page_id", "page_version"]
138 |     )
139 | 
140 |     assert len(columns) == 2
141 | 
142 |     expected_db_edges = [
143 |         (
144 |             ("test", "default", "page", "page_id"),
145 |             ("test", "default", "page_lookup_nonredirect", "page_id"),
146 |         ),
147 |         (
148 |             ("test", "default", "page", "page_latest"),
149 |             ("test", "default", "page_lookup_nonredirect", "page_version"),
150 |         ),
151 |     ]
152 | 
153 |     with catalog.managed_session as session:
154 |         all_edges = (
155 |             session.query(ColumnLineage)
156 |             .filter(ColumnLineage.target_id.in_([c.id for c in columns]))
157 |             .all()
158 |         )
159 |         assert set([(e.source.fqdn, e.target.fqdn) for e in all_edges]) == set(
160 |             expected_db_edges
161 |         )
162 | 
163 | 
164 | @pytest.fixture(scope="module")
165 | def get_graph(save_catalog, parse_queries_fixture, graph_sdk):
166 |     catalog = save_catalog
167 |     job_ids = []
168 | 
169 |     with catalog.managed_session:
170 |         source = catalog.get_source("test")
171 |         for parsed in parse_queries_fixture:
172 |             visitor = analyze_dml_query(catalog, parsed, source)
173 |             job_execution = extract_lineage(
174 |                 catalog,
175 |                 visitor,
176 |                 source,
177 |                 parsed,
178 |                 datetime.datetime.now(),
179 |                 datetime.datetime.now(),
180 |             )
181 |             job_ids.append(job_execution.job_id)
182 |     graph = load_graph(graph_sdk, job_ids)
183 |     yield graph, catalog
184 | 
185 | 
186 | def test_column_graph(get_graph):
187 |     graph, catalog = get_graph
188 |     assert sorted([node[1]["name"] for node in list(graph.graph.nodes(data=True))]) == [
189 |         "LOAD filtered_pagecounts",
190 |         "LOAD normalized_pagecounts",
191 |         "LOAD page_lookup",
192 |         "LOAD page_lookup_nonredirect",
193 |         "LOAD page_lookup_redirect",
194 |         "test.default.filtered_pagecounts.bytes_sent",
195 |         "test.default.filtered_pagecounts.group",
196 |         "test.default.filtered_pagecounts.page_title",
197 |         "test.default.filtered_pagecounts.views",
198 |         "test.default.normalized_pagecounts.bytes_sent",
199 |         "test.default.normalized_pagecounts.page_id",
200 |         "test.default.normalized_pagecounts.page_title",
201 |         "test.default.normalized_pagecounts.page_url",
202 |         "test.default.normalized_pagecounts.views",
203 |         "test.default.page.page_id",
204 |         "test.default.page.page_latest",
205 |         "test.default.page.page_title",
206 |         "test.default.page_lookup.page_id",
207 |         "test.default.page_lookup.page_version",
208 |         "test.default.page_lookup.redirect_id",
209 |         "test.default.page_lookup.redirect_title",
210 |         "test.default.page_lookup.true_title",
211 |         "test.default.page_lookup_nonredirect.page_id",
212 |         "test.default.page_lookup_nonredirect.page_version",
213 |         "test.default.page_lookup_nonredirect.redirect_id",
214 |         "test.default.page_lookup_nonredirect.redirect_title",
215 |         "test.default.page_lookup_nonredirect.true_title",
216 |         "test.default.page_lookup_redirect.page_id",
217 |         "test.default.page_lookup_redirect.page_version",
218 |         "test.default.page_lookup_redirect.redirect_id",
219 |         "test.default.page_lookup_redirect.redirect_title",
220 |         "test.default.page_lookup_redirect.true_title",
221 |         "test.default.pagecounts.bytes_sent",
222 |         "test.default.pagecounts.page_title",
223 |         "test.default.pagecounts.views",
224 |     ]
225 |     # expected_edges = [
226 |     #     ("column:4", "task:1"),
227 |     #     ("column:4", "task:3"),
228 |     #     ("task:1", "column:9"),
229 |     #     ("task:1", "column:10"),
230 |     #     ("task:1", "column:11"),
231 |     #     ("task:1", "column:12"),
232 |     #     ("task:1", "column:13"),
233 |     #     ("column:6", "task:1"),
234 |     #     ("column:6", "task:3"),
235 |     #     ("column:5", "task:1"),
236 |     #     ("column:5", "task:3"),
237 |     #     ("column:14", "task:4"),
238 |     #     ("task:3", "column:14"),
239 |     #     ("task:3", "column:15"),
240 |     #     ("task:3", "column:16"),
241 |     #     ("task:3", "column:17"),
242 |     #     ("task:3", "column:18"),
243 |     #     ("column:15", "task:4"),
244 |     #     ("column:16", "task:4"),
245 |     #     ("column:17", "task:4"),
246 |     #     ("column:18", "task:4"),
247 |     #     ("task:4", "column:19"),
248 |     #     ("task:4", "column:20"),
249 |     #     ("task:4", "column:21"),
250 |     #     ("task:4", "column:22"),
251 |     #     ("task:4", "column:23"),
252 |     #     ("column:21", "task:6"),
253 |     #     ("column:22", "task:6"),
254 |     #     ("task:6", "column:28"),
255 |     #     ("task:6", "column:29"),
256 |     #     ("task:6", "column:30"),
257 |     #     ("task:6", "column:31"),
258 |     #     ("column:26", "task:6"),
259 |     #     ("column:27", "task:6"),
260 |     # ]
261 | 
262 | 
263 | #    assert [
264 | #        (edge[0], edge[1]) for edge in list(edges(graph.graph))
265 | #    ] == expected_edges
266 | 


--------------------------------------------------------------------------------
/test/test_dml_visitor.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | from data_lineage.parser import analyze_dml_query, parse, parse_dml_query, parse_queries
  4 | from data_lineage.parser.dml_visitor import (
  5 |     CTASVisitor,
  6 |     SelectIntoVisitor,
  7 |     SelectSourceVisitor,
  8 | )
  9 | 
 10 | 
 11 | @pytest.mark.parametrize(
 12 |     "target, sources, sql",
 13 |     [
 14 |         ((None, "c"), [(None, "a")], "insert into c select x,y from a"),
 15 |         (
 16 |             (None, "c"),
 17 |             [(None, "a"), (None, "b")],
 18 |             "insert into c select x,y from a join b on a.id = b.id",
 19 |         ),
 20 |         (
 21 |             (None, "c"),
 22 |             [(None, "a"), (None, "b")],
 23 |             "insert into c select x,y from a join b on a.id = b.id",
 24 |         ),
 25 |         (
 26 |             (None, "c"),
 27 |             [(None, "a"), (None, "b")],
 28 |             "insert into c select x,y from a as aa join b on " "aa.id = b.id",
 29 |         ),
 30 |     ],
 31 | )
 32 | def test_sanity_insert(target, sources, sql):
 33 |     parsed = parse(sql)
 34 |     insert_visitor = SelectSourceVisitor("test_sanity_insert")
 35 |     insert_visitor(parsed.node)
 36 |     bound_target, bound_tables, bound_cols = insert_visitor.resolve()
 37 | 
 38 |     assert bound_target == target
 39 |     assert bound_tables == sources
 40 | 
 41 | 
 42 | @pytest.mark.parametrize(
 43 |     "target, sources, sql",
 44 |     [
 45 |         ((None, "c"), [(None, "a")], "create table c as select x,y from a"),
 46 |         (
 47 |             (None, "c"),
 48 |             [(None, "a"), (None, "b")],
 49 |             "create table c as select x,y from a join b on a.id = b.id",
 50 |         ),
 51 |         (
 52 |             (None, "c"),
 53 |             [(None, "a"), (None, "b")],
 54 |             "create table c as select x,y from a join b on a.id = b.id",
 55 |         ),
 56 |         (
 57 |             (None, "c"),
 58 |             [(None, "a"), (None, "b")],
 59 |             "create table c as select x,y from a as aa join b on aa.id = b.id",
 60 |         ),
 61 |     ],
 62 | )
 63 | def test_sanity_ctas(target, sources, sql):
 64 |     parsed = parse(sql)
 65 |     visitor = CTASVisitor("test_sanity_ctas")
 66 |     visitor(parsed.node)
 67 |     bound_target, bound_tables, bound_cols = visitor.resolve()
 68 | 
 69 |     assert bound_target == target
 70 |     assert bound_tables == sources
 71 | 
 72 | 
 73 | @pytest.mark.parametrize(
 74 |     "target, sources, sql",
 75 |     [
 76 |         (
 77 |             (None, "c"),
 78 |             [(None, "a"), (None, "b")],
 79 |             "select x,y into c from a join b on a.id = b.id",
 80 |         ),
 81 |         (
 82 |             (None, "c"),
 83 |             [(None, "a"), (None, "b")],
 84 |             "select x,y into c from a join b on a.id = b.id",
 85 |         ),
 86 |         (
 87 |             (None, "c"),
 88 |             [(None, "a"), (None, "b")],
 89 |             "select x,y into c from a as aa join b on aa.id = b.id",
 90 |         ),
 91 |     ],
 92 | )
 93 | def test_sanity_select_into(target, sources, sql):
 94 |     parsed = parse(sql)
 95 |     visitor = SelectIntoVisitor("test_sanity_select_into")
 96 |     visitor(parsed.node)
 97 |     bound_target, bound_tables, bound_cols = visitor.resolve()
 98 | 
 99 |     assert bound_target == target
100 |     assert bound_tables == sources
101 | 
102 | 
103 | @pytest.mark.parametrize(
104 |     "query",
105 |     [
106 |         "INSERT INTO page_lookup SELECT plr.redirect_id, plr.redirect_title, plr.true_title, plr.page_id, plr.page_version FROM page_lookup_redirect plr",
107 |         "INSERT INTO page_lookup SELECT redirect_id, redirect_title, true_title, page_id, page_version FROM page_lookup_redirect",
108 |         "INSERT INTO page_lookup SELECT page_lookup_redirect.* FROM page_lookup_redirect",
109 |         "INSERT INTO page_lookup SELECT * FROM page_lookup_redirect",
110 |         'INSERT INTO "default".page_lookup SELECT * FROM page_lookup_redirect',
111 |         "SELECT * INTO page_lookup from page_lookup_redirect",
112 |         'SELECT * INTO "default".page_lookup from page_lookup_redirect',
113 |         """
114 |             INSERT INTO page_lookup
115 |             SELECT * FROM (
116 |                     select redirect_id, redirect_title, true_title, page_id, page_version FROM page_lookup_redirect
117 |                     ) plr
118 |         """,
119 |         """
120 |             INSERT INTO page_lookup
121 |             SELECT plr.* FROM (
122 |                     select redirect_id, redirect_title, true_title, page_id, page_version FROM page_lookup_redirect
123 |                     ) plr
124 |         """,
125 |         """
126 |             INSERT INTO page_lookup
127 |             SELECT redirect_id, redirect_title, true_title, page_id, page_version FROM (
128 |                     select redirect_id, redirect_title, true_title, page_id, page_version FROM page_lookup_redirect
129 |                     ) plr
130 |         """,
131 |         """
132 |             INSERT INTO page_lookup
133 |             SELECT plr.redirect_id, plr.redirect_title, plr.true_title, plr.page_id, plr.page_version FROM (
134 |                     select redirect_id, redirect_title, true_title, page_id, page_version FROM page_lookup_redirect
135 |                     ) plr
136 |         """,
137 |     ],
138 | )
139 | def test_insert(managed_session, query):
140 |     source = managed_session.get_source("test")
141 |     parsed = parse(query)
142 |     visitor = analyze_dml_query(managed_session, parsed, source)
143 |     assert visitor is not None
144 | 
145 |     assert len(visitor.target_columns) == 5
146 |     assert visitor.target_table.fqdn == ("test", "default", "page_lookup")
147 |     assert len(visitor.source_columns) == 5
148 |     assert [table.fqdn for table in visitor.source_tables] == [
149 |         ("test", "default", "page_lookup_redirect")
150 |     ]
151 | 
152 | 
153 | def test_insert_cols(managed_session):
154 |     source = managed_session.get_source("test")
155 |     query = "INSERT INTO page_lookup_nonredirect(page_id, page_version) SELECT page.page_id, page.page_latest FROM page"
156 |     parsed = parse(query)
157 |     visitor = analyze_dml_query(managed_session, parsed, source)
158 |     assert visitor is not None
159 | 
160 |     assert len(visitor.target_columns) == 2
161 |     assert visitor.target_table.fqdn == ("test", "default", "page_lookup_nonredirect")
162 |     assert len(visitor.source_columns) == 2
163 |     assert [table.fqdn for table in visitor.source_tables] == [
164 |         ("test", "default", "page")
165 |     ]
166 | 
167 | 
168 | def test_insert_with_join(managed_session):
169 |     source = managed_session.get_source("test")
170 |     query = "insert into page_lookup_redirect select original_page.page_id redirect_id, original_page.page_title redirect_title, final_page.page_title as true_title, final_page.page_id, final_page.page_latest from page final_page join redirect on (redirect.page_title = final_page.page_title) join page original_page on (redirect.rd_from = original_page.page_id)"
171 |     parsed = parse(query)
172 |     visitor = analyze_dml_query(managed_session, parsed, source)
173 |     assert visitor is not None
174 | 
175 |     assert len(visitor.target_columns) == 5
176 |     assert visitor.target_table.fqdn == ("test", "default", "page_lookup_redirect")
177 |     assert len(visitor.source_columns) == 5
178 |     assert sorted([table.fqdn for table in visitor.source_tables]) == [
179 |         ("test", "default", "page"),
180 |         ("test", "default", "redirect"),
181 |     ]
182 | 
183 | 
184 | @pytest.mark.parametrize(
185 |     "query",
186 |     [
187 |         "with pln as (select redirect_title, true_title, page_id, page_version from page_lookup_nonredirect) insert into page_lookup_redirect (redirect_title, true_title, page_id, page_version) select redirect_title, true_title, page_id, page_version from pln;",
188 |         "with pln as (select * from page_lookup_nonredirect) insert into page_lookup_redirect (redirect_title, true_title, page_id, page_version) select redirect_title, true_title, page_id, page_version from pln;",
189 |         "with pln as (select redirect_title, true_title, page_id, page_version from page_lookup_nonredirect) insert into page_lookup_redirect (redirect_title, true_title, page_id, page_version) select * from pln;",
190 |         "with pln as (select redirect_title as t1, true_title as t2, page_id as t3, page_version as t4 from page_lookup_nonredirect) insert into page_lookup_redirect (redirect_title, true_title, page_id, page_version) select t1, t2, t3, t4 from pln;",
191 |         "insert into page_lookup_redirect (redirect_title, true_title, page_id, page_version) with pln as (select redirect_title, true_title, page_id, page_version from page_lookup_nonredirect) select redirect_title, true_title, page_id, page_version from pln;",
192 |     ],
193 | )
194 | def test_with_clause(managed_session, query):
195 |     source = managed_session.get_source("test")
196 |     parsed = parse(query)
197 |     visitor = analyze_dml_query(managed_session, parsed, source)
198 |     assert visitor is not None
199 | 
200 |     assert len(visitor.target_columns) == 4
201 |     assert visitor.target_table.fqdn == ("test", "default", "page_lookup_redirect")
202 |     assert len(visitor.source_columns) == 4
203 |     assert [table.fqdn for table in visitor.source_tables] == [
204 |         ("test", "default", "page_lookup_nonredirect")
205 |     ]
206 | 
207 | 
208 | def test_col_exprs(managed_session):
209 |     query = """
210 |         INSERT INTO page_lookup_redirect(true_title)
211 |         SELECT
212 |             BTRIM(TO_CHAR(DATEADD (MONTH,-1,('20' ||MAX ("redirect_id") || '-01')::DATE)::DATE,'YY-MM')) AS "max_month"
213 |         FROM page_lookup_nonredirect;
214 |     """
215 |     source = managed_session.get_source("test")
216 |     parsed = parse(query)
217 |     visitor = analyze_dml_query(catalog=managed_session, parsed=parsed, source=source)
218 |     assert visitor is not None
219 | 
220 |     assert len(visitor.target_columns) == 1
221 |     assert visitor.target_table.fqdn == ("test", "default", "page_lookup_redirect")
222 |     assert len(visitor.source_columns) == 1
223 |     assert [table.fqdn for table in visitor.source_tables] == [
224 |         ("test", "default", "page_lookup_nonredirect")
225 |     ]
226 | 
227 | 
228 | def test_syntax_errors():
229 |     queries = [
230 |         "INSERT INTO page_lookup_nonredirect(page_id, latest) SELECT page.page_id, page.page_latest FROM page",
231 |         "select a from table(b)",
232 |         "INSERT INTO page_lookup_nonredirect SELECT page.page_id, page.page_latest FROM page",
233 |     ]
234 | 
235 |     parsed = parse_queries(queries)
236 | 
237 |     assert len(parsed) == 2
238 | 
239 | 
240 | def test_parse_query(managed_session):
241 |     query = """
242 |     SELECT BTRIM(TO_CHAR(DATEADD (MONTH,-1,(\'20\' ||MAX ("group") || \'-01\')::DATE)::DATE,\'YY-MM\')) AS "max_month",
243 |         DATEADD(YEAR,-1,DATEADD (MONTH,-3,LAST_DAY (DATEADD (MONTH,-1,(\'20\' ||MAX ("group") || \'-01\')::DATE)::DATE))::DATE)::DATE AS "min_date",
244 |         DATEADD(MONTH,-3,LAST_DAY (DATEADD (MONTH,-1,(\'20\' ||MAX ("group") || \'-01\')::DATE)::DATE))::DATE AS "max_date",
245 |         page_title,
246 |         bytes_sent as mb_sent
247 |     INTO "new_table"
248 |     FROM pagecounts;
249 |     """
250 |     source = managed_session.get_source("test")
251 |     parsed = parse(query)
252 |     binder = parse_dml_query(catalog=managed_session, parsed=parsed, source=source)
253 |     assert [context.alias for context in binder.columns] == [
254 |         "max_month",
255 |         "min_date",
256 |         "max_date",
257 |         "page_title",
258 |         "mb_sent",
259 |     ]
260 | 
261 | 
262 | def test_ctas(managed_session):
263 |     query = """
264 |         CREATE TEMP TABLE temp_table_x(page_title) AS select redirect_title from page_lookup_nonredirect
265 |         where redirect_title is not null
266 |     """
267 |     source = managed_session.get_source("test")
268 |     schema = managed_session.get_schema("test", "default")
269 |     managed_session.update_source(source, schema)
270 |     parsed = parse(query)
271 |     visitor = analyze_dml_query(managed_session, parsed, source)
272 |     assert visitor is not None
273 | 
274 |     assert len(visitor.target_columns) == 1
275 |     assert visitor.target_table.fqdn == ("test", "default", "temp_table_x")
276 |     assert len(visitor.source_columns) == 1
277 |     assert [table.fqdn for table in visitor.source_tables] == [
278 |         ("test", "default", "page_lookup_nonredirect")
279 |     ]
280 | 


--------------------------------------------------------------------------------
/test/test_scan.py:
--------------------------------------------------------------------------------
 1 | import psycopg2
 2 | import pytest
 3 | from fakeredis import FakeStrictRedis
 4 | from rq import Queue
 5 | 
 6 | pii_data_script = """
 7 | create table no_pii(a text, b text);
 8 | insert into no_pii values ('abc', 'def');
 9 | insert into no_pii values ('xsfr', 'asawe');
10 | 
11 | create table partial_pii(a text, b text);
12 | insert into partial_pii values ('917-908-2234', 'plkj');
13 | insert into partial_pii values ('215-099-2234', 'sfrf');
14 | 
15 | create table full_pii(name text, location text);
16 | insert into full_pii values ('Jonathan Smith', 'Virginia');
17 | insert into full_pii values ('Chase Ryan', 'Chennai');
18 | 
19 | """
20 | 
21 | 
22 | pii_data_load = [
23 |     "create table no_pii(a text, b text)",
24 |     "insert into no_pii values ('abc', 'def')",
25 |     "insert into no_pii values ('xsfr', 'asawe')",
26 |     "create table partial_pii(a text, b text)",
27 |     "insert into partial_pii values ('917-908-2234', 'plkj')",
28 |     "insert into partial_pii values ('215-099-2234', 'sfrf')",
29 |     "create table full_pii(name text, location text)",
30 |     "insert into full_pii values ('Jonathan Smith', 'Virginia')",
31 |     "insert into full_pii values ('Chase Ryan', 'Chennai')",
32 | ]
33 | 
34 | pii_data_drop = ["DROP TABLE full_pii", "DROP TABLE partial_pii", "DROP TABLE no_pii"]
35 | 
36 | 
37 | def pg_conn():
38 |     return (
39 |         psycopg2.connect(
40 |             host="127.0.0.1", user="piiuser", password="p11secret", database="piidb"
41 |         ),
42 |         "public",
43 |     )
44 | 
45 | 
46 | @pytest.fixture(scope="module")
47 | def load_all_data():
48 |     params = [pg_conn()]
49 |     for p in params:
50 |         db_conn, expected_schema = p
51 |         with db_conn.cursor() as cursor:
52 |             for statement in pii_data_load:
53 |                 cursor.execute(statement)
54 |             cursor.execute("commit")
55 |     yield params
56 |     for p in params:
57 |         db_conn, expected_schema = p
58 |         with db_conn.cursor() as cursor:
59 |             for statement in pii_data_drop:
60 |                 cursor.execute(statement)
61 |             cursor.execute("commit")
62 | 
63 |     for p in params:
64 |         db_conn, expected_schema = p
65 |         db_conn.close()
66 | 
67 | 
68 | @pytest.fixture(scope="module")
69 | def setup_catalog_and_data(load_all_data, rest_catalog):
70 |     catalog = rest_catalog
71 |     source = catalog.add_source(
72 |         name="pg_scan",
73 |         source_type="postgresql",
74 |         uri="127.0.0.1",
75 |         username="piiuser",
76 |         password="p11secret",
77 |         database="piidb",
78 |         cluster="public",
79 |     )
80 |     yield catalog, source
81 | 
82 | 
83 | @pytest.fixture(scope="module")
84 | def fake_queue():
85 |     yield Queue(is_async=False, connection=FakeStrictRedis())
86 | 
87 | 
88 | def test_scan_source(setup_catalog_and_data, scan_sdk):
89 |     catalog, source = setup_catalog_and_data
90 |     scan_sdk.start(source)
91 | 
92 |     pg_source = catalog.get_source("pg_scan")
93 |     assert pg_source is not None
94 | 
95 |     no_pii = catalog.get_table("pg_scan", "public", "no_pii")
96 |     assert no_pii is not None
97 | 


--------------------------------------------------------------------------------
/test/test_server.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import logging
  3 | 
  4 | import pytest
  5 | from dbcat.catalog.models import ColumnLineage, Job, JobExecution, JobExecutionStatus
  6 | 
  7 | from data_lineage import (
  8 |     ColumnNotFound,
  9 |     ParseError,
 10 |     SchemaNotFound,
 11 |     SourceNotFound,
 12 |     TableNotFound,
 13 | )
 14 | 
 15 | 
 16 | def test_get_sources(rest_catalog):
 17 |     source = rest_catalog.get_source("test")
 18 |     assert source.name == "test"
 19 |     assert source.id is not None
 20 | 
 21 | 
 22 | def test_get_schemata(rest_catalog):
 23 |     schema = rest_catalog.get_schema("test", "default")
 24 |     assert schema.name == "default"
 25 |     assert schema.id is not None
 26 | 
 27 | 
 28 | def test_get_tables(rest_catalog):
 29 |     num = 0
 30 |     for table in rest_catalog.get_tables():
 31 |         assert table.id is not None
 32 |         assert table.name is not None
 33 |         num += 1
 34 |     assert num == 12
 35 | 
 36 | 
 37 | def test_get_columns(rest_catalog):
 38 |     num = 0
 39 |     for column in rest_catalog.get_columns():
 40 |         assert column.id is not None
 41 |         assert column.name is not None
 42 |         assert column.data_type is not None
 43 |         assert column.sort_order is not None
 44 |         num += 1
 45 | 
 46 |     assert num == 40
 47 | 
 48 | 
 49 | def test_get_source_by_id(rest_catalog):
 50 |     source = rest_catalog.get_source_by_id(1)
 51 |     print(source.__class__.__name__)
 52 |     assert source.name == "test"
 53 |     assert source.fqdn == "test"
 54 |     assert source.source_type == "redshift"
 55 | 
 56 | 
 57 | def test_get_schema_by_id(rest_catalog):
 58 |     schema = rest_catalog.get_schema_by_id(1)
 59 |     assert schema.name == "default"
 60 |     assert schema.fqdn == ["test", "default"]
 61 | 
 62 | 
 63 | def test_get_table_by_id(rest_catalog):
 64 |     table = rest_catalog.get_table_by_id(1)
 65 |     assert table.name == "pagecounts"
 66 |     assert table.fqdn == ["test", "default", "pagecounts"]
 67 | 
 68 | 
 69 | def test_get_column_by_id(rest_catalog):
 70 |     column = rest_catalog.get_column_by_id(1)
 71 |     assert column.name == "group"
 72 |     assert column.fqdn == ["test", "default", "pagecounts", "group"]
 73 | 
 74 | 
 75 | def test_get_source(rest_catalog):
 76 |     source = rest_catalog.get_source("test")
 77 |     assert source.name == "test"
 78 |     assert source.id is not None
 79 | 
 80 | 
 81 | def test_get_schema(rest_catalog):
 82 |     schema = rest_catalog.get_schema("test", "default")
 83 |     assert schema.name == "default"
 84 |     assert schema.id is not None
 85 | 
 86 | 
 87 | def test_get_table(rest_catalog):
 88 |     table = rest_catalog.get_table("test", "default", "normalized_pagecounts")
 89 |     assert table.id is not None
 90 |     assert table.name == "normalized_pagecounts"
 91 | 
 92 | 
 93 | def test_get_column(rest_catalog):
 94 |     column = rest_catalog.get_column("test", "default", "pagecounts", "bytes_sent")
 95 |     assert column.id is not None
 96 |     assert column.name is not None
 97 |     assert column.sort_order is not None
 98 | 
 99 | 
100 | def test_get_source_exception(rest_catalog):
101 |     with pytest.raises(SourceNotFound):
102 |         rest_catalog.get_source("tes")
103 | 
104 | 
105 | @pytest.mark.parametrize(
106 |     "source_name, schema_name", [("test", "def"), ("tes", "default")]
107 | )
108 | def test_get_schema_exception(rest_catalog, source_name, schema_name):
109 |     with pytest.raises(SchemaNotFound):
110 |         rest_catalog.get_schema(source_name, schema_name)
111 | 
112 | 
113 | def test_add_source_pg(rest_catalog):
114 |     data = {
115 |         "name": "pg",
116 |         "source_type": "postgres",
117 |         "database": "db_database",
118 |         "username": "db_user",
119 |         "password": "db_password",
120 |         "port": "db_port",
121 |         "uri": "db_uri",
122 |     }
123 | 
124 |     pg_connection = rest_catalog.add_source(**data)
125 |     assert pg_connection.name == "pg"
126 |     assert pg_connection.source_type == "postgres"
127 |     assert pg_connection.database == "db_database"
128 |     assert pg_connection.username == "db_user"
129 |     assert pg_connection.password == "db_password"
130 |     assert pg_connection.port == "db_port"
131 |     assert pg_connection.uri == "db_uri"
132 | 
133 | 
134 | def test_add_source_mysql(rest_catalog):
135 |     data = {
136 |         "name": "mys",
137 |         "source_type": "mysql",
138 |         "database": "db_database",
139 |         "username": "db_user",
140 |         "password": "db_password",
141 |         "port": "db_port",
142 |         "uri": "db_uri",
143 |     }
144 | 
145 |     mysql_conn = rest_catalog.add_source(**data)
146 | 
147 |     assert mysql_conn.name == "mys"
148 |     assert mysql_conn.source_type == "mysql"
149 |     assert mysql_conn.database == "db_database"
150 |     assert mysql_conn.username == "db_user"
151 |     assert mysql_conn.password == "db_password"
152 |     assert mysql_conn.port == "db_port"
153 |     assert mysql_conn.uri == "db_uri"
154 | 
155 | 
156 | def test_add_source_bq(rest_catalog):
157 |     bq_conn = rest_catalog.add_source(
158 |         name="bq",
159 |         source_type="bigquery",
160 |         key_path="db_key_path",
161 |         project_credentials="db_creds",
162 |         project_id="db_project_id",
163 |     )
164 |     assert bq_conn.name == "bq"
165 |     assert bq_conn.source_type == "bigquery"
166 |     assert bq_conn.key_path == "db_key_path"
167 |     assert bq_conn.project_credentials == "db_creds"
168 |     assert bq_conn.project_id == "db_project_id"
169 | 
170 | 
171 | def test_add_source_glue(rest_catalog):
172 |     glue_conn = rest_catalog.add_source(name="gl", source_type="glue")
173 |     assert glue_conn.name == "gl"
174 |     assert glue_conn.source_type == "glue"
175 | 
176 | 
177 | def test_add_source_snowflake(rest_catalog):
178 |     sf_conn = rest_catalog.add_source(
179 |         name="sf",
180 |         source_type="snowflake",
181 |         database="db_database",
182 |         username="db_user",
183 |         password="db_password",
184 |         account="db_account",
185 |         role="db_role",
186 |         warehouse="db_warehouse",
187 |     )
188 |     assert sf_conn.name == "sf"
189 |     assert sf_conn.source_type == "snowflake"
190 |     assert sf_conn.database == "db_database"
191 |     assert sf_conn.username == "db_user"
192 |     assert sf_conn.password == "db_password"
193 |     assert sf_conn.account == "db_account"
194 |     assert sf_conn.role == "db_role"
195 |     assert sf_conn.warehouse == "db_warehouse"
196 | 
197 | 
198 | def test_update_source(rest_catalog):
199 |     glue_conn = rest_catalog.add_source(name="gl_2", source_type="glue")
200 |     schema_1 = rest_catalog.add_schema("schema_1", glue_conn)
201 | 
202 |     default_schema = rest_catalog.update_source(glue_conn, schema_1)
203 | 
204 |     assert default_schema.source.id == glue_conn.id
205 |     assert default_schema.schema.id == schema_1.id
206 | 
207 |     schema_2 = rest_catalog.add_schema("schema_2", glue_conn)
208 | 
209 |     default_schema = rest_catalog.update_source(glue_conn, schema_2)
210 | 
211 |     assert default_schema.source.id == glue_conn.id
212 |     assert default_schema.schema.id == schema_2.id
213 | 
214 | 
215 | def load_edges(catalog, expected_edges, job_execution_id):
216 |     column_edge_ids = []
217 |     for edge in expected_edges:
218 |         source = catalog.get_column(
219 |             source_name=edge[0][0],
220 |             schema_name=edge[0][1],
221 |             table_name=edge[0][2],
222 |             column_name=edge[0][3],
223 |         )
224 | 
225 |         target = catalog.get_column(
226 |             source_name=edge[1][0],
227 |             schema_name=edge[1][1],
228 |             table_name=edge[1][2],
229 |             column_name=edge[1][3],
230 |         )
231 | 
232 |         added_edge = catalog.add_column_lineage(source, target, job_execution_id, {})
233 | 
234 |         column_edge_ids.append(added_edge.id)
235 |     return column_edge_ids
236 | 
237 | 
238 | @pytest.fixture(scope="module")
239 | def load_page_lookup_nonredirect_edges(save_catalog):
240 |     catalog = save_catalog
241 | 
242 |     expected_edges = [
243 |         (
244 |             ("test", "default", "page", "page_id"),
245 |             ("test", "default", "page_lookup_nonredirect", "redirect_id"),
246 |         ),
247 |         (
248 |             ("test", "default", "page", "page_id"),
249 |             ("test", "default", "page_lookup_nonredirect", "page_id"),
250 |         ),
251 |         (
252 |             ("test", "default", "page", "page_title"),
253 |             ("test", "default", "page_lookup_nonredirect", "redirect_title"),
254 |         ),
255 |         (
256 |             ("test", "default", "page", "page_title"),
257 |             ("test", "default", "page_lookup_nonredirect", "true_title"),
258 |         ),
259 |         (
260 |             ("test", "default", "page", "page_latest"),
261 |             ("test", "default", "page_lookup_nonredirect", "page_version"),
262 |         ),
263 |     ]
264 | 
265 |     job_id = None
266 | 
267 |     with catalog.managed_session:
268 |         job = catalog.add_job(
269 |             "insert_page_lookup_nonredirect",
270 |             catalog.get_source("test"),
271 |             {"sql": "insert into page_lookup_nonredirect select from page"},
272 |         )
273 |         e1 = catalog.add_job_execution(
274 |             job=job,
275 |             started_at=datetime.datetime.combine(
276 |                 datetime.date(2021, 4, 1), datetime.time(1, 0)
277 |             ),
278 |             ended_at=datetime.datetime.combine(
279 |                 datetime.date(2021, 4, 1), datetime.time(1, 15)
280 |             ),
281 |             status=JobExecutionStatus.SUCCESS,
282 |         )
283 | 
284 |         executions = [e1.id]
285 |         name = job.name
286 |         job_id = job.id
287 | 
288 |         print("Inserted job {}".format(name))
289 |         print("Inserted executions {}".format(",".join(str(v) for v in executions)))
290 | 
291 |         column_edge_ids = load_edges(catalog, expected_edges, executions[0])
292 |         print("Inserted edges {}".format(",".join(str(v) for v in column_edge_ids)))
293 | 
294 |     yield catalog, job_id, expected_edges
295 | 
296 |     with catalog.managed_session as session:
297 |         session.query(ColumnLineage).filter(
298 |             ColumnLineage.id.in_(column_edge_ids)
299 |         ).delete(synchronize_session=False)
300 |         print("DELETED edges {}".format(",".join(str(v) for v in column_edge_ids)))
301 |         session.commit()
302 | 
303 |         session.query(JobExecution).filter(JobExecution.id.in_(executions)).delete(
304 |             synchronize_session=False
305 |         )
306 |         print("DELETED executions {}".format(",".join(str(v) for v in executions)))
307 |         session.commit()
308 | 
309 |         session.query(Job).filter(Job.name == name).delete(synchronize_session=False)
310 |         print("DELETED job {}".format(name))
311 |         session.commit()
312 | 
313 | 
314 | def test_api_main(graph_sdk, load_page_lookup_nonredirect_edges):
315 |     catalog, job_id, expected_edges = load_page_lookup_nonredirect_edges
316 |     graph = graph_sdk.get([job_id])
317 |     assert len(graph["edges"]) == 10
318 |     assert len(graph["nodes"]) == 15
319 | 
320 | 
321 | def test_parser(rest_catalog, parser_sdk, graph_sdk, save_catalog):
322 |     source = rest_catalog.get_source("test")
323 |     data = {
324 |         "name": "LOAD page_lookup",
325 |         "query": "INSERT INTO page_lookup SELECT plr.redirect_id, plr.redirect_title, plr.true_title, plr.page_id, "
326 |         "plr.page_version FROM page_lookup_redirect plr",
327 |         "source": source,
328 |         "start_time": datetime.datetime.now(),
329 |         "end_time": datetime.datetime.now(),
330 |     }
331 | 
332 |     job_execution = parser_sdk.analyze(**data)
333 |     assert job_execution is not None
334 | 
335 |     graph = graph_sdk.get([job_execution.job_id])
336 | 
337 |     assert len(graph["edges"]) == 10
338 |     assert len(graph["nodes"]) == 15
339 | 
340 |     column_lineages = rest_catalog.get_column_lineage([job_execution.job_id])
341 |     assert (len(column_lineages)) == 10
342 | 
343 | 
344 | @pytest.mark.parametrize(
345 |     "query",
346 |     [
347 |         "insert into p_lookup select * from page_lookup_redirect",
348 |         "insert into page_lookup select * from pg_lp_rt",
349 |         "insert into page_lookup select plr.page_id, true_title from page_lookup_redirect",
350 |     ],
351 | )
352 | def test_parser_table_not_found(rest_catalog, parser_sdk, managed_session, query):
353 |     source = rest_catalog.get_source("test")
354 | 
355 |     with pytest.raises(TableNotFound) as exc:
356 |         parser_sdk.analyze(
357 |             query=query,
358 |             source=source,
359 |             start_time=datetime.datetime.now(),
360 |             end_time=datetime.datetime.now(),
361 |         )
362 |         logging.debug(exc)
363 | 
364 | 
365 | @pytest.mark.parametrize(
366 |     "query",
367 |     [
368 |         "insert into page_lookup(title) select true_title from page_lookup_redirect",
369 |         "insert into page_lookup(true_title) select title from page_lookup_redirect",
370 |     ],
371 | )
372 | def test_parser_column_not_found(rest_catalog, parser_sdk, managed_session, query):
373 |     source = rest_catalog.get_source("test")
374 | 
375 |     with pytest.raises(ColumnNotFound) as exc:
376 |         parser_sdk.analyze(
377 |             query=query,
378 |             source=source,
379 |             start_time=datetime.datetime.now(),
380 |             end_time=datetime.datetime.now(),
381 |         )
382 |         logging.debug(exc)
383 | 
384 | 
385 | @pytest.mark.parametrize(
386 |     "query", ["insert page_lookup select * from page_lookup_redirect"]
387 | )
388 | def test_parser_parse_error(rest_catalog, parser_sdk, managed_session, query):
389 |     source = rest_catalog.get_source("test")
390 | 
391 |     with pytest.raises(ParseError) as exc:
392 |         parser_sdk.analyze(
393 |             query=query,
394 |             source=source,
395 |             start_time=datetime.datetime.now(),
396 |             end_time=datetime.datetime.now(),
397 |         )
398 |         logging.debug(exc)
399 | 


--------------------------------------------------------------------------------