├── .chglog
├── CHANGELOG.tpl.md
└── config.yml
├── .circleci
└── config.yml
├── .coveragerc
├── .gitignore
├── .pre-commit-config.yaml
├── LICENSE
├── MANIFEST.in
├── README.md
├── api_example.ipynb
├── data_lineage
├── __init__.py
├── __main__.py
├── assets
│ └── favicon.ico
├── graph.py
├── parser
│ ├── __init__.py
│ ├── binder.py
│ ├── dml_visitor.py
│ └── visitor.py
├── server.py
└── worker.py
├── docker
├── Dockerfile
├── build_image.sh
└── docker-entrypoint.sh
├── example.ipynb
├── full_graph.png
├── install-manifests
├── docker-compose
│ ├── catalog-demo.yml
│ ├── tokern-lineage-engine.yml
│ └── wikimedia-demo.yml
└── dockerfiles
│ ├── Dockerfile-demo-catalog
│ ├── Dockerfile-demo-wikimedia
│ ├── Makefile
│ ├── demo-catalog.sql
│ └── demo-wikimedia.sql
├── one_task.png
├── poetry.lock
├── pyproject.toml
├── pytest.ini
├── setup.cfg
└── test
├── catalog.json
├── conftest.py
├── queries.json
├── test_data_lineage.py
├── test_db_graph.py
├── test_dml_visitor.py
├── test_scan.py
└── test_server.py
/.chglog/CHANGELOG.tpl.md:
--------------------------------------------------------------------------------
1 | {{ range .Versions }}
2 |
3 | ## {{ if .Tag.Previous }}[{{ .Tag.Name }}]({{ $.Info.RepositoryURL }}/compare/{{ .Tag.Previous.Name }}...{{ .Tag.Name }}){{ else }}{{ .Tag.Name }}{{ end }} ({{ datetime "2006-01-02" .Tag.Date }})
4 |
5 | {{ range .CommitGroups -}}
6 | ### {{ .Title }}
7 |
8 | {{ range .Commits -}}
9 | * {{ .Subject }}
10 | {{ end }}
11 | {{ end -}}
12 |
13 | {{- if .RevertCommits -}}
14 | ### Reverts
15 |
16 | {{ range .RevertCommits -}}
17 | * {{ .Revert.Header }}
18 | {{ end }}
19 | {{ end -}}
20 |
21 | {{- if .NoteGroups -}}
22 | {{ range .NoteGroups -}}
23 | ### {{ .Title }}
24 |
25 | {{ range .Notes }}
26 | {{ .Body }}
27 | {{ end }}
28 | {{ end -}}
29 | {{ end -}}
30 | {{ end -}}
--------------------------------------------------------------------------------
/.chglog/config.yml:
--------------------------------------------------------------------------------
1 | style: github
2 | template: CHANGELOG.tpl.md
3 | info:
4 | title: CHANGELOG
5 | repository_url: https://github.com/tokern/data-lineage
6 | options:
7 | commits:
8 | # filters:
9 | # Type:
10 | # - feat
11 | # - fix
12 | # - perf
13 | # - refactor
14 | commit_groups:
15 | # title_maps:
16 | # feat: Features
17 | # fix: Bug Fixes
18 | # perf: Performance Improvements
19 | # refactor: Code Refactoring
20 | header:
21 | pattern: "^(\\w*)\\:\\s(.*)$"
22 | pattern_maps:
23 | - Type
24 | - Subject
25 | notes:
26 | keywords:
27 | - BREAKING CHANGE
--------------------------------------------------------------------------------
/.circleci/config.yml:
--------------------------------------------------------------------------------
1 | # Python CircleCI 2.0 configuration file
2 | #
3 | # Check https://circleci.com/docs/2.0/language-python/ for more details
4 | #
5 | version: 2.1
6 | orbs:
7 | codecov: codecov/codecov@1.0.5
8 | python: circleci/python@1.4.0
9 | workflows:
10 | build_and_deploy:
11 | jobs:
12 | - build:
13 | filters:
14 | tags:
15 | only: /.*/
16 | - deploy:
17 | requires:
18 | - build
19 | filters:
20 | tags:
21 | only: /v[0-9]+(\.[0-9]+)*/
22 | branches:
23 | ignore: /.*/
24 |
25 | jobs:
26 | build: &test-template
27 | docker:
28 | - image: circleci/python:3.8.3
29 | environment:
30 | PIPENV_VENV_IN_PROJECT: true
31 | # Specify service dependencies here if necessary
32 | # CircleCI maintains a library of pre-built images
33 | # documented at https://circleci.com/docs/2.0/circleci-images/
34 | - image: circleci/postgres:12.0-alpine-ram
35 | environment:
36 | POSTGRES_USER: piiuser
37 | POSTGRES_PASSWORD: p11secret
38 | POSTGRES_DB: piidb
39 |
40 | - image: circleci/mysql:8.0.18-ram
41 | environment:
42 | MYSQL_USER: piiuser
43 | MYSQL_PASSWORD: p11secret
44 | MYSQL_DATABASE: piidb
45 | MYSQL_ROOT_PASSWORD: r00tPa33w0rd
46 | environment:
47 | PYVERSION: "3.8.3"
48 | working_directory: ~/repo
49 |
50 | steps:
51 | - checkout
52 |
53 | - run:
54 | name: install dockerize
55 | command: wget https://github.com/jwilder/dockerize/releases/download/$DOCKERIZE_VERSION/dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz && sudo tar -C /usr/local/bin -xzvf dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz && rm dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz
56 | environment:
57 | DOCKERIZE_VERSION: v0.3.0
58 |
59 | - run:
60 | name: Wait for db
61 | command: |
62 | dockerize -wait tcp://localhost:5432 -timeout 1m
63 | dockerize -wait tcp://localhost:3306 -timeout 1m
64 |
65 | - python/install-packages:
66 | pkg-manager: poetry
67 | include-python-in-cache-key: false
68 | include-branch-in-cache-key: false
69 |
70 | # run tests!
71 | - run:
72 | name: run tests
73 | command: |
74 | poetry run isort --check --diff .
75 | poetry run black --check .
76 | poetry run flake8 data_lineage test
77 | poetry run pytest --junitxml=junit/test-results.xml --cov=data_lineage --cov-report=xml --cov-report=html test/
78 |
79 | - store_test_results: # Upload test results for display in Test Summary: https://circleci.com/docs/2.0/collect-test-data/
80 | path: test-results
81 |
82 | - store_artifacts:
83 | path: test-reports
84 | destination: test-reports
85 |
86 | - codecov/upload:
87 | file: coverage.xml
88 |
89 | deploy:
90 | environment:
91 | PYVERSION: "3.8.11"
92 | docker:
93 | - image: tokern/python:3.8.11-buster
94 | environment:
95 | PYVERSION: "3.8.11"
96 | steps:
97 | - checkout
98 | - python/install-packages:
99 | pkg-manager: poetry
100 | include-python-in-cache-key: false
101 | include-branch-in-cache-key: false
102 |
103 | - run:
104 | name: create packages
105 | command: |
106 | poetry publish --build --username "${PYPI_USERNAME}" --password "${PYPI_PASSWORD}"
107 |
108 | - run:
109 | name: install git release utilities
110 | command: |
111 | go get github.com/aktau/github-release
112 | GO111MODULE=on go get -u github.com/git-chglog/git-chglog/cmd/git-chglog
113 |
114 | - run:
115 | name: release
116 | command: |
117 | ~/go/bin/git-chglog $CIRCLE_TAG | ~/go/bin/github-release release --description - --tag $CIRCLE_TAG
118 |
119 | - setup_remote_docker
120 |
121 | - run:
122 | name: build docker and publish
123 | command: |
124 | ./docker/build_image.sh $CIRCLE_TAG --publish --latest
125 |
--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | branch = True
3 | source = */data_lineage/*
4 |
5 | [report]
6 | exclude_lines =
7 | if self.debug:
8 | pragma: no cover
9 | raise NotImplementedError
10 | if __name__ == .__main__.:
11 | ignore_errors = True
12 | omit =
13 | test/*
14 | setup.py
15 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Created by https://www.gitignore.io/api/python,pycharm
2 | # Edit at https://www.gitignore.io/?templates=python,pycharm
3 |
4 | ### PyCharm ###
5 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
6 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
7 |
8 | .idea
9 |
10 | # User-specific stuff
11 | .idea/**/workspace.xml
12 | .idea/**/tasks.xml
13 | .idea/**/usage.statistics.xml
14 | .idea/**/dictionaries
15 | .idea/**/shelf
16 |
17 | # Generated files
18 | .idea/**/contentModel.xml
19 |
20 | # Sensitive or high-churn files
21 | .idea/**/dataSources/
22 | .idea/**/dataSources.ids
23 | .idea/**/dataSources.local.xml
24 | .idea/**/sqlDataSources.xml
25 | .idea/**/dynamic.xml
26 | .idea/**/uiDesigner.xml
27 | .idea/**/dbnavigator.xml
28 |
29 | # Gradle
30 | .idea/**/gradle.xml
31 | .idea/**/libraries
32 |
33 | # Gradle and Maven with auto-import
34 | # When using Gradle or Maven with auto-import, you should exclude module files,
35 | # since they will be recreated, and may cause churn. Uncomment if using
36 | # auto-import.
37 | # .idea/modules.xml
38 | # .idea/*.iml
39 | # .idea/modules
40 | # *.iml
41 | # *.ipr
42 |
43 | # CMake
44 | cmake-build-*/
45 |
46 | # Mongo Explorer plugin
47 | .idea/**/mongoSettings.xml
48 |
49 | # File-based project format
50 | *.iws
51 |
52 | # IntelliJ
53 | out/
54 |
55 | # mpeltonen/sbt-idea plugin
56 | .idea_modules/
57 |
58 | # JIRA plugin
59 | atlassian-ide-plugin.xml
60 |
61 | # Cursive Clojure plugin
62 | .idea/replstate.xml
63 |
64 | # Crashlytics plugin (for Android Studio and IntelliJ)
65 | com_crashlytics_export_strings.xml
66 | crashlytics.properties
67 | crashlytics-build.properties
68 | fabric.properties
69 |
70 | # Editor-based Rest Client
71 | .idea/httpRequests
72 |
73 | # Android studio 3.1+ serialized cache file
74 | .idea/caches/build_file_checksums.ser
75 |
76 | ### PyCharm Patch ###
77 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
78 |
79 | # *.iml
80 | # modules.xml
81 | # .idea/misc.xml
82 | # *.ipr
83 |
84 | # Sonarlint plugin
85 | .idea/**/sonarlint/
86 |
87 | # SonarQube Plugin
88 | .idea/**/sonarIssues.xml
89 |
90 | # Markdown Navigator plugin
91 | .idea/**/markdown-navigator.xml
92 | .idea/**/markdown-navigator/
93 |
94 | ### Python ###
95 | # Byte-compiled / optimized / DLL files
96 | __pycache__/
97 | *.py[cod]
98 | *$py.class
99 |
100 | # C extensions
101 | *.so
102 |
103 | # Distribution / packaging
104 | .Python
105 | build/
106 | develop-eggs/
107 | dist/
108 | downloads/
109 | eggs/
110 | .eggs/
111 | lib/
112 | lib64/
113 | parts/
114 | sdist/
115 | var/
116 | wheels/
117 | pip-wheel-metadata/
118 | share/python-wheels/
119 | *.egg-info/
120 | .installed.cfg
121 | *.egg
122 | MANIFEST
123 |
124 | # PyInstaller
125 | # Usually these files are written by a python script from a template
126 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
127 | *.manifest
128 | *.spec
129 |
130 | # Installer logs
131 | pip-log.txt
132 | pip-delete-this-directory.txt
133 |
134 | # Unit test / coverage reports
135 | htmlcov/
136 | .tox/
137 | .nox/
138 | .coverage
139 | .coverage.*
140 | .cache
141 | nosetests.xml
142 | coverage.xml
143 | *.cover
144 | .hypothesis/
145 | .pytest_cache/
146 |
147 | # Translations
148 | *.mo
149 | *.pot
150 |
151 | # Scrapy stuff:
152 | .scrapy
153 |
154 | # Sphinx documentation
155 | docs/_build/
156 |
157 | # PyBuilder
158 | target/
159 |
160 | # pyenv
161 | .python-version
162 |
163 | # pipenv
164 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
165 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
166 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
167 | # install all needed dependencies.
168 | #Pipfile.lock
169 |
170 | # celery beat schedule file
171 | celerybeat-schedule
172 |
173 | # SageMath parsed files
174 | *.sage.py
175 |
176 | # Spyder project settings
177 | .spyderproject
178 | .spyproject
179 |
180 | # Rope project settings
181 | .ropeproject
182 |
183 | # Mr Developer
184 | .mr.developer.cfg
185 | .project
186 | .pydevproject
187 |
188 | # mkdocs documentation
189 | /site
190 |
191 | # mypy
192 | .mypy_cache/
193 | .dmypy.json
194 | dmypy.json
195 |
196 | # Pyre type checker
197 | .pyre/
198 |
199 | junit/
200 |
201 | .ipynb_checkpoints/
202 |
203 | # End of https://www.gitignore.io/api/python,pycharm
204 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: local
3 | hooks:
4 | - id: isort
5 | name: isort
6 | stages: [commit]
7 | language: system
8 | entry: poetry run isort
9 | types: [python]
10 |
11 | - id: black
12 | name: black
13 | stages: [commit]
14 | language: system
15 | entry: poetry run black
16 | types: [python]
17 |
18 | - id: mypy
19 | name: mypy
20 | stages: [commit]
21 | language: system
22 | entry: poetry run mypy
23 | types: [python]
24 | pass_filenames: false
25 |
26 |
27 | - id: flake8
28 | name: flake8
29 | stages: [commit]
30 | language: system
31 | entry: poetry run flake8
32 | types: [python]
33 | exclude: setup.py
34 |
35 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 Tokern
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include data_lineage/assets/*
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Tokern Lineage Engine
2 |
3 | [](https://circleci.com/gh/tokern/data-lineage)
4 | [](https://codecov.io/gh/tokern/data-lineage)
5 | [](https://pypi.python.org/pypi/data-lineage)
6 | [](https://pypi.org/project/data-lineage/)
7 | [](https://pypi.org/project/data-lineage/)
8 |
9 |
10 | Tokern Lineage Engine is _fast_ and _easy to use_ application to collect, visualize and analyze
11 | column-level data lineage in databases, data warehouses and data lakes in AWS and RDS.
12 |
13 | Tokern Lineage helps you browse column-level data lineage
14 | * visually using [kedro-viz](https://github.com/quantumblacklabs/kedro-viz)
15 | * analyze lineage graphs programmatically using the powerful [networkx graph library](https://networkx.org/)
16 |
17 | ## Resources
18 |
19 | * Demo of Tokern Lineage App
20 |
21 | 
22 |
23 | * Checkout an [example data lineage notebook](http://tokern.io/docs/data-lineage/example/).
24 |
25 | * Check out [the post on using data lineage for cost control](https://tokern.io/blog/data-lineage-on-redshift/) for an
26 | example of how data lineage can be used in production.
27 |
28 | ## Quick Start
29 |
30 | ### Install a demo of using Docker and Docker Compose
31 |
32 | Download the docker-compose file from Github repository.
33 |
34 |
35 | # in a new directory run
36 | wget https://raw.githubusercontent.com/tokern/data-lineage/master/install-manifests/docker-compose/catalog-demo.yml
37 | # or run
38 | curl https://raw.githubusercontent.com/tokern/data-lineage/master/install-manifests/docker-compose/tokern-lineage-engine.yml -o docker-compose.yml
39 |
40 |
41 | Run docker-compose
42 |
43 |
44 | docker-compose up -d
45 |
46 |
47 | Check that the containers are running.
48 |
49 |
50 | docker ps
51 | CONTAINER ID IMAGE CREATED STATUS PORTS NAMES
52 | 3f4e77845b81 tokern/data-lineage-viz:latest ... 4 hours ago Up 4 hours 0.0.0.0:8000->80/tcp tokern-data-lineage-visualizer
53 | 1e1ce4efd792 tokern/data-lineage:latest ... 5 days ago Up 5 days tokern-data-lineage
54 | 38be15bedd39 tokern/demodb:latest ... 2 weeks ago Up 2 weeks tokern-demodb
55 |
56 | Try out Tokern Lineage App
57 |
58 | Head to `http://localhost:8000/` to open the Tokern Lineage app
59 |
60 | ### Install Tokern Lineage Engine
61 |
62 | # in a new directory run
63 | wget https://raw.githubusercontent.com/tokern/data-lineage/master/install-manifests/docker-compose/tokern-lineage-engine.yml
64 | # or run
65 | curl https://raw.githubusercontent.com/tokern/data-lineage/master/install-manifests/docker-compose/catalog-demo.yml -o tokern-lineage-engine.yml
66 |
67 | Run docker-compose
68 |
69 |
70 | docker-compose up -d
71 |
72 |
73 | If you want to use an external Postgres database, change the following parameters in `tokern-lineage-engine.yml`:
74 |
75 | * CATALOG_HOST
76 | * CATALOG_USER
77 | * CATALOG_PASSWORD
78 | * CATALOG_DB
79 |
80 | You can also override default values using environement variables.
81 |
82 | CATALOG_HOST=... CATALOG_USER=... CATALOG_PASSWORD=... CATALOG_DB=... docker-compose -f ... up -d
83 |
84 | For more advanced usage of environment variables with docker-compose, [refer to docker-compose docs](https://docs.docker.com/compose/environment-variables/)
85 |
86 | **Pro-tip**
87 |
88 | If you want to connect to a database in the host machine, set
89 |
90 | CATALOG_HOST: host.docker.internal # For mac or windows
91 | #OR
92 | CATALOG_HOST: 172.17.0.1 # Linux
93 |
94 | ## Supported Technologies
95 |
96 | * Postgres
97 | * AWS Redshift
98 | * Snowflake
99 |
100 | ### Coming Soon
101 |
102 | * SparkSQL
103 | * Presto
104 |
105 | ## Documentation
106 |
107 | For advanced usage, please refer to [data-lineage documentation](https://tokern.io/docs/data-lineage/index.html)
108 | ## Survey
109 |
110 | Please take this [survey](https://forms.gle/p2oEQBJnpEguhrp3A) if you are a user or considering using data-lineage. Responses will help us prioritize features better.
111 |
--------------------------------------------------------------------------------
/api_example.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "public-income",
6 | "metadata": {},
7 | "source": [
8 | "# Overview\n",
9 | "\n",
10 | "This example showcases the API exposed by the data lineage package. The API can be used to build\n",
11 | "a lineage graph by adding nodes and edges that represent columns and transformations. \n",
12 | "\n",
13 | "Note that the goal of the example to explain the building blocks of the lineage graph.\n",
14 | "In practical scenarios, use a pack (e.g. query parser pack) to automate the process.\n",
15 | "\n",
16 | "This example consists of the following sequence of operations:\n",
17 | "* Start docker containers containing a demo. Refer to [docs](https://tokern.io/docs/data-lineage/installation) for detailed instructions on installing demo-wikimedia.\n",
18 | "* Register nodes from columns in the catalog.\n",
19 | "* Register directed edges to represent that a column is the source of data for another column.\n",
20 | "* Visualize the graph by visiting [Tokern UI](http://localhost:8000/).\n",
21 | "* Analyze the graph"
22 | ]
23 | },
24 | {
25 | "cell_type": "markdown",
26 | "id": "6a9c9b70",
27 | "metadata": {
28 | "pycharm": {
29 | "name": "#%% md\n"
30 | }
31 | },
32 | "source": [
33 | "# Installation\n",
34 | "\n",
35 | "This demo requires wikimedia demo to be running. Start the demo using the following instructions:\n",
36 | "\n",
37 | " # in a new directory run\n",
38 | " wget https://raw.githubusercontent.com/tokern/data-lineage/master/install-manifests/docker-compose/wikimedia-demo.yml\n",
39 | " # or run\n",
40 | " curl https://raw.githubusercontent.com/tokern/data-lineage/master/install-manifests/docker-compose/wikimedia-demo.yml -o docker-compose.yml\n",
41 | "\n",
42 | "\n",
43 | "Run docker-compose\n",
44 | "\n",
45 | "\n",
46 | " docker-compose up -d\n",
47 | "\n",
48 | "\n",
49 | "Verify container are running\n",
50 | "\n",
51 | "\n",
52 | " docker container ls | grep tokern\n"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": 1,
58 | "id": "37651618",
59 | "metadata": {},
60 | "outputs": [],
61 | "source": [
62 | "# Required configuration for API and wikimedia database network address\n",
63 | "\n",
64 | "docker_address = \"http://127.0.0.1:8000\"\n",
65 | "wikimedia_db = {\n",
66 | " \"username\": \"etldev\",\n",
67 | " \"password\": \"3tld3v\",\n",
68 | " \"uri\": \"tokern-demo-wikimedia\",\n",
69 | " \"port\": \"5432\",\n",
70 | " \"database\": \"wikimedia\"\n",
71 | "}"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": 2,
77 | "id": "wrong-antigua",
78 | "metadata": {
79 | "scrolled": true
80 | },
81 | "outputs": [],
82 | "source": [
83 | "# Setup a connection to catalog using the SDK.\n",
84 | "from data_lineage import Catalog\n",
85 | "\n",
86 | "catalog = Catalog(docker_address)"
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": 3,
92 | "id": "23ed8c16",
93 | "metadata": {
94 | "pycharm": {
95 | "name": "#%%\n"
96 | },
97 | "scrolled": true
98 | },
99 | "outputs": [],
100 | "source": [
101 | "# Register wikimedia datawarehouse with data-lineage app.\n",
102 | "\n",
103 | "source = catalog.add_source(name=\"wikimedia\", source_type=\"postgresql\", **wikimedia_db)"
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": 4,
109 | "id": "ce6ebf16",
110 | "metadata": {
111 | "scrolled": false
112 | },
113 | "outputs": [
114 | {
115 | "data": {
116 | "text/plain": [
117 | "True"
118 | ]
119 | },
120 | "execution_count": 4,
121 | "metadata": {},
122 | "output_type": "execute_result"
123 | }
124 | ],
125 | "source": [
126 | "# Scan the wikimedia data warehouse and register all schemata, tables and columns.\n",
127 | "\n",
128 | "catalog.scan_source(source)"
129 | ]
130 | },
131 | {
132 | "cell_type": "code",
133 | "execution_count": 5,
134 | "id": "202c6b63",
135 | "metadata": {
136 | "scrolled": false
137 | },
138 | "outputs": [
139 | {
140 | "name": "stdout",
141 | "output_type": "stream",
142 | "text": [
143 | "{'attributes': {'context': {'sql': 'insert into page_lookup_nonredirect(redirect_id) select page_id from page'}, 'name': 'insert_into_page_lookup_nonredirect'}, 'id': '1', 'links': {'self': 'http://tokern-api:4142/api/v1/catalog/jobs/1'}, 'type': 'jobs'}\n"
144 | ]
145 | }
146 | ],
147 | "source": [
148 | "# Create a job and job_execution that inserts data from page to page_lookup_nonredirect\n",
149 | "\n",
150 | "job = catalog.add_job(\"insert_into_page_lookup_nonredirect\",\n",
151 | " {\n",
152 | " \"sql\": \"insert into page_lookup_nonredirect(redirect_id) select page_id from page\"\n",
153 | " })"
154 | ]
155 | },
156 | {
157 | "cell_type": "code",
158 | "execution_count": 6,
159 | "id": "cf308d97",
160 | "metadata": {
161 | "scrolled": true
162 | },
163 | "outputs": [],
164 | "source": [
165 | "import datetime\n",
166 | "from dbcat.catalog.models import JobExecutionStatus\n",
167 | "\n",
168 | "job_execution = catalog.add_job_execution(\n",
169 | " job=job,\n",
170 | " started_at=datetime.datetime.combine(\n",
171 | " datetime.date(2021, 4, 1), datetime.time(1, 0)\n",
172 | " ),\n",
173 | " ended_at=datetime.datetime.combine(\n",
174 | " datetime.date(2021, 4, 1), datetime.time(1, 15)\n",
175 | " ),\n",
176 | " status=JobExecutionStatus.SUCCESS,\n",
177 | ")\n"
178 | ]
179 | },
180 | {
181 | "cell_type": "code",
182 | "execution_count": 8,
183 | "id": "b45aaac8",
184 | "metadata": {},
185 | "outputs": [],
186 | "source": [
187 | "# Add an edge between these two columns:\n",
188 | "# (test\", \"default\", \"page\", \"page_id\") -> (\"test\", \"default\", \"page_lookup_nonredirect\", \"redirect_id\"),\n",
189 | "\n",
190 | "source_column = catalog.get_column(source_name=\"wikimedia\", \n",
191 | " schema_name=\"public\", \n",
192 | " table_name=\"page\",\n",
193 | " column_name=\"page_id\")\n",
194 | "target_column = catalog.get_column(source_name=\"wikimedia\", \n",
195 | " schema_name=\"public\", \n",
196 | " table_name=\"page_lookup_nonredirect\",\n",
197 | " column_name=\"redirect_id\")\n",
198 | "\n",
199 | "edge = catalog.add_column_lineage(source=source_column,\n",
200 | " target=target_column,\n",
201 | " job_execution_id=job_execution.id,\n",
202 | " context={})"
203 | ]
204 | },
205 | {
206 | "cell_type": "markdown",
207 | "id": "254fb735",
208 | "metadata": {},
209 | "source": [
210 | "Visit [Kedro UI](http://localhost:8000/)\n",
211 | "\n",
212 | ""
213 | ]
214 | }
215 | ],
216 | "metadata": {
217 | "kernelspec": {
218 | "display_name": "Python 3",
219 | "language": "python",
220 | "name": "python3"
221 | },
222 | "language_info": {
223 | "codemirror_mode": {
224 | "name": "ipython",
225 | "version": 3
226 | },
227 | "file_extension": ".py",
228 | "mimetype": "text/x-python",
229 | "name": "python",
230 | "nbconvert_exporter": "python",
231 | "pygments_lexer": "ipython3",
232 | "version": "3.8.5"
233 | }
234 | },
235 | "nbformat": 4,
236 | "nbformat_minor": 5
237 | }
--------------------------------------------------------------------------------
/data_lineage/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | __version__ = "0.9.0"
3 |
4 | import datetime
5 | import json
6 | import logging
7 | from typing import Any, Dict, Generator, List, Optional, Type, TypeVar
8 |
9 | import requests
10 | from dbcat.catalog.models import JobExecutionStatus
11 | from furl import furl
12 | from requests import HTTPError
13 |
14 | from data_lineage.graph import LineageGraph
15 |
16 |
17 | class SourceNotFound(Exception):
18 | """Source not found in catalog"""
19 |
20 |
21 | class SchemaNotFound(Exception):
22 | """Schema not found in catalog"""
23 |
24 |
25 | class TableNotFound(Exception):
26 | """Table not found in catalog"""
27 |
28 |
29 | class ColumnNotFound(Exception):
30 | """Column not found in catalog"""
31 |
32 |
33 | class ParseError(Exception):
34 | """Parser Error"""
35 |
36 |
37 | class SemanticError(Exception):
38 | """Error due to mismatch in catalog data"""
39 |
40 |
41 | class NoResultFound(Exception):
42 | """Raised when function returns no results"""
43 |
44 |
45 | class MultipleResultsFound(Exception):
46 | """Raised when multiple results are found but expected only one or zero results"""
47 |
48 |
49 | class Graph:
50 | def __init__(self, url: str):
51 | self._base_url = furl(url) / "api/main"
52 | self._session = requests.Session()
53 |
54 | def get(self, job_ids: set = None) -> Dict[str, List[Dict[str, str]]]:
55 | if job_ids is not None:
56 | response = self._session.get(
57 | self._base_url, params={"job_ids": list(job_ids)}
58 | )
59 | else:
60 | response = self._session.get(self._base_url)
61 | return response.json()
62 |
63 |
64 | def load_graph(graphSDK: Graph, job_ids: set = None) -> LineageGraph:
65 | data = graphSDK.get(job_ids)
66 | return LineageGraph(nodes=data["nodes"], edges=data["edges"])
67 |
68 |
69 | class BaseModel:
70 | def __init__(self, session, attributes, obj_id, relationships):
71 | self._session = session
72 | self._attributes = attributes
73 | self._obj_id = obj_id
74 | self._relationships = relationships
75 |
76 | def __getattr__(self, item):
77 | logging.debug("Attributes: {}".format(self._attributes))
78 | if item == "id":
79 | return self._obj_id
80 | elif self._attributes and item in self._attributes.keys():
81 | return self._attributes[item]
82 | elif self._relationships and item in self._relationships.keys():
83 | return self._relationships[item]
84 | raise AttributeError
85 |
86 |
87 | class Source(BaseModel):
88 | def __init__(self, session, attributes, obj_id, relationships):
89 | super().__init__(session, attributes, obj_id, relationships)
90 |
91 |
92 | class Schema(BaseModel):
93 | def __init__(self, session, attributes, obj_id, relationships):
94 | super().__init__(session, attributes, obj_id, relationships)
95 |
96 |
97 | class Table(BaseModel):
98 | def __init__(self, session, attributes, obj_id, relationships):
99 | super().__init__(session, attributes, obj_id, relationships)
100 |
101 |
102 | class Column(BaseModel):
103 | def __init__(self, session, attributes, obj_id, relationships):
104 | super().__init__(session, attributes, obj_id, relationships)
105 |
106 |
107 | class Job(BaseModel):
108 | def __init__(self, session, attributes, obj_id, relationships):
109 | super().__init__(session, attributes, obj_id, relationships)
110 |
111 |
112 | class JobExecution(BaseModel):
113 | def __init__(self, session, attributes, obj_id, relationships):
114 | super().__init__(session, attributes, obj_id, relationships)
115 |
116 |
117 | class ColumnLineage(BaseModel):
118 | def __init__(self, session, attributes, obj_id, relationships):
119 | super().__init__(session, attributes, obj_id, relationships)
120 |
121 |
122 | class DefaultSchema(BaseModel):
123 | def __init__(self, session, attributes, obj_id, relationships):
124 | super().__init__(session, attributes, obj_id, relationships)
125 |
126 |
127 | ModelType = TypeVar("ModelType", bound=BaseModel)
128 |
129 |
130 | class Catalog:
131 | def __init__(self, url: str):
132 | self._base_url = furl(url) / "api/v1/catalog"
133 | self._session = requests.Session()
134 | self._session.headers.update({"Accept": "application/vnd.api+json"})
135 | self._session.headers.update({"Content-Type": "application/vnd.api+json"})
136 |
137 | def _build_url(self, *urls) -> str:
138 | built_url = self._base_url
139 | for url in urls:
140 | built_url = furl(built_url) / url
141 | logging.debug(built_url)
142 | return built_url
143 |
144 | str_to_type = {
145 | "sources": Source,
146 | "schemata": Schema,
147 | }
148 |
149 | def _resolve_relationships(self, relationships) -> Dict[str, BaseModel]:
150 | resolved: Dict[str, BaseModel] = {}
151 | for key, value in relationships.items():
152 | logging.debug("Resolving {}:{}".format(key, value))
153 | if value["data"]:
154 | resolved[key] = self._obj_factory(
155 | value["data"],
156 | Catalog.str_to_type[value["data"]["type"]],
157 | resolve_relationships=False,
158 | )
159 |
160 | return resolved
161 |
162 | def _obj_factory(
163 | self,
164 | payload: Dict[str, Any],
165 | clazz: Type[ModelType],
166 | resolve_relationships=False,
167 | ) -> ModelType:
168 | resolved = None
169 | if resolve_relationships and payload.get("relationships"):
170 | resolved = self._resolve_relationships(payload.get("relationships"))
171 |
172 | return clazz(
173 | session=self._session,
174 | attributes=payload.get("attributes"),
175 | obj_id=payload.get("id"),
176 | relationships=resolved,
177 | )
178 |
179 | def _iterate(self, payload: Dict[str, Any], clazz: Type[BaseModel]):
180 | res: Optional[Dict[str, Any]] = payload
181 | while res is not None:
182 | for item in res["data"]:
183 | yield self._obj_factory(payload=item, clazz=clazz)
184 |
185 | if res["links"]["next"] is not None:
186 | response = self._session.get(res["links"]["next"])
187 | res = response.json()
188 | else:
189 | res = None
190 |
191 | def _index(self, path: str, clazz: Type[BaseModel]):
192 | response = self._session.get(self._build_url(path))
193 | logging.debug(response.json())
194 | return self._iterate(response.json(), clazz)
195 |
196 | def _get(
197 | self,
198 | path: str,
199 | obj_id: int,
200 | clazz: Type[ModelType],
201 | resolve_relationships=False,
202 | ) -> ModelType:
203 | response = self._session.get(self._build_url(path, str(obj_id)))
204 | json_response = response.json()
205 | logging.debug(json_response)
206 | response.raise_for_status()
207 | return self._obj_factory(
208 | json_response["data"], clazz, resolve_relationships=resolve_relationships
209 | )
210 |
211 | @staticmethod
212 | def _one(response):
213 | json_response = response.json()
214 | logging.debug(json_response)
215 | num_results = json_response["meta"]["total"]
216 | if num_results == 0:
217 | raise NoResultFound
218 | elif num_results > 1:
219 | raise MultipleResultsFound
220 |
221 | return json_response["data"][0]
222 |
223 | def _search_one(self, path: str, filters):
224 | params = {"filter[objects]": json.dumps(filters)}
225 | response = self._session.get(self._build_url(path), params=params)
226 | response.raise_for_status()
227 | return Catalog._one(response)
228 |
229 | def _search(self, path: str, search_string: str, clazz: Type[BaseModel]):
230 | filters = [dict(name="name", op="like", val="%{}%".format(search_string))]
231 | params = {"filter[objects]": json.dumps(filters)}
232 | response = self._session.get(self._build_url(path), params=params)
233 | return self._iterate(response.json(), clazz)
234 |
235 | def _post(self, path: str, data: Dict[str, Any], type: str) -> Dict[Any, Any]:
236 | payload = {"data": {"type": type, "attributes": data}}
237 | response = self._session.post(
238 | url=self._build_url(path), data=json.dumps(payload, default=str)
239 | )
240 | response.raise_for_status()
241 | logging.debug(response.text)
242 | json_response = response.json()
243 | return json_response["data"]
244 |
245 | def _patch(self, path: str, obj_id: int, data: Dict[str, Any], type: str):
246 | payload = {"data": {"type": type, "attributes": data, "id": obj_id}}
247 | response = self._session.patch(
248 | url=self._build_url(path, str(obj_id)),
249 | data=json.dumps(payload, default=str),
250 | )
251 | response.raise_for_status()
252 | return
253 |
254 | def get_sources(self) -> Generator[Any, Any, None]:
255 | return self._index("sources", Source)
256 |
257 | def get_schemata(self):
258 | return self._index("schemata", Schema)
259 |
260 | def get_tables(self):
261 | return self._index("tables", Table)
262 |
263 | def get_columns(self):
264 | return self._index("columns", Column)
265 |
266 | def get_jobs(self):
267 | return self._index("jobs", Job)
268 |
269 | def get_job_executions(self):
270 | return self._index("job_executions", JobExecution)
271 |
272 | def get_column_lineages(self):
273 | return self._index("column_lineages", ColumnLineage)
274 |
275 | def get_source_by_id(self, obj_id) -> Source:
276 | return self._get("sources", obj_id, Source)
277 |
278 | def get_schema_by_id(self, obj_id) -> Schema:
279 | return self._get("schemata", obj_id, Schema)
280 |
281 | def get_table_by_id(self, obj_id) -> Table:
282 | return self._get("tables", obj_id, Table)
283 |
284 | def get_column_by_id(self, obj_id) -> Column:
285 | return self._get("columns", obj_id, Column)
286 |
287 | def get_job_by_id(self, obj_id) -> Job:
288 | return self._get("jobs", obj_id, Job)
289 |
290 | def get_job_execution_by_id(self, obj_id) -> JobExecution:
291 | return self._get("job_executions", obj_id, JobExecution)
292 |
293 | def get_column_lineage(self, job_ids: List[int]) -> List[ColumnLineage]:
294 | params = {"job_ids": job_ids}
295 | response = self._session.get(self._build_url("column_lineage"), params=params)
296 | logging.debug(response.json())
297 | response.raise_for_status()
298 | return [
299 | ColumnLineage(
300 | session=self._session,
301 | attributes=item["attributes"],
302 | obj_id=item["id"],
303 | relationships=item["relationships"],
304 | )
305 | for item in response.json()["data"]
306 | ]
307 |
308 | def get_source(self, name) -> Source:
309 | filters = [dict(name="name", op="eq", val="{}".format(name))]
310 | try:
311 | payload = self._search_one("sources", filters)
312 | except NoResultFound:
313 | raise SourceNotFound("Source not found: source_name={}".format(name))
314 |
315 | return self._obj_factory(payload, Source)
316 |
317 | def get_schema(self, source_name: str, schema_name: str) -> Schema:
318 | name_filter = dict(name="name", op="eq", val=schema_name)
319 | source_filter = dict(
320 | name="source", op="has", val=dict(name="name", op="eq", val=source_name)
321 | )
322 | filters = {"and": [name_filter, source_filter]}
323 | logging.debug(filters)
324 | try:
325 | payload = self._search_one("schemata", [filters])
326 | except NoResultFound:
327 | raise SchemaNotFound(
328 | "Schema not found, (source_name={}, schema_name={})".format(
329 | source_name, schema_name
330 | )
331 | )
332 | return self._obj_factory(payload, Schema)
333 |
334 | def get_table(self, source_name: str, schema_name: str, table_name: str) -> Table:
335 | schema = self.get_schema(source_name, schema_name)
336 |
337 | name_filter = dict(name="name", op="eq", val=table_name)
338 | schema_id_filter = dict(name="schema_id", op="eq", val=str(schema.id))
339 | filters = {"and": [name_filter, schema_id_filter]}
340 | logging.debug(filters)
341 | try:
342 | payload = self._search_one("tables", [filters])
343 | except NoResultFound:
344 | raise TableNotFound(
345 | "Table not found, (source_name={}, schema_name={}, table_name={})".format(
346 | source_name, schema_name, table_name
347 | )
348 | )
349 | return self._obj_factory(payload, Table)
350 |
351 | def get_columns_for_table(self, table: Table):
352 | return self._index("tables/{}/columns".format(table.id), Column)
353 |
354 | def get_column(self, source_name, schema_name, table_name, column_name) -> Column:
355 | table = self.get_table(source_name, schema_name, table_name)
356 | name_filter = dict(name="name", op="eq", val=column_name)
357 | table_filter = dict(name="table_id", op="eq", val=str(table.id))
358 | filters = {"and": [name_filter, table_filter]}
359 | logging.debug(filters)
360 | try:
361 | payload = self._search_one("columns", [filters])
362 | except NoResultFound:
363 | raise ColumnNotFound(
364 | "Column not found, (source_name={}, schema_name={}, table_name={}, column_name={})".format(
365 | source_name, schema_name, table_name, column_name
366 | )
367 | )
368 | return self._obj_factory(payload, Column)
369 |
370 | def add_source(self, name: str, source_type: str, **kwargs) -> Source:
371 | data = {"name": name, "source_type": source_type, **kwargs}
372 | payload = self._post(path="sources", data=data, type="sources")
373 | return self._obj_factory(payload, Source)
374 |
375 | def add_schema(self, name: str, source: Source) -> Schema:
376 | data = {"name": name, "source_id": source.id}
377 | payload = self._post(path="schemata", data=data, type="schemata")
378 | return self._obj_factory(payload, Schema)
379 |
380 | def add_table(self, name: str, schema: Schema) -> Table:
381 | data = {"name": name, "schema_id": schema.id}
382 | payload = self._post(path="tables", data=data, type="tables")
383 | return self._obj_factory(payload, Table)
384 |
385 | def add_column(
386 | self, name: str, data_type: str, sort_order: int, table: Table
387 | ) -> Column:
388 | data = {
389 | "name": name,
390 | "table_id": table.id,
391 | "data_type": data_type,
392 | "sort_order": sort_order,
393 | }
394 | payload = self._post(path="columns", data=data, type="columns")
395 | return self._obj_factory(payload, Column)
396 |
397 | def add_job(self, name: str, context: Dict[Any, Any]) -> Job:
398 | data = {"name": name, "context": context}
399 | payload = self._post(path="jobs", data=data, type="jobs")
400 | return self._obj_factory(payload, Job)
401 |
402 | def add_job_execution(
403 | self,
404 | job: Job,
405 | started_at: datetime.datetime,
406 | ended_at: datetime.datetime,
407 | status: JobExecutionStatus,
408 | ) -> JobExecution:
409 | data = {
410 | "job_id": job.id,
411 | "started_at": started_at,
412 | "ended_at": ended_at,
413 | "status": status.name,
414 | }
415 | payload = self._post(path="job_executions", data=data, type="job_executions")
416 | return self._obj_factory(payload, JobExecution)
417 |
418 | def add_column_lineage(
419 | self,
420 | source: Column,
421 | target: Column,
422 | job_execution_id: int,
423 | context: Dict[Any, Any],
424 | ) -> ColumnLineage:
425 | data = {
426 | "source_id": source.id,
427 | "target_id": target.id,
428 | "job_execution_id": job_execution_id,
429 | "context": context,
430 | }
431 | payload = self._post(path="column_lineage", data=data, type="column_lineage")
432 | return self._obj_factory(payload, ColumnLineage)
433 |
434 | def update_source(self, source: Source, schema: Schema) -> DefaultSchema:
435 | try:
436 | current_obj = self._get(
437 | path="default_schema",
438 | obj_id=source.id,
439 | clazz=DefaultSchema,
440 | resolve_relationships=True,
441 | )
442 | if current_obj.schema.id == schema.id:
443 | return current_obj
444 | except HTTPError as error:
445 | if error.response.status_code == 404:
446 | data = {"source_id": source.id, "schema_id": schema.id}
447 | payload = self._post(
448 | path="default_schema", data=data, type="default_schema"
449 | )
450 | return self._obj_factory(
451 | payload, DefaultSchema, resolve_relationships=True
452 | )
453 |
454 | # Patch
455 | data = {"schema_id": schema.id}
456 | self._patch(
457 | path="default_schema", data=data, type="default_schema", obj_id=source.id
458 | )
459 | return self._get(
460 | path="default_schema",
461 | obj_id=source.id,
462 | clazz=DefaultSchema,
463 | resolve_relationships=True,
464 | )
465 |
466 |
467 | class Analyze:
468 | def __init__(self, url: str):
469 | self._base_url = furl(url) / "api/v1/analyze"
470 | self._session = requests.Session()
471 |
472 | def analyze(
473 | self,
474 | query: str,
475 | source: Source,
476 | start_time: datetime.datetime,
477 | end_time: datetime.datetime,
478 | name: str = None,
479 | ) -> JobExecution:
480 | payload = {
481 | "query": query,
482 | "name": name,
483 | "source_id": source.id,
484 | "start_time": start_time.isoformat(),
485 | "end_time": end_time.isoformat(),
486 | }
487 |
488 | response = self._session.post(self._base_url, json=payload,)
489 | if response.status_code == 441:
490 | raise TableNotFound(response.json()["message"])
491 | elif response.status_code == 442:
492 | raise ColumnNotFound(response.json()["message"])
493 | elif response.status_code == 422:
494 | raise ParseError(response.json()["message"])
495 | elif response.status_code == 443:
496 | raise SemanticError(response.json()["message"])
497 |
498 | logging.debug(response.text)
499 | response.raise_for_status()
500 | payload = response.json()["data"]
501 | return JobExecution(
502 | session=self._session,
503 | attributes=payload.get("attributes"),
504 | obj_id=payload.get("id"),
505 | relationships=None,
506 | )
507 |
508 |
509 | class Parse:
510 | def __init__(self, url: str):
511 | self._base_url = furl(url) / "api/v1/parse"
512 | self._session = requests.Session()
513 |
514 | def parse(self, query: str, source: Source):
515 | response = self._session.post(
516 | self._base_url, json={"query": query, "source_id": source.id},
517 | )
518 | logging.debug(response.text)
519 | response.raise_for_status()
520 | return response.json()
521 |
522 |
523 | class Scan:
524 | def __init__(self, url: str):
525 | self._base_url = furl(url) / "api/v1/scan"
526 | self._session = requests.Session()
527 |
528 | def start(self, source: Source) -> Dict[str, str]:
529 | payload = {"id": source.id}
530 | response = self._session.post(url=self._base_url, json=payload)
531 | response.raise_for_status()
532 | return response.json()
533 |
534 | def list(self) -> List[Dict[str, str]]:
535 | response = self._session.post(url=self._base_url)
536 | response.raise_for_status()
537 | return response.json()
538 |
539 | def get(self, job_id: str) -> Dict[str, str]:
540 | response = self._session.get(url=furl(self._base_url) / job_id)
541 | response.raise_for_status()
542 | return response.json()
543 |
544 | def cancel(self, job_id: str) -> Dict[str, str]:
545 | response = self._session.put(url=furl(self._base_url) / job_id)
546 | response.raise_for_status()
547 | return response.json()
548 |
--------------------------------------------------------------------------------
/data_lineage/__main__.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | import click
4 | from redis import Redis
5 |
6 | from data_lineage import __version__
7 | from data_lineage.server import create_server
8 |
9 |
10 | @click.command()
11 | @click.version_option(__version__)
12 | @click.option(
13 | "-l", "--log-level", envvar="LOG_LEVEL", help="Logging Level", default="INFO"
14 | )
15 | @click.option(
16 | "--catalog-user", help="Database user name", envvar="CATALOG_USER", required=True
17 | )
18 | @click.option(
19 | "--catalog-password",
20 | help="Database Password",
21 | envvar="CATALOG_PASSWORD",
22 | required=True,
23 | )
24 | @click.option(
25 | "--catalog-host", help="Database Host", envvar="CATALOG_HOST", default="localhost"
26 | )
27 | @click.option(
28 | "--catalog-port", help="Database Password", envvar="CATALOG_PORT", default=5432
29 | )
30 | @click.option(
31 | "--catalog-db", help="Postgres Database", envvar="CATALOG_DB", default="tokern"
32 | )
33 | @click.option(
34 | "--redis-host",
35 | help="Redis host for queueing scans",
36 | envvar="REDIS_HOST",
37 | default="localhost",
38 | )
39 | @click.option(
40 | "--redis-port",
41 | help="Redis port for queueing scans",
42 | envvar="REDIS_PORT",
43 | default="6379",
44 | )
45 | @click.option(
46 | "--is-production/--not-production",
47 | help="Run server in development mode",
48 | default=True,
49 | )
50 | def main(
51 | log_level,
52 | catalog_user,
53 | catalog_password,
54 | catalog_host,
55 | catalog_port,
56 | catalog_db,
57 | redis_host,
58 | redis_port,
59 | is_production,
60 | ):
61 | logging.basicConfig(level=getattr(logging, log_level.upper()))
62 | catalog = {
63 | "user": catalog_user,
64 | "password": catalog_password,
65 | "host": catalog_host,
66 | "port": catalog_port,
67 | "database": catalog_db,
68 | }
69 | connection = Redis(redis_host, redis_port)
70 | app, catalog = create_server(
71 | catalog, connection=connection, is_production=is_production
72 | )
73 | if is_production:
74 | app.run()
75 | else:
76 | app.run(debug=True)
77 |
78 |
79 | if __name__ == "__main__":
80 | main()
81 |
--------------------------------------------------------------------------------
/data_lineage/assets/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tokern/data-lineage/5945542742979fe350d313d906440c93ee3d0f36/data_lineage/assets/favicon.ico
--------------------------------------------------------------------------------
/data_lineage/graph.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from typing import Dict, List
3 |
4 | import networkx as nx
5 |
6 |
7 | class LineageGraph:
8 | def __init__(
9 | self,
10 | nodes: List[Dict[str, str]],
11 | edges: List[Dict[str, str]],
12 | name: str = "Lineage",
13 | ):
14 | self.name = name
15 | self._graph = nx.DiGraph()
16 | for node in nodes:
17 | node_id = node["id"]
18 | node_attributes = {"name": node["name"], "type": node["type"]}
19 | logging.debug("Add Node: {}, {}".format(node_id, node_attributes))
20 | self._graph.add_node(node_id, **node_attributes)
21 |
22 | for edge in edges:
23 | logging.debug("Edge: <{}>, <{}>".format(edge["source"], edge["target"]))
24 | self._graph.add_edge(edge["source"], edge["target"])
25 |
26 | @property
27 | def graph(self):
28 | return self._graph
29 |
30 | @graph.setter
31 | def graph(self, new_graph):
32 | self._graph = new_graph
33 |
--------------------------------------------------------------------------------
/data_lineage/parser/__init__.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from typing import List
3 |
4 | from dbcat.catalog import Catalog
5 | from dbcat.catalog.models import CatSource, JobExecution, JobExecutionStatus
6 | from pglast import Node, parse_sql
7 | from pglast.parser import ParseError
8 |
9 | from data_lineage import SemanticError
10 | from data_lineage.parser.binder import SelectBinder
11 | from data_lineage.parser.dml_visitor import (
12 | CTASVisitor,
13 | DmlVisitor,
14 | SelectIntoVisitor,
15 | SelectSourceVisitor,
16 | )
17 | from data_lineage.parser.visitor import ExprVisitor, RedshiftExprVisitor
18 |
19 |
20 | class Parsed:
21 | def __init__(self, name: str, query: str, node: Node):
22 | self._name = name
23 | self._node = node
24 | self._query = query
25 |
26 | @property
27 | def name(self):
28 | return self._name
29 |
30 | @property
31 | def node(self):
32 | return self._node
33 |
34 | @property
35 | def query(self):
36 | return self._query
37 |
38 |
39 | def parse_queries(queries: List[str]) -> List[Parsed]:
40 | parsed: List[Parsed] = []
41 |
42 | for query in queries:
43 | try:
44 | parsed.append(parse(query))
45 | except ParseError as e:
46 | logging.warning("Syntax error while parsing {}.\n{}".format(query, e))
47 |
48 | return parsed
49 |
50 |
51 | def analyze_dml_query(
52 | catalog: Catalog, parsed: Parsed, source: CatSource,
53 | ) -> DmlVisitor:
54 | chosen_visitor = visit_dml_query(parsed, source)
55 | chosen_visitor.bind(catalog=catalog, source=source)
56 | return chosen_visitor
57 |
58 |
59 | def parse_dml_query(
60 | catalog: Catalog, parsed: Parsed, source: CatSource,
61 | ) -> SelectBinder:
62 | chosen_visitor = visit_dml_query(parsed, source)
63 |
64 | select_binder = SelectBinder(
65 | catalog=catalog,
66 | source=source,
67 | tables=chosen_visitor.select_tables,
68 | columns=chosen_visitor.select_columns,
69 | expr_visitor_clazz=chosen_visitor.expr_visitor_clazz,
70 | alias_generator=("_U{}".format(i) for i in range(0, 1000)),
71 | )
72 | select_binder.bind()
73 | return select_binder
74 |
75 |
76 | def visit_dml_query(parsed: Parsed, source: CatSource,) -> DmlVisitor:
77 |
78 | expr_visitor_clazz = ExprVisitor
79 | if source.source_type == "redshift":
80 | expr_visitor_clazz = RedshiftExprVisitor
81 |
82 | select_source_visitor: DmlVisitor = SelectSourceVisitor(
83 | parsed.name, expr_visitor_clazz
84 | )
85 | select_into_visitor: DmlVisitor = SelectIntoVisitor(parsed.name, expr_visitor_clazz)
86 | ctas_visitor: DmlVisitor = CTASVisitor(parsed.name, expr_visitor_clazz)
87 |
88 | for v in [select_source_visitor, select_into_visitor, ctas_visitor]:
89 | v(parsed.node)
90 | if len(v.select_tables) > 0 and v.insert_table is not None:
91 | return v
92 | raise SemanticError("Query is not a DML Query")
93 |
94 |
95 | def extract_lineage(
96 | catalog: Catalog,
97 | visited_query: DmlVisitor,
98 | source: CatSource,
99 | parsed: Parsed,
100 | start_time,
101 | end_time,
102 | ) -> JobExecution:
103 | job = catalog.add_job(
104 | name=parsed.name, source=source, context={"query": parsed.query}
105 | )
106 | job_execution = catalog.add_job_execution(
107 | job=job,
108 | started_at=start_time,
109 | ended_at=end_time,
110 | status=JobExecutionStatus.SUCCESS,
111 | )
112 | for source, target in zip(
113 | visited_query.source_columns, visited_query.target_columns
114 | ):
115 | for column in source.columns:
116 | edge = catalog.add_column_lineage(column, target, job_execution.id, {})
117 | logging.debug("Added {}".format(edge))
118 |
119 | return job_execution
120 |
121 |
122 | def parse(sql: str, name: str = None) -> Parsed:
123 | if name is None:
124 | name = str(hash(sql))
125 | node = parse_sql(sql)
126 |
127 | return Parsed(name, sql, node)
128 |
--------------------------------------------------------------------------------
/data_lineage/parser/binder.py:
--------------------------------------------------------------------------------
1 | import json
2 | import logging
3 | from abc import ABC, abstractmethod
4 | from json import JSONEncoder
5 | from typing import List, Mapping, Set, Type
6 |
7 | from dbcat.catalog import Catalog, CatColumn, CatSource, CatTable
8 | from pglast import Node
9 | from pglast.ast import RangeSubselect, RangeVar
10 |
11 | from data_lineage import ColumnNotFound, SemanticError, TableNotFound
12 | from data_lineage.parser.visitor import (
13 | ColumnRefVisitor,
14 | ExprVisitor,
15 | RangeSubselectVisitor,
16 | RangeVarVisitor,
17 | )
18 |
19 |
20 | class ColumnContext:
21 | def __init__(self, alias: str, columns: Set[CatColumn]):
22 | self._alias = alias.lower()
23 | self._columns = columns
24 |
25 | @property
26 | def alias(self):
27 | return self._alias
28 |
29 | @property
30 | def columns(self) -> Set[CatColumn]:
31 | return self._columns
32 |
33 |
34 | class AliasContext:
35 | def __init__(self, catalog: Catalog, alias: str, tables: Set[CatTable]):
36 | self._catalog = catalog
37 | self._alias = alias.lower()
38 | self._tables = tables
39 |
40 | @property
41 | def alias(self):
42 | return self._alias
43 |
44 | @property
45 | def tables(self):
46 | return self._tables
47 |
48 | def get_columns(self, column_names: List[str] = None) -> List[ColumnContext]:
49 | columns: List[CatColumn] = []
50 | for table in self._tables:
51 | logging.debug("Searching in {}".format(table.fqdn))
52 | columns = columns + self._catalog.get_columns_for_table(table, column_names)
53 |
54 | return [
55 | ColumnContext(alias=column.name, columns={column}) for column in columns
56 | ]
57 |
58 |
59 | class WithContext(AliasContext):
60 | def __init__(
61 | self,
62 | catalog: Catalog,
63 | alias: str,
64 | tables: Set[CatTable],
65 | columns: List[ColumnContext],
66 | ):
67 | super(WithContext, self).__init__(catalog, alias, tables)
68 | self._columns = columns
69 |
70 | def get_columns(self, column_names: List[str] = None) -> List[ColumnContext]:
71 | if column_names is not None:
72 | filtered = []
73 | for column in self._columns:
74 | logging.debug(
75 | "Comparing with alias: {} - contains columns: {}".format(
76 | column.alias,
77 | json.dumps(list(column.columns), cls=CatColumnEncoder),
78 | )
79 | )
80 | if column.alias in column_names:
81 | filtered.append(column)
82 |
83 | return filtered
84 | else:
85 | return self._columns
86 |
87 |
88 | class CatTableEncoder(JSONEncoder):
89 | def default(self, obj):
90 | if isinstance(obj, CatTable):
91 | return {
92 | "name": obj.name,
93 | "schema": obj.schema.name,
94 | "source": obj.schema.source.name,
95 | }
96 |
97 | # Let the base class default method raise the TypeError
98 | return json.JSONEncoder.default(self, obj)
99 |
100 |
101 | class CatColumnEncoder(JSONEncoder):
102 | def default(self, obj):
103 | if isinstance(obj, CatColumn):
104 | return {
105 | "name": obj.name,
106 | "table": obj.table.name,
107 | "schema": obj.table.schema.name,
108 | "source": obj.table.schema.source.name,
109 | }
110 |
111 | # Let the base class default method raise the TypeError
112 | return json.JSONEncoder.default(self, obj)
113 |
114 |
115 | class Binder(ABC):
116 | @property
117 | @abstractmethod
118 | def _visited_tables(self) -> List[Node]:
119 | pass
120 |
121 | @property
122 | @abstractmethod
123 | def _visited_columns(self) -> List[ExprVisitor]:
124 | pass
125 |
126 | @property
127 | def tables(self) -> Set[CatTable]:
128 | return self._tables
129 |
130 | @property
131 | def columns(self) -> List[ColumnContext]:
132 | return self._columns
133 |
134 | def __init__(
135 | self,
136 | catalog: Catalog,
137 | source: CatSource,
138 | alias_generator,
139 | expr_visitor_clazz: Type[ExprVisitor],
140 | alias_map: Mapping[str, AliasContext] = None,
141 | ):
142 | self._catalog = catalog
143 | self._source = source
144 | self._tables: Set[CatTable] = set()
145 | self._columns: List[ColumnContext] = []
146 | self._alias_map: Mapping[str, AliasContext] = alias_map or {}
147 | self._alias_generator = alias_generator
148 | self._expr_visitor_clazz = expr_visitor_clazz
149 |
150 | def bind(self):
151 | bound_tables = self._bind_tables()
152 |
153 | self._tables = set(bound_tables)
154 | self._columns = self._bind_columns()
155 |
156 | def _bind_tables(self):
157 | bound_tables = []
158 | for table in self._visited_tables:
159 | if isinstance(table, RangeVar):
160 | visitor = RangeVarVisitor()
161 | visitor(table)
162 |
163 | logging.debug("Searching for: {}".format(visitor.search_string))
164 |
165 | if not visitor.is_qualified and visitor.name in self._alias_map:
166 | bound_tables = bound_tables + list(
167 | self._alias_map[visitor.name].tables
168 | )
169 | logging.debug("Added tables for alias {}".format(visitor.name))
170 | else:
171 | try:
172 | candidate_table = self._catalog.search_table(
173 | source_like=self._source.name, **visitor.search_string
174 | )
175 | except RuntimeError as err:
176 | logging.debug(str(err))
177 | raise TableNotFound(
178 | '"{schema_like}"."{table_like}" is not found'.format(
179 | **visitor.search_string
180 | )
181 | )
182 | logging.debug("Bound source table: {}".format(candidate_table))
183 |
184 | self._alias_map[visitor.alias] = AliasContext(
185 | catalog=self._catalog,
186 | alias=visitor.alias,
187 | tables={candidate_table},
188 | )
189 | bound_tables.append(candidate_table)
190 | elif isinstance(table, RangeSubselect):
191 | visitor = RangeSubselectVisitor(self._expr_visitor_clazz)
192 | visitor(table)
193 | binder = SelectBinder(
194 | self._catalog,
195 | self._source,
196 | visitor.sources,
197 | visitor.columns,
198 | self._alias_generator,
199 | self._expr_visitor_clazz,
200 | )
201 | binder.bind()
202 | self._alias_map[visitor.alias] = WithContext(
203 | catalog=self._catalog,
204 | alias=visitor.alias,
205 | tables=binder.tables,
206 | columns=binder.columns,
207 | )
208 | bound_tables = bound_tables + list(binder.tables)
209 | else:
210 | raise SemanticError("Unknown parser state. Please contact Support")
211 | return bound_tables
212 |
213 | def _bind_columns(self) -> List[ColumnContext]:
214 | bound_cols: List[ColumnContext] = []
215 | for expr_visitor in self._visited_columns:
216 | target_cols: Set[ColumnContext] = set()
217 | is_a_star = False
218 | for column in expr_visitor.columns:
219 | column_ref_visitor = ColumnRefVisitor()
220 | column_ref_visitor(column)
221 | is_a_star = column_ref_visitor.is_a_star
222 | alias_list = list(self._alias_map.values())
223 | if column_ref_visitor.is_qualified:
224 | if column_ref_visitor.table_name not in self._alias_map:
225 | raise TableNotFound(
226 | "{} not found for column ({}).".format(
227 | column_ref_visitor.name[0], column_ref_visitor.name
228 | )
229 | )
230 | assert column_ref_visitor.table_name is not None
231 | alias_list = [self._alias_map[column_ref_visitor.table_name]]
232 | target_cols.update(
233 | Binder._search_column_in_tables(column_ref_visitor, alias_list)
234 | )
235 |
236 | if is_a_star:
237 | for col in target_cols:
238 | bound_cols.append(
239 | ColumnContext(alias=col.alias, columns=col.columns)
240 | )
241 | else:
242 | if expr_visitor.alias is not None:
243 | alias = expr_visitor.alias
244 | elif len(target_cols) == 1:
245 | alias = list(target_cols)[0].alias
246 | else:
247 | alias = next(self._alias_generator)
248 | cols: Set[CatColumn] = set()
249 | for tgt in target_cols:
250 | for c in tgt.columns:
251 | cols.add(c)
252 | bound_cols.append(ColumnContext(alias=alias, columns=cols))
253 |
254 | if len(bound_cols) == 0:
255 | raise ColumnNotFound("No source columns found.")
256 | return bound_cols
257 |
258 | @staticmethod
259 | def _search_column_in_tables(
260 | column_ref_visitor, alias_list: List[AliasContext]
261 | ) -> List[ColumnContext]:
262 | found_cols: List[ColumnContext] = []
263 | if column_ref_visitor.is_a_star:
264 | for alias_context in alias_list:
265 | found_cols = alias_context.get_columns()
266 | logging.debug(
267 | "Bound all source columns in {}".format(alias_context.tables)
268 | )
269 | else:
270 | candidate_columns: List[ColumnContext] = []
271 | global_table_list: List[CatTable] = []
272 | logging.debug("Searching for {}".format(column_ref_visitor.column_name))
273 | for alias_context in alias_list:
274 | logging.debug("Searching in {}".format(alias_context.alias))
275 | candidate_columns = candidate_columns + alias_context.get_columns(
276 | [column_ref_visitor.column_name]
277 | )
278 | global_table_list = global_table_list + list(alias_context.tables)
279 |
280 | if len(candidate_columns) == 0:
281 | raise ColumnNotFound(
282 | '"{}" not found in the following tables: {}'.format(
283 | column_ref_visitor.column_name,
284 | json.dumps(global_table_list, cls=CatTableEncoder),
285 | )
286 | )
287 | elif len(candidate_columns) > 1:
288 | column_list = []
289 | for candidate in candidate_columns:
290 | for col in candidate.columns:
291 | column_list.append(col)
292 | raise ColumnNotFound(
293 | "{} Ambiguous column name. Multiple matches found: {}".format(
294 | column_ref_visitor.name,
295 | json.dumps(column_list, cls=CatColumnEncoder),
296 | )
297 | )
298 | logging.debug("Bound source column: {}".format(candidate_columns[0]))
299 | found_cols.append(candidate_columns[0])
300 | return found_cols
301 |
302 |
303 | class SelectBinder(Binder):
304 | def __init__(
305 | self,
306 | catalog: Catalog,
307 | source: CatSource,
308 | tables: List[Node],
309 | columns: List[ExprVisitor],
310 | alias_generator,
311 | expr_visitor_clazz: Type[ExprVisitor],
312 | alias_map: Mapping[str, AliasContext] = None,
313 | ):
314 | super(SelectBinder, self).__init__(
315 | catalog, source, alias_generator, expr_visitor_clazz, alias_map
316 | )
317 | self._table_nodes: List[Node] = tables
318 | self._column_nodes: List[ExprVisitor] = columns
319 |
320 | @property
321 | def _visited_tables(self) -> List[Node]:
322 | return self._table_nodes
323 |
324 | @property
325 | def _visited_columns(self) -> List[ExprVisitor]:
326 | return self._column_nodes
327 |
--------------------------------------------------------------------------------
/data_lineage/parser/dml_visitor.py:
--------------------------------------------------------------------------------
1 | import json
2 | import logging
3 | from typing import Any, Dict, List, Optional, Set, Tuple, Type
4 |
5 | from dbcat.catalog import Catalog, CatColumn, CatSource, CatTable
6 | from pglast import Node
7 | from pglast.ast import IntoClause
8 | from pglast.visitors import Ancestor, Continue, Skip, Visitor
9 |
10 | from data_lineage import ColumnNotFound, SemanticError, TableNotFound
11 | from data_lineage.parser.binder import (
12 | CatTableEncoder,
13 | ColumnContext,
14 | SelectBinder,
15 | WithContext,
16 | )
17 | from data_lineage.parser.visitor import (
18 | ColumnRefVisitor,
19 | ExprVisitor,
20 | RangeVarVisitor,
21 | TableVisitor,
22 | )
23 |
24 |
25 | class DmlVisitor(Visitor):
26 | def __init__(self, name: str, expr_visitor_clazz: Type[ExprVisitor]):
27 | self._name = name
28 | self._insert_table: Optional[Node] = None
29 | self._insert_columns: List[str] = []
30 | self._target_table: Optional[CatTable] = None
31 | self._target_columns: List[CatColumn] = []
32 | self._source_tables: Set[CatTable] = set()
33 | self._source_columns: List[ColumnContext] = []
34 | self._select_tables: List[Node] = []
35 | self._select_columns: List[ExprVisitor] = []
36 | self._with_aliases: Dict[str, Dict[str, Any]] = {}
37 | self._alias_map: Dict[str, WithContext] = {}
38 | self._column_alias_generator = ("_U{}".format(i) for i in range(0, 1000))
39 | self.expr_visitor_clazz = expr_visitor_clazz
40 |
41 | @property
42 | def name(self) -> str:
43 | return self._name
44 |
45 | @property
46 | def insert_table(self) -> Optional[Node]:
47 | return self._insert_table
48 |
49 | @property
50 | def target_table(self) -> CatTable:
51 | return self._target_table
52 |
53 | @property
54 | def target_columns(self) -> List[CatColumn]:
55 | return self._target_columns
56 |
57 | @property
58 | def source_tables(self) -> Set[CatTable]:
59 | return self._source_tables
60 |
61 | @property
62 | def source_columns(self) -> List[ColumnContext]:
63 | return self._source_columns
64 |
65 | @property
66 | def select_tables(self) -> List[Node]:
67 | return self._select_tables
68 |
69 | @property
70 | def select_columns(self) -> List[ExprVisitor]:
71 | return self._select_columns
72 |
73 | def visit_RangeVar(self, ancestors, node):
74 | self._insert_table = node
75 | return Skip
76 |
77 | def visit_ResTarget(self, ancestors, node):
78 | self._insert_columns.append(node.name)
79 | return Skip
80 |
81 | def visit_CommonTableExpr(self, ancestors, node):
82 | with_alias = node.ctename
83 | table_visitor = TableVisitor(self.expr_visitor_clazz)
84 | table_visitor(node.ctequery)
85 |
86 | self._with_aliases[with_alias] = {
87 | "tables": table_visitor.sources,
88 | "columns": table_visitor.columns,
89 | }
90 | return Skip
91 |
92 | def visit_CreateTableAsStmt(self, ancestors, node):
93 | """
94 | Do not process CTAS statement by default.
95 | :param ancestors:
96 | :type ancestors:
97 | :param node:
98 | :type node:
99 | :return:
100 | :rtype:
101 | """
102 | return Skip
103 |
104 | def bind(self, catalog: Catalog, source: CatSource):
105 | self._bind_target(catalog, source)
106 |
107 | self._bind_with(catalog, source)
108 | binder = SelectBinder(
109 | catalog,
110 | source,
111 | self._select_tables,
112 | self._select_columns,
113 | self._column_alias_generator,
114 | self.expr_visitor_clazz,
115 | self._alias_map,
116 | )
117 | binder.bind()
118 |
119 | if len(binder.tables) == 0:
120 | raise SemanticError("No source tables found")
121 |
122 | if len(binder.columns) == 0:
123 | raise SemanticError("No source columns found")
124 |
125 | if self.target_table is None:
126 | raise SemanticError("No target table found")
127 |
128 | if len(self.target_columns) == 0:
129 | raise SemanticError(
130 | "No target columns found in {}".format(
131 | json.dumps(self.target_table, cls=CatTableEncoder)
132 | )
133 | )
134 |
135 | if len(self.target_columns) != len(binder.columns):
136 | raise SemanticError(
137 | "No. of target columns({}) does not match no. of source columns({})".format(
138 | len(self.target_columns), len(binder.columns)
139 | )
140 | )
141 |
142 | self._source_tables = binder.tables
143 | self._source_columns = binder.columns
144 |
145 | def _bind_target(self, catalog: Catalog, source: CatSource):
146 | target_table_visitor = RangeVarVisitor()
147 | target_table_visitor(self._insert_table)
148 | logging.debug("Searching for: {}".format(target_table_visitor.search_string))
149 | try:
150 | self._target_table = catalog.search_table(
151 | source_like=source.name, **target_table_visitor.search_string
152 | )
153 | except RuntimeError as error:
154 | logging.debug(str(error))
155 | raise TableNotFound(
156 | '"{schema_like}"."{table_like}" is not found'.format(
157 | **target_table_visitor.search_string
158 | )
159 | )
160 | logging.debug("Bound target table: {}".format(self._target_table))
161 | if len(self._insert_columns) == 0:
162 | self._target_columns = catalog.get_columns_for_table(self._target_table)
163 | logging.debug("Bound all columns in {}".format(self._target_table))
164 | else:
165 | bound_cols = catalog.get_columns_for_table(
166 | self._target_table, column_names=self._insert_columns
167 | )
168 | # Handle error case
169 | if len(bound_cols) != len(self._insert_columns):
170 | for column in self._insert_columns:
171 | found = False
172 | for bound in bound_cols:
173 | if column == bound.name:
174 | found = True
175 | break
176 |
177 | if not found:
178 | raise ColumnNotFound(
179 | '"{}" not found in the following tables: {}'.format(
180 | column,
181 | json.dumps([self._target_table], cls=CatTableEncoder),
182 | )
183 | )
184 |
185 | self._target_columns = bound_cols
186 | logging.debug("Bound {} target columns".format(len(bound_cols)))
187 |
188 | def _bind_with(self, catalog: Catalog, source: CatSource):
189 | if self._with_aliases:
190 | # Bind all the WITH expressions
191 | for key in self._with_aliases.keys():
192 | binder = SelectBinder(
193 | catalog,
194 | source,
195 | self._with_aliases[key]["tables"],
196 | self._with_aliases[key]["columns"],
197 | self._column_alias_generator,
198 | self.expr_visitor_clazz,
199 | )
200 | binder.bind()
201 | self._alias_map[key] = WithContext(
202 | catalog=catalog,
203 | alias=key,
204 | tables=binder.tables,
205 | columns=binder.columns,
206 | )
207 |
208 | def resolve(
209 | self,
210 | ) -> Tuple[
211 | Tuple[Optional[str], str],
212 | List[Tuple[Optional[str], str]],
213 | List[Tuple[Optional[str], str]],
214 | ]:
215 | target_table_visitor = RangeVarVisitor()
216 | target_table_visitor(self._insert_table)
217 |
218 | bound_tables = []
219 | for table in self._select_tables:
220 | visitor = RangeVarVisitor()
221 | visitor(table)
222 | bound_tables.append(visitor.fqdn)
223 |
224 | bound_cols = []
225 | for expr_visitor in self._select_columns:
226 | for column in expr_visitor.columns:
227 | column_ref_visitor = ColumnRefVisitor()
228 | column_ref_visitor(column)
229 | bound_cols.append(column_ref_visitor.name[0])
230 |
231 | return target_table_visitor.fqdn, bound_tables, bound_cols
232 |
233 |
234 | class SelectSourceVisitor(DmlVisitor):
235 | def __init__(self, name: str, expr_visitor_clazz: Type[ExprVisitor] = ExprVisitor):
236 | super(SelectSourceVisitor, self).__init__(name, expr_visitor_clazz)
237 |
238 | def visit_SelectStmt(self, ancestors, node):
239 | table_visitor = TableVisitor(self.expr_visitor_clazz)
240 | table_visitor(node)
241 | self._select_tables = table_visitor.sources
242 | self._select_columns = table_visitor.columns
243 | for key in table_visitor.with_aliases.keys():
244 | self._with_aliases[key] = table_visitor.with_aliases[key]
245 |
246 | return Skip
247 |
248 |
249 | class SelectIntoVisitor(DmlVisitor):
250 | def __init__(self, name: str, expr_visitor_clazz: Type[ExprVisitor] = ExprVisitor):
251 | super(SelectIntoVisitor, self).__init__(name, expr_visitor_clazz)
252 |
253 | def visit_SelectStmt(self, ancestors, node):
254 | super().__call__(node.intoClause)
255 | table_visitor = TableVisitor(self.expr_visitor_clazz)
256 | table_visitor(node.targetList)
257 | table_visitor(node.fromClause)
258 | self._select_tables = table_visitor.sources
259 | self._select_columns = table_visitor.columns
260 | for key in table_visitor.with_aliases.keys():
261 | self._with_aliases[key] = table_visitor.with_aliases[key]
262 |
263 | return Skip
264 |
265 |
266 | class CTASVisitor(SelectSourceVisitor):
267 | def __init__(self, name: str, expr_visitor_clazz: Type[ExprVisitor] = ExprVisitor):
268 | super(CTASVisitor, self).__init__(name, expr_visitor_clazz)
269 |
270 | def visit_CreateTableAsStmt(self, ancestors, node):
271 | return Continue
272 |
273 | def visit_String(self, ancestors: Ancestor, node):
274 | # Check if parent is IntoClause
275 | parent = ancestors
276 | in_into_clause = False
277 | while parent is not None and not in_into_clause:
278 | in_into_clause = isinstance(parent.node, IntoClause)
279 | parent = parent.parent
280 |
281 | if in_into_clause:
282 | self._insert_columns.append(node.val)
283 |
284 | def _bind_target(self, catalog: Catalog, source: CatSource):
285 | target_table_visitor = RangeVarVisitor()
286 | target_table_visitor(self._insert_table)
287 |
288 | if target_table_visitor.is_qualified:
289 | schema = catalog.get_schema(
290 | source_name=source.name, schema_name=target_table_visitor.schema_name
291 | )
292 | elif source.default_schema is not None:
293 | schema = source.default_schema.schema
294 | else:
295 | raise SemanticError(
296 | "No default schema set for source {}".format(source.fqdn)
297 | )
298 |
299 | self._target_table = catalog.add_table(
300 | table_name=target_table_visitor.name, schema=schema
301 | )
302 |
303 | sort_order = 1
304 | for col in self._insert_columns:
305 | self._target_columns.append(
306 | catalog.add_column(
307 | column_name=col,
308 | data_type="varchar",
309 | sort_order=sort_order,
310 | table=self._target_table,
311 | )
312 | )
313 |
--------------------------------------------------------------------------------
/data_lineage/parser/visitor.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Dict, List, Optional, Tuple, Type
2 |
3 | from pglast import Node
4 | from pglast.visitors import Skip, Visitor
5 |
6 |
7 | class ExprVisitor(Visitor):
8 | def __init__(self, alias: str = None):
9 | self._alias: Optional[str] = alias
10 | self._columns: List[Node] = []
11 |
12 | @property
13 | def alias(self) -> Optional[str]:
14 | return self._alias
15 |
16 | @property
17 | def columns(self) -> List[Node]:
18 | return self._columns
19 |
20 | def visit_FuncCall(self, ancestors, node):
21 | super().__call__(node.args)
22 |
23 | def visit_TypeCast(self, ancestors, node):
24 | super().__call__(node.arg)
25 |
26 | def visit_A_Expr(self, ancestors, node):
27 | super().__call__(node.lexpr)
28 | super().__call__(node.rexpr)
29 |
30 | def visit_ColumnRef(self, ancestors, node):
31 | self._columns.append(node)
32 |
33 |
34 | class RedshiftExprVisitor(ExprVisitor):
35 | class FuncNameVisitor(Visitor):
36 | def __init__(self):
37 | self._name = None
38 |
39 | @property
40 | def name(self):
41 | return self._name
42 |
43 | def visit_String(self, ancestors, obj):
44 | self._name = obj.val
45 |
46 | def visit_FuncCall(self, ancestors, node):
47 | name_visitor = RedshiftExprVisitor.FuncNameVisitor()
48 | name_visitor(node.funcname)
49 | if name_visitor.name == "dateadd":
50 | super().__call__(node.args[2])
51 | return Skip
52 |
53 |
54 | class TableVisitor(Visitor):
55 | def __init__(self, expr_visitor_clazz: Type[ExprVisitor]):
56 | self._sources: List[Node] = []
57 | self._columns: List[ExprVisitor] = []
58 | self._expr_visitor_clazz = expr_visitor_clazz
59 | self._with_aliases: Dict[str, Dict[str, Any]] = {}
60 |
61 | @property
62 | def sources(self) -> List[Node]:
63 | return self._sources
64 |
65 | @property
66 | def columns(self) -> List[ExprVisitor]:
67 | return self._columns
68 |
69 | @property
70 | def with_aliases(self) -> Dict[str, Dict[str, Any]]:
71 | return self._with_aliases
72 |
73 | def visit_ResTarget(self, ancestors, node):
74 | name = None
75 | if node.name is not None:
76 | name = node.name
77 |
78 | expr_visitor = self._expr_visitor_clazz(name)
79 | expr_visitor(node.val)
80 | self._columns.append(expr_visitor)
81 | return Skip
82 |
83 | def visit_RangeVar(self, ancestors, node):
84 | self._sources.append(node)
85 | return Skip
86 |
87 | def visit_RangeSubselect(self, ancestors, node):
88 | self._sources.append(node)
89 | return Skip
90 |
91 | def visit_CommonTableExpr(self, ancestors, node):
92 | with_alias = node.ctename
93 | table_visitor = TableVisitor(self._expr_visitor_clazz)
94 | table_visitor(node.ctequery)
95 |
96 | self._with_aliases[with_alias] = {
97 | "tables": table_visitor.sources,
98 | "columns": table_visitor.columns,
99 | }
100 | return Skip
101 |
102 |
103 | class ColumnRefVisitor(Visitor):
104 | def __init__(self):
105 | self._name: List[str] = []
106 | self._is_a_star: bool = False
107 |
108 | @property
109 | def name(self) -> Tuple:
110 | return tuple(self._name)
111 |
112 | @property
113 | def is_a_star(self) -> bool:
114 | return self._is_a_star
115 |
116 | @property
117 | def is_qualified(self) -> bool:
118 | return len(self._name) == 2 or (len(self._name) == 1 and self._is_a_star)
119 |
120 | @property
121 | def column_name(self) -> Optional[str]:
122 | if len(self._name) == 2:
123 | return self._name[1]
124 | elif len(self._name) == 1:
125 | return self._name[0]
126 | return None
127 |
128 | @property
129 | def table_name(self) -> Optional[str]:
130 | if len(self._name) == 2 or (self._is_a_star and len(self._name) == 1):
131 | return self._name[0]
132 |
133 | return None
134 |
135 | def visit_String(self, ancestors, node):
136 | self._name.append(node.val)
137 |
138 | def visit_A_Star(self, ancestors, node):
139 | self._is_a_star = True
140 |
141 |
142 | class RangeVarVisitor(Visitor):
143 | def __init__(self):
144 | self._schema_name = None
145 | self._name = None
146 | self._alias = None
147 |
148 | @property
149 | def alias(self) -> Optional[str]:
150 | if self._alias is not None:
151 | return self._alias
152 | elif self._schema_name is not None and self._name is not None:
153 | return "{}.{}".format(self._schema_name, self._name)
154 | elif self._name is not None:
155 | return self._name
156 | return None
157 |
158 | @property
159 | def fqdn(self):
160 | return self._schema_name, self._name
161 |
162 | @property
163 | def search_string(self):
164 | return {"schema_like": self._schema_name, "table_like": self._name}
165 |
166 | @property
167 | def is_qualified(self) -> bool:
168 | return self._schema_name is not None
169 |
170 | @property
171 | def schema_name(self) -> Optional[str]:
172 | return self._schema_name
173 |
174 | @property
175 | def name(self) -> str:
176 | return self._name
177 |
178 | def visit_Alias(self, ancestors, node):
179 | self._alias = node.aliasname.lower()
180 |
181 | def visit_RangeVar(self, ancestors, node):
182 | if node.schemaname:
183 | self._schema_name = node.schemaname.lower()
184 | self._name = node.relname.lower()
185 |
186 |
187 | class RangeSubselectVisitor(Visitor):
188 | def __init__(self, expr_visitor_clazz: Type[ExprVisitor]):
189 | self._alias: Optional[str] = None
190 | self._table_visitor: TableVisitor = TableVisitor(expr_visitor_clazz)
191 |
192 | @property
193 | def alias(self) -> Optional[str]:
194 | if self._alias is not None:
195 | return self._alias
196 | return None
197 |
198 | @property
199 | def sources(self) -> List[Node]:
200 | return self._table_visitor.sources
201 |
202 | @property
203 | def columns(self) -> List[ExprVisitor]:
204 | return self._table_visitor.columns
205 |
206 | def visit_Alias(self, ancestors, node):
207 | self._alias = node.aliasname
208 |
209 | def visit_RangeSubselect(self, ancestors, node):
210 | super().__call__(node.alias)
211 | self._table_visitor(node.subquery)
212 | return Skip
213 |
--------------------------------------------------------------------------------
/data_lineage/server.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import logging
3 | from typing import Any, Dict, List, Tuple
4 |
5 | import flask_restless
6 | import gunicorn.app.base
7 | from dbcat import Catalog, PGCatalog, init_db
8 | from dbcat.catalog import CatColumn
9 | from dbcat.catalog.models import (
10 | CatSchema,
11 | CatSource,
12 | CatTable,
13 | ColumnLineage,
14 | DefaultSchema,
15 | Job,
16 | JobExecution,
17 | JobExecutionStatus,
18 | )
19 | from flask import Flask
20 | from flask_restful import Api, Resource, reqparse
21 | from pglast.parser import ParseError
22 | from rq import Queue
23 | from rq import job as RqJob
24 | from werkzeug.exceptions import NotFound, UnprocessableEntity
25 |
26 | from data_lineage import ColumnNotFound, SemanticError, TableNotFound
27 | from data_lineage.parser import (
28 | analyze_dml_query,
29 | extract_lineage,
30 | parse,
31 | parse_dml_query,
32 | )
33 | from data_lineage.worker import scan
34 |
35 |
36 | class TableNotFoundHTTP(NotFound):
37 | """Table not found in catalog"""
38 |
39 | code = 441
40 |
41 |
42 | class ColumnNotFoundHTTP(NotFound):
43 | """Column not found in catalog"""
44 |
45 | code = 442
46 |
47 |
48 | class ParseErrorHTTP(UnprocessableEntity):
49 | """Parser Error"""
50 |
51 |
52 | class SemanticErrorHTTP(UnprocessableEntity):
53 | """Semantic Error"""
54 |
55 | code = 443
56 |
57 |
58 | class Kedro(Resource):
59 | def __init__(self, catalog: Catalog):
60 | self._catalog = catalog
61 | self._parser = reqparse.RequestParser()
62 | self._parser.add_argument(
63 | "job_ids", action="append", help="List of job ids for a sub graph"
64 | )
65 |
66 | def get(self):
67 | nodes = []
68 | edges = []
69 |
70 | args = self._parser.parse_args()
71 | with self._catalog.managed_session:
72 | column_edges = self._catalog.get_column_lineages(args["job_ids"])
73 | for edge in column_edges:
74 | nodes.append(self._column_info(edge.source))
75 | nodes.append(self._column_info(edge.target))
76 | nodes.append(self._job_info(edge.job_execution.job))
77 | edges.append(
78 | {
79 | "source": "column:{}".format(edge.source_id),
80 | "target": "task:{}".format(edge.job_execution.job_id),
81 | }
82 | )
83 | edges.append(
84 | {
85 | "source": "task:{}".format(edge.job_execution.job_id),
86 | "target": "column:{}".format(edge.target_id),
87 | }
88 | )
89 |
90 | return {"nodes": nodes, "edges": edges}
91 |
92 | @staticmethod
93 | def _column_info(node: CatColumn):
94 | return {
95 | "id": "column:{}".format(node.id),
96 | "name": ".".join(node.fqdn),
97 | "type": "data",
98 | }
99 |
100 | @staticmethod
101 | def _job_info(node: Job):
102 | return {"id": "task:{}".format(node.id), "name": node.name, "type": "task"}
103 |
104 |
105 | class ScanList(Resource):
106 | def __init__(self, catalog: PGCatalog, queue: Queue):
107 | self._catalog = catalog
108 | self._queue = queue
109 | self._parser = reqparse.RequestParser()
110 | self._parser.add_argument("id", required=True, help="ID of the resource")
111 |
112 | def post(self):
113 | args = self._parser.parse_args()
114 | logging.info("Args for scanning: {}".format(args))
115 | job = self._queue.enqueue(
116 | scan,
117 | {
118 | "user": self._catalog.user,
119 | "password": self._catalog.password,
120 | "database": self._catalog.database,
121 | "host": self._catalog.host,
122 | "port": self._catalog.port,
123 | },
124 | int(args["id"]),
125 | )
126 |
127 | return {"id": job.id, "status": "queued"}, 200
128 |
129 | def get(self):
130 | job_list = []
131 | for job in self._queue.started_job_registry.get_job_ids():
132 | job_list.append({"id": job, "status": "started"})
133 |
134 | for job in self._queue.finished_job_registry.get_job_ids():
135 | job_list.append({"id": job, "status": "finished"})
136 |
137 | for job in self._queue.failed_job_registry.get_job_ids():
138 | job_list.append({"id": job, "status": "failed"})
139 |
140 | return job_list, 200
141 |
142 |
143 | class Scan(Resource):
144 | def __init__(self, catalog: PGCatalog, queue: Queue):
145 | self._catalog = catalog
146 | self._queue = queue
147 | self._parser = reqparse.RequestParser()
148 | self._parser.add_argument("id", required=True, help="ID of the resource")
149 |
150 | def get(self, job_id):
151 | status = RqJob.Job.fetch(job_id, connection=self._queue.connection).get_status()
152 | return {"id": job_id, "status": status}, 200
153 |
154 | def put(self, job_id):
155 | RqJob.Job.fetch(job_id, connection=self._queue.connection).cancel()
156 | return {"message": "Job {} cancelled".format(job_id)}, 200
157 |
158 |
159 | class Parse(Resource):
160 | def __init__(self, catalog: Catalog):
161 | self._catalog = catalog
162 | self._parser = reqparse.RequestParser()
163 | self._parser.add_argument("query", required=True, help="Query to parse")
164 | self._parser.add_argument(
165 | "source_id", help="Source database of the query", required=True
166 | )
167 |
168 | def post(self):
169 | args = self._parser.parse_args()
170 | logging.debug("Parse query: {}".format(args["query"]))
171 | try:
172 | parsed = parse(args["query"], "parse_api")
173 | except ParseError as error:
174 | raise ParseErrorHTTP(description=str(error))
175 |
176 | try:
177 | with self._catalog.managed_session:
178 | source = self._catalog.get_source_by_id(args["source_id"])
179 | logging.debug("Parsing query for source {}".format(source))
180 | binder = parse_dml_query(
181 | catalog=self._catalog, parsed=parsed, source=source
182 | )
183 |
184 | return (
185 | {
186 | "select_tables": [table.name for table in binder.tables],
187 | "select_columns": [context.alias for context in binder.columns],
188 | },
189 | 200,
190 | )
191 | except TableNotFound as table_error:
192 | raise TableNotFoundHTTP(description=str(table_error))
193 | except ColumnNotFound as column_error:
194 | raise ColumnNotFoundHTTP(description=str(column_error))
195 | except SemanticError as semantic_error:
196 | raise SemanticErrorHTTP(description=str(semantic_error))
197 |
198 |
199 | class Analyze(Resource):
200 | def __init__(self, catalog: Catalog):
201 | self._catalog = catalog
202 | self._parser = reqparse.RequestParser()
203 | self._parser.add_argument("query", required=True, help="Query to parse")
204 | self._parser.add_argument("name", help="Name of the ETL job")
205 | self._parser.add_argument(
206 | "start_time", required=True, help="Start time of the task"
207 | )
208 | self._parser.add_argument(
209 | "end_time", required=True, help="End time of the task"
210 | )
211 | self._parser.add_argument(
212 | "source_id", help="Source database of the query", required=True
213 | )
214 |
215 | def post(self):
216 | args = self._parser.parse_args()
217 | logging.debug("Parse query: {}".format(args["query"]))
218 | try:
219 | parsed = parse(args["query"], args["name"])
220 | except ParseError as error:
221 | raise ParseErrorHTTP(description=str(error))
222 |
223 | try:
224 | with self._catalog.managed_session:
225 | source = self._catalog.get_source_by_id(args["source_id"])
226 | logging.debug("Parsing query for source {}".format(source))
227 | chosen_visitor = analyze_dml_query(self._catalog, parsed, source)
228 | job_execution = extract_lineage(
229 | catalog=self._catalog,
230 | visited_query=chosen_visitor,
231 | source=source,
232 | parsed=parsed,
233 | start_time=datetime.datetime.fromisoformat(args["start_time"]),
234 | end_time=datetime.datetime.fromisoformat(args["end_time"]),
235 | )
236 |
237 | return (
238 | {
239 | "data": {
240 | "id": job_execution.id,
241 | "type": "job_executions",
242 | "attributes": {
243 | "job_id": job_execution.job_id,
244 | "started_at": job_execution.started_at.strftime(
245 | "%Y-%m-%d %H:%M:%S"
246 | ),
247 | "ended_at": job_execution.ended_at.strftime(
248 | "%Y-%m-%d %H:%M:%S"
249 | ),
250 | "status": job_execution.status.name,
251 | },
252 | }
253 | },
254 | 200,
255 | )
256 | except TableNotFound as table_error:
257 | raise TableNotFoundHTTP(description=str(table_error))
258 | except ColumnNotFound as column_error:
259 | raise ColumnNotFoundHTTP(description=str(column_error))
260 | except SemanticError as semantic_error:
261 | raise SemanticErrorHTTP(description=str(semantic_error))
262 |
263 |
264 | class Server(gunicorn.app.base.BaseApplication):
265 | def __init__(self, app):
266 | self.application = app
267 | super().__init__()
268 |
269 | def load_config(self):
270 | # parse console args
271 | parser = self.cfg.parser()
272 | env_args = parser.parse_args(self.cfg.get_cmd_args_from_env())
273 |
274 | # Load up environment configuration
275 | for k, v in vars(env_args).items():
276 | if v is None:
277 | continue
278 | if k == "args":
279 | continue
280 | self.cfg.set(k.lower(), v)
281 |
282 | def load(self):
283 | return self.application
284 |
285 |
286 | def job_execution_serializer(instance: JobExecution, only: List[str]):
287 | return {
288 | "id": instance.id,
289 | "type": "job_executions",
290 | "attributes": {
291 | "job_id": instance.job_id,
292 | "started_at": instance.started_at.strftime("%Y-%m-%d %H:%M:%S"),
293 | "ended_at": instance.ended_at.strftime("%Y-%m-%d %H:%M:%S"),
294 | "status": instance.status.name,
295 | },
296 | }
297 |
298 |
299 | def job_execution_deserializer(data: Dict["str", Any]):
300 | attributes = data["data"]["attributes"]
301 | logging.debug(attributes)
302 | job_execution = JobExecution()
303 | job_execution.job_id = int(attributes["job_id"])
304 | job_execution.started_at = datetime.datetime.strptime(
305 | attributes["started_at"], "%Y-%m-%d %H:%M:%S"
306 | )
307 | job_execution.ended_at = datetime.datetime.strptime(
308 | attributes["ended_at"], "%Y-%m-%d %H:%M:%S"
309 | )
310 | job_execution.status = (
311 | JobExecutionStatus.SUCCESS
312 | if attributes["status"] == "SUCCESS"
313 | else JobExecutionStatus.SUCCESS
314 | )
315 |
316 | logging.debug(job_execution)
317 | logging.debug(job_execution.status == JobExecutionStatus.SUCCESS)
318 | return job_execution
319 |
320 |
321 | def create_server(
322 | catalog_options: Dict[str, str], connection, is_production=True
323 | ) -> Tuple[Any, PGCatalog]:
324 | logging.debug(catalog_options)
325 | catalog = PGCatalog(
326 | **catalog_options,
327 | connect_args={"application_name": "data-lineage:flask-restless"},
328 | max_overflow=40,
329 | pool_size=20,
330 | pool_pre_ping=True
331 | )
332 |
333 | init_db(catalog)
334 |
335 | restful_catalog = PGCatalog(
336 | **catalog_options,
337 | connect_args={"application_name": "data-lineage:restful"},
338 | pool_pre_ping=True
339 | )
340 |
341 | app = Flask(__name__)
342 | queue = Queue(is_async=is_production, connection=connection)
343 |
344 | # Create CRUD APIs
345 | methods = ["DELETE", "GET", "PATCH", "POST"]
346 | url_prefix = "/api/v1/catalog"
347 | api_manager = flask_restless.APIManager(app, catalog.get_scoped_session())
348 | api_manager.create_api(
349 | CatSource,
350 | methods=methods,
351 | url_prefix=url_prefix,
352 | additional_attributes=["fqdn"],
353 | )
354 | api_manager.create_api(
355 | CatSchema,
356 | methods=methods,
357 | url_prefix=url_prefix,
358 | additional_attributes=["fqdn"],
359 | )
360 | api_manager.create_api(
361 | CatTable,
362 | methods=methods,
363 | url_prefix=url_prefix,
364 | additional_attributes=["fqdn"],
365 | )
366 | api_manager.create_api(
367 | CatColumn,
368 | methods=methods,
369 | url_prefix=url_prefix,
370 | additional_attributes=["fqdn"],
371 | )
372 | api_manager.create_api(Job, methods=methods, url_prefix=url_prefix)
373 | api_manager.create_api(
374 | JobExecution,
375 | methods=methods,
376 | url_prefix=url_prefix,
377 | serializer=job_execution_serializer,
378 | deserializer=job_execution_deserializer,
379 | )
380 | api_manager.create_api(
381 | ColumnLineage,
382 | methods=methods,
383 | url_prefix=url_prefix,
384 | collection_name="column_lineage",
385 | )
386 |
387 | api_manager.create_api(
388 | DefaultSchema,
389 | methods=methods,
390 | url_prefix=url_prefix,
391 | collection_name="default_schema",
392 | primary_key="source_id",
393 | )
394 |
395 | restful_manager = Api(app)
396 | restful_manager.add_resource(
397 | Kedro, "/api/main", resource_class_kwargs={"catalog": restful_catalog}
398 | )
399 | restful_manager.add_resource(
400 | ScanList,
401 | "/api/v1/scan",
402 | resource_class_kwargs={"catalog": restful_catalog, "queue": queue},
403 | )
404 |
405 | restful_manager.add_resource(
406 | Scan,
407 | "/api/v1/scan/",
408 | resource_class_kwargs={"catalog": restful_catalog, "queue": queue},
409 | )
410 |
411 | restful_manager.add_resource(
412 | Analyze, "/api/v1/analyze", resource_class_kwargs={"catalog": restful_catalog}
413 | )
414 |
415 | restful_manager.add_resource(
416 | Parse, "/api/v1/parse", resource_class_kwargs={"catalog": restful_catalog}
417 | )
418 |
419 | for rule in app.url_map.iter_rules():
420 | rule_methods = ",".join(rule.methods)
421 | logging.debug("{:50s} {:20s} {}".format(rule.endpoint, rule_methods, rule))
422 |
423 | if is_production:
424 | return Server(app=app), catalog
425 | else:
426 | return app, catalog
427 |
--------------------------------------------------------------------------------
/data_lineage/worker.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from contextlib import closing
3 |
4 | from dbcat import DbScanner, PGCatalog
5 |
6 |
7 | def scan(connection_args, source_id):
8 | logging.info("{}".format(connection_args))
9 | catalog = PGCatalog(
10 | **connection_args,
11 | connect_args={"application_name": "data-lineage:worker"},
12 | max_overflow=40,
13 | pool_size=20,
14 | pool_pre_ping=True
15 | )
16 |
17 | with closing(catalog):
18 | with catalog.managed_session:
19 | source = catalog.get_source_by_id(source_id)
20 | DbScanner(catalog, source).scan()
21 |
--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
1 | # Dockerfile
2 | # Uses multi-stage builds requiring Docker 17.05 or higher
3 | # See https://docs.docker.com/develop/develop-images/multistage-build/
4 |
5 | # Creating a python base with shared environment variables
6 | FROM python:3.8.1-slim as python-base
7 | ENV PYTHONUNBUFFERED=1 \
8 | PYTHONDONTWRITEBYTECODE=1 \
9 | PIP_NO_CACHE_DIR=off \
10 | PIP_DISABLE_PIP_VERSION_CHECK=on \
11 | PIP_DEFAULT_TIMEOUT=100 \
12 | POETRY_HOME="/opt/poetry" \
13 | POETRY_VIRTUALENVS_IN_PROJECT=true \
14 | POETRY_NO_INTERACTION=1 \
15 | PYSETUP_PATH="/opt/pysetup" \
16 | VENV_PATH="/opt/pysetup/.venv"
17 |
18 | ENV PATH="$POETRY_HOME/bin:$VENV_PATH/bin:$PATH"
19 |
20 |
21 | # builder-base is used to build dependencies
22 | FROM python-base as builder-base
23 | RUN apt-get update \
24 | && apt-get install --no-install-recommends -y \
25 | curl gcc python3-dev default-libmysqlclient-dev \
26 | build-essential libpq-dev musl-dev
27 |
28 | # Install Poetry - respects $POETRY_VERSION & $POETRY_HOME
29 | ENV POETRY_VERSION=1.1.6
30 | RUN curl -sSL https://raw.githubusercontent.com/sdispater/poetry/master/get-poetry.py | python
31 |
32 | # We copy our Python requirements here to cache them
33 | # and install only runtime deps using poetry
34 | WORKDIR $PYSETUP_PATH
35 | COPY ./poetry.lock ./pyproject.toml ./
36 | RUN poetry install --no-dev # respects
37 |
38 | WORKDIR /src
39 | COPY . .
40 | RUN poetry build
41 | ENV PATH="${VENV_PATH}/bin:$PATH"
42 | RUN pip install dist/data_lineage-*.whl
43 |
44 | # 'production' stage uses the clean 'python-base' stage and copyies
45 | # in only our runtime deps that were installed in the 'builder-base'
46 | FROM python-base as production
47 |
48 | RUN apt-get update \
49 | && apt-get install --no-install-recommends -y \
50 | libpq5
51 |
52 | COPY --from=builder-base $VENV_PATH $VENV_PATH
53 | COPY ./docker/docker-entrypoint.sh /docker-entrypoint.sh
54 | RUN chmod +x /docker-entrypoint.sh
55 |
56 | ENTRYPOINT /docker-entrypoint.sh $0 $@
57 | CMD [ "data_lineage"]
--------------------------------------------------------------------------------
/docker/build_image.sh:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env bash
2 |
3 | set -e
4 |
5 | PROJECT_ROOT=$(dirname $(dirname $0))
6 |
7 | echo "$PROJECT_ROOT"
8 |
9 | DOCKERHUB_NAMESPACE=tokern
10 |
11 |
12 | TAG=$1
13 | if [ -z $TAG ]; then
14 | echo "usage: $0 [--publish] [--latest]"
15 | exit 1
16 | fi
17 |
18 | if [ "$2" == "--publish" ]; then
19 | PUBLISH="YES"
20 | fi
21 |
22 | if [ "$3" == "--latest" ]; then
23 | LATEST="YES"
24 | fi
25 |
26 | if [ "$PUBLISH" == "YES" ] && [ -z "$DOCKERHUB_USERNAME" -o -z "$DOCKERHUB_PASSWORD" ]; then
27 | echo "In order to publish an image to Dockerhub you must set \$DOCKERHUB_USERNAME and \$DOCKERHUB_PASSWORD before running."
28 | exit 1
29 | fi
30 |
31 | DOCKERHUB_REPOSITORY=data-lineage
32 | DOCKER_IMAGE="${DOCKERHUB_NAMESPACE}/${DOCKERHUB_REPOSITORY}:${TAG}"
33 |
34 | echo "Building Docker image ${DOCKER_IMAGE} from official Tokern release ${TAG}"
35 |
36 | # now tell docker to build our image
37 |
38 | docker build -t "${DOCKER_IMAGE}" -f "$PROJECT_ROOT"/docker/Dockerfile "${PROJECT_ROOT}"
39 |
40 | if [ "$PUBLISH" == "YES" ]; then
41 | echo "Publishing image ${DOCKER_IMAGE} to Dockerhub"
42 |
43 | # make sure that we are logged into dockerhub
44 | docker login --username="${DOCKERHUB_USERNAME}" --password="${DOCKERHUB_PASSWORD}"
45 |
46 | # push the built image to dockerhub
47 | docker push "${DOCKER_IMAGE}"
48 |
49 | # TODO: quick check against dockerhub to see that our new image made it
50 |
51 | if [ "$LATEST" == "YES" ]; then
52 | # tag our recent versioned image as "latest"
53 | docker tag "${DOCKER_IMAGE}" ${DOCKERHUB_NAMESPACE}/${DOCKERHUB_REPOSITORY}:latest
54 |
55 | # then push it as well
56 | docker push ${DOCKERHUB_NAMESPACE}/${DOCKERHUB_REPOSITORY}:latest
57 |
58 | # TODO: validate push succeeded
59 | fi
60 | fi
61 |
62 | echo "Done"
--------------------------------------------------------------------------------
/docker/docker-entrypoint.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | set -e
4 |
5 | # activate our virtual environment here
6 | . /opt/pysetup/.venv/bin/activate
7 |
8 | # You can put other setup logic here
9 |
10 | # Evaluating passed command:
11 | exec "$@"
--------------------------------------------------------------------------------
/example.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Overview\n",
8 | "\n",
9 | "This example demonstrates how to scan query history from a data warehouse and save it in the data lineage app. The app automatically parses and extracts data lineage from the queries.\n",
10 | "\n",
11 | "The example consists of the following sequence of operations:\n",
12 | "\n",
13 | "* Start docker containers containing a demo. Refer to [docs](https://tokern.io/docs/data-lineage/installation) for detailed instructions on installing demo-wikimedia.\n",
14 | "* Scan and send queries from query history to data lineage app.\n",
15 | "* Visualize the graph by visiting Tokern UI.\n",
16 | "* Analyze the graph"
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {},
22 | "source": [
23 | "# Installation\n",
24 | "\n",
25 | "This demo requires wikimedia demo to be running. Start the demo using the following instructions:\n",
26 | "\n",
27 | " # in a new directory run\n",
28 | " wget https://raw.githubusercontent.com/tokern/data-lineage/master/install-manifests/docker-compose/wikimedia-demo.yml\n",
29 | " # or run\n",
30 | " curl https://raw.githubusercontent.com/tokern/data-lineage/master/install-manifests/docker-compose/wikimedia-demo.yml -o docker-compose.yml\n",
31 | "\n",
32 | "\n",
33 | "Run docker-compose\n",
34 | "\n",
35 | "\n",
36 | " docker-compose up -d\n",
37 | "\n",
38 | "\n",
39 | "Verify container are running\n",
40 | "\n",
41 | "\n",
42 | " docker container ls | grep tokern\n"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": null,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "# Required configuration for API and wikimedia database network address\n",
52 | "\n",
53 | "docker_address = \"http://127.0.0.1:8000\"\n",
54 | "wikimedia_db = {\n",
55 | " \"username\": \"etldev\",\n",
56 | " \"password\": \"3tld3v\",\n",
57 | " \"uri\": \"tokern-demo-wikimedia\",\n",
58 | " \"port\": \"5432\",\n",
59 | " \"database\": \"wikimedia\"\n",
60 | "}"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": null,
66 | "metadata": {},
67 | "outputs": [],
68 | "source": [
69 | "import time\n",
70 | "# Setup a connection to catalog using the SDK.\n",
71 | "from data_lineage import Catalog, Scan\n",
72 | "\n",
73 | "catalog = Catalog(docker_address)\n",
74 | "\n",
75 | "# Register wikimedia datawarehouse with data-lineage app.\n",
76 | "\n",
77 | "source = catalog.add_source(name=\"wikimedia\", source_type=\"postgresql\", **wikimedia_db)\n",
78 | "\n",
79 | "# Scan the wikimedia data warehouse and register all schemata, tables and columns.\n",
80 | "scan = Scan(docker_address)\n",
81 | "job = scan.start(source)\n",
82 | "\n",
83 | "# Wait for scan to complete\n",
84 | "\n",
85 | "status = \"\"\n",
86 | "\n",
87 | "while (status != \"finished\" and status != \"failed\"):\n",
88 | " time.sleep(5)\n",
89 | " status = scan.get(job[\"id\"])[\"status\"]\n",
90 | " print(\"Status is {}\".format(status))"
91 | ]
92 | },
93 | {
94 | "cell_type": "code",
95 | "execution_count": null,
96 | "metadata": {},
97 | "outputs": [],
98 | "source": [
99 | "import json\n",
100 | "\n",
101 | "with open(\"test/queries.json\", \"r\") as file:\n",
102 | " queries = json.load(file)"
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": null,
108 | "metadata": {
109 | "scrolled": true
110 | },
111 | "outputs": [],
112 | "source": [
113 | "from datetime import datetime\n",
114 | "from data_lineage import Analyze\n",
115 | "\n",
116 | "analyze = Analyze(docker_address)\n",
117 | "\n",
118 | "for query in queries:\n",
119 | " print(query)\n",
120 | " analyze.analyze(**query, source=source, start_time=datetime.now(), end_time=datetime.now())"
121 | ]
122 | },
123 | {
124 | "cell_type": "markdown",
125 | "metadata": {},
126 | "source": [
127 | "Visit [Kedro UI](http://localhost:8000/)\n",
128 | "\n",
129 | ""
130 | ]
131 | },
132 | {
133 | "cell_type": "code",
134 | "execution_count": null,
135 | "metadata": {},
136 | "outputs": [],
137 | "source": []
138 | }
139 | ],
140 | "metadata": {
141 | "kernelspec": {
142 | "display_name": "Python 3 (ipykernel)",
143 | "language": "python",
144 | "name": "python3"
145 | },
146 | "language_info": {
147 | "codemirror_mode": {
148 | "name": "ipython",
149 | "version": 3
150 | },
151 | "file_extension": ".py",
152 | "mimetype": "text/x-python",
153 | "name": "python",
154 | "nbconvert_exporter": "python",
155 | "pygments_lexer": "ipython3",
156 | "version": "3.8.10"
157 | }
158 | },
159 | "nbformat": 4,
160 | "nbformat_minor": 4
161 | }
--------------------------------------------------------------------------------
/full_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tokern/data-lineage/5945542742979fe350d313d906440c93ee3d0f36/full_graph.png
--------------------------------------------------------------------------------
/install-manifests/docker-compose/catalog-demo.yml:
--------------------------------------------------------------------------------
1 | version: '3.6'
2 | services:
3 | tokern-demo-catalog:
4 | image: tokern/demo-catalog:latest
5 | container_name: tokern-demo-catalog
6 | restart: unless-stopped
7 | networks:
8 | - tokern-internal
9 | volumes:
10 | - tokern_demo_catalog_data:/var/lib/postgresql/data
11 | environment:
12 | POSTGRES_PASSWORD: catal0g_passw0rd
13 | POSTGRES_USER: catalog_user
14 | POSTGRES_DB: tokern
15 | tokern-api:
16 | image: tokern/data-lineage:latest
17 | container_name: tokern-data-lineage
18 | restart: unless-stopped
19 | networks:
20 | - tokern-internal
21 | environment:
22 | CATALOG_PASSWORD: catal0g_passw0rd
23 | CATALOG_USER: catalog_user
24 | CATALOG_DB: tokern
25 | CATALOG_HOST: tokern-demo-catalog
26 | GUNICORN_CMD_ARGS: "--bind 0.0.0.0:4142"
27 | toker-viz:
28 | image: tokern/data-lineage-viz:latest
29 | container_name: tokern-data-lineage-visualizer
30 | restart: unless-stopped
31 | networks:
32 | - tokern-internal
33 | - tokern-net
34 | ports:
35 | - "8000:80"
36 | networks:
37 | tokern-net: # Exposed by your host.
38 | # external: true
39 | name: "tokern-net"
40 | driver: bridge
41 | ipam:
42 | driver: default
43 | config:
44 | - subnet: 10.10.0.0/24
45 | tokern-internal:
46 | name: "tokern-internal"
47 | driver: bridge
48 | internal: true
49 | ipam:
50 | driver: default
51 | config:
52 | - subnet: 10.11.0.0/24
53 |
54 | volumes:
55 | tokern_demo_catalog_data:
56 |
--------------------------------------------------------------------------------
/install-manifests/docker-compose/tokern-lineage-engine.yml:
--------------------------------------------------------------------------------
1 | version: '3.6'
2 | services:
3 | tokern-catalog:
4 | image: postgres:13.2-alpine
5 | container_name: tokern-catalog
6 | restart: unless-stopped
7 | networks:
8 | - tokern-internal
9 | volumes:
10 | - tokern_catalog_data:/var/lib/postgresql/data
11 | environment:
12 | POSTGRES_PASSWORD: catal0g_passw0rd
13 | POSTGRES_USER: catalog_user
14 | POSTGRES_DB: tokern
15 | tokern-redis:
16 | image: redis:6.2.6-alpine
17 | container_name: tokern-redis
18 | restart: unless-stopped
19 | networks:
20 | - tokern-internal
21 | tokern-api:
22 | image: tokern/data-lineage:latest
23 | container_name: tokern-data-lineage
24 | restart: unless-stopped
25 | depends_on:
26 | - tokern-redis
27 | networks:
28 | - tokern-internal
29 | - tokern-net
30 | environment:
31 | CATALOG_PASSWORD: ${CATALOG_PASSWORD:-catal0g_passw0rd}
32 | CATALOG_USER: ${CATALOG_USER:-catalog_user}
33 | CATALOG_DB: ${CATALOG_DB:-tokern}
34 | CATALOG_HOST: ${CATALOG_HOST:-tokern-catalog}
35 | CATALOG_PORT: ${CATALOG_PORT:-5432}
36 | GUNICORN_CMD_ARGS: "--bind 0.0.0.0:4142"
37 | LOG_LEVEL: ${LOG_LEVEL:-INFO}
38 | REDIS_HOST: ${REDIS_HOST:-tokern-redis}
39 | REDIS_PORT: ${REDIS_PORT:-6379}
40 | REDIS_HOST: "tokern-redis"
41 | tokern-worker:
42 | image: tokern/data-lineage:latest
43 | container_name: tokern_worker
44 | restart: unless-stopped
45 | depends_on:
46 | - tokern-redis
47 | networks:
48 | - tokern-internal
49 | command: rq worker --url redis://tokern-redis:6379
50 | tokern-viz:
51 | image: tokern/data-lineage-viz:latest
52 | container_name: tokern-data-lineage-visualizer
53 | restart: unless-stopped
54 | networks:
55 | - tokern-internal
56 | - tokern-net
57 | ports:
58 | - "8000:80"
59 | networks:
60 | tokern-net: # Exposed by your host.
61 | # external: true
62 | name: "tokern-net"
63 | driver: bridge
64 | ipam:
65 | driver: default
66 | config:
67 | - subnet: 10.10.0.0/24
68 | tokern-internal:
69 | name: "tokern-internal"
70 | driver: bridge
71 | internal: true
72 | ipam:
73 | driver: default
74 | config:
75 | - subnet: 10.11.0.0/24
76 |
77 | volumes:
78 | tokern_catalog_data:
79 |
--------------------------------------------------------------------------------
/install-manifests/docker-compose/wikimedia-demo.yml:
--------------------------------------------------------------------------------
1 | version: '3.6'
2 | services:
3 | tokern-catalog:
4 | image: postgres:13.2-alpine
5 | container_name: tokern-catalog
6 | restart: unless-stopped
7 | networks:
8 | - tokern-internal
9 | volumes:
10 | - tokern_wikimedia_catalog_data:/var/lib/postgresql/data
11 | environment:
12 | POSTGRES_PASSWORD: catal0g_passw0rd
13 | POSTGRES_USER: catalog_user
14 | POSTGRES_DB: tokern
15 | tokern-redis:
16 | image: redis:6.2.6-alpine
17 | container_name: tokern-redis
18 | restart: unless-stopped
19 | networks:
20 | - tokern-internal
21 | tokern-wikimedia:
22 | image: tokern/demo-wikimedia:latest
23 | container_name: tokern-demo-wikimedia
24 | restart: unless-stopped
25 | networks:
26 | - tokern-internal
27 | volumes:
28 | - tokern_wikimedia_data:/var/lib/postgresql/data
29 | environment:
30 | POSTGRES_PASSWORD: 3tld3v
31 | POSTGRES_USER: etldev
32 | POSTGRES_DB: wikimedia
33 | tokern-api:
34 | image: tokern/data-lineage:latest
35 | container_name: tokern-data-lineage
36 | restart: unless-stopped
37 | depends_on:
38 | - tokern-redis
39 | networks:
40 | - tokern-internal
41 | environment:
42 | CATALOG_PASSWORD: catal0g_passw0rd
43 | CATALOG_USER: catalog_user
44 | CATALOG_DB: tokern
45 | CATALOG_HOST: tokern-catalog
46 | GUNICORN_CMD_ARGS: "--bind 0.0.0.0:4142"
47 | REDIS_HOST: "tokern-redis"
48 | tokern-worker:
49 | image: tokern/data-lineage:latest
50 | container_name: tokern_worker
51 | restart: unless-stopped
52 | depends_on:
53 | - tokern-redis
54 | networks:
55 | - tokern-internal
56 | command: rq worker --url redis://tokern-redis:6379
57 | toker-viz:
58 | image: tokern/data-lineage-viz:latest
59 | container_name: tokern-data-lineage-visualizer
60 | restart: unless-stopped
61 | networks:
62 | - tokern-internal
63 | - tokern-net
64 | ports:
65 | - "8000:80"
66 | networks:
67 | tokern-net: # Exposed by your host.
68 | # external: true
69 | name: "tokern-net"
70 | driver: bridge
71 | ipam:
72 | driver: default
73 | config:
74 | - subnet: 10.10.0.0/24
75 | tokern-internal:
76 | name: "tokern-internal"
77 | driver: bridge
78 | internal: true
79 | ipam:
80 | driver: default
81 | config:
82 | - subnet: 10.11.0.0/24
83 |
84 | volumes:
85 | tokern_wikimedia_catalog_data:
86 | tokern_wikimedia_data:
--------------------------------------------------------------------------------
/install-manifests/dockerfiles/Dockerfile-demo-catalog:
--------------------------------------------------------------------------------
1 | FROM postgres:13.2-alpine
2 | COPY demo-catalog.sql /docker-entrypoint-initdb.d/
--------------------------------------------------------------------------------
/install-manifests/dockerfiles/Dockerfile-demo-wikimedia:
--------------------------------------------------------------------------------
1 | FROM postgres:13.2-alpine
2 | COPY demo-wikimedia.sql /docker-entrypoint-initdb.d/
--------------------------------------------------------------------------------
/install-manifests/dockerfiles/Makefile:
--------------------------------------------------------------------------------
1 | default: all
2 |
3 | .PHONY: default all fetch_dump
4 |
5 | date := `date '+%Y-%m-%d'`
6 | TARGET_IMAGE ?= demo-catalog
7 | VERSION ?= "0.2.0"
8 | DESTINATION_REPOSITORY ?= "tokern"
9 |
10 | all: generate_image push_to_registry finished
11 |
12 | check_vars:
13 | @test -n "$DESTINATION_REPOSITORY" || (echo "You need to set DESTINATION_REPOSITORY environment variable" >&2 && exit 1)
14 |
15 | generate_image:
16 | @docker build . -f Dockerfile-$(TARGET_IMAGE) -t $(TARGET_IMAGE)\:latest -t $(DESTINATION_REPOSITORY)/$(TARGET_IMAGE)\:latest -t $(DESTINATION_REPOSITORY)/$(TARGET_IMAGE)\:$(VERSION)
17 |
18 | push_to_registry:
19 | @echo ""
20 | @echo "====== Pushing image to repository ======"
21 | @docker push $(DESTINATION_REPOSITORY)/$(TARGET_IMAGE):latest
22 | @docker push $(DESTINATION_REPOSITORY)/$(TARGET_IMAGE):$(VERSION)
23 |
24 | finished:
25 | @echo ""
26 | @echo "Finished with success. Pushed image to $(DESTINATION_REPOSITORY)/$(TARGET_IMAGE)"
--------------------------------------------------------------------------------
/install-manifests/dockerfiles/demo-catalog.sql:
--------------------------------------------------------------------------------
1 | --
2 | -- PostgreSQL database dump
3 | --
4 |
5 | -- Dumped from database version 13.2 (Debian 13.2-1.pgdg100+1)
6 | -- Dumped by pg_dump version 13.3 (Ubuntu 13.3-1.pgdg20.04+1)
7 |
8 | SET statement_timeout = 0;
9 | SET lock_timeout = 0;
10 | SET idle_in_transaction_session_timeout = 0;
11 | SET client_encoding = 'UTF8';
12 | SET standard_conforming_strings = on;
13 | SELECT pg_catalog.set_config('search_path', '', false);
14 | SET check_function_bodies = false;
15 | SET xmloption = content;
16 | SET client_min_messages = warning;
17 | SET row_security = off;
18 |
19 | --
20 | -- Name: jobexecutionstatus; Type: TYPE; Schema: public; Owner: catalog_user
21 | --
22 |
23 | CREATE TYPE public.jobexecutionstatus AS ENUM (
24 | 'SUCCESS',
25 | 'FAILURE'
26 | );
27 |
28 |
29 | ALTER TYPE public.jobexecutionstatus OWNER TO catalog_user;
30 |
31 | SET default_tablespace = '';
32 |
33 | SET default_table_access_method = heap;
34 |
35 | --
36 | -- Name: alembic_version; Type: TABLE; Schema: public; Owner: catalog_user
37 | --
38 |
39 | CREATE TABLE public.alembic_version (
40 | version_num character varying(32) NOT NULL
41 | );
42 |
43 |
44 | ALTER TABLE public.alembic_version OWNER TO catalog_user;
45 |
46 | --
47 | -- Name: column_lineage; Type: TABLE; Schema: public; Owner: catalog_user
48 | --
49 |
50 | CREATE TABLE public.column_lineage (
51 | id integer NOT NULL,
52 | context jsonb,
53 | source_id integer,
54 | target_id integer,
55 | job_execution_id integer
56 | );
57 |
58 |
59 | ALTER TABLE public.column_lineage OWNER TO catalog_user;
60 |
61 | --
62 | -- Name: column_lineage_id_seq; Type: SEQUENCE; Schema: public; Owner: catalog_user
63 | --
64 |
65 | CREATE SEQUENCE public.column_lineage_id_seq
66 | AS integer
67 | START WITH 1
68 | INCREMENT BY 1
69 | NO MINVALUE
70 | NO MAXVALUE
71 | CACHE 1;
72 |
73 |
74 | ALTER TABLE public.column_lineage_id_seq OWNER TO catalog_user;
75 |
76 | --
77 | -- Name: column_lineage_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: catalog_user
78 | --
79 |
80 | ALTER SEQUENCE public.column_lineage_id_seq OWNED BY public.column_lineage.id;
81 |
82 |
83 | --
84 | -- Name: columns; Type: TABLE; Schema: public; Owner: catalog_user
85 | --
86 |
87 | CREATE TABLE public.columns (
88 | id integer NOT NULL,
89 | name character varying,
90 | data_type character varying,
91 | sort_order integer,
92 | table_id integer
93 | );
94 |
95 |
96 | ALTER TABLE public.columns OWNER TO catalog_user;
97 |
98 | --
99 | -- Name: columns_id_seq; Type: SEQUENCE; Schema: public; Owner: catalog_user
100 | --
101 |
102 | CREATE SEQUENCE public.columns_id_seq
103 | AS integer
104 | START WITH 1
105 | INCREMENT BY 1
106 | NO MINVALUE
107 | NO MAXVALUE
108 | CACHE 1;
109 |
110 |
111 | ALTER TABLE public.columns_id_seq OWNER TO catalog_user;
112 |
113 | --
114 | -- Name: columns_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: catalog_user
115 | --
116 |
117 | ALTER SEQUENCE public.columns_id_seq OWNED BY public.columns.id;
118 |
119 |
120 | --
121 | -- Name: default_schema; Type: TABLE; Schema: public; Owner: catalog_user
122 | --
123 |
124 | CREATE TABLE public.default_schema (
125 | source_id integer NOT NULL,
126 | schema_id integer
127 | );
128 |
129 |
130 | ALTER TABLE public.default_schema OWNER TO catalog_user;
131 |
132 | --
133 | -- Name: job_executions; Type: TABLE; Schema: public; Owner: catalog_user
134 | --
135 |
136 | CREATE TABLE public.job_executions (
137 | id integer NOT NULL,
138 | job_id integer,
139 | started_at timestamp without time zone,
140 | ended_at timestamp without time zone,
141 | status public.jobexecutionstatus
142 | );
143 |
144 |
145 | ALTER TABLE public.job_executions OWNER TO catalog_user;
146 |
147 | --
148 | -- Name: job_executions_id_seq; Type: SEQUENCE; Schema: public; Owner: catalog_user
149 | --
150 |
151 | CREATE SEQUENCE public.job_executions_id_seq
152 | AS integer
153 | START WITH 1
154 | INCREMENT BY 1
155 | NO MINVALUE
156 | NO MAXVALUE
157 | CACHE 1;
158 |
159 |
160 | ALTER TABLE public.job_executions_id_seq OWNER TO catalog_user;
161 |
162 | --
163 | -- Name: job_executions_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: catalog_user
164 | --
165 |
166 | ALTER SEQUENCE public.job_executions_id_seq OWNED BY public.job_executions.id;
167 |
168 |
169 | --
170 | -- Name: jobs; Type: TABLE; Schema: public; Owner: catalog_user
171 | --
172 |
173 | CREATE TABLE public.jobs (
174 | id integer NOT NULL,
175 | name character varying,
176 | context jsonb,
177 | source_id integer
178 | );
179 |
180 |
181 | ALTER TABLE public.jobs OWNER TO catalog_user;
182 |
183 | --
184 | -- Name: jobs_id_seq; Type: SEQUENCE; Schema: public; Owner: catalog_user
185 | --
186 |
187 | CREATE SEQUENCE public.jobs_id_seq
188 | AS integer
189 | START WITH 1
190 | INCREMENT BY 1
191 | NO MINVALUE
192 | NO MAXVALUE
193 | CACHE 1;
194 |
195 |
196 | ALTER TABLE public.jobs_id_seq OWNER TO catalog_user;
197 |
198 | --
199 | -- Name: jobs_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: catalog_user
200 | --
201 |
202 | ALTER SEQUENCE public.jobs_id_seq OWNED BY public.jobs.id;
203 |
204 |
205 | --
206 | -- Name: schemata; Type: TABLE; Schema: public; Owner: catalog_user
207 | --
208 |
209 | CREATE TABLE public.schemata (
210 | id integer NOT NULL,
211 | name character varying,
212 | source_id integer
213 | );
214 |
215 |
216 | ALTER TABLE public.schemata OWNER TO catalog_user;
217 |
218 | --
219 | -- Name: schemata_id_seq; Type: SEQUENCE; Schema: public; Owner: catalog_user
220 | --
221 |
222 | CREATE SEQUENCE public.schemata_id_seq
223 | AS integer
224 | START WITH 1
225 | INCREMENT BY 1
226 | NO MINVALUE
227 | NO MAXVALUE
228 | CACHE 1;
229 |
230 |
231 | ALTER TABLE public.schemata_id_seq OWNER TO catalog_user;
232 |
233 | --
234 | -- Name: schemata_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: catalog_user
235 | --
236 |
237 | ALTER SEQUENCE public.schemata_id_seq OWNED BY public.schemata.id;
238 |
239 |
240 | --
241 | -- Name: sources; Type: TABLE; Schema: public; Owner: catalog_user
242 | --
243 |
244 | CREATE TABLE public.sources (
245 | id integer NOT NULL,
246 | source_type character varying,
247 | name character varying,
248 | dialect character varying,
249 | uri character varying,
250 | port character varying,
251 | username character varying,
252 | password character varying,
253 | database character varying,
254 | instance character varying,
255 | cluster character varying,
256 | project_id character varying,
257 | project_credentials character varying,
258 | page_size character varying,
259 | filter_key character varying,
260 | included_tables_regex character varying,
261 | key_path character varying,
262 | account character varying,
263 | role character varying,
264 | warehouse character varying
265 | );
266 |
267 |
268 | ALTER TABLE public.sources OWNER TO catalog_user;
269 |
270 | --
271 | -- Name: sources_id_seq; Type: SEQUENCE; Schema: public; Owner: catalog_user
272 | --
273 |
274 | CREATE SEQUENCE public.sources_id_seq
275 | AS integer
276 | START WITH 1
277 | INCREMENT BY 1
278 | NO MINVALUE
279 | NO MAXVALUE
280 | CACHE 1;
281 |
282 |
283 | ALTER TABLE public.sources_id_seq OWNER TO catalog_user;
284 |
285 | --
286 | -- Name: sources_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: catalog_user
287 | --
288 |
289 | ALTER SEQUENCE public.sources_id_seq OWNED BY public.sources.id;
290 |
291 |
292 | --
293 | -- Name: tables; Type: TABLE; Schema: public; Owner: catalog_user
294 | --
295 |
296 | CREATE TABLE public.tables (
297 | id integer NOT NULL,
298 | name character varying,
299 | schema_id integer
300 | );
301 |
302 |
303 | ALTER TABLE public.tables OWNER TO catalog_user;
304 |
305 | --
306 | -- Name: tables_id_seq; Type: SEQUENCE; Schema: public; Owner: catalog_user
307 | --
308 |
309 | CREATE SEQUENCE public.tables_id_seq
310 | AS integer
311 | START WITH 1
312 | INCREMENT BY 1
313 | NO MINVALUE
314 | NO MAXVALUE
315 | CACHE 1;
316 |
317 |
318 | ALTER TABLE public.tables_id_seq OWNER TO catalog_user;
319 |
320 | --
321 | -- Name: tables_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: catalog_user
322 | --
323 |
324 | ALTER SEQUENCE public.tables_id_seq OWNED BY public.tables.id;
325 |
326 |
327 | --
328 | -- Name: column_lineage id; Type: DEFAULT; Schema: public; Owner: catalog_user
329 | --
330 |
331 | ALTER TABLE ONLY public.column_lineage ALTER COLUMN id SET DEFAULT nextval('public.column_lineage_id_seq'::regclass);
332 |
333 |
334 | --
335 | -- Name: columns id; Type: DEFAULT; Schema: public; Owner: catalog_user
336 | --
337 |
338 | ALTER TABLE ONLY public.columns ALTER COLUMN id SET DEFAULT nextval('public.columns_id_seq'::regclass);
339 |
340 |
341 | --
342 | -- Name: job_executions id; Type: DEFAULT; Schema: public; Owner: catalog_user
343 | --
344 |
345 | ALTER TABLE ONLY public.job_executions ALTER COLUMN id SET DEFAULT nextval('public.job_executions_id_seq'::regclass);
346 |
347 |
348 | --
349 | -- Name: jobs id; Type: DEFAULT; Schema: public; Owner: catalog_user
350 | --
351 |
352 | ALTER TABLE ONLY public.jobs ALTER COLUMN id SET DEFAULT nextval('public.jobs_id_seq'::regclass);
353 |
354 |
355 | --
356 | -- Name: schemata id; Type: DEFAULT; Schema: public; Owner: catalog_user
357 | --
358 |
359 | ALTER TABLE ONLY public.schemata ALTER COLUMN id SET DEFAULT nextval('public.schemata_id_seq'::regclass);
360 |
361 |
362 | --
363 | -- Name: sources id; Type: DEFAULT; Schema: public; Owner: catalog_user
364 | --
365 |
366 | ALTER TABLE ONLY public.sources ALTER COLUMN id SET DEFAULT nextval('public.sources_id_seq'::regclass);
367 |
368 |
369 | --
370 | -- Name: tables id; Type: DEFAULT; Schema: public; Owner: catalog_user
371 | --
372 |
373 | ALTER TABLE ONLY public.tables ALTER COLUMN id SET DEFAULT nextval('public.tables_id_seq'::regclass);
374 |
375 |
376 | --
377 | -- Data for Name: alembic_version; Type: TABLE DATA; Schema: public; Owner: catalog_user
378 | --
379 |
380 | COPY public.alembic_version (version_num) FROM stdin;
381 | d1daff1715f7
382 | \.
383 |
384 |
385 | --
386 | -- Data for Name: column_lineage; Type: TABLE DATA; Schema: public; Owner: catalog_user
387 | --
388 |
389 | COPY public.column_lineage (id, context, source_id, target_id, job_execution_id) FROM stdin;
390 | 1 {} 5 10 1
391 | 2 {} 7 11 1
392 | 3 {} 7 12 1
393 | 4 {} 5 13 1
394 | 5 {} 6 14 1
395 | 6 {} 5 15 2
396 | 7 {} 7 16 2
397 | 8 {} 7 17 2
398 | 9 {} 5 18 2
399 | 10 {} 6 19 2
400 | 11 {} 15 20 3
401 | 12 {} 16 21 3
402 | 13 {} 17 22 3
403 | 14 {} 18 23 3
404 | 15 {} 19 24 3
405 | 16 {} 2 25 4
406 | 17 {} 3 26 4
407 | 18 {} 4 27 4
408 | 19 {} 23 29 5
409 | 20 {} 22 30 5
410 | 21 {} 22 31 5
411 | 22 {} 27 32 5
412 | 23 {} 28 33 5
413 | \.
414 |
415 |
416 | --
417 | -- Data for Name: columns; Type: TABLE DATA; Schema: public; Owner: catalog_user
418 | --
419 |
420 | COPY public.columns (id, name, data_type, sort_order, table_id) FROM stdin;
421 | 1 group STRING 0 1
422 | 2 page_title STRING 1 1
423 | 3 views BIGINT 2 1
424 | 4 bytes_sent BIGINT 3 1
425 | 5 page_id BIGINT 0 2
426 | 6 page_latest BIGINT 1 2
427 | 7 page_title STRING 2 2
428 | 8 rd_from BIGINT 0 3
429 | 9 page_title STRING 1 3
430 | 10 redirect_id BIGINT 0 4
431 | 11 redirect_title STRING 1 4
432 | 12 true_title STRING 2 4
433 | 13 page_id BIGINT 3 4
434 | 14 page_version BIGINT 4 4
435 | 15 redirect_id BIGINT 0 5
436 | 16 redirect_title STRING 1 5
437 | 17 true_title STRING 2 5
438 | 18 page_id BIGINT 3 5
439 | 19 page_version BIGINT 4 5
440 | 20 redirect_id bigint 0 6
441 | 21 redirect_title STRING 1 6
442 | 22 true_title STRING 2 6
443 | 23 page_id BIGINT 3 6
444 | 24 page_version BIGINT 4 6
445 | 25 group STRING 0 7
446 | 26 page_title STRING 1 7
447 | 27 views BIGINT 2 7
448 | 28 bytes_sent BIGINT 3 7
449 | 29 page_id BIGINT 0 8
450 | 30 page_title STRING 1 8
451 | 31 page_url STRING 2 8
452 | 32 views BIGINT 3 8
453 | 33 bytes_sent BIGINT 4 8
454 | \.
455 |
456 |
457 | --
458 | -- Data for Name: default_schema; Type: TABLE DATA; Schema: public; Owner: catalog_user
459 | --
460 |
461 | COPY public.default_schema (source_id, schema_id) FROM stdin;
462 | \.
463 |
464 |
465 | --
466 | -- Data for Name: job_executions; Type: TABLE DATA; Schema: public; Owner: catalog_user
467 | --
468 |
469 | COPY public.job_executions (id, job_id, started_at, ended_at, status) FROM stdin;
470 | 1 1 2021-07-29 23:11:44.470984 2021-07-29 23:11:44.470993 SUCCESS
471 | 2 2 2021-07-29 23:11:44.61084 2021-07-29 23:11:44.610849 SUCCESS
472 | 3 3 2021-07-29 23:11:44.717093 2021-07-29 23:11:44.717101 SUCCESS
473 | 4 4 2021-07-29 23:11:44.842395 2021-07-29 23:11:44.84241 SUCCESS
474 | 5 5 2021-07-29 23:11:44.949858 2021-07-29 23:11:44.949867 SUCCESS
475 | \.
476 |
477 |
478 | --
479 | -- Data for Name: jobs; Type: TABLE DATA; Schema: public; Owner: catalog_user
480 | --
481 |
482 | COPY public.jobs (id, name, context, source_id) FROM stdin;
483 | 1 LOAD page_lookup_nonredirect {"query": "INSERT INTO page_lookup_nonredirect SELECT page.page_id as redircet_id, page.page_title as redirect_title, page.page_title true_title, page.page_id, page.page_latest FROM page LEFT OUTER JOIN redirect ON page.page_id = redirect.rd_from WHERE redirect.rd_from IS NULL "} 1
484 | 2 LOAD page_lookup_redirect {"query": "insert into page_lookup_redirect select original_page.page_id redirect_id, original_page.page_title redirect_title, final_page.page_title as true_title, final_page.page_id, final_page.page_latest from page final_page join redirect on (redirect.page_title = final_page.page_title) join page original_page on (redirect.rd_from = original_page.page_id)"} 1
485 | 3 LOAD page_lookup {"query": "INSERT INTO page_lookup SELECT plr.redirect_id, plr.redirect_title, plr.true_title, plr.page_id, plr.page_version FROM page_lookup_redirect plr"} 1
486 | 4 LOAD filtered_pagecounts {"query": "INSERT INTO filtered_pagecounts(\\"group\\", page_title, views) SELECT regexp_replace (reflect ('java.net.URLDecoder','decode', reflect ('java.net.URLDecoder','decode',pvs.page_title)),'^\\\\s*([a-zA-Z0-9]+).*','$1') page_title,SUM (pvs.views) AS total_views, SUM (pvs.bytes_sent) AS total_bytes_sent FROM pagecounts as pvs WHERE not pvs.page_title LIKE '(MEDIA|SPECIAL||Talk|User|User_talk|Project|Project_talk|File|File_talk|MediaWiki|MediaWiki_talk|Template|Template_talk|Help|Help_talk|Category|Category_talk|Portal|Wikipedia|Wikipedia_talk|upload|Special)\\\\:(.*)' and pvs.page_title LIKE '^([A-Z])(.*)' and not pvs.page_title LIKE '(.*).(jpg|gif|png|JPG|GIF|PNG|txt|ico)$' and pvs.page_title <> '404_error/' and pvs.page_title <> 'Main_Page' and pvs.page_title <> 'Hypertext_Transfer_Protocol' and pvs.page_title <> 'Favicon.ico' and pvs.page_title <> 'Search' and pvs.dt = '2020-01-01' GROUP BY regexp_replace (reflect ('java.net.URLDecoder','decode', reflect ('java.net.URLDecoder','decode',pvs.page_title)),'^\\\\s*([a-zA-Z0-9]+).*','$1')"} 1
487 | 5 LOAD normalized_pagecounts {"query": "INSERT INTO normalized_pagecounts SELECT pl.page_id page_id, REGEXP_REPLACE(pl.true_title, '_', ' ') page_title, pl.true_title page_url, fp.views, fp.bytes_sent FROM page_lookup pl JOIN filtered_pagecounts fp ON fp.page_title = pl.redirect_title where fp.dt='2020-01-01'"} 1
488 | \.
489 |
490 |
491 | --
492 | -- Data for Name: schemata; Type: TABLE DATA; Schema: public; Owner: catalog_user
493 | --
494 |
495 | COPY public.schemata (id, name, source_id) FROM stdin;
496 | 1 default 1
497 | \.
498 |
499 |
500 | --
501 | -- Data for Name: sources; Type: TABLE DATA; Schema: public; Owner: catalog_user
502 | --
503 |
504 | COPY public.sources (id, source_type, name, dialect, uri, port, username, password, database, instance, cluster, project_id, project_credentials, page_size, filter_key, included_tables_regex, key_path, account, role, warehouse) FROM stdin;
505 | 1 redshift test \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N
506 | \.
507 |
508 |
509 | --
510 | -- Data for Name: tables; Type: TABLE DATA; Schema: public; Owner: catalog_user
511 | --
512 |
513 | COPY public.tables (id, name, schema_id) FROM stdin;
514 | 1 pagecounts 1
515 | 2 page 1
516 | 3 redirect 1
517 | 4 page_lookup_nonredirect 1
518 | 5 page_lookup_redirect 1
519 | 6 page_lookup 1
520 | 7 filtered_pagecounts 1
521 | 8 normalized_pagecounts 1
522 | \.
523 |
524 |
525 | --
526 | -- Name: column_lineage_id_seq; Type: SEQUENCE SET; Schema: public; Owner: catalog_user
527 | --
528 |
529 | SELECT pg_catalog.setval('public.column_lineage_id_seq', 23, true);
530 |
531 |
532 | --
533 | -- Name: columns_id_seq; Type: SEQUENCE SET; Schema: public; Owner: catalog_user
534 | --
535 |
536 | SELECT pg_catalog.setval('public.columns_id_seq', 33, true);
537 |
538 |
539 | --
540 | -- Name: job_executions_id_seq; Type: SEQUENCE SET; Schema: public; Owner: catalog_user
541 | --
542 |
543 | SELECT pg_catalog.setval('public.job_executions_id_seq', 5, true);
544 |
545 |
546 | --
547 | -- Name: jobs_id_seq; Type: SEQUENCE SET; Schema: public; Owner: catalog_user
548 | --
549 |
550 | SELECT pg_catalog.setval('public.jobs_id_seq', 5, true);
551 |
552 |
553 | --
554 | -- Name: schemata_id_seq; Type: SEQUENCE SET; Schema: public; Owner: catalog_user
555 | --
556 |
557 | SELECT pg_catalog.setval('public.schemata_id_seq', 1, true);
558 |
559 |
560 | --
561 | -- Name: sources_id_seq; Type: SEQUENCE SET; Schema: public; Owner: catalog_user
562 | --
563 |
564 | SELECT pg_catalog.setval('public.sources_id_seq', 1, true);
565 |
566 |
567 | --
568 | -- Name: tables_id_seq; Type: SEQUENCE SET; Schema: public; Owner: catalog_user
569 | --
570 |
571 | SELECT pg_catalog.setval('public.tables_id_seq', 8, true);
572 |
573 |
574 | --
575 | -- Name: alembic_version alembic_version_pkc; Type: CONSTRAINT; Schema: public; Owner: catalog_user
576 | --
577 |
578 | ALTER TABLE ONLY public.alembic_version
579 | ADD CONSTRAINT alembic_version_pkc PRIMARY KEY (version_num);
580 |
581 |
582 | --
583 | -- Name: column_lineage column_lineage_pkey; Type: CONSTRAINT; Schema: public; Owner: catalog_user
584 | --
585 |
586 | ALTER TABLE ONLY public.column_lineage
587 | ADD CONSTRAINT column_lineage_pkey PRIMARY KEY (id);
588 |
589 |
590 | --
591 | -- Name: columns columns_pkey; Type: CONSTRAINT; Schema: public; Owner: catalog_user
592 | --
593 |
594 | ALTER TABLE ONLY public.columns
595 | ADD CONSTRAINT columns_pkey PRIMARY KEY (id);
596 |
597 |
598 | --
599 | -- Name: default_schema default_schema_pkey; Type: CONSTRAINT; Schema: public; Owner: catalog_user
600 | --
601 |
602 | ALTER TABLE ONLY public.default_schema
603 | ADD CONSTRAINT default_schema_pkey PRIMARY KEY (source_id);
604 |
605 |
606 | --
607 | -- Name: job_executions job_executions_pkey; Type: CONSTRAINT; Schema: public; Owner: catalog_user
608 | --
609 |
610 | ALTER TABLE ONLY public.job_executions
611 | ADD CONSTRAINT job_executions_pkey PRIMARY KEY (id);
612 |
613 |
614 | --
615 | -- Name: jobs jobs_name_key; Type: CONSTRAINT; Schema: public; Owner: catalog_user
616 | --
617 |
618 | ALTER TABLE ONLY public.jobs
619 | ADD CONSTRAINT jobs_name_key UNIQUE (name);
620 |
621 |
622 | --
623 | -- Name: jobs jobs_pkey; Type: CONSTRAINT; Schema: public; Owner: catalog_user
624 | --
625 |
626 | ALTER TABLE ONLY public.jobs
627 | ADD CONSTRAINT jobs_pkey PRIMARY KEY (id);
628 |
629 |
630 | --
631 | -- Name: jobs jobs_source_id_name_key; Type: CONSTRAINT; Schema: public; Owner: catalog_user
632 | --
633 |
634 | ALTER TABLE ONLY public.jobs
635 | ADD CONSTRAINT jobs_source_id_name_key UNIQUE (source_id, name);
636 |
637 |
638 | --
639 | -- Name: schemata schemata_pkey; Type: CONSTRAINT; Schema: public; Owner: catalog_user
640 | --
641 |
642 | ALTER TABLE ONLY public.schemata
643 | ADD CONSTRAINT schemata_pkey PRIMARY KEY (id);
644 |
645 |
646 | --
647 | -- Name: sources sources_name_key; Type: CONSTRAINT; Schema: public; Owner: catalog_user
648 | --
649 |
650 | ALTER TABLE ONLY public.sources
651 | ADD CONSTRAINT sources_name_key UNIQUE (name);
652 |
653 |
654 | --
655 | -- Name: sources sources_pkey; Type: CONSTRAINT; Schema: public; Owner: catalog_user
656 | --
657 |
658 | ALTER TABLE ONLY public.sources
659 | ADD CONSTRAINT sources_pkey PRIMARY KEY (id);
660 |
661 |
662 | --
663 | -- Name: tables tables_pkey; Type: CONSTRAINT; Schema: public; Owner: catalog_user
664 | --
665 |
666 | ALTER TABLE ONLY public.tables
667 | ADD CONSTRAINT tables_pkey PRIMARY KEY (id);
668 |
669 |
670 | --
671 | -- Name: columns unique_column_name; Type: CONSTRAINT; Schema: public; Owner: catalog_user
672 | --
673 |
674 | ALTER TABLE ONLY public.columns
675 | ADD CONSTRAINT unique_column_name UNIQUE (table_id, name);
676 |
677 |
678 | --
679 | -- Name: column_lineage unique_lineage; Type: CONSTRAINT; Schema: public; Owner: catalog_user
680 | --
681 |
682 | ALTER TABLE ONLY public.column_lineage
683 | ADD CONSTRAINT unique_lineage UNIQUE (source_id, target_id, job_execution_id);
684 |
685 |
686 | --
687 | -- Name: schemata unique_schema_name; Type: CONSTRAINT; Schema: public; Owner: catalog_user
688 | --
689 |
690 | ALTER TABLE ONLY public.schemata
691 | ADD CONSTRAINT unique_schema_name UNIQUE (source_id, name);
692 |
693 |
694 | --
695 | -- Name: tables unique_table_name; Type: CONSTRAINT; Schema: public; Owner: catalog_user
696 | --
697 |
698 | ALTER TABLE ONLY public.tables
699 | ADD CONSTRAINT unique_table_name UNIQUE (schema_id, name);
700 |
701 |
702 | --
703 | -- Name: column_lineage column_lineage_job_execution_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: catalog_user
704 | --
705 |
706 | ALTER TABLE ONLY public.column_lineage
707 | ADD CONSTRAINT column_lineage_job_execution_id_fkey FOREIGN KEY (job_execution_id) REFERENCES public.job_executions(id);
708 |
709 |
710 | --
711 | -- Name: column_lineage column_lineage_source_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: catalog_user
712 | --
713 |
714 | ALTER TABLE ONLY public.column_lineage
715 | ADD CONSTRAINT column_lineage_source_id_fkey FOREIGN KEY (source_id) REFERENCES public.columns(id);
716 |
717 |
718 | --
719 | -- Name: column_lineage column_lineage_target_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: catalog_user
720 | --
721 |
722 | ALTER TABLE ONLY public.column_lineage
723 | ADD CONSTRAINT column_lineage_target_id_fkey FOREIGN KEY (target_id) REFERENCES public.columns(id);
724 |
725 |
726 | --
727 | -- Name: columns columns_table_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: catalog_user
728 | --
729 |
730 | ALTER TABLE ONLY public.columns
731 | ADD CONSTRAINT columns_table_id_fkey FOREIGN KEY (table_id) REFERENCES public.tables(id);
732 |
733 |
734 | --
735 | -- Name: default_schema default_schema_schema_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: catalog_user
736 | --
737 |
738 | ALTER TABLE ONLY public.default_schema
739 | ADD CONSTRAINT default_schema_schema_id_fkey FOREIGN KEY (schema_id) REFERENCES public.schemata(id);
740 |
741 |
742 | --
743 | -- Name: default_schema default_schema_source_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: catalog_user
744 | --
745 |
746 | ALTER TABLE ONLY public.default_schema
747 | ADD CONSTRAINT default_schema_source_id_fkey FOREIGN KEY (source_id) REFERENCES public.sources(id);
748 |
749 |
750 | --
751 | -- Name: job_executions job_executions_job_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: catalog_user
752 | --
753 |
754 | ALTER TABLE ONLY public.job_executions
755 | ADD CONSTRAINT job_executions_job_id_fkey FOREIGN KEY (job_id) REFERENCES public.jobs(id);
756 |
757 |
758 | --
759 | -- Name: jobs jobs_source_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: catalog_user
760 | --
761 |
762 | ALTER TABLE ONLY public.jobs
763 | ADD CONSTRAINT jobs_source_id_fkey FOREIGN KEY (source_id) REFERENCES public.sources(id);
764 |
765 |
766 | --
767 | -- Name: schemata schemata_source_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: catalog_user
768 | --
769 |
770 | ALTER TABLE ONLY public.schemata
771 | ADD CONSTRAINT schemata_source_id_fkey FOREIGN KEY (source_id) REFERENCES public.sources(id);
772 |
773 |
774 | --
775 | -- Name: tables tables_schema_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: catalog_user
776 | --
777 |
778 | ALTER TABLE ONLY public.tables
779 | ADD CONSTRAINT tables_schema_id_fkey FOREIGN KEY (schema_id) REFERENCES public.schemata(id);
780 |
781 |
782 | --
783 | -- PostgreSQL database dump complete
784 | --
785 |
786 |
--------------------------------------------------------------------------------
/install-manifests/dockerfiles/demo-wikimedia.sql:
--------------------------------------------------------------------------------
1 | --
2 | -- PostgreSQL database dump
3 | --
4 |
5 | -- Dumped from database version 13.2 (Debian 13.2-1.pgdg100+1)
6 | -- Dumped by pg_dump version 13.3 (Ubuntu 13.3-1.pgdg20.04+1)
7 |
8 | SET statement_timeout = 0;
9 | SET lock_timeout = 0;
10 | SET idle_in_transaction_session_timeout = 0;
11 | SET client_encoding = 'UTF8';
12 | SET standard_conforming_strings = on;
13 | SELECT pg_catalog.set_config('search_path', '', false);
14 | SET check_function_bodies = false;
15 | SET xmloption = content;
16 | SET client_min_messages = warning;
17 | SET row_security = off;
18 |
19 | SET default_tablespace = '';
20 |
21 | SET default_table_access_method = heap;
22 |
23 | --
24 | -- Name: filtered_pagecounts; Type: TABLE; Schema: public; Owner: etldev
25 | --
26 |
27 | CREATE TABLE public.filtered_pagecounts (
28 | "group" character varying,
29 | page_title character varying,
30 | views bigint,
31 | bytes_sent bigint
32 | );
33 |
34 |
35 | ALTER TABLE public.filtered_pagecounts OWNER TO etldev;
36 |
37 | --
38 | -- Name: page_lookup; Type: TABLE; Schema: public; Owner: etldev
39 | --
40 |
41 | CREATE TABLE public.page_lookup (
42 | redirect_id bigint,
43 | redirect_title bigint,
44 | true_title character varying,
45 | page_id bigint,
46 | page_version bigint
47 | );
48 |
49 |
50 | ALTER TABLE public.page_lookup OWNER TO etldev;
51 |
52 | --
53 | -- Name: normalized_pagecounts; Type: TABLE; Schema: public; Owner: etldev
54 | --
55 |
56 | CREATE TABLE public.normalized_pagecounts (
57 | page_id bigint,
58 | page_title character varying,
59 | page_url character varying,
60 | views bigint,
61 | bytes_sent bigint
62 | );
63 |
64 |
65 | ALTER TABLE public.normalized_pagecounts OWNER TO etldev;
66 |
67 | --
68 | -- Name: page; Type: TABLE; Schema: public; Owner: etldev
69 | --
70 |
71 | CREATE TABLE public.page (
72 | page_id bigint,
73 | page_latest bigint,
74 | page_title character varying
75 | );
76 |
77 |
78 | ALTER TABLE public.page OWNER TO etldev;
79 |
80 | --
81 | -- Name: page_lookup_nonredirect; Type: TABLE; Schema: public; Owner: etldev
82 | --
83 |
84 | CREATE TABLE public.page_lookup_nonredirect (
85 | redirect_id bigint,
86 | redirect_title bigint,
87 | true_title character varying,
88 | page_id bigint,
89 | page_version bigint
90 | );
91 |
92 |
93 | ALTER TABLE public.page_lookup_nonredirect OWNER TO etldev;
94 |
95 | --
96 | -- Name: page_lookup_redirect; Type: TABLE; Schema: public; Owner: etldev
97 | --
98 |
99 | CREATE TABLE public.page_lookup_redirect (
100 | redirect_id bigint,
101 | redirect_title bigint,
102 | true_title character varying,
103 | page_id bigint,
104 | page_version bigint
105 | );
106 |
107 |
108 | ALTER TABLE public.page_lookup_redirect OWNER TO etldev;
109 |
110 | --
111 | -- Name: pagecounts; Type: TABLE; Schema: public; Owner: etldev
112 | --
113 |
114 | CREATE TABLE public.pagecounts (
115 | "group" character varying,
116 | page_title character varying,
117 | views bigint,
118 | bytes_sent bigint
119 | );
120 |
121 |
122 | ALTER TABLE public.pagecounts OWNER TO etldev;
123 |
124 | --
125 | -- Name: redirect; Type: TABLE; Schema: public; Owner: etldev
126 | --
127 |
128 | CREATE TABLE public.redirect (
129 | rd_from bigint,
130 | page_title character varying
131 | );
132 |
133 |
134 | ALTER TABLE public.redirect OWNER TO etldev;
135 |
136 | --
137 | -- Data for Name: filtered_pagecounts; Type: TABLE DATA; Schema: public; Owner: etldev
138 | --
139 |
140 | COPY public.filtered_pagecounts ("group", page_title, views, bytes_sent) FROM stdin;
141 | \.
142 |
143 |
144 | --
145 | -- Data for Name: lookup; Type: TABLE DATA; Schema: public; Owner: etldev
146 | --
147 |
148 | COPY public.page_lookup (redirect_id, redirect_title, true_title, page_id, page_version) FROM stdin;
149 | \.
150 |
151 |
152 | --
153 | -- Data for Name: normalized_pagecounts; Type: TABLE DATA; Schema: public; Owner: etldev
154 | --
155 |
156 | COPY public.normalized_pagecounts ("group", page_title, views, bytes_sent) FROM stdin;
157 | \.
158 |
159 |
160 | --
161 | -- Data for Name: page; Type: TABLE DATA; Schema: public; Owner: etldev
162 | --
163 |
164 | COPY public.page (page_id, page_latest, page_title) FROM stdin;
165 | \.
166 |
167 |
168 | --
169 | -- Data for Name: page_lookup_nonredirect; Type: TABLE DATA; Schema: public; Owner: etldev
170 | --
171 |
172 | COPY public.page_lookup_nonredirect (redirect_id, redirect_title, true_title, page_id, page_version) FROM stdin;
173 | \.
174 |
175 |
176 | --
177 | -- Data for Name: page_lookup_redirect; Type: TABLE DATA; Schema: public; Owner: etldev
178 | --
179 |
180 | COPY public.page_lookup_redirect (redirect_id, redirect_title, true_title, page_id, page_version) FROM stdin;
181 | \.
182 |
183 |
184 | --
185 | -- Data for Name: pagecounts; Type: TABLE DATA; Schema: public; Owner: etldev
186 | --
187 |
188 | COPY public.pagecounts ("group", page_title, views, bytes_sent) FROM stdin;
189 | \.
190 |
191 |
192 | --
193 | -- Data for Name: redirect; Type: TABLE DATA; Schema: public; Owner: etldev
194 | --
195 |
196 | COPY public.redirect (rd_from, page_title) FROM stdin;
197 | \.
198 |
199 |
200 | --
201 | -- PostgreSQL database dump complete
202 | --
203 |
204 |
--------------------------------------------------------------------------------
/one_task.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tokern/data-lineage/5945542742979fe350d313d906440c93ee3d0f36/one_task.png
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.poetry]
2 | name = "data-lineage"
3 | version = "0.9.0"
4 | description = "Open Source Data Lineage Tool for Redshift. Snowflake and many other databases"
5 | authors = ["Tokern "]
6 | license = "MIT"
7 | classifiers = [
8 | "Development Status :: 3 - Alpha",
9 | "Intended Audience :: Developers",
10 | "Programming Language :: Python",
11 | "Programming Language :: Python :: 3",
12 | "Programming Language :: Python :: 3.7",
13 | "Programming Language :: Python :: 3.8",
14 | "Topic :: Database",
15 | "Topic :: Software Development",
16 | "Topic :: Software Development :: Libraries :: Python Modules",
17 | ]
18 | keywords=["data-lineage","postgres","snowflake","redshift","glue"]
19 | readme="README.md"
20 | homepage="https://tokern.io/"
21 | repository="https://github.com/tokern/data-lineage/"
22 |
23 | [tool.poetry.dependencies]
24 | python = "^3.8"
25 | pglast = "*"
26 | inflection = "*"
27 | networkx = "*"
28 | click = "^7"
29 | PyYAML = "*"
30 | dbcat = "^0.7.1"
31 | gunicorn = "*"
32 | flask = "~=1.1"
33 | flask-restless-ng = "*"
34 | requests = "*"
35 | furl = "*"
36 | flask-restful = "*"
37 | psycopg2 = "^2.9.1"
38 | SQLAlchemy = "^1.3"
39 | botocore = "^1.20"
40 | rq = "^1.10.0"
41 | redis = "^3.5.3"
42 |
43 | [tool.poetry.dev-dependencies]
44 | black = "==19.10b0"
45 | flake8 = "*"
46 | isort = "*"
47 | pre-commit = "*"
48 | pytest = "*"
49 | pytest-cov = "*"
50 | pipenv-setup = "*"
51 | mypy = "*"
52 | jupyter = "*"
53 | pytest-flask = "*"
54 | types-requests = "^0.1.13"
55 | types-Flask = "^1.1.1"
56 | types-PyYAML = "^5.4.3"
57 | types-click = "^7.1.2"
58 | fakeredis = "^1.6.1"
59 | types-redis = "^3.5.15"
60 |
61 | [build-system]
62 | requires = ["poetry-core>=1.0.0"]
63 | build-backend = "poetry.core.masonry.api"
64 |
65 | [tool.poetry.scripts]
66 | data_lineage = "data_lineage.__main__:main"
67 |
--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | testpaths=
3 | test
4 |
5 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ignore = E203, E266, E501, W503
3 | max-line-length = 88
4 | max-complexity = 18
5 | select = B,C,E,F,W,T4
6 |
7 | [isort]
8 | multi_line_output=3
9 | include_trailing_comma=True
10 | force_grid_wrap=0
11 | use_parentheses=True
12 | line_length=88
13 |
14 | [mypy]
15 | files=data_lineage,test
16 | ignore_missing_imports=true
17 |
--------------------------------------------------------------------------------
/test/catalog.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "test",
3 | "source_type": "redshift",
4 | "schemata": [
5 | {
6 | "name": "default",
7 | "tables": [
8 | {
9 | "name": "pagecounts",
10 | "columns": [
11 | {
12 | "name": "group",
13 | "data_type": "STRING"
14 | },
15 | {
16 | "name": "page_title",
17 | "data_type": "STRING"
18 | },
19 | {
20 | "name": "views",
21 | "data_type": "BIGINT"
22 | },
23 | {
24 | "name": "bytes_sent",
25 | "data_type": "BIGINT"
26 | }
27 | ]
28 | },
29 | {
30 | "name": "page",
31 | "columns": [
32 | {
33 | "name": "page_id",
34 | "data_type": "BIGINT"
35 | },
36 | {
37 | "name": "page_latest",
38 | "data_type": "BIGINT"
39 | },
40 | {
41 | "name": "page_title",
42 | "data_type": "STRING"
43 | }
44 | ]
45 | },
46 | {
47 | "name": "redirect",
48 | "columns": [
49 | {
50 | "name": "rd_from",
51 | "data_type": "BIGINT"
52 | },
53 | {
54 | "name": "page_title",
55 | "data_type": "STRING"
56 | }
57 |
58 | ]
59 | },
60 | {
61 | "name": "page_lookup_nonredirect",
62 | "columns": [
63 | {
64 | "name": "redirect_id",
65 | "data_type": "BIGINT"
66 | },
67 | {
68 | "name": "redirect_title",
69 | "data_type": "STRING"
70 | },
71 | {
72 | "name": "true_title",
73 | "data_type": "STRING"
74 | },
75 | {
76 | "name": "page_id",
77 | "data_type": "BIGINT"
78 | },
79 | {
80 | "name": "page_version",
81 | "data_type": "BIGINT"
82 | }
83 | ]
84 | },
85 | {
86 | "name": "page_lookup_redirect",
87 | "columns": [
88 | {
89 | "name": "redirect_id",
90 | "data_type": "BIGINT"
91 | },
92 | {
93 | "name": "redirect_title",
94 | "data_type": "STRING"
95 | },
96 | {
97 | "name": "true_title",
98 | "data_type": "STRING"
99 | },
100 | {
101 | "name": "page_id",
102 | "data_type": "BIGINT"
103 | },
104 | {
105 | "name": "page_version",
106 | "data_type": "BIGINT"
107 | }
108 | ]
109 | },
110 | {
111 | "name": "page_lookup",
112 | "columns": [
113 | {
114 | "name": "redirect_id",
115 | "data_type": "bigint"
116 | },
117 | {
118 | "name": "redirect_title",
119 | "data_type": "STRING"
120 | },
121 | {
122 | "name": "true_title",
123 | "data_type": "STRING"
124 | },
125 | {
126 | "name": "page_id",
127 | "data_type": "BIGINT"
128 | },
129 | {
130 | "name": "page_version",
131 | "data_type": "BIGINT"
132 | }
133 | ]
134 | },
135 | {
136 | "name": "filtered_pagecounts",
137 | "columns": [
138 | {
139 | "name": "group",
140 | "data_type": "STRING"
141 | },
142 | {
143 | "name": "page_title",
144 | "data_type": "STRING"
145 | },
146 | {
147 | "name": "views",
148 | "data_type": "BIGINT"
149 | },
150 | {
151 | "name": "bytes_sent",
152 | "data_type": "BIGINT"
153 | }
154 | ]
155 | },
156 | {
157 | "name": "normalized_pagecounts",
158 | "columns": [
159 | {
160 | "name": "page_id",
161 | "data_type": "BIGINT"
162 | },
163 | {
164 | "name": "page_title",
165 | "data_type": "STRING"
166 | },
167 | {
168 | "name": "page_url",
169 | "data_type": "STRING"
170 | },
171 | {
172 | "name": "views",
173 | "data_type": "BIGINT"
174 | },
175 | {
176 | "name": "bytes_sent",
177 | "data_type": "BIGINT"
178 | }
179 | ]
180 | }
181 | ]
182 | }
183 | ]
184 | }
--------------------------------------------------------------------------------
/test/conftest.py:
--------------------------------------------------------------------------------
1 | from contextlib import closing
2 |
3 | import pytest
4 | import yaml
5 | from dbcat import PGCatalog as DbCatalog
6 | from dbcat import catalog_connection, init_db
7 | from dbcat.catalog import CatSource
8 | from fakeredis import FakeStrictRedis
9 |
10 | from data_lineage import Analyze, Catalog, Graph, Scan
11 | from data_lineage.parser import parse
12 | from data_lineage.server import create_server
13 |
14 |
15 | @pytest.fixture(scope="session")
16 | def load_queries():
17 | import json
18 |
19 | with open("test/queries.json", "r") as file:
20 | queries = json.load(file)
21 |
22 | yield queries
23 |
24 |
25 | @pytest.fixture(scope="session")
26 | def parse_queries_fixture(load_queries):
27 | parsed = [parse(sql=query["query"], name=query["name"]) for query in load_queries]
28 | yield parsed
29 |
30 |
31 | postgres_conf = """
32 | catalog:
33 | user: piiuser
34 | password: p11secret
35 | host: 127.0.0.1
36 | port: 5432
37 | database: piidb
38 | """
39 |
40 |
41 | @pytest.fixture(scope="session")
42 | def root_connection() -> DbCatalog:
43 | config = yaml.safe_load(postgres_conf)
44 | with closing(DbCatalog(**config["catalog"])) as conn:
45 | yield conn
46 |
47 |
48 | @pytest.fixture(scope="session")
49 | def setup_catalog(root_connection):
50 | with root_connection.engine.connect() as conn:
51 | conn.execute("CREATE USER catalog_user PASSWORD 'catal0g_passw0rd'")
52 | conn.execution_options(isolation_level="AUTOCOMMIT").execute(
53 | "CREATE DATABASE tokern"
54 | )
55 | conn.execution_options(isolation_level="AUTOCOMMIT").execute(
56 | "GRANT ALL PRIVILEGES ON DATABASE tokern TO catalog_user"
57 | )
58 |
59 | yield root_connection
60 |
61 | with root_connection.engine.connect() as conn:
62 | conn.execution_options(isolation_level="AUTOCOMMIT").execute(
63 | "DROP DATABASE tokern"
64 | )
65 |
66 | conn.execution_options(isolation_level="AUTOCOMMIT").execute(
67 | "DROP USER catalog_user"
68 | )
69 |
70 |
71 | catalog_conf = """
72 | catalog:
73 | user: catalog_user
74 | password: catal0g_passw0rd
75 | host: 127.0.0.1
76 | port: 5432
77 | database: tokern
78 | """
79 |
80 |
81 | @pytest.fixture(scope="session")
82 | def open_catalog_connection(setup_catalog):
83 | with closing(catalog_connection(catalog_conf)) as conn:
84 | init_db(conn)
85 | yield conn
86 |
87 |
88 | class File:
89 | def __init__(self, name: str, path: str, catalog: DbCatalog):
90 | self.name = name
91 | self._path = path
92 | self._catalog = catalog
93 |
94 | @property
95 | def path(self):
96 | return self._path
97 |
98 | def scan(self):
99 | import json
100 |
101 | with open(self.path, "r") as file:
102 | content = json.load(file)
103 |
104 | with self._catalog.managed_session:
105 | source = self._catalog.add_source(
106 | name=content["name"], source_type=content["source_type"]
107 | )
108 | for s in content["schemata"]:
109 | schema = self._catalog.add_schema(s["name"], source=source)
110 |
111 | for t in s["tables"]:
112 | table = self._catalog.add_table(t["name"], schema)
113 |
114 | index = 0
115 | for c in t["columns"]:
116 | self._catalog.add_column(
117 | column_name=c["name"],
118 | data_type=c["data_type"],
119 | sort_order=index,
120 | table=table,
121 | )
122 | index += 1
123 |
124 |
125 | @pytest.fixture(scope="session")
126 | def save_catalog(open_catalog_connection):
127 | scanner = File("test", "test/catalog.json", open_catalog_connection)
128 | scanner.scan()
129 | yield open_catalog_connection
130 | with open_catalog_connection.managed_session as session:
131 | [session.delete(db) for db in session.query(CatSource).all()]
132 | session.commit()
133 |
134 |
135 | @pytest.fixture(scope="function")
136 | def managed_session(save_catalog):
137 | with save_catalog.managed_session:
138 | yield save_catalog
139 |
140 |
141 | @pytest.fixture(scope="session")
142 | def app(setup_catalog):
143 | config = yaml.safe_load(catalog_conf)
144 | app, catalog = create_server(
145 | config["catalog"], connection=FakeStrictRedis(), is_production=False
146 | )
147 | yield app
148 | catalog.close()
149 |
150 |
151 | @pytest.fixture(scope="session")
152 | def rest_catalog(live_server, save_catalog):
153 | yield Catalog("http://{}:{}".format(live_server.host, live_server.port))
154 |
155 |
156 | @pytest.fixture(scope="session")
157 | def graph_sdk(live_server):
158 | yield Graph("http://{}:{}".format(live_server.host, live_server.port))
159 |
160 |
161 | @pytest.fixture(scope="session")
162 | def parser_sdk(live_server):
163 | yield Analyze("http://{}:{}".format(live_server.host, live_server.port))
164 |
165 |
166 | @pytest.fixture(scope="session")
167 | def scan_sdk(live_server):
168 | yield Scan("http://{}:{}".format(live_server.host, live_server.port))
169 |
--------------------------------------------------------------------------------
/test/queries.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "name": "LOAD page_lookup_nonredirect",
4 | "query": "INSERT INTO page_lookup_nonredirect SELECT page.page_id as redircet_id, page.page_title as redirect_title, page.page_title true_title, page.page_id, page.page_latest FROM page LEFT OUTER JOIN redirect ON page.page_id = redirect.rd_from WHERE redirect.rd_from IS NULL "
5 | },
6 | {
7 | "name": "LOAD page_lookup_redirect",
8 | "query": "insert into page_lookup_redirect select original_page.page_id redirect_id, original_page.page_title redirect_title, final_page.page_title as true_title, final_page.page_id, final_page.page_latest from page final_page join redirect on (redirect.page_title = final_page.page_title) join page original_page on (redirect.rd_from = original_page.page_id)"
9 | },
10 | {
11 | "name": "LOAD page_lookup",
12 | "query": "INSERT INTO page_lookup SELECT plr.redirect_id, plr.redirect_title, plr.true_title, plr.page_id, plr.page_version FROM page_lookup_redirect plr"
13 | },
14 | {
15 | "name": "LOAD filtered_pagecounts",
16 | "query": "INSERT INTO filtered_pagecounts(\"group\", page_title, views) SELECT regexp_replace (reflect ('java.net.URLDecoder','decode', reflect ('java.net.URLDecoder','decode',pvs.page_title)),'^\\s*([a-zA-Z0-9]+).*','$1') page_title,SUM (pvs.views) AS total_views, SUM (pvs.bytes_sent) AS total_bytes_sent FROM pagecounts as pvs WHERE not pvs.page_title LIKE '(MEDIA|SPECIAL||Talk|User|User_talk|Project|Project_talk|File|File_talk|MediaWiki|MediaWiki_talk|Template|Template_talk|Help|Help_talk|Category|Category_talk|Portal|Wikipedia|Wikipedia_talk|upload|Special)\\:(.*)' and pvs.page_title LIKE '^([A-Z])(.*)' and not pvs.page_title LIKE '(.*).(jpg|gif|png|JPG|GIF|PNG|txt|ico)$' and pvs.page_title <> '404_error/' and pvs.page_title <> 'Main_Page' and pvs.page_title <> 'Hypertext_Transfer_Protocol' and pvs.page_title <> 'Favicon.ico' and pvs.page_title <> 'Search' and pvs.dt = '2020-01-01' GROUP BY regexp_replace (reflect ('java.net.URLDecoder','decode', reflect ('java.net.URLDecoder','decode',pvs.page_title)),'^\\s*([a-zA-Z0-9]+).*','$1')"
17 | },
18 | {
19 | "name": "LOAD normalized_pagecounts",
20 | "query": "INSERT INTO normalized_pagecounts SELECT pl.page_id page_id, REGEXP_REPLACE(pl.true_title, '_', ' ') page_title, pl.true_title page_url, fp.views, fp.bytes_sent FROM page_lookup pl JOIN filtered_pagecounts fp ON fp.page_title = pl.redirect_title where fp.dt='2020-01-01'"
21 | }
22 | ]
23 |
--------------------------------------------------------------------------------
/test/test_data_lineage.py:
--------------------------------------------------------------------------------
1 | from data_lineage.parser import analyze_dml_query
2 |
3 |
4 | def test_parser(parse_queries_fixture):
5 | assert len(parse_queries_fixture) == 5
6 |
7 |
8 | def test_visitor(save_catalog, parse_queries_fixture):
9 | catalog = save_catalog
10 | with catalog.managed_session:
11 | source = catalog.get_source("test")
12 |
13 | dml = [
14 | analyze_dml_query(catalog, parsed, source)
15 | for parsed in parse_queries_fixture
16 | ]
17 | assert len(dml) == 5
18 |
19 | for d in dml:
20 | assert len(d.source_tables) > 0 and d.target_table is not None
21 |
--------------------------------------------------------------------------------
/test/test_db_graph.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import logging
3 |
4 | import pytest
5 | from dbcat.catalog import ColumnLineage
6 | from networkx import edges
7 |
8 | from data_lineage import load_graph
9 | from data_lineage.parser import analyze_dml_query, extract_lineage, parse
10 | from data_lineage.parser.dml_visitor import SelectSourceVisitor
11 |
12 | logging.basicConfig(level=getattr(logging, "DEBUG"))
13 |
14 |
15 | def test_no_insert_column_graph(managed_session, graph_sdk):
16 | catalog = managed_session
17 | query = """
18 | INSERT INTO page_lookup_nonredirect
19 | SELECT page.page_id as redirect_id, page.page_title as redirect_title,
20 | page.page_title true_title, page.page_id, page.page_latest
21 | FROM page
22 | """
23 |
24 | parsed = parse(
25 | query, name="LOAD page_lookup_nonredirect-test_no_insert_column_graph"
26 | )
27 | visitor = SelectSourceVisitor(parsed.name)
28 | visitor(parsed.node)
29 | source = catalog.get_source("test")
30 | visitor.bind(catalog, source)
31 |
32 | job_execution = extract_lineage(
33 | catalog,
34 | visitor,
35 | source,
36 | parsed,
37 | datetime.datetime.now(),
38 | datetime.datetime.now(),
39 | )
40 | graph = load_graph(graph_sdk, [job_execution.job_id])
41 |
42 | assert sorted([node[1]["name"] for node in list(graph.graph.nodes(data=True))]) == [
43 | "LOAD page_lookup_nonredirect-test_no_insert_column_graph",
44 | "test.default.page.page_id",
45 | "test.default.page.page_latest",
46 | "test.default.page.page_title",
47 | "test.default.page_lookup_nonredirect.page_id",
48 | "test.default.page_lookup_nonredirect.page_version",
49 | "test.default.page_lookup_nonredirect.redirect_id",
50 | "test.default.page_lookup_nonredirect.redirect_title",
51 | "test.default.page_lookup_nonredirect.true_title",
52 | ]
53 |
54 | expected_edges = [
55 | ("column:5", "task:1"),
56 | ("task:1", "column:10"),
57 | ("task:1", "column:11"),
58 | ("task:1", "column:12"),
59 | ("task:1", "column:13"),
60 | ("task:1", "column:14"),
61 | ("column:7", "task:1"),
62 | ("column:6", "task:1"),
63 | ]
64 |
65 | assert [(edge[0], edge[1]) for edge in list(edges(graph.graph))] == expected_edges
66 |
67 | expected_db_edges = [
68 | (
69 | ("test", "default", "page", "page_id"),
70 | ("test", "default", "page_lookup_nonredirect", "redirect_id"),
71 | ),
72 | (
73 | ("test", "default", "page", "page_id"),
74 | ("test", "default", "page_lookup_nonredirect", "page_id"),
75 | ),
76 | (
77 | ("test", "default", "page", "page_title"),
78 | ("test", "default", "page_lookup_nonredirect", "redirect_title"),
79 | ),
80 | (
81 | ("test", "default", "page", "page_title"),
82 | ("test", "default", "page_lookup_nonredirect", "true_title"),
83 | ),
84 | (
85 | ("test", "default", "page", "page_latest"),
86 | ("test", "default", "page_lookup_nonredirect", "page_version"),
87 | ),
88 | ]
89 | with catalog.managed_session as session:
90 | all_edges = session.query(ColumnLineage).all()
91 | assert set([(e.source.fqdn, e.target.fqdn) for e in all_edges]) == set(
92 | expected_db_edges
93 | )
94 |
95 |
96 | def test_basic_column_graph(managed_session, graph_sdk):
97 | catalog = managed_session
98 |
99 | query = "INSERT INTO page_lookup_nonredirect(page_id, page_version) SELECT page.page_id, page.page_latest FROM page"
100 | parsed = parse(query, "basic_column_graph")
101 | visitor = SelectSourceVisitor(parsed.name)
102 | visitor(parsed.node)
103 | source = catalog.get_source("test")
104 | visitor.bind(catalog, source)
105 |
106 | job_execution = extract_lineage(
107 | catalog,
108 | visitor,
109 | source,
110 | parsed,
111 | datetime.datetime.now(),
112 | datetime.datetime.now(),
113 | )
114 | graph = load_graph(graph_sdk, [job_execution.job_id])
115 |
116 | assert sorted([node[1]["name"] for node in list(graph.graph.nodes(data=True))]) == [
117 | "basic_column_graph",
118 | "test.default.page.page_id",
119 | "test.default.page.page_latest",
120 | "test.default.page_lookup_nonredirect.page_id",
121 | "test.default.page_lookup_nonredirect.page_version",
122 | ]
123 |
124 | expected_edges = [
125 | ("column:5", "task:2"),
126 | ("task:2", "column:13"),
127 | ("task:2", "column:14"),
128 | ("column:6", "task:2"),
129 | ]
130 |
131 | assert [(edge[0], edge[1]) for edge in list(edges(graph.graph))] == expected_edges
132 |
133 | table = catalog.get_table(
134 | source_name="test", schema_name="default", table_name="page_lookup_nonredirect",
135 | )
136 | columns = catalog.get_columns_for_table(
137 | table, column_names=["page_id", "page_version"]
138 | )
139 |
140 | assert len(columns) == 2
141 |
142 | expected_db_edges = [
143 | (
144 | ("test", "default", "page", "page_id"),
145 | ("test", "default", "page_lookup_nonredirect", "page_id"),
146 | ),
147 | (
148 | ("test", "default", "page", "page_latest"),
149 | ("test", "default", "page_lookup_nonredirect", "page_version"),
150 | ),
151 | ]
152 |
153 | with catalog.managed_session as session:
154 | all_edges = (
155 | session.query(ColumnLineage)
156 | .filter(ColumnLineage.target_id.in_([c.id for c in columns]))
157 | .all()
158 | )
159 | assert set([(e.source.fqdn, e.target.fqdn) for e in all_edges]) == set(
160 | expected_db_edges
161 | )
162 |
163 |
164 | @pytest.fixture(scope="module")
165 | def get_graph(save_catalog, parse_queries_fixture, graph_sdk):
166 | catalog = save_catalog
167 | job_ids = []
168 |
169 | with catalog.managed_session:
170 | source = catalog.get_source("test")
171 | for parsed in parse_queries_fixture:
172 | visitor = analyze_dml_query(catalog, parsed, source)
173 | job_execution = extract_lineage(
174 | catalog,
175 | visitor,
176 | source,
177 | parsed,
178 | datetime.datetime.now(),
179 | datetime.datetime.now(),
180 | )
181 | job_ids.append(job_execution.job_id)
182 | graph = load_graph(graph_sdk, job_ids)
183 | yield graph, catalog
184 |
185 |
186 | def test_column_graph(get_graph):
187 | graph, catalog = get_graph
188 | assert sorted([node[1]["name"] for node in list(graph.graph.nodes(data=True))]) == [
189 | "LOAD filtered_pagecounts",
190 | "LOAD normalized_pagecounts",
191 | "LOAD page_lookup",
192 | "LOAD page_lookup_nonredirect",
193 | "LOAD page_lookup_redirect",
194 | "test.default.filtered_pagecounts.bytes_sent",
195 | "test.default.filtered_pagecounts.group",
196 | "test.default.filtered_pagecounts.page_title",
197 | "test.default.filtered_pagecounts.views",
198 | "test.default.normalized_pagecounts.bytes_sent",
199 | "test.default.normalized_pagecounts.page_id",
200 | "test.default.normalized_pagecounts.page_title",
201 | "test.default.normalized_pagecounts.page_url",
202 | "test.default.normalized_pagecounts.views",
203 | "test.default.page.page_id",
204 | "test.default.page.page_latest",
205 | "test.default.page.page_title",
206 | "test.default.page_lookup.page_id",
207 | "test.default.page_lookup.page_version",
208 | "test.default.page_lookup.redirect_id",
209 | "test.default.page_lookup.redirect_title",
210 | "test.default.page_lookup.true_title",
211 | "test.default.page_lookup_nonredirect.page_id",
212 | "test.default.page_lookup_nonredirect.page_version",
213 | "test.default.page_lookup_nonredirect.redirect_id",
214 | "test.default.page_lookup_nonredirect.redirect_title",
215 | "test.default.page_lookup_nonredirect.true_title",
216 | "test.default.page_lookup_redirect.page_id",
217 | "test.default.page_lookup_redirect.page_version",
218 | "test.default.page_lookup_redirect.redirect_id",
219 | "test.default.page_lookup_redirect.redirect_title",
220 | "test.default.page_lookup_redirect.true_title",
221 | "test.default.pagecounts.bytes_sent",
222 | "test.default.pagecounts.page_title",
223 | "test.default.pagecounts.views",
224 | ]
225 | # expected_edges = [
226 | # ("column:4", "task:1"),
227 | # ("column:4", "task:3"),
228 | # ("task:1", "column:9"),
229 | # ("task:1", "column:10"),
230 | # ("task:1", "column:11"),
231 | # ("task:1", "column:12"),
232 | # ("task:1", "column:13"),
233 | # ("column:6", "task:1"),
234 | # ("column:6", "task:3"),
235 | # ("column:5", "task:1"),
236 | # ("column:5", "task:3"),
237 | # ("column:14", "task:4"),
238 | # ("task:3", "column:14"),
239 | # ("task:3", "column:15"),
240 | # ("task:3", "column:16"),
241 | # ("task:3", "column:17"),
242 | # ("task:3", "column:18"),
243 | # ("column:15", "task:4"),
244 | # ("column:16", "task:4"),
245 | # ("column:17", "task:4"),
246 | # ("column:18", "task:4"),
247 | # ("task:4", "column:19"),
248 | # ("task:4", "column:20"),
249 | # ("task:4", "column:21"),
250 | # ("task:4", "column:22"),
251 | # ("task:4", "column:23"),
252 | # ("column:21", "task:6"),
253 | # ("column:22", "task:6"),
254 | # ("task:6", "column:28"),
255 | # ("task:6", "column:29"),
256 | # ("task:6", "column:30"),
257 | # ("task:6", "column:31"),
258 | # ("column:26", "task:6"),
259 | # ("column:27", "task:6"),
260 | # ]
261 |
262 |
263 | # assert [
264 | # (edge[0], edge[1]) for edge in list(edges(graph.graph))
265 | # ] == expected_edges
266 |
--------------------------------------------------------------------------------
/test/test_dml_visitor.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from data_lineage.parser import analyze_dml_query, parse, parse_dml_query, parse_queries
4 | from data_lineage.parser.dml_visitor import (
5 | CTASVisitor,
6 | SelectIntoVisitor,
7 | SelectSourceVisitor,
8 | )
9 |
10 |
11 | @pytest.mark.parametrize(
12 | "target, sources, sql",
13 | [
14 | ((None, "c"), [(None, "a")], "insert into c select x,y from a"),
15 | (
16 | (None, "c"),
17 | [(None, "a"), (None, "b")],
18 | "insert into c select x,y from a join b on a.id = b.id",
19 | ),
20 | (
21 | (None, "c"),
22 | [(None, "a"), (None, "b")],
23 | "insert into c select x,y from a join b on a.id = b.id",
24 | ),
25 | (
26 | (None, "c"),
27 | [(None, "a"), (None, "b")],
28 | "insert into c select x,y from a as aa join b on " "aa.id = b.id",
29 | ),
30 | ],
31 | )
32 | def test_sanity_insert(target, sources, sql):
33 | parsed = parse(sql)
34 | insert_visitor = SelectSourceVisitor("test_sanity_insert")
35 | insert_visitor(parsed.node)
36 | bound_target, bound_tables, bound_cols = insert_visitor.resolve()
37 |
38 | assert bound_target == target
39 | assert bound_tables == sources
40 |
41 |
42 | @pytest.mark.parametrize(
43 | "target, sources, sql",
44 | [
45 | ((None, "c"), [(None, "a")], "create table c as select x,y from a"),
46 | (
47 | (None, "c"),
48 | [(None, "a"), (None, "b")],
49 | "create table c as select x,y from a join b on a.id = b.id",
50 | ),
51 | (
52 | (None, "c"),
53 | [(None, "a"), (None, "b")],
54 | "create table c as select x,y from a join b on a.id = b.id",
55 | ),
56 | (
57 | (None, "c"),
58 | [(None, "a"), (None, "b")],
59 | "create table c as select x,y from a as aa join b on aa.id = b.id",
60 | ),
61 | ],
62 | )
63 | def test_sanity_ctas(target, sources, sql):
64 | parsed = parse(sql)
65 | visitor = CTASVisitor("test_sanity_ctas")
66 | visitor(parsed.node)
67 | bound_target, bound_tables, bound_cols = visitor.resolve()
68 |
69 | assert bound_target == target
70 | assert bound_tables == sources
71 |
72 |
73 | @pytest.mark.parametrize(
74 | "target, sources, sql",
75 | [
76 | (
77 | (None, "c"),
78 | [(None, "a"), (None, "b")],
79 | "select x,y into c from a join b on a.id = b.id",
80 | ),
81 | (
82 | (None, "c"),
83 | [(None, "a"), (None, "b")],
84 | "select x,y into c from a join b on a.id = b.id",
85 | ),
86 | (
87 | (None, "c"),
88 | [(None, "a"), (None, "b")],
89 | "select x,y into c from a as aa join b on aa.id = b.id",
90 | ),
91 | ],
92 | )
93 | def test_sanity_select_into(target, sources, sql):
94 | parsed = parse(sql)
95 | visitor = SelectIntoVisitor("test_sanity_select_into")
96 | visitor(parsed.node)
97 | bound_target, bound_tables, bound_cols = visitor.resolve()
98 |
99 | assert bound_target == target
100 | assert bound_tables == sources
101 |
102 |
103 | @pytest.mark.parametrize(
104 | "query",
105 | [
106 | "INSERT INTO page_lookup SELECT plr.redirect_id, plr.redirect_title, plr.true_title, plr.page_id, plr.page_version FROM page_lookup_redirect plr",
107 | "INSERT INTO page_lookup SELECT redirect_id, redirect_title, true_title, page_id, page_version FROM page_lookup_redirect",
108 | "INSERT INTO page_lookup SELECT page_lookup_redirect.* FROM page_lookup_redirect",
109 | "INSERT INTO page_lookup SELECT * FROM page_lookup_redirect",
110 | 'INSERT INTO "default".page_lookup SELECT * FROM page_lookup_redirect',
111 | "SELECT * INTO page_lookup from page_lookup_redirect",
112 | 'SELECT * INTO "default".page_lookup from page_lookup_redirect',
113 | """
114 | INSERT INTO page_lookup
115 | SELECT * FROM (
116 | select redirect_id, redirect_title, true_title, page_id, page_version FROM page_lookup_redirect
117 | ) plr
118 | """,
119 | """
120 | INSERT INTO page_lookup
121 | SELECT plr.* FROM (
122 | select redirect_id, redirect_title, true_title, page_id, page_version FROM page_lookup_redirect
123 | ) plr
124 | """,
125 | """
126 | INSERT INTO page_lookup
127 | SELECT redirect_id, redirect_title, true_title, page_id, page_version FROM (
128 | select redirect_id, redirect_title, true_title, page_id, page_version FROM page_lookup_redirect
129 | ) plr
130 | """,
131 | """
132 | INSERT INTO page_lookup
133 | SELECT plr.redirect_id, plr.redirect_title, plr.true_title, plr.page_id, plr.page_version FROM (
134 | select redirect_id, redirect_title, true_title, page_id, page_version FROM page_lookup_redirect
135 | ) plr
136 | """,
137 | ],
138 | )
139 | def test_insert(managed_session, query):
140 | source = managed_session.get_source("test")
141 | parsed = parse(query)
142 | visitor = analyze_dml_query(managed_session, parsed, source)
143 | assert visitor is not None
144 |
145 | assert len(visitor.target_columns) == 5
146 | assert visitor.target_table.fqdn == ("test", "default", "page_lookup")
147 | assert len(visitor.source_columns) == 5
148 | assert [table.fqdn for table in visitor.source_tables] == [
149 | ("test", "default", "page_lookup_redirect")
150 | ]
151 |
152 |
153 | def test_insert_cols(managed_session):
154 | source = managed_session.get_source("test")
155 | query = "INSERT INTO page_lookup_nonredirect(page_id, page_version) SELECT page.page_id, page.page_latest FROM page"
156 | parsed = parse(query)
157 | visitor = analyze_dml_query(managed_session, parsed, source)
158 | assert visitor is not None
159 |
160 | assert len(visitor.target_columns) == 2
161 | assert visitor.target_table.fqdn == ("test", "default", "page_lookup_nonredirect")
162 | assert len(visitor.source_columns) == 2
163 | assert [table.fqdn for table in visitor.source_tables] == [
164 | ("test", "default", "page")
165 | ]
166 |
167 |
168 | def test_insert_with_join(managed_session):
169 | source = managed_session.get_source("test")
170 | query = "insert into page_lookup_redirect select original_page.page_id redirect_id, original_page.page_title redirect_title, final_page.page_title as true_title, final_page.page_id, final_page.page_latest from page final_page join redirect on (redirect.page_title = final_page.page_title) join page original_page on (redirect.rd_from = original_page.page_id)"
171 | parsed = parse(query)
172 | visitor = analyze_dml_query(managed_session, parsed, source)
173 | assert visitor is not None
174 |
175 | assert len(visitor.target_columns) == 5
176 | assert visitor.target_table.fqdn == ("test", "default", "page_lookup_redirect")
177 | assert len(visitor.source_columns) == 5
178 | assert sorted([table.fqdn for table in visitor.source_tables]) == [
179 | ("test", "default", "page"),
180 | ("test", "default", "redirect"),
181 | ]
182 |
183 |
184 | @pytest.mark.parametrize(
185 | "query",
186 | [
187 | "with pln as (select redirect_title, true_title, page_id, page_version from page_lookup_nonredirect) insert into page_lookup_redirect (redirect_title, true_title, page_id, page_version) select redirect_title, true_title, page_id, page_version from pln;",
188 | "with pln as (select * from page_lookup_nonredirect) insert into page_lookup_redirect (redirect_title, true_title, page_id, page_version) select redirect_title, true_title, page_id, page_version from pln;",
189 | "with pln as (select redirect_title, true_title, page_id, page_version from page_lookup_nonredirect) insert into page_lookup_redirect (redirect_title, true_title, page_id, page_version) select * from pln;",
190 | "with pln as (select redirect_title as t1, true_title as t2, page_id as t3, page_version as t4 from page_lookup_nonredirect) insert into page_lookup_redirect (redirect_title, true_title, page_id, page_version) select t1, t2, t3, t4 from pln;",
191 | "insert into page_lookup_redirect (redirect_title, true_title, page_id, page_version) with pln as (select redirect_title, true_title, page_id, page_version from page_lookup_nonredirect) select redirect_title, true_title, page_id, page_version from pln;",
192 | ],
193 | )
194 | def test_with_clause(managed_session, query):
195 | source = managed_session.get_source("test")
196 | parsed = parse(query)
197 | visitor = analyze_dml_query(managed_session, parsed, source)
198 | assert visitor is not None
199 |
200 | assert len(visitor.target_columns) == 4
201 | assert visitor.target_table.fqdn == ("test", "default", "page_lookup_redirect")
202 | assert len(visitor.source_columns) == 4
203 | assert [table.fqdn for table in visitor.source_tables] == [
204 | ("test", "default", "page_lookup_nonredirect")
205 | ]
206 |
207 |
208 | def test_col_exprs(managed_session):
209 | query = """
210 | INSERT INTO page_lookup_redirect(true_title)
211 | SELECT
212 | BTRIM(TO_CHAR(DATEADD (MONTH,-1,('20' ||MAX ("redirect_id") || '-01')::DATE)::DATE,'YY-MM')) AS "max_month"
213 | FROM page_lookup_nonredirect;
214 | """
215 | source = managed_session.get_source("test")
216 | parsed = parse(query)
217 | visitor = analyze_dml_query(catalog=managed_session, parsed=parsed, source=source)
218 | assert visitor is not None
219 |
220 | assert len(visitor.target_columns) == 1
221 | assert visitor.target_table.fqdn == ("test", "default", "page_lookup_redirect")
222 | assert len(visitor.source_columns) == 1
223 | assert [table.fqdn for table in visitor.source_tables] == [
224 | ("test", "default", "page_lookup_nonredirect")
225 | ]
226 |
227 |
228 | def test_syntax_errors():
229 | queries = [
230 | "INSERT INTO page_lookup_nonredirect(page_id, latest) SELECT page.page_id, page.page_latest FROM page",
231 | "select a from table(b)",
232 | "INSERT INTO page_lookup_nonredirect SELECT page.page_id, page.page_latest FROM page",
233 | ]
234 |
235 | parsed = parse_queries(queries)
236 |
237 | assert len(parsed) == 2
238 |
239 |
240 | def test_parse_query(managed_session):
241 | query = """
242 | SELECT BTRIM(TO_CHAR(DATEADD (MONTH,-1,(\'20\' ||MAX ("group") || \'-01\')::DATE)::DATE,\'YY-MM\')) AS "max_month",
243 | DATEADD(YEAR,-1,DATEADD (MONTH,-3,LAST_DAY (DATEADD (MONTH,-1,(\'20\' ||MAX ("group") || \'-01\')::DATE)::DATE))::DATE)::DATE AS "min_date",
244 | DATEADD(MONTH,-3,LAST_DAY (DATEADD (MONTH,-1,(\'20\' ||MAX ("group") || \'-01\')::DATE)::DATE))::DATE AS "max_date",
245 | page_title,
246 | bytes_sent as mb_sent
247 | INTO "new_table"
248 | FROM pagecounts;
249 | """
250 | source = managed_session.get_source("test")
251 | parsed = parse(query)
252 | binder = parse_dml_query(catalog=managed_session, parsed=parsed, source=source)
253 | assert [context.alias for context in binder.columns] == [
254 | "max_month",
255 | "min_date",
256 | "max_date",
257 | "page_title",
258 | "mb_sent",
259 | ]
260 |
261 |
262 | def test_ctas(managed_session):
263 | query = """
264 | CREATE TEMP TABLE temp_table_x(page_title) AS select redirect_title from page_lookup_nonredirect
265 | where redirect_title is not null
266 | """
267 | source = managed_session.get_source("test")
268 | schema = managed_session.get_schema("test", "default")
269 | managed_session.update_source(source, schema)
270 | parsed = parse(query)
271 | visitor = analyze_dml_query(managed_session, parsed, source)
272 | assert visitor is not None
273 |
274 | assert len(visitor.target_columns) == 1
275 | assert visitor.target_table.fqdn == ("test", "default", "temp_table_x")
276 | assert len(visitor.source_columns) == 1
277 | assert [table.fqdn for table in visitor.source_tables] == [
278 | ("test", "default", "page_lookup_nonredirect")
279 | ]
280 |
--------------------------------------------------------------------------------
/test/test_scan.py:
--------------------------------------------------------------------------------
1 | import psycopg2
2 | import pytest
3 | from fakeredis import FakeStrictRedis
4 | from rq import Queue
5 |
6 | pii_data_script = """
7 | create table no_pii(a text, b text);
8 | insert into no_pii values ('abc', 'def');
9 | insert into no_pii values ('xsfr', 'asawe');
10 |
11 | create table partial_pii(a text, b text);
12 | insert into partial_pii values ('917-908-2234', 'plkj');
13 | insert into partial_pii values ('215-099-2234', 'sfrf');
14 |
15 | create table full_pii(name text, location text);
16 | insert into full_pii values ('Jonathan Smith', 'Virginia');
17 | insert into full_pii values ('Chase Ryan', 'Chennai');
18 |
19 | """
20 |
21 |
22 | pii_data_load = [
23 | "create table no_pii(a text, b text)",
24 | "insert into no_pii values ('abc', 'def')",
25 | "insert into no_pii values ('xsfr', 'asawe')",
26 | "create table partial_pii(a text, b text)",
27 | "insert into partial_pii values ('917-908-2234', 'plkj')",
28 | "insert into partial_pii values ('215-099-2234', 'sfrf')",
29 | "create table full_pii(name text, location text)",
30 | "insert into full_pii values ('Jonathan Smith', 'Virginia')",
31 | "insert into full_pii values ('Chase Ryan', 'Chennai')",
32 | ]
33 |
34 | pii_data_drop = ["DROP TABLE full_pii", "DROP TABLE partial_pii", "DROP TABLE no_pii"]
35 |
36 |
37 | def pg_conn():
38 | return (
39 | psycopg2.connect(
40 | host="127.0.0.1", user="piiuser", password="p11secret", database="piidb"
41 | ),
42 | "public",
43 | )
44 |
45 |
46 | @pytest.fixture(scope="module")
47 | def load_all_data():
48 | params = [pg_conn()]
49 | for p in params:
50 | db_conn, expected_schema = p
51 | with db_conn.cursor() as cursor:
52 | for statement in pii_data_load:
53 | cursor.execute(statement)
54 | cursor.execute("commit")
55 | yield params
56 | for p in params:
57 | db_conn, expected_schema = p
58 | with db_conn.cursor() as cursor:
59 | for statement in pii_data_drop:
60 | cursor.execute(statement)
61 | cursor.execute("commit")
62 |
63 | for p in params:
64 | db_conn, expected_schema = p
65 | db_conn.close()
66 |
67 |
68 | @pytest.fixture(scope="module")
69 | def setup_catalog_and_data(load_all_data, rest_catalog):
70 | catalog = rest_catalog
71 | source = catalog.add_source(
72 | name="pg_scan",
73 | source_type="postgresql",
74 | uri="127.0.0.1",
75 | username="piiuser",
76 | password="p11secret",
77 | database="piidb",
78 | cluster="public",
79 | )
80 | yield catalog, source
81 |
82 |
83 | @pytest.fixture(scope="module")
84 | def fake_queue():
85 | yield Queue(is_async=False, connection=FakeStrictRedis())
86 |
87 |
88 | def test_scan_source(setup_catalog_and_data, scan_sdk):
89 | catalog, source = setup_catalog_and_data
90 | scan_sdk.start(source)
91 |
92 | pg_source = catalog.get_source("pg_scan")
93 | assert pg_source is not None
94 |
95 | no_pii = catalog.get_table("pg_scan", "public", "no_pii")
96 | assert no_pii is not None
97 |
--------------------------------------------------------------------------------
/test/test_server.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import logging
3 |
4 | import pytest
5 | from dbcat.catalog.models import ColumnLineage, Job, JobExecution, JobExecutionStatus
6 |
7 | from data_lineage import (
8 | ColumnNotFound,
9 | ParseError,
10 | SchemaNotFound,
11 | SourceNotFound,
12 | TableNotFound,
13 | )
14 |
15 |
16 | def test_get_sources(rest_catalog):
17 | source = rest_catalog.get_source("test")
18 | assert source.name == "test"
19 | assert source.id is not None
20 |
21 |
22 | def test_get_schemata(rest_catalog):
23 | schema = rest_catalog.get_schema("test", "default")
24 | assert schema.name == "default"
25 | assert schema.id is not None
26 |
27 |
28 | def test_get_tables(rest_catalog):
29 | num = 0
30 | for table in rest_catalog.get_tables():
31 | assert table.id is not None
32 | assert table.name is not None
33 | num += 1
34 | assert num == 12
35 |
36 |
37 | def test_get_columns(rest_catalog):
38 | num = 0
39 | for column in rest_catalog.get_columns():
40 | assert column.id is not None
41 | assert column.name is not None
42 | assert column.data_type is not None
43 | assert column.sort_order is not None
44 | num += 1
45 |
46 | assert num == 40
47 |
48 |
49 | def test_get_source_by_id(rest_catalog):
50 | source = rest_catalog.get_source_by_id(1)
51 | print(source.__class__.__name__)
52 | assert source.name == "test"
53 | assert source.fqdn == "test"
54 | assert source.source_type == "redshift"
55 |
56 |
57 | def test_get_schema_by_id(rest_catalog):
58 | schema = rest_catalog.get_schema_by_id(1)
59 | assert schema.name == "default"
60 | assert schema.fqdn == ["test", "default"]
61 |
62 |
63 | def test_get_table_by_id(rest_catalog):
64 | table = rest_catalog.get_table_by_id(1)
65 | assert table.name == "pagecounts"
66 | assert table.fqdn == ["test", "default", "pagecounts"]
67 |
68 |
69 | def test_get_column_by_id(rest_catalog):
70 | column = rest_catalog.get_column_by_id(1)
71 | assert column.name == "group"
72 | assert column.fqdn == ["test", "default", "pagecounts", "group"]
73 |
74 |
75 | def test_get_source(rest_catalog):
76 | source = rest_catalog.get_source("test")
77 | assert source.name == "test"
78 | assert source.id is not None
79 |
80 |
81 | def test_get_schema(rest_catalog):
82 | schema = rest_catalog.get_schema("test", "default")
83 | assert schema.name == "default"
84 | assert schema.id is not None
85 |
86 |
87 | def test_get_table(rest_catalog):
88 | table = rest_catalog.get_table("test", "default", "normalized_pagecounts")
89 | assert table.id is not None
90 | assert table.name == "normalized_pagecounts"
91 |
92 |
93 | def test_get_column(rest_catalog):
94 | column = rest_catalog.get_column("test", "default", "pagecounts", "bytes_sent")
95 | assert column.id is not None
96 | assert column.name is not None
97 | assert column.sort_order is not None
98 |
99 |
100 | def test_get_source_exception(rest_catalog):
101 | with pytest.raises(SourceNotFound):
102 | rest_catalog.get_source("tes")
103 |
104 |
105 | @pytest.mark.parametrize(
106 | "source_name, schema_name", [("test", "def"), ("tes", "default")]
107 | )
108 | def test_get_schema_exception(rest_catalog, source_name, schema_name):
109 | with pytest.raises(SchemaNotFound):
110 | rest_catalog.get_schema(source_name, schema_name)
111 |
112 |
113 | def test_add_source_pg(rest_catalog):
114 | data = {
115 | "name": "pg",
116 | "source_type": "postgres",
117 | "database": "db_database",
118 | "username": "db_user",
119 | "password": "db_password",
120 | "port": "db_port",
121 | "uri": "db_uri",
122 | }
123 |
124 | pg_connection = rest_catalog.add_source(**data)
125 | assert pg_connection.name == "pg"
126 | assert pg_connection.source_type == "postgres"
127 | assert pg_connection.database == "db_database"
128 | assert pg_connection.username == "db_user"
129 | assert pg_connection.password == "db_password"
130 | assert pg_connection.port == "db_port"
131 | assert pg_connection.uri == "db_uri"
132 |
133 |
134 | def test_add_source_mysql(rest_catalog):
135 | data = {
136 | "name": "mys",
137 | "source_type": "mysql",
138 | "database": "db_database",
139 | "username": "db_user",
140 | "password": "db_password",
141 | "port": "db_port",
142 | "uri": "db_uri",
143 | }
144 |
145 | mysql_conn = rest_catalog.add_source(**data)
146 |
147 | assert mysql_conn.name == "mys"
148 | assert mysql_conn.source_type == "mysql"
149 | assert mysql_conn.database == "db_database"
150 | assert mysql_conn.username == "db_user"
151 | assert mysql_conn.password == "db_password"
152 | assert mysql_conn.port == "db_port"
153 | assert mysql_conn.uri == "db_uri"
154 |
155 |
156 | def test_add_source_bq(rest_catalog):
157 | bq_conn = rest_catalog.add_source(
158 | name="bq",
159 | source_type="bigquery",
160 | key_path="db_key_path",
161 | project_credentials="db_creds",
162 | project_id="db_project_id",
163 | )
164 | assert bq_conn.name == "bq"
165 | assert bq_conn.source_type == "bigquery"
166 | assert bq_conn.key_path == "db_key_path"
167 | assert bq_conn.project_credentials == "db_creds"
168 | assert bq_conn.project_id == "db_project_id"
169 |
170 |
171 | def test_add_source_glue(rest_catalog):
172 | glue_conn = rest_catalog.add_source(name="gl", source_type="glue")
173 | assert glue_conn.name == "gl"
174 | assert glue_conn.source_type == "glue"
175 |
176 |
177 | def test_add_source_snowflake(rest_catalog):
178 | sf_conn = rest_catalog.add_source(
179 | name="sf",
180 | source_type="snowflake",
181 | database="db_database",
182 | username="db_user",
183 | password="db_password",
184 | account="db_account",
185 | role="db_role",
186 | warehouse="db_warehouse",
187 | )
188 | assert sf_conn.name == "sf"
189 | assert sf_conn.source_type == "snowflake"
190 | assert sf_conn.database == "db_database"
191 | assert sf_conn.username == "db_user"
192 | assert sf_conn.password == "db_password"
193 | assert sf_conn.account == "db_account"
194 | assert sf_conn.role == "db_role"
195 | assert sf_conn.warehouse == "db_warehouse"
196 |
197 |
198 | def test_update_source(rest_catalog):
199 | glue_conn = rest_catalog.add_source(name="gl_2", source_type="glue")
200 | schema_1 = rest_catalog.add_schema("schema_1", glue_conn)
201 |
202 | default_schema = rest_catalog.update_source(glue_conn, schema_1)
203 |
204 | assert default_schema.source.id == glue_conn.id
205 | assert default_schema.schema.id == schema_1.id
206 |
207 | schema_2 = rest_catalog.add_schema("schema_2", glue_conn)
208 |
209 | default_schema = rest_catalog.update_source(glue_conn, schema_2)
210 |
211 | assert default_schema.source.id == glue_conn.id
212 | assert default_schema.schema.id == schema_2.id
213 |
214 |
215 | def load_edges(catalog, expected_edges, job_execution_id):
216 | column_edge_ids = []
217 | for edge in expected_edges:
218 | source = catalog.get_column(
219 | source_name=edge[0][0],
220 | schema_name=edge[0][1],
221 | table_name=edge[0][2],
222 | column_name=edge[0][3],
223 | )
224 |
225 | target = catalog.get_column(
226 | source_name=edge[1][0],
227 | schema_name=edge[1][1],
228 | table_name=edge[1][2],
229 | column_name=edge[1][3],
230 | )
231 |
232 | added_edge = catalog.add_column_lineage(source, target, job_execution_id, {})
233 |
234 | column_edge_ids.append(added_edge.id)
235 | return column_edge_ids
236 |
237 |
238 | @pytest.fixture(scope="module")
239 | def load_page_lookup_nonredirect_edges(save_catalog):
240 | catalog = save_catalog
241 |
242 | expected_edges = [
243 | (
244 | ("test", "default", "page", "page_id"),
245 | ("test", "default", "page_lookup_nonredirect", "redirect_id"),
246 | ),
247 | (
248 | ("test", "default", "page", "page_id"),
249 | ("test", "default", "page_lookup_nonredirect", "page_id"),
250 | ),
251 | (
252 | ("test", "default", "page", "page_title"),
253 | ("test", "default", "page_lookup_nonredirect", "redirect_title"),
254 | ),
255 | (
256 | ("test", "default", "page", "page_title"),
257 | ("test", "default", "page_lookup_nonredirect", "true_title"),
258 | ),
259 | (
260 | ("test", "default", "page", "page_latest"),
261 | ("test", "default", "page_lookup_nonredirect", "page_version"),
262 | ),
263 | ]
264 |
265 | job_id = None
266 |
267 | with catalog.managed_session:
268 | job = catalog.add_job(
269 | "insert_page_lookup_nonredirect",
270 | catalog.get_source("test"),
271 | {"sql": "insert into page_lookup_nonredirect select from page"},
272 | )
273 | e1 = catalog.add_job_execution(
274 | job=job,
275 | started_at=datetime.datetime.combine(
276 | datetime.date(2021, 4, 1), datetime.time(1, 0)
277 | ),
278 | ended_at=datetime.datetime.combine(
279 | datetime.date(2021, 4, 1), datetime.time(1, 15)
280 | ),
281 | status=JobExecutionStatus.SUCCESS,
282 | )
283 |
284 | executions = [e1.id]
285 | name = job.name
286 | job_id = job.id
287 |
288 | print("Inserted job {}".format(name))
289 | print("Inserted executions {}".format(",".join(str(v) for v in executions)))
290 |
291 | column_edge_ids = load_edges(catalog, expected_edges, executions[0])
292 | print("Inserted edges {}".format(",".join(str(v) for v in column_edge_ids)))
293 |
294 | yield catalog, job_id, expected_edges
295 |
296 | with catalog.managed_session as session:
297 | session.query(ColumnLineage).filter(
298 | ColumnLineage.id.in_(column_edge_ids)
299 | ).delete(synchronize_session=False)
300 | print("DELETED edges {}".format(",".join(str(v) for v in column_edge_ids)))
301 | session.commit()
302 |
303 | session.query(JobExecution).filter(JobExecution.id.in_(executions)).delete(
304 | synchronize_session=False
305 | )
306 | print("DELETED executions {}".format(",".join(str(v) for v in executions)))
307 | session.commit()
308 |
309 | session.query(Job).filter(Job.name == name).delete(synchronize_session=False)
310 | print("DELETED job {}".format(name))
311 | session.commit()
312 |
313 |
314 | def test_api_main(graph_sdk, load_page_lookup_nonredirect_edges):
315 | catalog, job_id, expected_edges = load_page_lookup_nonredirect_edges
316 | graph = graph_sdk.get([job_id])
317 | assert len(graph["edges"]) == 10
318 | assert len(graph["nodes"]) == 15
319 |
320 |
321 | def test_parser(rest_catalog, parser_sdk, graph_sdk, save_catalog):
322 | source = rest_catalog.get_source("test")
323 | data = {
324 | "name": "LOAD page_lookup",
325 | "query": "INSERT INTO page_lookup SELECT plr.redirect_id, plr.redirect_title, plr.true_title, plr.page_id, "
326 | "plr.page_version FROM page_lookup_redirect plr",
327 | "source": source,
328 | "start_time": datetime.datetime.now(),
329 | "end_time": datetime.datetime.now(),
330 | }
331 |
332 | job_execution = parser_sdk.analyze(**data)
333 | assert job_execution is not None
334 |
335 | graph = graph_sdk.get([job_execution.job_id])
336 |
337 | assert len(graph["edges"]) == 10
338 | assert len(graph["nodes"]) == 15
339 |
340 | column_lineages = rest_catalog.get_column_lineage([job_execution.job_id])
341 | assert (len(column_lineages)) == 10
342 |
343 |
344 | @pytest.mark.parametrize(
345 | "query",
346 | [
347 | "insert into p_lookup select * from page_lookup_redirect",
348 | "insert into page_lookup select * from pg_lp_rt",
349 | "insert into page_lookup select plr.page_id, true_title from page_lookup_redirect",
350 | ],
351 | )
352 | def test_parser_table_not_found(rest_catalog, parser_sdk, managed_session, query):
353 | source = rest_catalog.get_source("test")
354 |
355 | with pytest.raises(TableNotFound) as exc:
356 | parser_sdk.analyze(
357 | query=query,
358 | source=source,
359 | start_time=datetime.datetime.now(),
360 | end_time=datetime.datetime.now(),
361 | )
362 | logging.debug(exc)
363 |
364 |
365 | @pytest.mark.parametrize(
366 | "query",
367 | [
368 | "insert into page_lookup(title) select true_title from page_lookup_redirect",
369 | "insert into page_lookup(true_title) select title from page_lookup_redirect",
370 | ],
371 | )
372 | def test_parser_column_not_found(rest_catalog, parser_sdk, managed_session, query):
373 | source = rest_catalog.get_source("test")
374 |
375 | with pytest.raises(ColumnNotFound) as exc:
376 | parser_sdk.analyze(
377 | query=query,
378 | source=source,
379 | start_time=datetime.datetime.now(),
380 | end_time=datetime.datetime.now(),
381 | )
382 | logging.debug(exc)
383 |
384 |
385 | @pytest.mark.parametrize(
386 | "query", ["insert page_lookup select * from page_lookup_redirect"]
387 | )
388 | def test_parser_parse_error(rest_catalog, parser_sdk, managed_session, query):
389 | source = rest_catalog.get_source("test")
390 |
391 | with pytest.raises(ParseError) as exc:
392 | parser_sdk.analyze(
393 | query=query,
394 | source=source,
395 | start_time=datetime.datetime.now(),
396 | end_time=datetime.datetime.now(),
397 | )
398 | logging.debug(exc)
399 |
--------------------------------------------------------------------------------