├── .coveragerc
├── .editorconfig
├── .flake8
├── .github
    ├── mergify.yml
    ├── release-drafter.yml
    └── workflows
    │   ├── build.yaml
    │   ├── draft-release.yaml
    │   └── release.yaml
├── .gitignore
├── .pre-commit-config.yaml
├── .readthedocs.yaml
├── .vscode
    ├── launch.json
    └── settings.json
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── debugger-usage.gif
├── debugger.py
├── docs
    ├── .gitignore
    ├── Makefile
    ├── conf.py
    ├── easy_sql
    │   ├── add_backend.md
    │   ├── backend
    │   │   └── flink.md
    │   ├── bnf.md
    │   ├── build_install.md
    │   ├── command_line.md
    │   ├── debug.md
    │   ├── easy_sql.md
    │   ├── faq.md
    │   ├── functions.md
    │   ├── functions.tpl.md
    │   ├── how_to.md
    │   ├── img
    │   │   └── test_case.png
    │   ├── index.md
    │   ├── linter.md
    │   ├── other_features.md
    │   ├── quick_start.md
    │   ├── syntax.md
    │   ├── testing.md
    │   ├── udfs.md
    │   ├── udfs.tpl.md
    │   └── variables.md
    ├── index.rst
    ├── make.bat
    ├── pyproject.toml
    ├── requirements.txt
    ├── scripts
    │   ├── generate_func_data.py
    │   └── update_doc.py
    └── sqlfluff
    │   ├── new_rule.md
    │   └── quick_start.md
├── easy_sql
    ├── __init__.py
    ├── base_test.py
    ├── cli
    │   ├── __init__.py
    │   └── backend_processor.py
    ├── config
    │   ├── sql_config.py
    │   └── sql_config_test.py
    ├── data_process.py
    ├── data_process_itest.py
    ├── data_process_test.py
    ├── local_spark.py
    ├── logger.py
    ├── report.py
    ├── report_test.py
    ├── spark_optimizer.py
    ├── sql_linter
    │   ├── __init__.py
    │   ├── rules
    │   │   ├── __init__.py
    │   │   └── bq_schema_rule.py
    │   ├── sql_linter.py
    │   ├── sql_linter_cli.py
    │   ├── sql_linter_reportor.py
    │   └── sql_linter_test.py
    ├── sql_processor
    │   ├── __init__.py
    │   ├── backend
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── bigquery.py
    │   │   ├── clickhouse.py
    │   │   ├── flink.py
    │   │   ├── flink_itest.py
    │   │   ├── maxcompute.py
    │   │   ├── maxcompute_itest.py
    │   │   ├── postgres.py
    │   │   ├── rdb.py
    │   │   ├── rdb_itest.py
    │   │   ├── rdb_test.py
    │   │   ├── spark.py
    │   │   ├── spark_test.py
    │   │   └── sql_dialect
    │   │   │   ├── __init__.py
    │   │   │   ├── bigquery.py
    │   │   │   ├── clickhouse.py
    │   │   │   ├── clickhouse_test.py
    │   │   │   └── postgres.py
    │   ├── common.py
    │   ├── context.py
    │   ├── context_test.py
    │   ├── funcs.py
    │   ├── funcs_common.py
    │   ├── funcs_flink.py
    │   ├── funcs_flink_itest.py
    │   ├── funcs_itest.py
    │   ├── funcs_rdb.py
    │   ├── funcs_spark.py
    │   ├── report.py
    │   ├── sql_processor.py
    │   ├── step.py
    │   └── step_test.py
    ├── sql_processor_debugger.py
    ├── sql_processor_debugger_itest.py
    ├── sql_processor_itest.py
    ├── sql_processor_test.py
    ├── sql_test.py
    ├── sql_test_itest.py
    ├── sql_tester.py
    ├── sql_tester_test.py
    ├── udf
    │   ├── __init__.py
    │   ├── check.py
    │   ├── udfs.py
    │   └── udfs_test.py
    └── utils
    │   ├── __init__.py
    │   ├── db_connection_utils.py
    │   ├── flink_test_cluster.py
    │   ├── flink_test_cluster_itest.py
    │   ├── io_utils.py
    │   ├── kv.py
    │   ├── object_utils.py
    │   ├── object_utils_test.py
    │   ├── sql_expr.py
    │   └── sql_expr_test.py
├── examples
    └── rtdw
    │   ├── .gitignore
    │   ├── Makefile
    │   ├── java
    │       ├── .gitignore
    │       ├── README
    │       ├── build.gradle
    │       ├── gradle
    │       │   └── wrapper
    │       │   │   └── gradle-wrapper.properties
    │       ├── gradlew
    │       ├── gradlew.bat
    │       ├── settings.gradle
    │       └── src
    │       │   └── main
    │       │       ├── java
    │       │           ├── com
    │       │           │   └── easysql
    │       │           │   │   └── example
    │       │           │   │       ├── Example.java
    │       │           │   │       ├── Ingest.java
    │       │           │   │       ├── RowDataDebeziumDeserializationSchema.java
    │       │           │   │       ├── Sinks.java
    │       │           │   │       ├── Sources.java
    │       │           │   │       └── SplitTableFunction.java
    │       │           └── org
    │       │           │   └── myorg
    │       │           │       └── quickstart
    │       │           │           └── DataStreamJob.java
    │       │       ├── resources
    │       │           └── log4j2.properties
    │       │       └── scala
    │       │           └── com
    │       │               └── easysql
    │       │                   └── example
    │       │                       └── ingest.scala
    │   ├── readme.md
    │   ├── scala
    │       ├── .gitignore
    │       ├── Makefile
    │       └── src
    │       │   └── com
    │       │       └── easysql
    │       │           └── example
    │       │               └── ingest.scala
    │   └── workflow
    │       └── sales
    │           └── ods
    │               ├── Makefile
    │               ├── data.sql
    │               ├── ingest.sql
    │               ├── ingest.test.sql
    │               ├── ingest_funcs.py
    │               ├── ingest_hudi.sql
    │               ├── ingest_hudi.test.sql
    │               ├── ingest_hudi_funcs.py
    │               ├── ods.flink_tables.json
    │               └── register-pg.json
├── poetry.lock
├── poetry.toml
├── pyproject.toml
├── requirements-all.txt
└── test
    ├── Dockerfile
    ├── customized_func
        ├── customized_func.py
        └── etl_with_customized_func.sql
    ├── doc
        ├── .sqlfluff
        ├── debugging.sql
        ├── test_sqlfulff.sql
        └── variables.sql
    ├── etl_test.xlsx
    ├── flink
        └── flink_hive_conf
        │   └── hive-site.xml
    ├── sample_data_process.py
    ├── sample_etl.clickhouse.json
    ├── sample_etl.clickhouse.sql
    ├── sample_etl.clickhouse.xlsx
    ├── sample_etl.flink.hive.postgres.sql
    ├── sample_etl.flink.hive.sql
    ├── sample_etl.flink.hudi-agg.sql
    ├── sample_etl.flink.postgres-cdc.multi-sink.sql
    ├── sample_etl.flink.postgres-cdc.sql
    ├── sample_etl.flink.postgres-hudi.sql
    ├── sample_etl.flink.postgres.sql
    ├── sample_etl.flink_tables_file.yml
    ├── sample_etl.flink_tables_file_hive.yml
    ├── sample_etl.postgres.json
    ├── sample_etl.postgres.sql
    ├── sample_etl.postgres.xlsx
    ├── sample_etl.spark.json
    ├── sample_etl.spark.sql
    ├── sample_etl.spark.xlsx
    ├── sample_etl.syntax.xlsx
    ├── sample_etl_wps.syntax.xlsx
    └── udf
        ├── clickhouse
            ├── etl_with_udf.sql
            └── udf.py
        ├── flink-python
            ├── etl_with_udf.sql
            └── udf.py
        ├── flink-scala
            ├── .gitignore
            ├── Makefile
            ├── etl_with_udf.sql
            └── your
            │   └── company
            │       └── udfs.scala
        ├── spark-python
            ├── etl_with_udf.sql
            └── udf.py
        └── spark-scala
            ├── .gitignore
            ├── Makefile
            ├── etl_with_udf.sql
            └── your
                └── company
                    └── udfs.scala


/.coveragerc:
--------------------------------------------------------------------------------
 1 | [run]
 2 | branch = True
 3 | source =
 4 |     easy_sql/
 5 | omit =
 6 |     */*_test.py
 7 |     */*_itest.py
 8 |     */maxcompute.py
 9 |     */sql_dialect/bigquery.py
10 | 
11 | [report]
12 | # Regexes for lines to exclude from consideration
13 | exclude_lines =
14 |     # Have to re-enable the standard pragma
15 |     pragma: no cover
16 | 
17 |     # Don't complain about missing debug-only code:
18 |     def __repr__
19 |     if self\.debug
20 | 
21 |     # Don't complain if tests don't hit defensive assertion code:
22 |     raise AssertionError
23 |     raise SqlProcessorAssertionError
24 |     raise NotImplementedError
25 | 
26 |     # Don't complain if non-runnable code isn't run:
27 |     if 0:
28 |     if __name__ == .__main__.:
29 | 
30 | [html]
31 | directory = build/coverage
32 | 


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
 1 | # EditorConfig is awesome: https://EditorConfig.org
 2 | 
 3 | # top-most EditorConfig file
 4 | root = true
 5 | 
 6 | [Makefile]
 7 | indent_style = tab
 8 | 
 9 | # Unix-style newlines with a newline ending every file
10 | [*]
11 | end_of_line = lf
12 | insert_final_newline = true
13 | charset = utf-8
14 | indent_style = space
15 | indent_size = 4
16 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | max-line-length = 120
 3 | extend-ignore = E203, E501, E731, SIM905
 4 | extend-select = TC, TC1, B950
 5 | per-file-ignores =
 6 |     easy_sql/sql_linter/sql_linter_test.py: W291,W293
 7 |     __init__.py:F403, F401,
 8 |     easy_sql/sql_tester.py: B950
 9 |     easy_sql/sql_processor/report.py: B950
10 |     dataplat/report_test.py: B950
11 | 


--------------------------------------------------------------------------------
/.github/mergify.yml:
--------------------------------------------------------------------------------
  1 | queue_rules:
  2 |   - name: shared_queue
  3 |     conditions:
  4 |       - check-success=unit-test
  5 |       - check-success=e2e-test-spark
  6 |       - check-success=e2e-test-postgres
  7 |       - check-success=e2e-test-clickhouse
  8 |       - check-success=test-coverage-all
  9 | 
 10 | pull_request_rules:
 11 |   - name: delete head branch after merge
 12 |     conditions:
 13 |       - merged
 14 |     actions:
 15 |       delete_head_branch:
 16 | 
 17 |   # Push PR into queue when it passes all checks
 18 |   - name: put approved pr to queue
 19 |     conditions:
 20 |       - or:
 21 |         - and:
 22 |           - "#approved-reviews-by>=1"
 23 |           - -draft
 24 |           - check-success=test-coverage-all
 25 |         - and:
 26 |           - label=can-merge
 27 |           - -draft
 28 |           - check-success=test-coverage-all
 29 |     actions:
 30 |       queue:
 31 |         name: shared_queue
 32 |         method: squash
 33 | 
 34 |   # Check if PR title contain valid types
 35 |   - name: Comment PR if title not semantic
 36 |     conditions:
 37 |       - author!=Mergify
 38 |       - -draft
 39 |       - '-title~=^(feat|fix|refactor|ci|build|docs|website|chore)(\(.*\))?!?:'
 40 |     actions:
 41 |       comment:
 42 |         message: |
 43 |           This pull request's title is not fulfill the requirements. @{{author}} please update it 🙏.
 44 | 
 45 |           Valid format:
 46 | 
 47 |           ```
 48 |           fix(query): fix group by string bug
 49 |             ^         ^---------------------^
 50 |             |         |
 51 |             |         +-> Summary in present tense.
 52 |             |
 53 |             +-------> Type: feat, fix, refactor, ci, build, docs, website, chore
 54 |           ```
 55 | 
 56 |           Valid types:
 57 | 
 58 |           - `feat`: this PR introduces a new feature to the codebase
 59 |           - `fix`: this PR patches a bug in codebase
 60 |           - `refactor`: this PR changes the code base without new features or bugfix
 61 |           - `ci|build`: this PR changes build/testing/ci steps
 62 |           - `docs|website`: this PR changes the documents or websites
 63 |           - `chore`: this PR only has small changes that no need to record
 64 |           - `type(scope)!`: this type of PR introduces breaking changes to the codebase
 65 | 
 66 | 
 67 |   # Assign pr label based of tags
 68 |   - name: label on New Feature
 69 |     conditions:
 70 |       - 'title~=^(feat)(\(.*\))?:'
 71 |     actions:
 72 |       label:
 73 |         add:
 74 |           - pr-feature
 75 |   - name: label on Bug Fix
 76 |     conditions:
 77 |       - 'title~=^(fix)(\(.*\))?:'
 78 |     actions:
 79 |       label:
 80 |         add:
 81 |           - pr-bugfix
 82 |   - name: label on Refactor
 83 |     conditions:
 84 |       - 'title~=^(refactor)(\(.*\))?:'
 85 |     actions:
 86 |       label:
 87 |         add:
 88 |           - pr-refactor
 89 |   - name: label on Build/Testing/CI
 90 |     conditions:
 91 |       - 'title~=^(ci|build)(\(.*\))?:'
 92 |     actions:
 93 |       label:
 94 |         add:
 95 |           - pr-build
 96 |   - name: label on Documentation
 97 |     conditions:
 98 |       - 'title~=^(docs|website)(\(.*\))?:'
 99 |     actions:
100 |       label:
101 |         add:
102 |           - pr-doc
103 |   - name: label on Not for changelog
104 |     conditions:
105 |       - 'title~=^(chore)(\(.*\))?:'
106 |     actions:
107 |       label:
108 |         add:
109 |           - pr-chore
110 |   - name: label on breaking changes
111 |     conditions:
112 |       - 'title~=^.*?(\(.*\))?!:'
113 |     actions:
114 |       label:
115 |         add:
116 |           - pr-breaking
117 | 


--------------------------------------------------------------------------------
/.github/release-drafter.yml:
--------------------------------------------------------------------------------
 1 | name-template: 'v$RESOLVED_VERSION'
 2 | tag-template: 'v$RESOLVED_VERSION'
 3 | template: |
 4 |   $CHANGES
 5 | 
 6 |   **Full Changelog**: https://github.com/$OWNER/$REPOSITORY/compare/$PREVIOUS_TAG...v$RESOLVED_VERSION
 7 | 
 8 | categories:
 9 |   - title: 'Breaking'
10 |     label: 'pr-breaking'
11 |   - title: 'New'
12 |     label: 'pr-feature'
13 |   - title: 'Bug Fixes'
14 |     label: 'pr-bugfix'
15 |   - title: 'Maintenance'
16 |     label: 'pr-refactor'
17 |   - title: 'Documentation'
18 |     label: 'pr-doc'
19 |   - title: 'Other changes'
20 | 
21 | version-resolver:
22 |   major:
23 |     labels:
24 |       - 'pr-breaking'
25 |   minor:
26 |     labels:
27 |       - 'pr-feature'
28 |   patch:
29 |     labels:
30 |       - 'pr-bugfix'
31 |       - 'pr-refactor'
32 |       - 'pr-build'
33 |       - 'pr-doc'
34 |       - 'pr-chore'
35 | 
36 | exclude-labels:
37 |   - 'skip-changelog'
38 | 


--------------------------------------------------------------------------------
/.github/workflows/draft-release.yaml:
--------------------------------------------------------------------------------
 1 | name: Draft Release
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 | 
 8 | jobs:
 9 |   update-release-draft:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - uses: release-drafter/release-drafter@master
13 |         env:
14 |           GITHUB_TOKEN: ${{ secrets.PAT }}
15 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yaml:
--------------------------------------------------------------------------------
 1 | name: Release
 2 | on:
 3 |   push:
 4 |     tags:
 5 |       - v*.*.*
 6 | 
 7 | jobs:
 8 |   release:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |     - uses: actions/checkout@v3
12 |       with:
13 |         ref: main
14 |         # ssh-key: ${{ secrets.DEPLOY_KEY }}
15 |         token: ${{ secrets.PAT }}
16 |     - uses: actions/setup-python@v4
17 |       with:
18 |         python-version: 3.8
19 |     - name: Load cached Poetry installation
20 |       uses: actions/cache@v2
21 |       with:
22 |         path: ~/.local/  # the path depends on the OS
23 |         key: poetry-0  # increment to reset cache
24 |     - name: Install Poetry
25 |       uses: snok/install-poetry@v1.3.4
26 |       with:
27 |         version: 1.5.1
28 |     - name: version check
29 |       id: version
30 |       run: |
31 |         tag=${GITHUB_REF_NAME}
32 |         version=${tag#v}
33 |         old_version=$(poetry version -s)
34 |         echo "tags: $tag , version: $version , old_version: $old_version"
35 |         echo "::set-output name=tag::${tag}"
36 |         echo "::set-output name=version::${version}"
37 |         echo "::set-output name=version_changed::$( [ $version != $old_version ] && echo 'true' )"
38 |     - name: bump version
39 |       if: ${{ steps.version.outputs.version_changed == 'true' }}
40 |       run: poetry version ${{ steps.version.outputs.version }}
41 |     - name: commit and update release tag
42 |       if: ${{ steps.version.outputs.version_changed == 'true' }}
43 |       run: |
44 |         git config user.name 'auto-release'
45 |         git config user.email 'easy_sql@thoughtworks.com'
46 |         git commit -am "release: bump to version ${{ steps.version.outputs.version }} [skip ci]"
47 |         git tag ${{ steps.version.outputs.tag }} -f
48 |         git push --atomic origin main ${{ steps.version.outputs.tag }} -f
49 |     - name: upload pypi
50 |       if: ${{ steps.version.outputs.version_changed == 'true' }}
51 |       run: |
52 |         poetry config pypi-token.pypi ${{ secrets.PYPI_TOKEN }}
53 |         make upload-pip
54 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | tmp/
 2 | .idea/
 3 | .metals/
 4 | __pycache__
 5 | **/.DS_Store
 6 | **/*.zip
 7 | venv/
 8 | .python-version
 9 | spark-warehouse/
10 | build/*
11 | .ipynb_checkpoints/
12 | easy_sql-easy_sql*
13 | easy-sql-easy-sql*
14 | easy_sql_easy_sql.egg-info
15 | dist/
16 | dist.old/
17 | metastore_db/
18 | derby.log
19 | coverage.xml
20 | .coverage
21 | readme.local.md
22 | test/flink/jars
23 | test/*.local.*
24 | test/*__local.py
25 | test/spark/jars
26 | test/flink/tools
27 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/pycqa/isort
 3 |     rev: 5.12.0
 4 |     hooks:
 5 |       - id: isort
 6 | 
 7 |   - repo: https://github.com/psf/black
 8 |     rev: 24.1.1
 9 |     hooks:
10 |       - id: black
11 | 
12 |   - repo: https://github.com/pycqa/flake8
13 |     rev: 6.0.0
14 |     hooks:
15 |       - id: flake8
16 |         additional_dependencies:
17 |           - flake8-bugbear
18 |           - flake8-comprehensions
19 |           - flake8-simplify
20 |           - flake8-type-checking
21 | 
22 |   - repo: https://github.com/pre-commit/pre-commit-hooks
23 |     rev: v4.4.0
24 |     hooks:
25 |       - id: end-of-file-fixer
26 |       - id: trailing-whitespace
27 |         exclude: |
28 |           (?x)^(
29 |               easy_sql/sql_linter/sql_linter_test.py
30 |           )$
31 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | build:
 4 |   os: "ubuntu-20.04"
 5 |   tools:
 6 |     python: "3.8"
 7 | 
 8 | sphinx:
 9 |    configuration: docs/conf.py
10 | 
11 | python:
12 |   install:
13 |     - requirements: docs/requirements.txt
14 | 


--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     // Use IntelliSense to learn about possible attributes.
 3 |     // Hover to view descriptions of existing attributes.
 4 |     // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
 5 |     "version": "0.2.0",
 6 |     "configurations": [
 7 |         {
 8 |             "name": "Debug Unit Test",
 9 |             "type": "python",
10 |             "request": "test",
11 |             "justMyCode": false
12 |         }
13 |     ]
14 | }
15 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "python.linting.flake8Enabled": true,
 3 |   "python.linting.pylintEnabled": false,
 4 |   "python.linting.enabled": true,
 5 |   "python.testing.unittestArgs": ["-v", "-s", "./", "-p", "*test.py"],
 6 |   "python.testing.pytestEnabled": true,
 7 |   "python.testing.unittestEnabled": false,
 8 |   "python.analysis.typeCheckingMode": "basic",
 9 |   "editor.formatOnSave": true,
10 |   "editor.defaultFormatter": "ms-python.black-formatter",
11 |   "python.testing.pytestArgs": [
12 |     "easy_sql"
13 |   ]
14 | }
15 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | # Contributing to EasySQL
  2 | 
  3 | - [Contributing to EasySQL](#contributing-to-easysql)
  4 |   - [Architecture Design](#architecture-design)
  5 |   - [Build and Run EasySQL](#build-and-run-EasySQL)
  6 |   - [Create Tracking Issue if Necessary](#create-tracking-issue-if-necessary)
  7 |   - [Write Tests](#write-tests)
  8 |   - [Running Test and Checks](#running-test-and-checks)
  9 |   - [Submit a PR](#submit-a-pr)
 10 |     - [Pull Request Title](#pull-request-title)
 11 |     - [Pull Request Description](#pull-request-description)
 12 |     - [Sign DCO (Developer Certificate of Origin)](#sign-dco-developer-certificate-of-origin)
 13 | 
 14 | Thanks for your contribution! The EasySQL project welcomes contribution of various types -- new features, bug fixes
 15 | and reports, typo fixes, etc. If you want to contribute to the EasySQL project, you will need to pass necessary
 16 | checks and sign DCO. If you have any question, feel free to ping community members on GitHub and in Slack channels.
 17 | 
 18 | ## Architecture Design
 19 | 
 20 | TODO: need to enhance this part
 21 | 
 22 | ## Build and Run EasySQL
 23 | 
 24 | TODO: need to enhance this part
 25 | 
 26 | ## Create Tracking Issue if Necessary
 27 | 
 28 | If you are working on a large feature (>= 300 LoCs), it is recommended to create a tracking issue first, so that
 29 | contributors and maintainers can understand the issue better and discuss how to proceed and implement the features.
 30 | 
 31 | ## Write Tests
 32 | 
 33 | TODO: need to enhance this part
 34 | 
 35 | ## Running Test and Checks
 36 | 
 37 | We provide a simple make command to run all the checks:
 38 | 
 39 | ```shell
 40 | make unit-test
 41 | ```
 42 | 
 43 | After all the checks pass, your changes will likely be accepted.
 44 | 
 45 | ## Submit a PR
 46 | 
 47 | ### Pull Request Title
 48 | 
 49 | As described in [here](https://github.com/commitizen/conventional-commit-types/blob/master/index.json), a valid PR title should begin with one of the following prefixes:
 50 | 
 51 | - `feat`: A new feature
 52 | - `fix`: A bug fix
 53 | - `docs`: Documentation only changes
 54 | - `style`: Changes that do not affect the meaning of the code (white-space, formatting, missing semi-colons, etc)
 55 | - `refactor`: A code change that neither fixes a bug nor adds a feature
 56 | - `perf`: A code change that improves performance
 57 | - `test`: Adding missing tests or correcting existing tests
 58 | - `build`: Changes that affect the build system or external dependencies (example scopes: gulp, broccoli, npm)
 59 | - `ci`: Changes to EasySQL CI configuration files and scripts
 60 | - `chore`: Other changes that don't modify src or test files
 61 | - `revert`: Reverts a previous commit
 62 | 
 63 | For example, a PR title could be:
 64 | 
 65 | - `refactor: modify sql processor protobuf package path`
 66 | - `feat(processor): support clickhouse as backend.`
 67 | 
 68 | 
 69 | > `<type>(<scope>): <subject>`
 70 | >
 71 | > ```
 72 | > feat(scope): add hat wobble
 73 | > ^--^ ^---^   ^------------^
 74 | > |    |       |
 75 | > |    |       +-> Summary in present tense.
 76 | > |    |
 77 | > |    +---> Scope: executor, storage, etc.
 78 | > |
 79 | > +-------> Type: chore, docs, feat, fix, refactor, style, or test.
 80 | > ```
 81 | 
 82 | 
 83 | ### Pull Request Description
 84 | 
 85 | - If your PR is small (such as a typo fix), you can go brief.
 86 | - If it is large and you have changed a lot, it's better to write more details.
 87 | 
 88 | ### Sign DCO (Developer Certificate of Origin)
 89 | 
 90 | Contributors will need to sign DCO in their commits. From [GitHub App's DCO](https://github.com/apps/dco) page:
 91 | 
 92 | The Developer Certificate of Origin (DCO) is a lightweight way for contributors to certify that they wrote or otherwise
 93 | have the right to submit the code they are contributing to the project. Here is the full text of the DCO, reformatted
 94 | for readability:
 95 | 
 96 | > By making a contribution to this project, I certify that:
 97 | >
 98 | > The contribution was created in whole or in part by me and I have the right to submit it under the open source license indicated in the file; or
 99 | >
100 | > The contribution is based upon previous work that, to the best of my knowledge, is covered under an appropriate open source license and I have the right under that license to submit that work with modifications, whether created in whole or in part by me, under the same open source license (unless I am permitted to submit under a different license), as indicated in the file; or
101 | >
102 | > The contribution was provided directly to me by some other person who certified 1., 2. or 3. and I have not modified it.
103 | >
104 | > I understand and agree that this project and the contribution are public and that a record of the contribution (including all personal information I submit with it, including my sign-off) is maintained indefinitely and may be redistributed consistent with this project or the open source license(s) involved.
105 | 
106 | Contributors will need to add a `Signed-off-by` line in all their commits:
107 | 
108 | ```
109 | Signed-off-by: Random J Developer <random@developer.example.org>
110 | ```
111 | 
112 | The `git` command provides `-s` parameter to attach DCO to the commits.
113 | 
114 | ```
115 | git commit -m "feat(scope): commit messages" -s
116 | ```
117 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | prune test
2 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | test-coverage:
 2 | 	export PYSPARK_PYTHON=python3 && export PYSPARK_DRIVER_PYTHON=python3 && \
 3 |  		python3 -m coverage run -m pytest -o python_files=*_test.py
 4 | 	python3 -m coverage report -m
 5 | 	- mkdir build
 6 | 	- rm -r build/coverage
 7 | 	python3 -m coverage html
 8 | 
 9 | unit-test:
10 | 	export PYSPARK_PYTHON=python3 && export PYSPARK_DRIVER_PYTHON=python3 && \
11 |  		python3 -m pytest
12 | 
13 | # Note: env var named PG_URL and CLICKHOUSE_URL must be set to run e2e test
14 | e2e-test:
15 | 	cd test && docker build . --build-arg PG_URL=${PG_URL} --build-arg CLICKHOUSE_URL=${CLICKHOUSE_URL}
16 | 
17 | echo-var:
18 | 	echo ${PG_URL} ${CLICKHOUSE_URL}
19 | 
20 | e2e-test-spark:
21 | 	python3 -m easy_sql.data_process -f test/sample_etl.spark.sql
22 | 
23 | e2e-test-postgres:
24 | 	python3 -m easy_sql.data_process -f test/sample_etl.postgres.sql
25 | 
26 | e2e-test-clickhouse:
27 | 	python3 -m easy_sql.data_process -f test/sample_etl.clickhouse.sql
28 | 
29 | e2e-test-flink-postgres:
30 | 	python3 -m easy_sql.data_process -f test/sample_etl.flink.postgres.sql
31 | 
32 | e2e-test-flink-streaming:
33 | 	python3 -m easy_sql.data_process -f test/sample_etl.flink.postgres-cdc.sql
34 | 	python3 -m easy_sql.data_process -f test/sample_etl.flink.postgres-cdc.multi-sink.sql
35 | 	python3 -m easy_sql.data_process -f test/sample_etl.flink.postgres-hudi.sql
36 | 
37 | e2e-test-flink-hive:
38 | 	python3 -m easy_sql.data_process -f test/sample_etl.flink.hive.sql
39 | 
40 | test-coverage-all:
41 | 	export PYSPARK_PYTHON=python3 && export PYSPARK_DRIVER_PYTHON=python3 && \
42 |  		PG_URL=${PG_URL} CLICKHOUSE_URL=${CLICKHOUSE_URL} python3 -m coverage run -m pytest -o python_files=*test.py
43 | 	python3 -m coverage report -m
44 | 	python3 -m coverage xml
45 | 
46 | package-zip:
47 | 	- rm build/easysql.zip
48 | 	mkdir -p build
49 | 	zip -r --exclude=*__pycache__* build/easysql.zip easy_sql
50 | 
51 | package-pip:
52 | 	poetry build
53 | 
54 | upload-test-pip:
55 | 	rm -rf ./dist
56 | 	poetry publish -r testpypi --build
57 | 
58 | install-test-pip:
59 | 	pip3 uninstall easy_sql-easy_sql
60 | 	python3 -m pip install --index-url https://test.pypi.org/simple/ 'easy-sql-easy-sql[cli]'
61 | 
62 | upload-pip:
63 | 	rm -rf ./dist
64 | 	poetry publish --build
65 | 
66 | prepare-flink-hadoop:
67 | 	test -f test/flink/tools/hadoop/hadoop-3.3.5.tar.gz || ( \
68 |         mkdir -pv test/flink/tools/hadoop && \
69 |         wget -P test/flink/tools/hadoop https://dlcdn.apache.org/hadoop/common/hadoop-3.3.5/hadoop-3.3.5.tar.gz && \
70 |         cd test/flink/tools/hadoop && \
71 |         tar xf hadoop-3.3.5.tar.gz )
72 | 
73 | download-flink-jars:
74 | 	test -f test/flink/jars/flink-connector-jdbc-1.15.1.jar || wget -P test/flink/jars https://repo1.maven.org/maven2/org/apache/flink/flink-connector-jdbc/1.15.1/flink-connector-jdbc-1.15.1.jar
75 | 	test -f test/flink/jars/flink-sql-connector-hive-3.1.2_2.12-1.15.1.jar || wget -P test/flink/jars https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-hive-3.1.2_2.12/1.15.1/flink-sql-connector-hive-3.1.2_2.12-1.15.1.jar
76 | 	test -f test/flink/jars/postgresql-42.2.14.jar || wget -P test/flink/jars https://repo1.maven.org/maven2/org/postgresql/postgresql/42.2.14/postgresql-42.2.14.jar
77 | 	test -f test/flink/jars/flink-sql-connector-postgres-cdc-2.3.0.jar || wget -P test/flink/jars https://repo1.maven.org/maven2/com/ververica/flink-sql-connector-postgres-cdc/2.3.0/flink-sql-connector-postgres-cdc-2.3.0.jar
78 | 	test -f test/flink/jars/hudi-flink1.15-bundle-0.12.2.jar || wget -P test/flink/jars https://repo1.maven.org/maven2/org/apache/hudi/hudi-flink1.15-bundle/0.12.2/hudi-flink1.15-bundle-0.12.2.jar
79 | 	test -f test/flink/jars/flink-sql-connector-kafka-1.15.2.jar || wget -P test/flink/jars https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-kafka/1.15.2/flink-sql-connector-kafka-1.15.2.jar
80 | 	test -f test/flink/jars/kafka-clients-3.3.2.jar || wget -P test/flink/jars https://repo1.maven.org/maven2/org/apache/kafka/kafka-clients/3.3.2/kafka-clients-3.3.2.jar
81 | 


--------------------------------------------------------------------------------
/debugger-usage.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/easysql/easy_sql/b568542617942f347579ff872d976fd2175aa071/debugger-usage.gif


--------------------------------------------------------------------------------
/debugger.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from os import path
 3 | from typing import Any, Dict, Optional
 4 | 
 5 | src_path = path.dirname(path.abspath(__file__))
 6 | sys.path.insert(0, src_path)
 7 | 
 8 | __all__ = ["create_debugger", "create_pg_debugger", "create_ch_debugger"]
 9 | 
10 | 
11 | def create_debugger(sql_file_path: str, vars: Optional[Dict[str, Any]] = None, funcs: Optional[Dict[str, Any]] = None):
12 |     import os
13 |     import subprocess
14 | 
15 |     spark_home = (
16 |         subprocess.check_output(["bash", "-c", "echo 'import os; print(os.environ[\"SPARK_HOME\"])' | pyspark"])
17 |         .decode("utf8")
18 |         .split("\n")
19 |     )
20 |     spark_home = [c.strip() for c in spark_home if c.strip()][0]
21 |     os.environ["SPARK_HOME"] = spark_home
22 |     import findspark
23 | 
24 |     findspark.init()
25 | 
26 |     from pyspark.sql import SparkSession
27 | 
28 |     from easy_sql.sql_processor.backend import SparkBackend
29 | 
30 |     spark = SparkSession.builder.enableHiveSupport().getOrCreate()
31 |     backend = SparkBackend(spark)
32 |     from easy_sql.sql_processor_debugger import SqlProcessorDebugger
33 | 
34 |     debugger = SqlProcessorDebugger(sql_file_path, backend, vars, funcs)
35 |     return debugger
36 | 
37 | 
38 | def create_pg_debugger(
39 |     sql_file_path: str, vars: Optional[Dict[str, Any]] = None, funcs: Optional[Dict[str, Any]] = None
40 | ):
41 |     from easy_sql.sql_processor.backend.rdb import RdbBackend
42 | 
43 |     pg = RdbBackend("postgresql://postgres:123456@testpg:15432/postgres")
44 |     from easy_sql.sql_processor_debugger import SqlProcessorDebugger
45 | 
46 |     debugger = SqlProcessorDebugger(sql_file_path, pg, vars, funcs)
47 |     return debugger
48 | 
49 | 
50 | def create_ch_debugger(
51 |     sql_file_path: str, vars: Optional[Dict[str, Any]] = None, funcs: Optional[Dict[str, Any]] = None
52 | ):
53 |     from easy_sql.sql_processor.backend.rdb import RdbBackend
54 | 
55 |     ch = RdbBackend("clickhouse+native://default@testch:30123")
56 |     from easy_sql.sql_processor_debugger import SqlProcessorDebugger
57 | 
58 |     debugger = SqlProcessorDebugger(sql_file_path, ch, vars, funcs)
59 |     return debugger
60 | 


--------------------------------------------------------------------------------
/docs/.gitignore:
--------------------------------------------------------------------------------
1 | _build/
2 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | # import os
14 | # import sys
15 | # sys.path.insert(0, os.path.abspath('.'))
16 | 
17 | 
18 | # -- Project information -----------------------------------------------------
19 | import os.path
20 | 
21 | project = "Easy SQL"
22 | copyright = "2022, easysql"
23 | author = "easysql"
24 | 
25 | # The full version, including alpha/beta/rc tags
26 | release = "v0.1.0"
27 | 
28 | 
29 | # -- General configuration ---------------------------------------------------
30 | 
31 | # Add any Sphinx extension module names here, as strings. They can be
32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
33 | # ones.
34 | extensions = [
35 |     "myst_parser",
36 |     "sphinx.ext.duration",
37 |     "sphinx.ext.doctest",
38 |     "sphinx.ext.autodoc",
39 |     "sphinx.ext.autosummary",
40 |     "sphinx.ext.intersphinx",
41 |     "autoapi.extension",
42 | ]
43 | 
44 | autoapi_type = "python"
45 | autoapi_dirs = ["../"]
46 | _docs_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
47 | autoapi_ignore = [
48 |     f"{_docs_dir}/debugger.py",
49 |     f"{_docs_dir}/test/*",
50 |     f"{_docs_dir}/build/*",
51 |     f"{_docs_dir}/docs/*",
52 |     f"{_docs_dir}/requirements/*",
53 |     "*_itest.py",
54 |     "*_test.py",
55 | ]
56 | autoapi_options = [
57 |     "show-module-summary",
58 |     "members",
59 |     "undoc-members",
60 |     "imported-members",
61 |     "show-inheritance",
62 |     "show-inheritance-diagram",
63 | ]
64 | autoapi_member_order = "groupwise"
65 | autodoc_typehints = "description"
66 | 
67 | # Add any paths that contain templates here, relative to this directory.
68 | templates_path = ["_templates"]
69 | 
70 | # List of patterns, relative to source directory, that match files and
71 | # directories to ignore when looking for source files.
72 | # This pattern also affects html_static_path and html_extra_path.
73 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
74 | 
75 | intersphinx_mapping = {
76 |     "python": ("https://docs.python.org/3/", None),
77 |     "sphinx": ("https://www.sphinx-doc.org/en/master/", None),
78 | }
79 | intersphinx_disabled_domains = ["std"]
80 | 
81 | # -- Options for HTML output -------------------------------------------------
82 | 
83 | # The theme to use for HTML and HTML Help pages.  See the documentation for
84 | # a list of builtin themes.
85 | #
86 | html_theme = "sphinx_rtd_theme"
87 | 
88 | # Add any paths that contain custom static files (such as style sheets) here,
89 | # relative to this directory. They are copied after the builtin static files,
90 | # so a file named "default.css" will overwrite the builtin "default.css".
91 | html_static_path = ["_static"]
92 | 


--------------------------------------------------------------------------------
/docs/easy_sql/add_backend.md:
--------------------------------------------------------------------------------
 1 | # Add Backend
 2 | 
 3 | ##Introduction
 4 | 
 5 | Easy-sql is designed as a tool to quick implement with different kind of sql backend.
 6 | 
 7 | So far supported backends are:
 8 | 
 9 | + spark sql(spark engine)
10 | + bigquery(sqlalchemy engine)
11 | + postgresql(sqlalchemy engine)
12 | + clickhouse(sqlalchemy engine)
13 | + flink(flink engine)
14 | 
15 | Easy sql is designed to be flexible and scalable. If in future have requirement on build new backend engine in easy sql, it can be easily added on by implement the method. Following is the description on how to implement new engine step by step.
16 | 
17 | ##
18 | 


--------------------------------------------------------------------------------
/docs/easy_sql/bnf.md:
--------------------------------------------------------------------------------
 1 | The pseudocode BNF of Easy SQL syntax.
 2 | 
 3 | ```
 4 | easysql: target_def | sql_body | config | include
 5 | target_def: target_def_prefix (variables_def | list_variables_def | temp_def | cache_def | broadcast_def | func_def | log_def | check_def | output_def | template_def | action) (, if = func_call)?
 6 | sql_body: (any var_reference any | any tpl_reference any)* comment?
 7 | 
 8 | target_def_prefix: '-- target='
 9 | 
10 | var_reference: var_reference_lit | var_reference_func
11 | var_reference_lit: ${ name }
12 | var_reference_func: ${ func_call }
13 | 
14 | func_call: func_call_no_arg | func_call_with_args
15 | func_call_no_arg: name \(  \)
16 | func_call_with_args: name \( func_call_args \)
17 | func_call_args: (name_wide | var_reference_lit) (, name_wide | , var_reference_lit)*
18 | 
19 | variables_def: 'variables'
20 | list_variables_def: 'list_variables'
21 | temp_def: 'temp.'name
22 | cache_def: 'cache.'name
23 | broadcast_def: 'broadcast.'name
24 | func_def: 'func.'func_call
25 | log_def: 'log.'name
26 | check_def: 'check.'name | 'check.'func_call
27 | output_def: 'output.'name.name | 'output.'name.name.name
28 | template_def: 'template.'name
29 | action_def: 'action.'name
30 | 
31 | config: '-- config:' name_key = any | \
32 |     '-- backend:' name | \
33 |     '-- owner:' | \
34 |     '-- owner:' name (, name)* | \
35 |     '-- schedule:' any | \
36 |     '-- prepare-sql: ' any | \
37 |     '-- inputs:' | \
38 |     '-- inputs:' (name.name | name.name.name) (, name.name | , name.name.name)* | \
39 |     '-- outputs:' | \
40 |     '-- outputs:' (name.name | name.name.name) (, name.name | , name.name.name)*
41 | 
42 | include: '-- include=' any
43 | 
44 | tpl_call: tpl_call_no_arg | tpl_call_with_args
45 | tpl_call_no_arg: name \(  \)
46 | tpl_call_with_args: name \( tpl_call_args \)
47 | tpl_call_args: (name = name_wide | name = var_reference_lit) (, name = name_wide | , name = var_reference_lit)*
48 | 
49 | 
50 | tpl_reference: tpl_reference_lit | tpl_reference_func
51 | tpl_reference_lit: @{ name }
52 | tpl_reference_func: @{ tpl_call }
53 | 
54 | name: r'[a-zA-Z_]\\w*'
55 | name_wide: r'[^),]*'
56 | 
57 | comment: '--' any
58 | ```
59 | 


--------------------------------------------------------------------------------
/docs/easy_sql/build_install.md:
--------------------------------------------------------------------------------
 1 | # Build and install Easy SQL
 2 | 
 3 | Easy SQL is a very light-weight python library. The common Python library conventions are followed.
 4 | It's easy to build or install Easy SQL.
 5 | 
 6 | ## Install Easy SQL
 7 | 
 8 | Install Easy SQL using pip: `python3 -m pip install 'easy-sql-easy-sql[extra,extra]'`
 9 | 
10 | Currently we are providing below extras, choose according to your need:
11 | - cli
12 | - linter
13 | - spark
14 | - pg
15 | - clickhouse
16 | 
17 | We also provide flink backend, but because of dependency confliction between pyspark and apache-flink, you need to install the flink backend dependencies manually with the following command `python3 -m pip install apache-flink`.
18 | 
19 | Usually we read data from some data source and write data to some other system using flink with different connectors. So we need to download some jars for the used connectors as well. Refer [here](https://nightlies.apache.org/flink/flink-docs-release-1.15/docs/connectors/table/overview/) to get more information and [here](https://nightlies.apache.org/flink/flink-docs-release-1.15/docs/connectors/table/downloads/) to download the required connectors.
20 | ## Building Easy SQL
21 | 
22 | Internally we use `poetry` to manage the dependencies. So make sure you have [installed it](https://python-poetry.org/docs/master/#installation). Package could be built with the following make command: `make package-pip` or just `poetry build`.
23 | 
24 | After the above command, there will be a file named `easy_sql*.whl` generated in the `dist` folder.
25 | You can install it with command `python3 -m pip install dist/easy_sql*.whl[extra]` or just `poetry install -E 'extra extra'`.
26 | 


--------------------------------------------------------------------------------
/docs/easy_sql/command_line.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/easysql/easy_sql/b568542617942f347579ff872d976fd2175aa071/docs/easy_sql/command_line.md


--------------------------------------------------------------------------------
/docs/easy_sql/debug.md:
--------------------------------------------------------------------------------
 1 | # Debug ETL
 2 | 
 3 | There is a debugger interface implemented in Easy SQL.
 4 | 
 5 | ## Start to debug
 6 | 
 7 | We recommend debugging ETLs from jupyter. You can follow the steps below to start debugging your ETL.
 8 | 
 9 | 1. Install jupyter first with command `python3 -m pip install jupyterlab`.
10 | 
11 | 2. Create a file named `debugger.py` with contents like below:
12 | 
13 | A more detailed sample could be found [here](https://github.com/easysql/easy_sql/blob/main/debugger.py).
14 | 
15 | ```python
16 | from typing import Dict, Any
17 | 
18 | def create_debugger(sql_file_path: str, vars: Dict[str, Any] = None, funcs: Dict[str, Any] = None):
19 |     from pyspark.sql import SparkSession
20 |     from easy_sql.sql_processor.backend import SparkBackend
21 |     from easy_sql.sql_processor_debugger import SqlProcessorDebugger
22 |     spark = SparkSession.builder.enableHiveSupport().getOrCreate()
23 |     backend = SparkBackend(spark)
24 |     debugger = SqlProcessorDebugger(sql_file_path, backend, vars, funcs)
25 |     return debugger
26 | 
27 | ```
28 | 
29 | 3. Create a file named `test.sql` with contents as [here](https://github.com/easysql/easy_sql/blob/main/test/sample_etl.spark.sql).
30 | 
31 | 4. Then start jupyter lab with command: `jupyter lab`.
32 | 
33 | 5. Start debugging like below:
34 | 
35 | ![ETL Debugging](https://raw.githubusercontent.com/easysql/easy_sql/main/debugger-usage.gif)
36 | 
37 | ## Debuger API
38 | 
39 | Please refer to API doc [here](api/debugger.md)
40 | 


--------------------------------------------------------------------------------
/docs/easy_sql/easy_sql.md:
--------------------------------------------------------------------------------
 1 | # Easy SQL
 2 | 
 3 | Easy SQL is built to ease the data ETL development process.
 4 | With Easy SQL, you can develop your ETL in SQL in an imperative way.
 5 | 
 6 | It defines a few simple syntax on top of standard SQL, with which SQL could be executed one by one.
 7 | Easy SQL also provides a processor to handle all the new syntax.
 8 | 
 9 | Since this is SQL agnostic, any SQL engine could be plugged-in as a backend.
10 | There are built-in supported for several popular SQL engines, including SparkSQL, PostgreSQL, Clickhouse, Aliyun Maxcompute, Google BigQuery.
11 | More will be added in the near future.
12 | 
13 | ## Background
14 | 
15 | Why do we need imperative syntax in ETL?
16 | 
17 | SQL is designed to be used in a declarative way and it causes a few troubles when we use SQL to develop complicated ETL.
18 | 
19 | Think about the following cases.
20 | 
21 | 1. We would like to use large computing resources when we're handling data in the full-data partition since the amount of data there is far larger than that in the other partitions.
22 | 2. We would like to send out a http request to report status when some step of the ETL fails for some reasons(E.g. some data does not conform to the previous assumptions).
23 | 3. We would like to reuse some code to check if some order is a valid order (think about e-commerce business).
24 | 4. We would like to stop at some step of the ETL and check if the data is what we expected.
25 | 
26 | When we use SQL to develop our ETL, it is hard to handle the above cases.
27 | But for a company with a wide range of data usage, there are similar cases everywhere.
28 | 
29 | ### Why imperative SQL
30 | 
31 | The above cases could be easily handled if we have an imperative-way to write our code.
32 | This might be the reason why a lot of developers like to write ETLs in a general programming language like Python or Scala.
33 | 
34 | But for data ETL development case, we still think that to use SQL or SQL-like language is a better choice. The main reasons are:
35 | 
36 | - Consistent code style across all ETLs.
37 | - All roles in the team can easily understand the logic in ETL.
38 | - All code about one ETL mainly stays in one file and it makes things simpler when we try to read and understand what it does in the ETL.
39 | 
40 | ## Design principal
41 | 
42 | When first tried to design the syntax, we found several important things. Which are:
43 | 
44 | - Keep compatible with standard SQL. So that every SQL editor could be used to develop in Easy SQL.
45 | - Try to use SQL-way to implement most of the features.
46 | - Use intuitive syntax which is also similar to the widely-used syntax in other programming languages.
47 | - Implement widely-used debugging features, such as logging and asserting and even step by step debugging.
48 | 
49 | These important things become the design principals of Easy SQL. They provide guidance in the whole design process.
50 | If there is an argument about which design is better, the design principals could be referred to make a decision.
51 | 
52 | ## Language features in Easy SQL
53 | 
54 | For Easy SQL, guided by the design principals, there are a few simple language features added to support these imperative characteristics. Below is a list about these features:
55 | 
56 | - An imperative structure of ETL code.
57 | - Variables which could be defined and modified any time.
58 | - A way to call external functions.
59 | - A way to control whether a step should be executed.
60 | - Templates that could be reused in the same ETL file.
61 | - Include command that could be used to reuse code at file level.
62 | - Logging and assertion that could be used for debugging.
63 | - A debugger interface.
64 | 


--------------------------------------------------------------------------------
/docs/easy_sql/faq.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/easysql/easy_sql/b568542617942f347579ff872d976fd2175aa071/docs/easy_sql/faq.md


--------------------------------------------------------------------------------
/docs/easy_sql/how_to.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/easysql/easy_sql/b568542617942f347579ff872d976fd2175aa071/docs/easy_sql/how_to.md


--------------------------------------------------------------------------------
/docs/easy_sql/img/test_case.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/easysql/easy_sql/b568542617942f347579ff872d976fd2175aa071/docs/easy_sql/img/test_case.png


--------------------------------------------------------------------------------
/docs/easy_sql/index.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/easysql/easy_sql/b568542617942f347579ff872d976fd2175aa071/docs/easy_sql/index.md


--------------------------------------------------------------------------------
/docs/easy_sql/linter.md:
--------------------------------------------------------------------------------
 1 | # Linter
 2 | 
 3 | Easy SQL is a powerful tool that can bring convenience to ETL developer.
 4 | But so far we do not have an Easy-SQL grammar supported compiler that can auto-check SQL quality and auto fix the violations.
 5 | It is the reason why we develop such linter tool on top of sqlfluff. With this linter, we can do static analysis and auto-fixing of ETL code written in Easy SQL.
 6 | 
 7 | 
 8 | ## Command Line Interface
 9 | 
10 | The command line interface usage is as follows:
11 | 
12 | ```bash
13 | $ python3 -m easy_sql.sql_linter.sql_linter_cli fix --path ${path}
14 | ```
15 | 
16 | There are fix and lint mode, for lint it will only show the rule violations while for fix it will auto-fix the query.
17 | 
18 | Fix mode parameters:
19 | 
20 | - path: The location of the ETL file.
21 | - config-path: Sql fluff config file path, must be named `.sqlfluff`. Used to customize lint rules. There are some customization introduced by Easy SQL. If this is specified, the customization will be lost.
22 | - include: Comma separated rule id to be included.
23 | - exclude: Comma separated rule id to be excluded.
24 | - backend: The backend of the ETL file. Will be used to find the correct rules.
25 | - easy_sql: Boolean value to indicate whether the ETL file is written in Easy SQL or normal SQL. Will default to true.
26 | - inplace: Boolean value to indicate whether to overwrite the origin file with the fixed output. If false the fixed output will be written to a new file with suffix `.fixed.sql`.
27 | 
28 | Lint mode  parameters:
29 | 
30 | - path: The location of the ETL file.
31 | - include: Comma separated rule id to be included.
32 | - config-path: Sql fluff config file path, must be named `.sqlfluff`. Used to customize lint rules. There are some customization introduced by Easy SQL. If this is specified, the customization will be lost.
33 | - exclude: Comma separated rule id to be excluded.
34 | - backend: The backend of the ETL file. Will be used to find the correct rules.
35 | - easy_sql: Boolean value to indicate whether the ETL file is written in Easy SQL or normal SQL. Will default to true.
36 | 
37 | ## Programmatical usage
38 | 
39 | ```python
40 | from easy_sql.sql_linter.sql_linter import SqlLinter
41 | 
42 | sql = ""
43 | sql_linter = SqlLinter(sql, include_rules=None, exclude_rules=None)
44 | result = sql_linter.lint("bigquery", easy_sql=True)
45 | fixed = sql_linter.fix("bigquery", easy_sql=True)
46 | ```
47 | 
48 | You may find out that in the lint and fix command there is an option to specify which backend the ETL file is written to.
49 | If you do not provide the option, and you are using easy sql, it will automatically detect the backend from the file.
50 | Make sure you've specified the correct options, or it will generate unexpected output.
51 | 
52 | (
53 | For developers:
54 | 
55 | The backend impacts the applied rules. If defined as bigquery, all the customized rules with groups containing bigquery and sqlfluff built-in core rules will be applied.
56 | 
57 | ```python
58 | # groups in customized rules
59 | groups = ("all", "bigquery")
60 | ```
61 | )
62 | 


--------------------------------------------------------------------------------
/docs/easy_sql/other_features.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/easysql/easy_sql/b568542617942f347579ff872d976fd2175aa071/docs/easy_sql/other_features.md


--------------------------------------------------------------------------------
/docs/easy_sql/variables.md:
--------------------------------------------------------------------------------
 1 | # Variables
 2 | 
 3 | Easy SQL provides several special variables to help with ETL implementation.
 4 | These special variables are all starts with `__`.
 5 | 
 6 | Here are some description about what they are and how to use them.
 7 | 
 8 | ## Variables to control data saving
 9 | 
10 | - `__create_output_table__`: When true and the output table does not exist, will try to create output table automatically.
11 | - `__partition__`: If specified, will save output data to the specified partition. There must be a partition column followed in the variable name.
12 | As an example, if we defined variable `__partition__dt`, then dt will be the partition column and the value of the variable will be the partition value.
13 | - `__save_mode__`: Value could be 'overwrite' or 'append'. If not specified, default to 'overwrite'. Will do append or overwrite when write data to table.
14 | 
15 | ## Variables to control execution behaviour
16 | 
17 | - `__no_check__`: If true, will skip any `check` step defined by `-- target=check.xxx` for performance consideration.
18 | - `__no_log__`: If true, will skip any `log` step defined by `-- target=log.xxx` for performance consideration.
19 | - `__no_cache__`: If true, will create temporal table instead of cache table. This if for spark backend only. For the other backends, all the `cache` or `temp` table will be views.
20 | - `__dry_run_verify_output_schema__`: If true, will verify output table schema against the target table. Will fail if target table does not exist or there are columns in target table but not in the output query. Useful when need to do check in dryrun mode.
21 | - `__dry_run_verify_output_schema_type__`: Will be take into account when `__dry_run_verify_output_schema__` is true. If both `__dry_run_verify_output_schema__` and `__dry_run_verify_output_schema_type__` are true, will verify output table field types against the target table.
22 | - `__skip_all__`: If true, will skip execution of the following steps. Could be used when the partition of the input data does not exist.
23 | - `__exception_handler__`: When specified, the value must be a function call.
24 | The function call will be executed when there is an exception found during the execution of some step.
25 | As an example, the value could be `some_exception_handler({__step__}, {var_a}, b)`. As we see, there could be variables referenced in the function call and the variable will be resolved when exception happens (at runtime, not definition time).
26 | 
27 | ## Variables for function calling
28 | 
29 | - `__backend__`: An instance of [`Backend`]() class. Usually used to pass into functions.
30 | - `__step__`: An instance of [`Step`](https://easy-sql.readthedocs.io/en/latest/autoapi/easy_sql/sql_processor/step/index.html#easy_sql.sql_processor.step.Step) class. Usually used to pass into functions.
31 | - `__context__`: An instance of [`ProcessorContext`](https://easy-sql.readthedocs.io/en/latest/autoapi/easy_sql/sql_processor/context/index.html#easy_sql.sql_processor.context.ProcessorContext) class. Usually used to pass into functions.
32 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. Easy SQL documentation master file, created by
 2 |    sphinx-quickstart on Wed Apr 27 16:59:16 2022.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to Easy SQL's documentation!
 7 | ====================================
 8 | 
 9 | Easy SQL is built to ease the data ETL development process.
10 | With Easy SQL, you can develop your ETL in SQL in an imperative way.
11 | 
12 | It defines a few simple syntax on top of standard SQL, with which SQL could be executed one by one.
13 | Easy SQL also provides a processor to handle all the new syntax.
14 | 
15 | Since this is SQL agnostic, any SQL engine could be plugged-in as a backend.
16 | There are built-in supported for several popular SQL engines, including SparkSQL, PostgreSQL, Clickhouse, Aliyun Maxcompute, Google BigQuery.
17 | More will be added in the near future.
18 | 
19 | Contents
20 | --------
21 | 
22 | .. toctree::
23 |    :maxdepth: 6
24 | 
25 |    easy_sql/easy_sql.md
26 |    easy_sql/build_install.md
27 |    easy_sql/quick_start.md
28 |    easy_sql/syntax.md
29 |    easy_sql/debug.md
30 |    easy_sql/testing.md
31 |    easy_sql/linter.md
32 |    easy_sql/functions.md
33 |    easy_sql/udfs.md
34 |    easy_sql/variables.md
35 |    easy_sql/backend/flink.md
36 |    autoapi/index
37 | 
38 | 
39 | 
40 | Indices and tables
41 | ==================
42 | 
43 | * :ref:`genindex`
44 | * :ref:`modindex`
45 | * :ref:`search`
46 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | 	echo.
16 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | 	echo.installed, then set the SPHINXBUILD environment variable to point
18 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | 	echo.may add the Sphinx directory to PATH.
20 | 	echo.
21 | 	echo.If you don't have Sphinx installed, grab it from
22 | 	echo.https://www.sphinx-doc.org/
23 | 	exit /b 1
24 | )
25 | 
26 | if "%1" == "" goto help
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["flit_core >=3.2,<4"]
3 | build-backend = "flit_core.buildapi"
4 | 
5 | [project]
6 | name = "easysql"
7 | authors = [{name = "easy_sql", email = "easy_sql@thoughtworks.com"}]
8 | dynamic = ["version", "description"]
9 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | myst-parser==0.17.2
2 | sphinx-rtd-theme==1.0.0
3 | Sphinx==4.5.0
4 | sphinx-autoapi==1.8.4
5 | 


--------------------------------------------------------------------------------
/docs/scripts/update_doc.py:
--------------------------------------------------------------------------------
  1 | import inspect
  2 | import os.path
  3 | import re
  4 | from typing import Callable, Sequence
  5 | 
  6 | from easy_sql.sql_processor import funcs_rdb, funcs_spark
  7 | from easy_sql.udf import udfs
  8 | 
  9 | 
 10 | def _render_doc_modules_functions(backend: str):
 11 |     print("render doc for:", backend)
 12 |     assert backend in ["spark", "rdb"]
 13 |     mod = funcs_spark if backend == "spark" else funcs_rdb
 14 |     groups_doc = []
 15 |     for funcs_group in mod.__all__:
 16 |         mod_name: str = funcs_group
 17 |         funcs_group_mod = getattr(mod, funcs_group)
 18 |         funcs = [func for func in dir(funcs_group_mod) if not func.startswith("_") and func == func.lower()]
 19 |         assert mod_name.endswith("Func") or mod_name.endswith("Funcs") or mod_name.endswith("Functions")
 20 |         group_name = mod_name[: mod_name.rindex("Func")]
 21 | 
 22 |         funcs_doc = []
 23 |         for func_name in funcs:
 24 |             func_mod = getattr(funcs_group_mod, func_name)
 25 |             func_sig = str(inspect.signature(func_mod)).replace("(self, ", "(", 1).replace("'", "")
 26 |             module = func_mod.__module__
 27 |             func_doc = (
 28 |                 f"- [`{func_name}{func_sig}`]"
 29 |                 f'(https://easy-sql.readthedocs.io/en/latest/autoapi/{module.replace(".", "/")}/index.html#{module}.{mod_name}.{func_name})'  # noqa: B950
 30 |             )
 31 |             funcs_doc.append(func_doc)
 32 |         funcs_doc = "\n".join(funcs_doc)
 33 | 
 34 |         funcs_group_doc = f"""
 35 | #### {group_name} functions
 36 | 
 37 | {funcs_doc}
 38 | """
 39 |         groups_doc.append(funcs_group_doc)
 40 |     return "\n".join(groups_doc)
 41 | 
 42 | 
 43 | def _update_doc(
 44 |     doc_tpl_file: str,
 45 |     doc_file: str,
 46 |     tpl_rex: str,
 47 |     render: Callable[[Sequence[str]], str],
 48 | ):
 49 |     with open(doc_tpl_file, "r") as f:
 50 |         doc_tpl = f.read()
 51 |     lines = doc_tpl.split("\n")
 52 |     result_lines = []
 53 |     for line in lines:
 54 |         m = re.match(tpl_rex, line)
 55 |         if m:
 56 |             groups = m.groups()
 57 |             result_lines.append(render(groups))
 58 |         else:
 59 |             result_lines.append(line)
 60 | 
 61 |     with open(doc_file, "w") as f:
 62 |         f.write("\n".join(result_lines))
 63 |         print("updated file:", doc_file)
 64 | 
 65 | 
 66 | def update_func_doc():
 67 |     doc_tpl_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../easy_sql/functions.tpl.md")
 68 |     doc_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../easy_sql/functions.md")
 69 | 
 70 |     def render(groups: Sequence[str]) -> str:
 71 |         backend = groups[0]
 72 |         title = groups[1].strip() if len(groups) > 1 and groups[1].strip() else f"Functions for {backend} backend"
 73 |         return f"""
 74 | ### {title}
 75 | 
 76 | {_render_doc_modules_functions(backend)}
 77 | """
 78 | 
 79 |     _update_doc(doc_tpl_file, doc_file, r"\{\{ (spark|rdb) functions:? ?(.*)? \}\}", render)
 80 | 
 81 | 
 82 | def update_udf_doc():
 83 |     doc_tpl_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../easy_sql/udfs.tpl.md")
 84 |     doc_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../easy_sql/udfs.md")
 85 | 
 86 |     def render(groups: Sequence[str]) -> str:
 87 |         backend = groups[0]
 88 |         backend_display_names = {
 89 |             "spark": "Spark",
 90 |             "pg": "PostgreSQL",
 91 |             "ch": "Clickhouse",
 92 |         }
 93 | 
 94 |         udf_names = udfs.get_udfs(backend)
 95 |         udf_mods = {"spark": udfs.SparkUdfs, "pg": udfs.PgUdfs, "ch": udfs.ChUdfs}
 96 |         rendered_udfs_doc = []
 97 |         for udf_name in udf_names:
 98 |             udf_mod = udf_mods[backend]
 99 |             udf_sig = str(inspect.signature(getattr(udf_mod, udf_name)))
100 |             udf_doc = (
101 |                 f"- [`{udf_name}{udf_sig}`]"
102 |                 f"(https://easy-sql.readthedocs.io/en/latest/autoapi/easy_sql/udf/udfs/index.html#easy_sql.udf.udfs.{udf_mod.__name__}.{udf_name})"
103 |             )
104 |             rendered_udfs_doc.append(udf_doc)
105 |         rendered_udfs_doc = "\n".join(rendered_udfs_doc)
106 |         return f"""
107 | ### {backend_display_names.get(backend, backend)} UDFs
108 | 
109 | {rendered_udfs_doc}
110 |         """
111 | 
112 |     _update_doc(doc_tpl_file, doc_file, r"\{\{ (spark|pg|ch) UDFs:? ?(.*)? \}\}", render)
113 | 
114 | 
115 | if __name__ == "__main__":
116 |     update_func_doc()
117 |     update_udf_doc()
118 | 


--------------------------------------------------------------------------------
/docs/sqlfluff/new_rule.md:
--------------------------------------------------------------------------------
 1 | # Easy SQL customized code quality rules
 2 | 
 3 | ## Introduction
 4 | 
 5 | In SQL FLuff we already have predefined rules, it follows the common rules for clean SQL code.
 6 | You can check all the implemented rule from running the command: `sqlfluff rules`.
 7 | But they are not enough, in some situation we need to implement a customized rule. This documentation will go through the steps of how to achieve it.
 8 | 
 9 | Before we start, we need to understand the design of SQL Fluff first. When checking the SQL code, it will go through the following steps:
10 | 
11 | - **templates handling**: To replace the variable used in SQL. Jinjia/dbt format are supported. The replacement is static, and all the variables will be resolved from the config file first.
12 | - **lex**: Separate the SQL into whitespace and code segment.
13 | - **parse**: Parse the lex result and organize the tokens to a grammar tree with the specific SQL dialects. If no matches found for a segment, the content will be wrapped in an `UnparsableSegment` which will be picked up as a parsing error later.
14 | - **lint**: Walk through the parsed tree-structured data and check if there are violations according to the rules. There will be `lintError` returned if any violations found.
15 | - **fix**: Auto-fix the problem pointed out by lint.
16 | 
17 | In SQL Fluff, segments form a tree-like structure. The top-level segment is a `FileSegment`, which contains zero or more `StatementSegment`s.
18 | Before parsing the segments and names according to their type, they are `raw`, meaning that they are literal values.
19 | 
20 | ## New rule
21 | 
22 | To create new rules, we need to implement a new class extended from `BaseRule` in SQL Fluff first.
23 | The name of the class will become the name of rules. The name convention should be `Rule_xxxxx_Lxxx`. The `BaseRule` contains a parsing logic to parse rules following the convention.
24 | The core function of lint is `_eval`, the input is a tree structure element to indicate the context.
25 | By calling context.segment.children you can find the next segment, and the link structure also allows you traverse to the end.
26 | 
27 | In the `groups` definition, "all" must be there.
28 | 
29 | ```config
30 | # Comma separated list of rules to check, default to all
31 | rules = all
32 | ```
33 | 
34 | You can check the type of the segment by `is_type("table_reference")`. To find the correct name of the segment, you need to go into SQL Fluff code to find the segment class and name is inside.
35 | With the `is_type` check, you can correctly point to the location that need to check the rule。
36 | 
37 | ```python
38 | from sqlfluff.core.rules.base import BaseRule, RuleContext
39 | 
40 | 
41 | class Rule_BigQuery_L001(BaseRule):
42 | 
43 |     groups = ("all", "bigquery")
44 | 
45 |     def __init__(self, *args, **kwargs):
46 |         """Overwrite __init__ to set config."""
47 |         super().__init__(*args, **kwargs)
48 | 
49 |     def _eval(self, context: RuleContext):
50 |         pass
51 | ```
52 | 
53 | ```python
54 | from sqlfluff.core.parser import BaseSegment
55 | 
56 | class WildcardExpressionSegment(BaseSegment):
57 |     type = "wildcard_expression"
58 | ```
59 | 
60 | ## Define the rule violation
61 | 
62 | The return value of the _eval() function is the rule violation record object.
63 | If the check passed and no error found, it should return nothing.
64 | The return object is a `LintResult` object. In creation of this object, you should pass three arguments.
65 | 
66 | + `anchor`: the segment that hold the position info.
67 | + `description`: a description of the reason why it is failed.
68 | + `fix`: a list of fix object (`delete`/`replace`/`create_before`/`create_after`) to fix the problem. To further understand the function, can read the `LintFix` code. If you do not pass in anything as `fix`, it will do nothing in the fix step.
69 | 
70 | 
71 | ## Add the rule into action
72 | 
73 | To make the customized function work, you need to pass in a list of classes as the linter `user_rules` parameter.
74 | So far, we imported all the classes in the `easy_sql.sql_linter.rules`. By adding the rule while initiating the module, it works as expected.
75 | 
76 | ```python
77 | from sqlfluff.core import Linter
78 | 
79 | linter = Linter( user_rules=[])
80 | ```
81 | 
82 | ```python
83 | from easy_sql.sql_linter.rules.bq_schema_rule import Rule_BigQuery_L001
84 | 
85 | __all__ = [Rule_BigQuery_L001]
86 | ```
87 | 


--------------------------------------------------------------------------------
/docs/sqlfluff/quick_start.md:
--------------------------------------------------------------------------------
  1 | # Quick start
  2 | 
  3 | ## Introduction to sqlfluff
  4 | 
  5 | With multiple contributors to a project and varying technical backgrounds, it's really difficult to maintain consistent readability and comprehension across a codebase.
  6 | sqlfluff is a tool that can easily checkout sql code quality with varying SQL backgrounds and dialects.
  7 | 
  8 | ## Python Requirement
  9 | 
 10 | Sqlfluff does not support for python 2. It need python 3+. Check your python version with the following:
 11 | 
 12 | ```bash
 13 | $ python --version
 14 | ```
 15 | 
 16 | ## Quick install
 17 | 
 18 | Install sqlfluff with pip: `pip install sqlfluff`
 19 | 
 20 | Check if installation succeeds: `pip install sqlfluff`
 21 | 
 22 | 
 23 | ## Hands on demo
 24 | We can use command line to quick check the sql code quality. Here we use an example, save the example to a sql file.
 25 | ```sql
 26 | SELECT a+b  AS foo,
 27 | c AS bar from my_table
 28 | ```
 29 | cd to the folder and run the test
 30 | ```bash
 31 | $ cd /test/doc
 32 | $ sqlfluff lint test_sqlfulff.sql --dialect ansi
 33 | ```
 34 | output:
 35 | ```
 36 | == [test_sqlfulff.sql] FAIL
 37 | L:   1 | P:   1 | L034 | Select wildcards then simple targets before calculations
 38 |                        | and aggregates.
 39 | L:   1 | P:   1 | L036 | Select targets should be on a new line unless there is
 40 |                        | only one select target.
 41 | L:   1 | P:   9 | L006 | Missing whitespace before +
 42 | L:   1 | P:   9 | L006 | Missing whitespace after +
 43 | L:   1 | P:  11 | L039 | Unnecessary whitespace found.
 44 | L:   2 | P:   1 | L003 | Expected 1 indentations, found 0 [compared to line 01]
 45 | L:   2 | P:  10 | L010 | Keywords must be consistently upper case.
 46 | L:   2 | P:  23 | L009 | Files must end with a single trailing newline.
 47 | All Finished 📜 🎉!
 48 | ```
 49 | The sqlfluff checker will tell what is needed to take care.
 50 | To further understand the rule, check https://docs.sqlfluff.com/en/stable/rules.html#ruleref
 51 | 
 52 | Automatically fix the issue with specify rule:
 53 | 
 54 | ```bash
 55 | $ sqlfluff fix test_sqlfulff.sql --rules L003,L009,L010 --dialect ansi
 56 | ```
 57 | 
 58 | ## Customer style
 59 | In lint command can specify to use different kind of dialect for varying SQL backend.
 60 | Check the currently support backend:
 61 | 
 62 | ```bash
 63 | $ sqlfluff dialects
 64 | ```
 65 | ```output
 66 | ==== sqlfluff - dialects ====
 67 | ansi:                 ansi dialect [inherits from 'nothing']
 68 | bigquery:            bigquery dialect [inherits from 'ansi']
 69 | db2:                      db2 dialect [inherits from 'ansi']
 70 | exasol:                exasol dialect [inherits from 'ansi']
 71 | ```
 72 | All dialects are inherited from basic ansi dialects. If we want to customize dialects, we need to fork the git repo and create new class.
 73 | 
 74 | Lint command also can specify which rules to use to check. To list the currently support rules:
 75 | 
 76 | ```bash
 77 | $ sqlfluff rules
 78 | ```
 79 | 
 80 | ```output
 81 | ==== sqlfluff - rules ====
 82 | L001: Unnecessary trailing whitespace.
 83 | L002: Mixed Tabs and Spaces in single whitespace.
 84 | L003: Indentation not consistent with previous lines.
 85 | ```
 86 | 
 87 | Rules are predefined, but it is flexible at parameter and usage.
 88 | All the settings can be specified in a `.sqlfluff` file.
 89 | Change the config file, the customer style will be applied immediately.
 90 | 
 91 | Only enable few rule:
 92 | 
 93 | ```config
 94 | rules = L001,L002 (default :all)
 95 | ```
 96 | 
 97 | Ignore specify rule:
 98 | 
 99 | ```config
100 | exclude_rules = L001,L002
101 | ```
102 | 
103 | To customize a specific rule, parameters are predefined:
104 | 
105 | ```config
106 | [sqlfluff:rules:L010]
107 | capitalisation_policy = consistent
108 | ignore_words = from
109 | ignore_words_regex = None
110 | ```
111 | 
112 | ## Jinjia Template
113 | 
114 | SQL fluff also support template replacement by variables for flexibility.
115 | By default, it is Jinja template, and we also use Jinja as example here.
116 | 
117 | ```sql
118 | SELECT a+b  AS foo,
119 | c AS bar from my_table where name = {{ test_name }};
120 | ```
121 | 
122 | Set the value of the parameter in the config file `.sqlfluff`:
123 | 
124 | ```config
125 | [sqlfluff:templater:jinja:context]
126 | test_name=456
127 | ```
128 | 
129 | After this, you can get the parsed result by run the following command:
130 | 
131 | ```bash
132 | $ sqlfluff parse test_sqlfulff.sql --rules L003,L009,L010 --dialect ansi
133 | ```
134 | 
135 | # Easy sql integration Plan
136 | 
137 | 1. Parse the backend config in easy sql to define the dialect
138 | 2. Our easy sql have multiple sql with comment as seperator. Add for-loop to loop through all different sql file
139 | 3. Make rules enable /disable for different sql backend. For example, bigquery specific need schema.
140 | 4. Add rules to check, including input/output check, partition check
141 | 5. Allow for easy_sql function  and variable like ${temp_db} to be checked.
142 | 
143 | 
144 | refer dbt tool but it is static
145 | 


--------------------------------------------------------------------------------
/easy_sql/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/easysql/easy_sql/b568542617942f347579ff872d976fd2175aa071/easy_sql/__init__.py


--------------------------------------------------------------------------------
/easy_sql/base_test.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import os
 4 | import uuid
 5 | from datetime import datetime
 6 | from typing import TYPE_CHECKING, Dict, List, Optional
 7 | 
 8 | if TYPE_CHECKING:
 9 |     from pyspark.sql import SparkSession
10 | 
11 | from easy_sql.sql_processor import SqlProcessor
12 | from easy_sql.sql_processor.backend.rdb import SqlExpr
13 | 
14 | from .local_spark import LocalSpark
15 | from .logger import log_time
16 | 
17 | 
18 | def should_run_integration_test(key: Optional[str] = None):
19 |     if key is None or key in ["pg", "ch", "mc", "bq", "flink_hive"]:
20 |         return False
21 |     return True
22 | 
23 | 
24 | TEST_PG_URL = os.environ.get("PG_URL", "postgresql://postgres:123456@testpg:15432/postgres")
25 | TEST_PG_JDBC_URL = os.environ.get("PG_JDBC_URL", "jdbc:postgresql://testpg:15432/postgres")
26 | TEST_PG_JDBC_USER = os.environ.get("PG_JDBC_USER", "postgres")
27 | TEST_PG_JDBC_PASSWD = os.environ.get("PG_JDBC_PASSWD", "123456")
28 | TEST_CH_URL = os.environ.get("CLICKHOUSE_URL", "clickhouse+native://default@testch:30123")
29 | TEST_BQ_URL = os.environ.get("BQ_URL", "bigquery://")
30 | 
31 | __partition_col_converter__ = lambda col: (
32 |     f"PARSE_DATE('%Y-%m', {col}) as {col}" if col in ["data_month", ":data_month"] else f"CAST({col} as DATE)"
33 | )
34 | __partition_value_converter__ = lambda col, value: (
35 |     datetime.strptime(value, "%Y-%m").date() if col == "data_month" else datetime.strptime(value, "%Y-%m-%d").date()
36 | )
37 | __column_sql_type_converter__ = lambda backend_type, col_name, col_type: (
38 |     "DATE" if col_name in ["di", "dt", "data_date", "data_month"] else None
39 | )
40 | __partition_expr__ = lambda backend_type, partition_col: (
41 |     f"DATE_TRUNC({partition_col}, MONTH)"
42 |     if backend_type == "bigqiery" and partition_col == "data_month"
43 |     else partition_col
44 | )
45 | bigquery_sql_expr = SqlExpr(
46 |     column_sql_type_converter=__column_sql_type_converter__,
47 |     partition_col_converter=__partition_col_converter__,
48 |     partition_value_converter=__partition_value_converter__,
49 |     partition_expr=__partition_expr__,
50 | )
51 | 
52 | 
53 | def dt(dt_s):
54 |     return datetime.strptime(dt_s, "%Y-%m-%d %H:%M:%S")
55 | 
56 | 
57 | def date(s):
58 |     return datetime.strptime(s, "%Y-%m-%d").date()
59 | 
60 | 
61 | def dt_zone(dt_s: str, formate="%Y-%m-%d %H:%M:%S", timezone=None):
62 |     if timezone is None:
63 |         return datetime.strptime(dt_s, formate)
64 |     else:
65 |         return datetime.strptime(dt_s, formate).replace(tzinfo=timezone)
66 | 
67 | 
68 | def next_id():
69 |     return str(uuid.uuid1()).replace("-", "")
70 | 
71 | 
72 | @log_time
73 | def run_sql(
74 |     sql: str,
75 |     result_table: str,
76 |     funcs: Optional[Dict] = None,
77 |     variables: Optional[Dict] = None,
78 |     dry_run: bool = False,
79 |     spark: Optional[SparkSession] = None,
80 |     spark_conf: Optional[Dict] = None,
81 | ) -> List:
82 |     spark = spark or LocalSpark.get(spark_conf)
83 |     processor = SqlProcessor(spark, sql, [], variables or {})
84 |     processor.func_runner.register_funcs(funcs or {})
85 |     processor.run(dry_run=dry_run)
86 |     return spark.sql(f"select * from {result_table}").collect()
87 | 


--------------------------------------------------------------------------------
/easy_sql/cli/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/easysql/easy_sql/b568542617942f347579ff872d976fd2175aa071/easy_sql/cli/__init__.py


--------------------------------------------------------------------------------
/easy_sql/data_process.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import os
 4 | import sys
 5 | from typing import List, Optional
 6 | 
 7 | import click
 8 | 
 9 | 
10 | @click.command(name="data_process")
11 | @click.option("--sql-file", "-f", type=str)
12 | @click.option("--vars", "-v", type=str, required=False)
13 | @click.option("--dry-run", type=str, required=False, help="if dry run, one of [true, 1, false, 0]")
14 | @click.option("--python-path", type=str, required=False)
15 | @click.option("--print-command", "-p", is_flag=True)
16 | def data_process(sql_file: str, vars: str, dry_run: str, python_path: str, print_command: bool):
17 |     EasySqlProcessor(sql_file, vars, dry_run, print_command, python_path=python_path).process()
18 | 
19 | 
20 | class EasySqlProcessor:
21 |     def __init__(
22 |         self,
23 |         sql_file: str,
24 |         vars: Optional[str],
25 |         dry_run: Optional[str],
26 |         print_command: bool,
27 |         python_path: Optional[str] = None,
28 |     ) -> None:
29 |         if not sql_file.endswith(".sql"):
30 |             raise Exception(f"sql_file must ends with .sql, found `{sql_file}`")
31 | 
32 |         try:
33 |             from easy_sql.config.sql_config import EasySqlConfig
34 |         except ModuleNotFoundError:
35 |             assert python_path is not None
36 |             sys.path.insert(0, python_path)
37 |             from easy_sql.config.sql_config import EasySqlConfig
38 | 
39 |         self.sql_file = sql_file
40 |         self.vars_arg = vars
41 |         self.dry_run_arg = dry_run if dry_run is not None else "0"
42 |         self.dry_run = dry_run in ["true", "1"]
43 |         self.config = EasySqlConfig.from_sql(sql_file)
44 |         self.print_command = print_command
45 | 
46 |     def process(self, backend_config: Optional[List[str]] = None) -> Optional[str]:
47 |         from easy_sql.cli.backend_processor import BackendProcessor
48 | 
49 |         backend_processor = BackendProcessor.create_backend_processor(self.config)
50 | 
51 |         if self.print_command:
52 |             command = backend_processor.shell_command(
53 |                 self.vars_arg, self.dry_run_arg, os.path.abspath(__file__), backend_config
54 |             )
55 |             print(command)
56 |             return command
57 |         else:
58 |             backend_processor.run(self.vars_arg, self.dry_run, backend_config)
59 | 
60 | 
61 | if __name__ == "__main__":
62 |     data_process()
63 | 


--------------------------------------------------------------------------------
/easy_sql/data_process_itest.py:
--------------------------------------------------------------------------------
 1 | import os.path
 2 | import unittest
 3 | from typing import List, Optional
 4 | 
 5 | from easy_sql import data_process
 6 | 
 7 | proj_base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 8 | 
 9 | 
10 | def _data_process(
11 |     sql_file: str,
12 |     vars: Optional[str],
13 |     dry_run: Optional[str],
14 |     print_command: bool,
15 |     backend_config: Optional[List[str]] = None,
16 | ) -> None:
17 |     data_process.EasySqlProcessor(sql_file, vars, dry_run, print_command).process(backend_config)
18 | 
19 | 
20 | class DataProcessTest(unittest.TestCase):
21 |     def test_spark(self):
22 |         _data_process(os.path.join(proj_base_dir, "test/sample_etl.spark.sql"), None, None, False)
23 | 
24 |     def test_postgres(self):
25 |         _data_process(os.path.join(proj_base_dir, "test/sample_etl.postgres.sql"), None, None, False)
26 | 
27 |     def test_clickhouse(self):
28 |         _data_process(os.path.join(proj_base_dir, "test/sample_etl.clickhouse.sql"), None, None, False)
29 | 
30 |     def test_flink_postgres(self):
31 |         print(_data_process(os.path.join(proj_base_dir, "test/sample_etl.flink.postgres.sql"), None, None, False))
32 | 


--------------------------------------------------------------------------------
/easy_sql/data_process_test.py:
--------------------------------------------------------------------------------
 1 | import os.path
 2 | import unittest
 3 | from typing import Optional
 4 | 
 5 | from easy_sql import data_process
 6 | 
 7 | proj_base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 8 | 
 9 | 
10 | def _data_process(sql_file: str, vars: Optional[str], dry_run: Optional[str], print_command: bool) -> Optional[str]:
11 |     return data_process.EasySqlProcessor(sql_file, vars, dry_run, print_command).process()
12 | 
13 | 
14 | class DataProcessTest(unittest.TestCase):
15 |     def test_spark(self):
16 |         command = _data_process(os.path.join(proj_base_dir, "test/sample_etl.spark.sql"), None, None, True)
17 |         assert command is not None
18 |         print(command)
19 |         self.assertRegex(
20 |             command,
21 |             r"spark-submit --conf spark.master=local\[2\] --conf spark.submit.deployMode=client "
22 |             r"--conf spark.app.name=sample_etl.spark_[\d]+ "
23 |             "--conf spark.sql.warehouse.dir=/tmp/spark-warehouse-localdw "
24 |             '--conf spark.driver.extraJavaOptions="-Dderby.system.home=/tmp/spark-warehouse-metastore '
25 |             '-Dderby.stream.error.file=/tmp/spark-warehouse-metastore.log" '
26 |             '--conf spark.files="[^"]+test/sample_etl.spark.sql" '
27 |             '"[^"]+/easy_sql/data_process.py" '
28 |             "-f .+/test/sample_etl.spark.sql --dry-run 0",
29 |         )
30 | 
31 |     def test_flink_hive(self):
32 |         command = _data_process(os.path.join(proj_base_dir, "test/sample_etl.flink.hive.sql"), None, None, True)
33 | 
34 |         assert command is not None
35 |         self.assertRegex(
36 |             command.strip(),
37 |             r".*flink run --parallelism 2 "
38 |             '--pyFiles "[^"]+test/sample_etl.flink.hive.sql" '
39 |             "-t local "
40 |             '--python "[^"]+/easy_sql/data_process.py" '
41 |             r"-f .+/test/sample_etl.flink.hive.sql --dry-run 0",
42 |         )
43 | 
44 |     def test_flink_hive_postgres(self):
45 |         command = _data_process(
46 |             os.path.join(proj_base_dir, "test/sample_etl.flink.hive.postgres.sql"), None, None, True
47 |         )
48 |         assert command is not None
49 |         self.assertRegex(
50 |             command,
51 |             r".*flink run --parallelism 1 "
52 |             '--pyFiles "[^"]+test/sample_etl.flink.hive.postgres.sql" '
53 |             '--python "[^"]+/easy_sql/data_process.py" '
54 |             "-f .+/test/sample_etl.flink.hive.postgres.sql --dry-run 0",
55 |         )
56 | 
57 |     def test_flink_scala_udf(self):
58 |         command = _data_process(os.path.join(proj_base_dir, "test/udf/flink-scala/etl_with_udf.sql"), None, None, True)
59 |         assert command is not None
60 |         self.assertRegex(
61 |             command,
62 |             r".*flink run --parallelism 1 "
63 |             '--pyFiles "[^"]+test/udf/flink-scala/etl_with_udf.sql" --jarfile udf.jar '
64 |             '--python "[^"]+/easy_sql/data_process.py" '
65 |             "-f .+/test/udf/flink-scala/etl_with_udf.sql --dry-run 0",
66 |         )
67 | 


--------------------------------------------------------------------------------
/easy_sql/local_spark.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | from typing import Any, Dict, Optional
 4 | 
 5 | from pyspark.sql import SparkSession
 6 | 
 7 | 
 8 | class LocalSpark:
 9 |     spark: Optional[SparkSession] = None
10 |     __conf: Dict = {}
11 | 
12 |     @staticmethod
13 |     def stop():
14 |         if LocalSpark.spark:
15 |             LocalSpark.spark.stop()
16 |             LocalSpark.spark = None
17 | 
18 |     @staticmethod
19 |     def get(conf: Optional[Dict[str, Any]] = None, clean_existing_data: bool = True) -> SparkSession:
20 |         conf = conf or {}
21 |         if LocalSpark.spark is None:
22 |             default_conf = {
23 |                 "spark.default.parallelism": 4,
24 |                 "hive.exec.dynamic.partition.mode": "nonstrict",
25 |                 "spark.sql.warehouse.dir": "/tmp/spark-warehouse-localdw-ut",
26 |                 "spark.driver.extraJavaOptions": (
27 |                     "-Dderby.system.home=/tmp/spark-warehouse-metastore-ut "
28 |                     "-Dderby.stream.error.file=/tmp/spark-warehouse-metastore-ut.log"
29 |                 ),
30 |             }
31 |             default_conf.update(conf)
32 |             conf = default_conf
33 | 
34 |             if clean_existing_data:
35 |                 # delete old spark warehouse/metastore dir
36 |                 print(f"removing dir {conf['spark.sql.warehouse.dir']}")
37 |                 shutil.rmtree(conf["spark.sql.warehouse.dir"], ignore_errors=True)
38 |                 if "-Dderby.system.home" in conf["spark.driver.extraJavaOptions"]:
39 |                     import re
40 | 
41 |                     java_options = re.sub(r"\s*=\s*", "=", conf["spark.driver.extraJavaOptions"].strip()).split()
42 |                     for op in java_options:
43 |                         if op.split("=")[0].strip() == "-Dderby.system.home":
44 |                             print(f"removing dir {op.split('=')[1].strip()}")
45 |                             shutil.rmtree(op.split("=")[1].strip(), ignore_errors=True)
46 | 
47 |             # ensure a local spark with default config
48 |             os.environ["SPARK_CONF_DIR"] = "/tmp/local-spark-conf-ut"
49 |             spark_builder = SparkSession.builder.appName("UnitTest").master("local[4]")
50 | 
51 |             print("using conf: ", conf)
52 |             for k, v in conf.items():
53 |                 spark_builder.config(k, v)
54 |             LocalSpark.spark = spark_builder.enableHiveSupport().getOrCreate()
55 | 
56 |         spark = LocalSpark.spark
57 |         spark.catalog.clearCache()
58 |         for table in spark.catalog.listTables("default"):
59 |             if table.isTemporary:
60 |                 print(f"dropping temp view {table.name}")
61 |                 spark.catalog.dropTempView(table.name)
62 | 
63 |         return LocalSpark.spark
64 | 


--------------------------------------------------------------------------------
/easy_sql/logger.py:
--------------------------------------------------------------------------------
 1 | import codecs
 2 | import logging
 3 | import sys
 4 | from datetime import datetime
 5 | from functools import wraps
 6 | from typing import Callable
 7 | 
 8 | LOG_LEVEL = logging.DEBUG
 9 | 
10 | 
11 | def _config_logger():
12 |     logger = logging.getLogger("simple_logger")
13 |     logger.setLevel(LOG_LEVEL)
14 |     python_version = sys.version_info
15 |     if python_version.major == 3 and python_version.minor == 6:
16 |         sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach())  # type: ignore
17 |     elif hasattr(sys.stdout, "reconfigure"):
18 |         sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
19 |     handler = logging.StreamHandler(sys.stdout)
20 |     handler.setLevel(LOG_LEVEL)
21 | 
22 |     formatter = logging.Formatter(
23 |         "[%(asctime)s][%(processName)s:%(threadName)s][%(levelname)s][%(module)s.%(funcName)s:%(lineno)d] %(message)s"
24 |     )
25 |     handler.setFormatter(formatter)
26 | 
27 |     for existing_handler in logger.handlers:
28 |         logger.removeHandler(existing_handler)
29 |     logger.addHandler(handler)
30 | 
31 |     return logger
32 | 
33 | 
34 | def log_time(func: Callable):
35 |     @wraps(func)
36 |     def wrapper(*args, **kwargs):
37 |         start_time = datetime.now()
38 |         try:
39 |             return func(*args, **kwargs)
40 |         finally:
41 |             end_time = datetime.now()
42 |             logger.debug("function {} took {}s".format(func.__name__, (end_time - start_time).total_seconds()))
43 | 
44 |     return wrapper
45 | 
46 | 
47 | logger = _config_logger()
48 | 


--------------------------------------------------------------------------------
/easy_sql/report.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from datetime import datetime
 3 | from typing import Any
 4 | 
 5 | 
 6 | class EsService:
 7 |     def __init__(self, base_url: str, should_send: bool = True):
 8 |         self.base_url = base_url.strip("/")
 9 |         self.should_send = should_send
10 |         self.data = None
11 | 
12 |     def post(self, url_path: str, data: str):
13 |         import requests
14 | 
15 |         if self.should_send:
16 |             resp = requests.post(
17 |                 self.base_url + url_path, headers={"Content-Type": "Application/json"}, data=data.encode("utf8")
18 |             )
19 |             if not resp.ok:
20 |                 raise Exception(f"send data quality report failed(status={resp.status_code}): {resp.text}")
21 |             print("data post to es done")
22 |         else:
23 |             self.data = {"method": "post", "args": {"url_path": url_path, "data": data}}
24 |             print("will not send data")
25 | 
26 |     def put(self, url_path: str, data: str):
27 |         import requests
28 | 
29 |         if self.should_send:
30 |             resp = requests.put(
31 |                 self.base_url + url_path, headers={"Content-Type": "Application/json"}, data=data.encode("utf8")
32 |             )
33 |             if not resp.ok:
34 |                 raise Exception(f"send data quality report failed(status={resp.status_code}): {resp.text}")
35 |             print("data put to es done")
36 |         else:
37 |             self.data = {"method": "put", "args": {"url_path": url_path, "data": data}}
38 |             print("will not send data")
39 | 
40 |     def delete_by_query(self, index: str, query: object):
41 |         import requests
42 | 
43 |         data = json.dumps({"query": query})
44 |         url_path = f"/{index}/_delete_by_query"
45 |         if self.should_send:
46 |             resp = requests.post(
47 |                 self.base_url + url_path, headers={"Content-Type": "Application/json"}, data=data.encode("utf8")
48 |             )
49 |             if not resp.ok:
50 |                 raise Exception(f"send data quality report failed(status={resp.status_code}): {resp.text}")
51 |         else:
52 |             self.data = {"method": "post", "args": {"url_path": url_path, "data": data}}
53 |             print("will not send data")
54 | 
55 | 
56 | class Reporter:
57 |     def __init__(self, es_service: EsService, index_prefix: str = "", now: Any = None):
58 |         self.es_service = es_service
59 |         self.now = now
60 |         self.index_prefix = index_prefix
61 | 
62 |     def _es_index_name(self, name: str):
63 |         return f"{self.index_prefix}_{name}" if self.index_prefix else name
64 | 
65 |     def report_task_result(self, task_id: str, report: str):
66 |         """
67 |         es index:
68 | 
69 |         PUT /{index_prefix}_task_report
70 |         {
71 |           "mappings": {
72 |             "properties": {
73 |               "task_id": { "type": "wildcard" },
74 |               "report": { "type": "text" },
75 |               "created_at": { "type": "date", "format": "yyyy-MM-dd HH:mm:ss" }
76 |             }
77 |           }
78 |         }
79 |         """
80 |         now = (self.now or datetime.utcnow()).strftime("%Y-%m-%d %H:%M:%S")
81 |         data = {"task_id": task_id, "report": report, "created_at": now}
82 |         self.es_service.post(f'/{self._es_index_name("task_report")}/_doc', json.dumps(data))
83 | 


--------------------------------------------------------------------------------
/easy_sql/report_test.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from easy_sql.report import EsService, Reporter
 4 | 
 5 | 
 6 | class ReporterTest(unittest.TestCase):
 7 |     @unittest.skip("integration test")
 8 |     def test_should_report_task_result(self):
 9 |         reporter = Reporter(EsService("http://testes:9200"))
10 |         reporter.report_task_result("some-task", "some message\nsome other message")
11 | 


--------------------------------------------------------------------------------
/easy_sql/spark_optimizer.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, Optional
 2 | 
 3 | from pyspark.sql import SparkSession
 4 | 
 5 | 
 6 | class SparkDynamicConfig:
 7 |     def __init__(self, max_shuffle_partitions: Optional[int] = None, min_shuffle_partitions: Optional[int] = None):
 8 |         self.max_shuffle_partitions = max_shuffle_partitions
 9 |         self.min_shuffle_partitions = min_shuffle_partitions
10 | 
11 |     def use_min_shuffle_partitions(self, spark: SparkSession) -> "SparkDynamicConfig":
12 |         assert self.min_shuffle_partitions, "must provide min_shuffle_partitions to use the conf"
13 |         spark.conf.set("spark.sql.adaptive.enabled", "false")
14 |         spark.conf.set("spark.sql.shuffle.partitions", str(self.min_shuffle_partitions))
15 |         spark.conf.set("spark.default.parallelism", str(self.min_shuffle_partitions))
16 |         return self
17 | 
18 |     def use_max_shuffle_partitions(self, spark: SparkSession) -> "SparkDynamicConfig":
19 |         assert self.max_shuffle_partitions, "must provide max_shuffle_partitions to use the conf"
20 |         spark.conf.set("spark.sql.adaptive.enabled", "false")
21 |         spark.conf.set("spark.sql.shuffle.partitions", str(self.max_shuffle_partitions))
22 |         spark.conf.set("spark.default.parallelism", str(self.max_shuffle_partitions))
23 |         return self
24 | 
25 |     def use_adaptive_shuffle_partitions(self, spark: SparkSession) -> "SparkDynamicConfig":
26 |         spark.conf.set("spark.sql.adaptive.enabled", "true")
27 |         return self
28 | 
29 | 
30 | def get_spark(app_name: Optional[str] = None, conf: Optional[Dict] = None):
31 |     builder = SparkSession.builder.enableHiveSupport()
32 |     if app_name:
33 |         builder.config("spark.app.name", app_name)
34 |     conf = conf or {}
35 |     for k, v in conf.items():
36 |         builder.config(k, v)
37 | 
38 |     spark = builder.getOrCreate()
39 |     spark.conf.set("spark.sql.statistics.fallBackToHdfs", "true")
40 |     # 启用 Adaptive Execution ，从而启用自动设置 Shuffle Reducer 特性
41 |     spark.conf.set("spark.sql.adaptive.enabled", "true")
42 |     # 设置每个 Reducer 读取的目标数据量，单位为字节。默认64M，一般改成集群块大小
43 |     spark.conf.set("spark.sql.adaptive.shuffle.targetPostShuffleInputSize", "134217728")
44 |     # 允许动态资源分配，配合 spark.dynamicAllocation.minExecutors, spark.dynamicAllocation.maxExecutors 等使用
45 |     # spark 3.0+ 不允许动态设置以下两个参数
46 |     import pyspark
47 | 
48 |     if str(pyspark.__version__).startswith("2."):  # type: ignore
49 |         spark.conf.set("spark.dynamicAllocation.enabled", "true")
50 |         spark.conf.set("spark.shuffle.service.enabled", "true")
51 | 
52 |     # spark.conf.set("hive.exec.dynamic.partition", "true")
53 |     # default strict. In strict mode, the user must specify at least one static partition,
54 |     # in case the user accidentally overwrites all partitions.
55 |     # In nonstrict mode all partitions are allowed to be dynamic.
56 |     spark.conf.set("hive.exec.dynamic.partition.mode", "nonstrict")
57 | 
58 |     return spark
59 | 
60 | 
61 | def clear_temp_views(spark: SparkSession):
62 |     for table in spark.catalog.listTables("default"):
63 |         if table.isTemporary:
64 |             print(f"dropping temp view {table.name}")
65 |             spark.catalog.dropTempView(table.name)
66 | 


--------------------------------------------------------------------------------
/easy_sql/sql_linter/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/easysql/easy_sql/b568542617942f347579ff872d976fd2175aa071/easy_sql/sql_linter/__init__.py


--------------------------------------------------------------------------------
/easy_sql/sql_linter/rules/__init__.py:
--------------------------------------------------------------------------------
1 | from easy_sql.sql_linter.rules.bq_schema_rule import Rule_BigQuery_L001
2 | 
3 | all_rules = [Rule_BigQuery_L001]
4 | 


--------------------------------------------------------------------------------
/easy_sql/sql_linter/rules/bq_schema_rule.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import TYPE_CHECKING
 4 | 
 5 | from sqlfluff.core.parser import CodeSegment
 6 | from sqlfluff.core.rules.base import BaseRule, LintFix, LintResult
 7 | from sqlfluff.core.rules.crawlers import SegmentSeekerCrawler
 8 | 
 9 | if TYPE_CHECKING:
10 |     from sqlfluff.core.rules.context import RuleContext
11 | 
12 | 
13 | class Rule_BigQuery_L001(BaseRule):
14 |     """
15 |     Table schema is required for queries in BigQuery.
16 | 
17 |     **Anti-pattern**
18 |     Select from some table without schema.
19 | 
20 |     .. code-block:: sql
21 |         SELECT *
22 |         FROM foo
23 | 
24 |     **Best practice**
25 |     Select from some table with schema.
26 | 
27 |     .. code-block:: sql
28 |         SELECT *
29 |         FROM test.foo
30 |     """
31 | 
32 |     groups = ("all", "bigquery")
33 |     crawl_behaviour = SegmentSeekerCrawler({"table_reference"})
34 | 
35 |     def __init__(self, *args, **kwargs):
36 |         """Overwrite __init__ to set config."""
37 |         super().__init__(*args, **kwargs)
38 | 
39 |     def _eval(self, context: RuleContext):
40 |         """check from table have schema"""
41 |         if len(context.segment.segments) != 3:
42 |             return LintResult(
43 |                 anchor=context.segment,
44 |                 fixes=[
45 |                     LintFix.create_before(
46 |                         context.segment,
47 |                         [CodeSegment(raw="${temp_db}.")],
48 |                     )
49 |                 ],
50 |                 description=f"No schema found when select from table `{context.segment.raw}`.",
51 |             )
52 | 


--------------------------------------------------------------------------------
/easy_sql/sql_linter/sql_linter_cli.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import sys
  3 | import warnings
  4 | from typing import List, Optional
  5 | 
  6 | import click
  7 | 
  8 | from easy_sql.sql_linter.sql_linter import SqlLinter
  9 | 
 10 | 
 11 | def split_rules_to_list(rule_description: str) -> Optional[List[str]]:
 12 |     if rule_description != "":
 13 |         return rule_description.split(",")
 14 |     else:
 15 |         return None
 16 | 
 17 | 
 18 | def parse_backend(sql: str):
 19 |     sql_lines = sql.split("\n")
 20 |     parsed_backend = None
 21 |     for line in sql_lines:
 22 |         if re.match(r"^-- \s*backend:.*$", line):
 23 |             parsed_backend = line[line.index("backend:") + len("backend:") :].strip()
 24 |             break
 25 | 
 26 |     if parsed_backend is None:
 27 |         parsed_backend = "spark"
 28 |     return parsed_backend
 29 | 
 30 | 
 31 | def lint_process(
 32 |     check_sql_file_path: str,
 33 |     exclude: str,
 34 |     include: str,
 35 |     backend: str,
 36 |     easy_sql: bool,
 37 |     config_path: Optional[str] = None,
 38 | ):
 39 |     if not check_sql_file_path.endswith(".sql"):
 40 |         warnings.warn("file name:" + check_sql_file_path + " must end with .sql", stacklevel=2)
 41 | 
 42 |     with open(check_sql_file_path, "r") as file:
 43 |         sql = file.read()
 44 |     sql_linter = SqlLinter(sql, exclude_rules=split_rules_to_list(exclude), include_rules=split_rules_to_list(include))
 45 |     backend = backend if backend else parse_backend(sql)
 46 |     print("using backend:", backend)
 47 |     result = sql_linter.lint(backend, easysql=easy_sql, config_path=config_path)
 48 |     fixed = sql_linter.fix(backend, easy_sql=easy_sql, config_path=config_path)
 49 | 
 50 |     return result, fixed
 51 | 
 52 | 
 53 | def write_out_fixed(check_sql_file_path: str, fixed: str, inplace: bool):
 54 |     if inplace:
 55 |         write_out_file_path = check_sql_file_path
 56 |     else:
 57 |         write_out_file_path = check_sql_file_path.replace(".sql", ".fixed.sql")
 58 |     with open(write_out_file_path, "w") as file:
 59 |         file.write(fixed)
 60 | 
 61 | 
 62 | @click.group()
 63 | def cli():
 64 |     """Check or fix violations in SQL."""
 65 |     pass
 66 | 
 67 | 
 68 | def fix_process(
 69 |     path: str,
 70 |     exclude: str,
 71 |     include: str,
 72 |     backend: str,
 73 |     inplace: bool,
 74 |     easy_sql: bool,
 75 |     config_path: Optional[str] = None,
 76 | ):
 77 |     result, fixed = lint_process(path, exclude, include, backend, easy_sql, config_path=config_path)
 78 |     write_out_fixed(path, fixed, inplace)
 79 | 
 80 | 
 81 | @cli.command(help="""Fix rule violations in sql""")
 82 | @click.option("--path", help="sql file path", required=True, type=str)
 83 | @click.option("--config-path", help="sql fluff config file path, must be named .sqlfluff", required=False, type=str)
 84 | @click.option("--exclude", help="comma separated rule to be excluded", default="", required=False, type=str)
 85 | @click.option("--include", help="comma separated rule to be included", default="", required=False, type=str)
 86 | @click.option(
 87 |     "--backend",
 88 |     help=(
 89 |         "backend for this file, "
 90 |         "if easy sql it will parse from the sql file if not specify, "
 91 |         "if normal sql it will default to spark"
 92 |     ),
 93 |     default=None,
 94 |     required=False,
 95 |     type=str,
 96 | )
 97 | @click.option("--inplace", help="fix file inplace", default=False, required=False, type=bool)
 98 | @click.option("--easy_sql", help="easy sql or normal sql", default=True, required=False, type=bool)
 99 | def fix(path: str, config_path: str, exclude: str, include: str, backend: str, inplace: bool, easy_sql: bool):
100 |     fix_process(path, exclude, include, backend, inplace, easy_sql, config_path=config_path)
101 | 
102 | 
103 | @cli.command(help="""Check rule violations in sql""")
104 | @click.option("--path", help="sql file path", required=True, type=str)
105 | @click.option("--config-path", help="sql fluff config file path, must be named .sqlfluff", required=False, type=str)
106 | @click.option("--exclude", help="comma separated rule to be excluded", default="", required=False, type=str)
107 | @click.option("--include", help="comma separated rule to be included", default="", required=False, type=str)
108 | @click.option(
109 |     "--backend",
110 |     help=(
111 |         "backend for this file, "
112 |         "if easy sql it will parse from the sql file if not specify, "
113 |         "if normal sql it will default to spark"
114 |     ),
115 |     default=None,
116 |     required=False,
117 |     type=str,
118 | )
119 | @click.option("--easy_sql", help="easy sql or normal sql", default=True, required=False, type=bool)
120 | def lint(path: str, config_path: str, exclude: str, include: str, backend: str, easy_sql: bool):
121 |     lint_process(path, exclude, include, backend, easy_sql, config_path=config_path)
122 | 
123 | 
124 | if __name__ == "__main__":
125 |     cli.main(sys.argv[1:])
126 | 


--------------------------------------------------------------------------------
/easy_sql/sql_linter/sql_linter_reportor.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import codecs
 4 | import logging
 5 | import sys
 6 | from typing import TYPE_CHECKING, List, Union
 7 | 
 8 | import colorlog
 9 | 
10 | if TYPE_CHECKING:
11 |     from sqlfluff.core import SQLBaseError
12 | 
13 | 
14 | class LintReporter:
15 |     def __init__(self):
16 |         self.sql_linter_log = LintReporter._create_logger(logging.DEBUG)
17 | 
18 |     def _get_extra_default_dict(self):
19 |         return {"pos_info": "", "description": "", "warn": "", "pass": ""}
20 | 
21 |     @staticmethod
22 |     def _create_logger(log_level: Union[int, str]):
23 |         logger = logging.getLogger("linter_logger")
24 |         logger.setLevel(log_level)
25 |         info_formater = colorlog.ColoredFormatter(
26 |             fmt="%(white)s%(message)s%(red)s%(warn)s %(green)s%(pass)s %(blue)s%(pos_info)s %(white)s%(description)s "
27 |         )
28 |         python_version = sys.version_info
29 |         if python_version.major == 3 and python_version.minor == 6:
30 |             sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach())  # type: ignore
31 |         elif hasattr(sys.stdout, "reconfigure"):
32 |             sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
33 |         handler = logging.StreamHandler(sys.stdout)
34 |         handler.setFormatter(info_formater)
35 |         for existing_handler in logger.handlers:
36 |             logger.removeHandler(existing_handler)
37 |         logger.addHandler(handler)
38 |         return logger
39 | 
40 |     def report_violation(self, violation: SQLBaseError, step_start_line=0):
41 |         pos_info = "L: {} | P: {}: | {}  :".format(
42 |             violation.line_no + step_start_line, violation.line_pos, violation.rule_code()
43 |         )
44 |         extra_dict = self._get_extra_default_dict()
45 |         extra_dict["pos_info"] = pos_info
46 |         extra_dict["description"] = violation.desc()
47 |         self.sql_linter_log.info("", extra=extra_dict)
48 | 
49 |     def report_list_of_violations(self, lint_result: List[SQLBaseError], step_start_line=0):
50 |         if len(lint_result) > 0:
51 |             self.report_warning("Fail")
52 |             for violation in lint_result:
53 |                 self.report_violation(violation, step_start_line)
54 |         else:
55 |             self.report_pass("Pass")
56 | 
57 |     def report_message(self, message):
58 |         self.sql_linter_log.info(message, extra=self._get_extra_default_dict())
59 | 
60 |     def report_warning(self, warning: str):
61 |         extra_dict = self._get_extra_default_dict()
62 |         extra_dict["warn"] = warning
63 |         self.sql_linter_log.warning("", extra=extra_dict)
64 | 
65 |     def report_pass(self, pass_info: str):
66 |         extra_dict = self._get_extra_default_dict()
67 |         extra_dict["pass"] = pass_info
68 |         self.sql_linter_log.warning("", extra=extra_dict)
69 | 


--------------------------------------------------------------------------------
/easy_sql/sql_processor/__init__.py:
--------------------------------------------------------------------------------
 1 | from easy_sql.sql_processor.common import Column, SqlProcessorException
 2 | from easy_sql.sql_processor.context import VarsContext
 3 | from easy_sql.sql_processor.funcs import FuncRunner
 4 | from easy_sql.sql_processor.report import SqlProcessorReporter, StepReport, StepStatus
 5 | from easy_sql.sql_processor.sql_processor import (
 6 |     SqlProcessor,
 7 |     get_current_backend,
 8 |     get_current_config,
 9 |     get_current_context,
10 |     get_current_sql_processor,
11 |     get_current_step,
12 | )
13 | from easy_sql.sql_processor.step import Step, StepConfig, StepType
14 | 
15 | __all__ = [
16 |     "Column",
17 |     "SqlProcessorException",
18 |     "StepConfig",
19 |     "StepType",
20 |     "VarsContext",
21 |     "FuncRunner",
22 |     "Step",
23 |     "StepStatus",
24 |     "StepReport",
25 |     "SqlProcessorReporter",
26 |     "SqlProcessor",
27 |     "get_current_backend",
28 |     "get_current_config",
29 |     "get_current_context",
30 |     "get_current_step",
31 |     "get_current_sql_processor",
32 | ]
33 | 


--------------------------------------------------------------------------------
/easy_sql/sql_processor/backend/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import *
2 | from .flink import *
3 | from .spark import *
4 | 


--------------------------------------------------------------------------------
/easy_sql/sql_processor/backend/bigquery.py:
--------------------------------------------------------------------------------
1 | from .rdb import RdbBackend as BigQueryBackend
2 | from .rdb import RdbRow as BigQueryRow
3 | from .rdb import TimeLog, _exec_sql
4 | 
5 | __all__ = ["BigQueryBackend", "TimeLog", "_exec_sql", "BigQueryRow"]
6 | 


--------------------------------------------------------------------------------
/easy_sql/sql_processor/backend/clickhouse.py:
--------------------------------------------------------------------------------
1 | from .rdb import RdbBackend as ChBackend
2 | from .rdb import RdbRow as ChRow
3 | from .rdb import TimeLog, _exec_sql
4 | 
5 | __all__ = ["ChBackend", "TimeLog", "_exec_sql", "ChRow"]
6 | 


--------------------------------------------------------------------------------
/easy_sql/sql_processor/backend/postgres.py:
--------------------------------------------------------------------------------
1 | from .rdb import RdbBackend as PostgresBackend
2 | from .rdb import RdbRow as PgRow
3 | from .rdb import TimeLog, _exec_sql
4 | 
5 | __all__ = ["PostgresBackend", "TimeLog", "_exec_sql", "PgRow"]
6 | 


--------------------------------------------------------------------------------
/easy_sql/sql_processor/backend/rdb_test.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from unittest.mock import patch
 3 | 
 4 | from sqlalchemy.dialects.postgresql import DOUBLE_PRECISION
 5 | from sqlalchemy.engine.mock import create_mock_engine
 6 | from sqlalchemy.engine.reflection import Inspector
 7 | 
 8 | from easy_sql.sql_processor.backend.rdb import RdbBackend
 9 | 
10 | 
11 | class RdbTest(unittest.TestCase):
12 |     def test_get_column_names_should_only_get_from_name(self):
13 |         mock_engine = create_mock_engine("postgresql://", None)
14 |         cols = [{"name": "a"}, {"name": "b"}, {"type": "c"}]
15 |         with patch.object(RdbBackend, "get_columns", return_value=cols):
16 |             rdb = RdbBackend("", engine=mock_engine)  # type: ignore
17 |             names = rdb.get_column_names("test")
18 |             self.assertSequenceEqual(names, ["a", "b"])
19 | 
20 |     def test_get_columns_should_compile_type_by_dialect_when_now_raw(self):
21 |         mock_engine = create_mock_engine("postgresql://", None)
22 |         mock_engine.close = lambda: None  # type: ignore
23 |         col = {"name": "id", "type": DOUBLE_PRECISION(10)}
24 |         raw_cols = [col]
25 |         with patch.object(Inspector, "get_columns", return_value=[col.copy() for col in raw_cols]):
26 |             rdb = RdbBackend("", engine=mock_engine)  # type: ignore
27 | 
28 |             cols = rdb.get_columns("test")
29 | 
30 |             self.assertNotEqual(str(col["type"]), "DOUBLE PRECISION")
31 |             self.assertEqual(cols, [{"name": "id", "type": "DOUBLE PRECISION"}])
32 | 
33 |     def test_get_columns_should_compile_type_by_dialect_when_in_raw(self):
34 |         mock_engine = create_mock_engine("postgresql://", None)
35 |         mock_engine.close = lambda: None  # type: ignore
36 |         col = {"name": "id", "type": DOUBLE_PRECISION(10)}
37 |         raw_cols = [col]
38 |         with patch.object(Inspector, "get_columns", return_value=[col.copy() for col in raw_cols]):
39 |             rdb = RdbBackend("", engine=mock_engine)  # type: ignore
40 | 
41 |             cols = rdb.get_columns("test", raw=True)
42 | 
43 |             self.assertEqual(cols, raw_cols)
44 | 


--------------------------------------------------------------------------------
/easy_sql/sql_processor/backend/spark_test.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from pyspark.sql.functions import expr, lit
 4 | 
 5 | from easy_sql.base_test import LocalSpark
 6 | from easy_sql.sql_processor.backend import SparkTable
 7 | from easy_sql.sql_processor.backend.base import TableMeta
 8 | from easy_sql.sql_processor.backend.spark import SparkBackend
 9 | from easy_sql.sql_processor.common import SqlProcessorException
10 | 
11 | 
12 | class SparkTest(unittest.TestCase):
13 |     def test_with_column(self):
14 |         spark = LocalSpark.get()
15 |         df = spark.sql("select 1 as id")
16 |         # expr('2021-01-01') 效果就是 select 2021-01-01，少了引号，解析就会出现奇怪的结果
17 |         self.assertNotEqual(
18 |             SparkTable(df).with_column("data_date", "2021-01-01").df.select("data_date").limit(1).collect(),
19 |             [("2021-01-01",)],
20 |         )
21 |         # expr("'2021-01-01'") 效果就是 select '2021-01-01'，结果正确
22 |         self.assertEqual(
23 |             SparkTable(df).with_column("data_date", "'2021-01-01'").df.select("data_date").limit(1).collect(),
24 |             [("2021-01-01",)],
25 |         )
26 |         # 可以直接传入 Column
27 |         self.assertEqual(
28 |             SparkTable(df).with_column("data_date", lit("2021-01-01")).df.select("data_date").limit(1).collect(),
29 |             [("2021-01-01",)],
30 |         )
31 |         self.assertEqual(SparkTable(df).with_column("flag", "1==2").df.select("flag").limit(1).collect(), [(False,)])
32 |         self.assertEqual(
33 |             SparkTable(df).with_column("flag", expr("1==2")).df.select("flag").limit(1).collect(), [(False,)]
34 |         )
35 | 
36 |     def test_verify_schema(self):
37 |         spark = LocalSpark.get()
38 |         backend = SparkBackend(spark)
39 |         spark.sql('create table test_verify_schema using parquet as select 1 as id, "a" as name')
40 | 
41 |         # should check if target table exists
42 |         spark.sql("select 1 as id").createOrReplaceTempView("test_verify_schema0")
43 |         with self.assertRaises(SqlProcessorException):
44 |             backend.verify_schema(TableMeta("test_verify_schema0"), TableMeta("test_verify_schema1"))
45 | 
46 |         # should verify column name
47 |         spark.sql("select 1 as id").createOrReplaceTempView("test_verify_schema1")
48 |         with self.assertRaises(SqlProcessorException):
49 |             backend.verify_schema(TableMeta("test_verify_schema1"), TableMeta("test_verify_schema"))
50 | 
51 |         # should ignore case and not verify type
52 |         spark.sql("select 1 as Id, 1 as name").createOrReplaceTempView("test_verify_schema2")
53 |         # should not raise exception
54 |         backend.verify_schema(TableMeta("test_verify_schema2"), TableMeta("test_verify_schema"))
55 | 
56 |         # should verify type and raise error
57 |         spark.sql("select 1 as id, 1 as Name").createOrReplaceTempView("test_verify_schema21")
58 |         with self.assertRaises(SqlProcessorException):
59 |             backend.verify_schema(TableMeta("test_verify_schema21"), TableMeta("test_verify_schema"), True)
60 | 
61 |         # should ignore extra column
62 |         spark.sql("select 1 as id, 'a' as name, 1 as id1").createOrReplaceTempView("test_verify_schema3")
63 |         # should not raise exception
64 |         backend.verify_schema(TableMeta("test_verify_schema3"), TableMeta("test_verify_schema"))
65 | 
66 | 
67 | if __name__ == "__main__":
68 |     unittest.main()
69 | 


--------------------------------------------------------------------------------
/easy_sql/sql_processor/backend/sql_dialect/clickhouse_test.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from easy_sql.sql_processor.backend import Partition
 4 | from easy_sql.sql_processor.backend.sql_dialect import SqlExpr
 5 | from easy_sql.sql_processor.backend.sql_dialect.clickhouse import ChSqlDialect
 6 | 
 7 | 
 8 | class RdbTest(unittest.TestCase):
 9 |     def test_ch_config(self):
10 |         ch_config = ChSqlDialect(SqlExpr(), "dataplat.__table_partitions__")
11 |         sql = ch_config.delete_partition_sql("test.test", [Partition("dt", "20210101")])
12 |         self.assertEqual(
13 |             sql,
14 |             [
15 |                 "alter table test.test drop partition tuple('20210101')",
16 |                 (
17 |                     "alter table dataplat.__table_partitions__ delete "
18 |                     "where db_name = 'test' and table_name = 'test' and partition_value = '20210101'"
19 |                 ),
20 |             ],
21 |         )
22 | 


--------------------------------------------------------------------------------
/easy_sql/sql_processor/common.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import TYPE_CHECKING, Any
 4 | 
 5 | from ..logger import logger
 6 | 
 7 | if TYPE_CHECKING:
 8 |     from pyspark.sql import DataFrame, SparkSession
 9 | 
10 | 
11 | def _exec_sql(spark: SparkSession, sql: str) -> DataFrame:
12 |     logger.info(f"will exec sql: {sql}")
13 |     return spark.sql(sql)
14 | 
15 | 
16 | def is_int_type(type_name):
17 |     return any(type_name.startswith(t) for t in ["integer", "long", "decimal", "short"])
18 | 
19 | 
20 | class Column:
21 |     def __init__(self, name: str, value: Any):
22 |         self.name, self.value = name, value
23 | 
24 | 
25 | class SqlProcessorException(Exception):
26 |     def __init__(self, message: str):
27 |         super().__init__(message)
28 | 
29 | 
30 | class SqlProcessorAssertionError(Exception):
31 |     def __init__(self, message: str):
32 |         super().__init__(message)
33 | 
34 | 
35 | class VarsReplacer:
36 |     def replace_variables(self, text: str, include_funcs: bool = True) -> str:
37 |         raise NotImplementedError()
38 | 


--------------------------------------------------------------------------------
/easy_sql/sql_processor/context_test.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from easy_sql.sql_processor.context import TemplatesContext, VarsContext
 4 | from easy_sql.sql_processor.funcs import FuncRunner
 5 | 
 6 | 
 7 | class TemplateContextTest(unittest.TestCase):
 8 |     def test_should_replace_template(self):
 9 |         tc = TemplatesContext(True, {"a": "xx\n#{var}=abc, 123"})
10 |         replaced = tc.replace_templates("??@{a(var=${abc})}??")
11 |         self.assertEqual("??xx\n${abc}=abc, 123??", replaced)
12 | 
13 |         # does not support var-func in template parameters
14 |         replaced = tc.replace_templates("??@{a(var=${fn(abc)})}??")
15 |         self.assertNotEquals("??xx\n${fn(abc)}=abc, 123??", replaced)
16 | 
17 |         # if this is a comment, do not replace
18 |         replaced = tc.replace_templates("??@{a(var=${abc})}?? --??@{a(var=${abc})}??")
19 |         self.assertEqual("??xx\n${abc}=abc, 123?? --??@{a(var=${abc})}??", replaced)
20 |         replaced = tc.replace_templates("-- ??@{a(var=${abc})}??")
21 |         self.assertEqual("-- ??@{a(var=${abc})}??", replaced)
22 | 
23 |     def test_multi_line_in_template_reference(self):
24 |         tc = TemplatesContext(True, {"a": "xx\n#{var}=abc, #{var1} 123"})
25 |         replaced = tc.replace_templates("??@{a(var=123\n,var1=234)}??")
26 |         self.assertEqual("??xx\n123=abc, 234 123??", replaced)
27 | 
28 |         replaced = tc.replace_templates("??@{a(var=123,\nvar1=234)}??")
29 |         self.assertEqual("??xx\n123=abc, 234 123??", replaced)
30 | 
31 |         replaced = tc.replace_templates("??@{a(\n  var\n=123\n,\nvar1=234)}??")
32 |         self.assertEqual("??xx\n123=abc, 234 123??", replaced)
33 | 
34 |     def test_comment_line_in_template_reference(self):
35 |         tc = TemplatesContext(True, {"a": "--xx\n#{var}=abc, #{var1} 123--abc\n--abc"})
36 |         replaced = tc.replace_templates("??@{a(var=123\n,var1=234)}??")
37 |         self.assertEqual("??--xx\n123=abc, 234 123--abc\n--abc\n??", replaced)
38 | 
39 |         tc = TemplatesContext(True, {"a": "--xx\n#{var}=abc, #{var1} 123--abc\n"})
40 |         replaced = tc.replace_templates("??@{a(var=123\n,var1=234)}??")
41 |         self.assertEqual("??--xx\n123=abc, 234 123--abc\n??", replaced)
42 | 
43 |         tc = TemplatesContext(True, {"a": "--xx\n#{var}=abc, #{var1} 123--abc"})
44 |         replaced = tc.replace_templates("??@{a(var=123\n,var1=234)}??")
45 |         self.assertEqual("??--xx\n123=abc, 234 123--abc\n??", replaced)
46 | 
47 |         tc = TemplatesContext(True, {"a": "\n#{var}=abc, #{var1} 123\n"})
48 |         replaced = tc.replace_templates("??@{a(var=123\n,var1=234)}??")
49 |         self.assertEqual("??123=abc, 234 123??", replaced)
50 | 
51 | 
52 | class VarsContextTest(unittest.TestCase):
53 |     def test_should_replace_vars(self):
54 |         vc = VarsContext(vars={"a": "##A##", "aa": "##${a}##"}, debug_log=True)
55 |         self.assertEqual("-##A##, ===####A####===", vc.replace_variables("-${a}, ===${aa}==="), "should replace all")
56 |         self.assertEqual(
57 |             "-- -${a}, ===${aa}===", vc.replace_variables("-- -${a}, ===${aa}==="), "do not replace comment"
58 |         )
59 |         self.assertEqual(
60 |             "-##A##, ==-- =${aa}===", vc.replace_variables("-${a}, ==-- =${aa}==="), "do not replace comment"
61 |         )
62 |         self.assertEqual("-\\##A##, ===####A####===", vc.replace_variables("-\\${a}, ===${aa}==="), "ignore escaping")
63 | 
64 |         vc = VarsContext(vars={"a": "##A##", "b": "##${a}##", "aa": "##${b}##"}, debug_log=True)
65 |         self.assertEqual(
66 |             "-##A##, -####A####, ===######A######===",
67 |             vc.replace_variables("-${a}, -${b}, ===${aa}==="),
68 |             "replace vars recursively",
69 |         )
70 | 
71 |         vc = VarsContext(vars={"a": "##A##", "aa": "##${a}##", "b": "1"}, debug_log=True)
72 |         vc.init(func_runner=FuncRunner({"f": lambda x: int(x) + 1}))
73 |         self.assertEqual("-6, ===####A####===", vc.replace_variables("-${f(5)}, ===${aa}==="), "func call in vars")
74 |         self.assertEqual(
75 |             "-2, ===####A####===", vc.replace_variables("-${f(${b})}, ===${aa}==="), "vars as args in func call"
76 |         )
77 |         self.assertEqual(
78 |             "-4, ===####A####===",
79 |             vc.replace_variables("-${f(${c:3})}, ===${aa}==="),
80 |             "vars with default value as args in func call",
81 |         )
82 | 
83 |         vc = VarsContext(vars={"a": "##A##", "b": "##${a}##", "aa": "##${b}##"}, debug_log=True)
84 |         self.assertEqual(
85 |             "-1, -####A####, ===######A######===",
86 |             vc.replace_variables("-${a1:1}, -${b}, ===${aa:b?x}==="),
87 |             "vars with default value",
88 |         )
89 | 
90 |         # TODO: support for confliction detection
91 | 


--------------------------------------------------------------------------------
/easy_sql/sql_processor/funcs_flink_itest.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from easy_sql.base_test import TEST_PG_JDBC_PASSWD, TEST_PG_JDBC_URL, TEST_PG_JDBC_USER
 4 | from easy_sql.config.sql_config import EasySqlConfig
 5 | from easy_sql.sql_processor.backend import FlinkBackend, FlinkTablesConfig
 6 | from easy_sql.sql_processor.step import Step
 7 | from easy_sql.utils.flink_test_cluster import FlinkTestClusterManager
 8 | 
 9 | from .funcs_flink import TestFuncs
10 | 
11 | 
12 | def step_with_sql(sql: str) -> Step:
13 |     return Step("0", None, None, select_sql=sql)  # type: ignore
14 | 
15 | 
16 | class FlinkFuncsTest(unittest.TestCase):
17 |     def create_flink_backend(self):
18 |         return FlinkBackend(
19 |             True,
20 |             FlinkTablesConfig(
21 |                 connectors={
22 |                     "jdbc": FlinkTablesConfig.Connector(
23 |                         f"""
24 |                                     'url' = '{TEST_PG_JDBC_URL}',
25 |                                     'username' = '{TEST_PG_JDBC_USER}',
26 |                                     'password' = '{TEST_PG_JDBC_PASSWD}'
27 |                     """
28 |                     ),
29 |                 },
30 |                 catalogs={},
31 |             ),
32 |         )
33 | 
34 |     def test_exec_sql_in_source(self):
35 |         fb = self.create_flink_backend()
36 |         tf = TestFuncs(fb)
37 |         tf.exec_sql_in_source(step_with_sql("select 1;\nselect now();"), "db", "jdbc")
38 | 
39 |     def test_run_etl_streaming(self):
40 |         fb = self.create_flink_backend()
41 |         tf = TestFuncs(fb)
42 |         with open("/tmp/flink_func_test__test_run_etl.sql", "w") as f:
43 |             f.write(
44 |                 """
45 | -- backend: flink
46 | -- config: easy_sql.etl_type=streaming
47 | -- config: flink.cmd=-pyexec python3
48 | -- config: flink.cmd=-t remote
49 | -- config: flink.cmd=-pyclientexec python3
50 | -- target=variables
51 | select
52 |     'append'           as __save_mode__
53 |             """
54 |             )
55 |         fm = FlinkTestClusterManager()
56 |         if fm.is_not_started():
57 |             fm.start_cluster()
58 |         tf.test_run_etl(None, "/tmp/flink_func_test__test_run_etl.sql")
59 |         self.assertTrue(fm.is_started())
60 |         fm.stop_cluster()
61 | 
62 |         tf.test_run_etl(None, "/tmp/flink_func_test__test_run_etl.sql")
63 |         self.assertTrue(fm.is_not_started())
64 | 
65 |     def test_run_etl_batch(self):
66 |         fb = self.create_flink_backend()
67 |         tf = TestFuncs(fb)
68 |         with open("/tmp/flink_func_test__test_run_etl.sql", "w") as f:
69 |             f.write(
70 |                 """
71 | -- backend: flink
72 | -- config: easy_sql.etl_type=batch
73 | -- config: flink.cmd=-pyexec python3
74 | -- config: flink.cmd=-t local
75 | -- config: flink.cmd=-pyclientexec python3
76 | -- target=variables
77 | select
78 |     'append'           as __save_mode__
79 |             """
80 |             )
81 |         tf.test_run_etl(
82 |             EasySqlConfig.from_sql(sql_file="/tmp/flink_func_test__test_run_etl.sql"),
83 |             "/tmp/flink_func_test__test_run_etl.sql",
84 |         )
85 | 


--------------------------------------------------------------------------------
/easy_sql/sql_processor/step_test.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | 
  3 | from easy_sql.sql_processor import SqlProcessorException, StepConfig
  4 | from easy_sql.sql_processor.step import SqlCleaner, StepFactory
  5 | 
  6 | 
  7 | class StepConfigTest(unittest.TestCase):
  8 |     def test_should_parse_config(self):
  9 |         self.assertEqual(StepConfig.from_config_line("-- target=check.f1", 0), StepConfig("check", "f1", None, 0))
 10 |         self.assertEqual(
 11 |             StepConfig.from_config_line("-- target=check.f1(a, ${b})", 0), StepConfig("check", "f1(a, ${b})", None, 0)
 12 |         )
 13 |         self.assertEqual(
 14 |             StepConfig.from_config_line("-- target=check.f1(a, ${b}), if=f2(c, ${d})", 0),
 15 |             StepConfig("check", "f1(a, ${b})", "f2(c, ${d})", 0),
 16 |         )
 17 |         self.assertEqual(
 18 |             StepConfig.from_config_line("-- target=check.f1(a, ${b}),if=f2(c, ${d})", 0),
 19 |             StepConfig("check", "f1(a, ${b})", "f2(c, ${d})", 0),
 20 |         )
 21 |         self.assertEqual(StepConfig.from_config_line("-- target=variables", 0), StepConfig("variables", None, None, 0))
 22 |         self.assertEqual(
 23 |             StepConfig.from_config_line("-- target=variables, if=f2(c, ${d})", 0),
 24 |             StepConfig("variables", None, "f2(c, ${d})", 0),
 25 |         )
 26 |         with self.assertRaises(expected_exception=SqlProcessorException):
 27 |             StepConfig.from_config_line("-- target=check.f1(a, ${b}),if=f2-(c, ${d})", 0)
 28 |         with self.assertRaises(expected_exception=SqlProcessorException):
 29 |             StepConfig.from_config_line("-- target=unknown_type", 0)
 30 | 
 31 |     def test_should_clean_sql(self):
 32 |         self.assertEquals(
 33 |             """
 34 |         with a as (select 1 as a) -- comment
 35 |         --comment
 36 | select * from a
 37 |         """.strip(),
 38 |             SqlCleaner().clean_sql(
 39 |                 """
 40 |         -- comment
 41 |         with a as (select 1 as a) -- comment
 42 |         --comment
 43 |         select * from a -- comment
 44 |         ;
 45 |         --comment
 46 |         """
 47 |             ),
 48 |         )
 49 | 
 50 |     def test_should_clean_sql_with_semicolon_before_comment(self):
 51 |         self.assertEquals(
 52 |             """
 53 |         with a as (select 1 as a) -- comment
 54 |         --comment
 55 | select * from a
 56 |         """.strip(),
 57 |             SqlCleaner().clean_sql(
 58 |                 """
 59 |         -- comment
 60 |         with a as (select 1 as a) -- comment
 61 |         --comment
 62 |         select * from a; -- comment
 63 |         ;
 64 |         --comment
 65 |         """
 66 |             ),
 67 |         )
 68 | 
 69 |     def test_should_read_sql_correctly(self):
 70 |         sql = """
 71 | -- target=temp.test
 72 | select ';' as a
 73 |         """
 74 |         steps = StepFactory(None, None).create_from_sql(sql, {})  # type: ignore
 75 |         self.assertEquals(1, len(steps))
 76 |         assert steps[0].target_config is not None
 77 |         self.assertEquals(steps[0].target_config.name, "test")
 78 |         assert steps[0].select_sql is not None
 79 |         self.assertEquals(steps[0].select_sql.strip(), "select ';' as a")
 80 | 
 81 |     def test_should_skip_duplicate_include(self):
 82 |         sql0 = """
 83 | -- target=temp.test
 84 | select 1 as a
 85 |         """
 86 |         sql1 = """
 87 | -- include 0 start
 88 | -- include=0.sql
 89 |         """
 90 |         sql = """
 91 | -- outer include start
 92 | -- include=1.sql
 93 | -- include=1.sql
 94 | -- include=0.sql
 95 |         """
 96 |         sql_expected = """
 97 | -- outer include start
 98 | -- include 0 start
 99 | -- target=temp.test
100 | select 1 as a
101 | """
102 |         sf = StepFactory(None, None, skip_duplicate_include=True)  # type: ignore
103 |         sf.create_from_sql(sql, {"0.sql": sql0, "1.sql": sql1})
104 |         print(sf.resolved_sql)
105 |         assert (
106 |             "\n".join([line.strip() for line in sf.resolved_sql.splitlines() if line.strip()]) == sql_expected.strip()
107 |         )
108 | 
109 | 
110 | if __name__ == "__main__":
111 |     unittest.main()
112 | 


--------------------------------------------------------------------------------
/easy_sql/sql_test_itest.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from . import sql_test
 4 | 
 5 | 
 6 | class SqlTestTest(unittest.TestCase):
 7 |     def test_convert_test_for_spark(self):
 8 |         self.run_test("spark")
 9 | 
10 |     def test_convert_test_for_postgres(self):
11 |         self.run_test("postgres")
12 | 
13 |     def test_convert_test_for_clickhouse(self):
14 |         self.run_test("clickhouse")
15 | 
16 |     def run_test(self, backend: str):
17 |         sql_test._convert_json(f"test/sample_etl.{backend}.xlsx")
18 |         sql_test._run_test(f"test/sample_etl.{backend}.xlsx", backend=backend)
19 |         sql_test._run_test(f"test/sample_etl.{backend}.json", backend=backend)
20 | 


--------------------------------------------------------------------------------
/easy_sql/udf/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/easysql/easy_sql/b568542617942f347579ff872d976fd2175aa071/easy_sql/udf/__init__.py


--------------------------------------------------------------------------------
/easy_sql/udf/check.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import re
 4 | from typing import TYPE_CHECKING, Callable
 5 | 
 6 | if TYPE_CHECKING:
 7 |     from pyspark.sql.types import DataType
 8 | 
 9 | 
10 | class UDF:
11 |     def __init__(self, func: Callable, return_type: DataType):
12 |         self.func = func
13 |         self.return_type = return_type
14 | 
15 |     def __call__(self, *args, **kwargs):
16 |         return self.func(*args, **kwargs)
17 | 
18 | 
19 | def check_regex_func(pattern):
20 |     return lambda any_str: any_str if any_str and re.match(pattern, any_str) else None
21 | 


--------------------------------------------------------------------------------
/easy_sql/udf/udfs.py:
--------------------------------------------------------------------------------
  1 | from typing import Any, Callable, Dict, List, Optional, Union
  2 | 
  3 | 
  4 | def _all_udfs(cls: Any):
  5 |     return {
  6 |         attr: getattr(cls, attr)
  7 |         for attr in dir(cls)
  8 |         if callable(getattr(cls, attr)) and not attr.startswith("_") and attr != "all"
  9 |     }
 10 | 
 11 | 
 12 | def get_udfs(type: str) -> Union[Dict[str, Callable[[], Union[str, List[str]]]], Dict[str, Callable]]:
 13 |     if type == "pg":
 14 |         return PgUdfs.all()
 15 |     elif type == "ch":
 16 |         return ChUdfs.all()
 17 |     elif type == "spark":
 18 |         return SparkUdfs.all()
 19 |     else:
 20 |         return {}
 21 | 
 22 | 
 23 | class SparkUdfs:
 24 |     @staticmethod
 25 |     def all() -> Dict[str, Callable]:
 26 |         return _all_udfs(SparkUdfs)
 27 | 
 28 |     @staticmethod
 29 |     def remove_all_whitespaces(value: Optional[str]) -> Optional[str]:
 30 |         return "".join(value.split()) if value is not None else None
 31 | 
 32 |     @staticmethod
 33 |     def trim_all(value: Optional[str]) -> Optional[str]:
 34 |         return value.strip() if value is not None else None
 35 | 
 36 | 
 37 | class PgUdfs:
 38 |     @staticmethod
 39 |     def all() -> Dict[str, Callable[[], str]]:
 40 |         return _all_udfs(PgUdfs)
 41 | 
 42 |     @staticmethod
 43 |     def trim_all():
 44 |         return """
 45 | create or replace function trim_all(value text) returns text
 46 |     as $$ select regexp_replace(regexp_replace($1, E'^[\\\\a\\\\b\\\\e\\\\f\\\\n\\\\r\\\\t\\\\v\\\\0 ]+', ''), E'[\\\\a\\\\b\\\\e\\\\f\\\\n\\\\r\\\\t\\\\v\\\\0 ]+$', '') $$
 47 |     LANGUAGE SQL
 48 |     IMMUTABLE
 49 |     RETURNS NULL ON NULL INPUT
 50 | """  # noqa : B950
 51 | 
 52 |     @staticmethod
 53 |     def split():
 54 |         return """
 55 | create or replace function split(value text, sep text) returns text[]
 56 |     as $$ select string_to_array($1, $2) $$
 57 |     LANGUAGE SQL
 58 |     IMMUTABLE
 59 |     RETURNS NULL ON NULL INPUT
 60 | """
 61 | 
 62 |     @staticmethod
 63 |     def from_unixtime():
 64 |         return """
 65 | create or replace function from_unixtime(value float) returns timestamp
 66 |     as $$ select to_timestamp($1) $$
 67 |     LANGUAGE SQL
 68 |     IMMUTABLE
 69 |     RETURNS NULL ON NULL INPUT
 70 | """
 71 | 
 72 |     @staticmethod
 73 |     def date_format():
 74 |         return """
 75 | create or replace function date_format(value timestamp, format text) returns text
 76 |     as $$ select to_char($1, $2) $$
 77 |     LANGUAGE SQL
 78 |     IMMUTABLE
 79 |     RETURNS NULL ON NULL INPUT
 80 | """
 81 | 
 82 |     @staticmethod
 83 |     def get_json_object():
 84 |         return """
 85 | create or replace function get_json_object(value text, path text) returns text
 86 |     as $$ select $1::json#>(string_to_array($2, '.'))[2:] $$
 87 |     LANGUAGE SQL
 88 |     IMMUTABLE
 89 |     RETURNS NULL ON NULL INPUT
 90 | """
 91 | 
 92 |     @staticmethod
 93 |     def sha1():
 94 |         return """
 95 | CREATE EXTENSION IF NOT EXISTS pgcrypto with schema public;
 96 | create or replace function sha1(value text) returns text
 97 |     as $$ select encode(public.digest($1::bytea, cast('sha1' as text)), 'hex') $$
 98 |     LANGUAGE SQL
 99 |     IMMUTABLE
100 |     RETURNS NULL ON NULL INPUT
101 | """.split(
102 |             ";"
103 |         )
104 | 
105 | 
106 | class ChUdfs:
107 |     """
108 |     https://clickhouse.com/docs/en/sql-reference/statements/create/function
109 |     CREATE FUNCTION name AS (parameter0, ...) -> expression
110 |     """
111 | 
112 |     @staticmethod
113 |     def all() -> Dict[str, Callable[[], str]]:
114 |         return _all_udfs(ChUdfs)
115 | 
116 |     @staticmethod
117 |     def translate():
118 |         return """
119 | CREATE FUNCTION IF NOT EXISTS translate AS (input, from, to) -> replaceAll(input, from, to)
120 |         """
121 | 


--------------------------------------------------------------------------------
/easy_sql/udf/udfs_test.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from easy_sql.base_test import LocalSpark
 4 | from easy_sql.sql_processor import SqlProcessor
 5 | 
 6 | 
 7 | class FunctionsTest(unittest.TestCase):
 8 |     def test_remove_all_whitespaces(self):
 9 |         spark = LocalSpark.get()
10 |         SqlProcessor(spark, "")
11 | 
12 |         self.assertEqual("ab", spark.sql("select remove_all_whitespaces('  a    b  ')").collect()[0][0])
13 |         self.assertEqual("ab", spark.sql("select remove_all_whitespaces(' \ta\t    b\t ')").collect()[0][0])
14 |         self.assertEqual("ab", spark.sql("select remove_all_whitespaces(' \na\n    b\n ')").collect()[0][0])
15 |         self.assertEqual("ab", spark.sql("select remove_all_whitespaces(' \fa\f    b\f ')").collect()[0][0])
16 |         self.assertEqual("ab", spark.sql("select remove_all_whitespaces(' \ra\r    b\r ')").collect()[0][0])
17 |         self.assertEqual("ab", spark.sql("select remove_all_whitespaces(' \va\v    b\v ')").collect()[0][0])
18 | 
19 |         self.assertEqual("ab", spark.sql("select remove_all_whitespaces('  a     b  ')").collect()[0][0])  # \u00A0
20 |         self.assertEqual("ab", spark.sql("select remove_all_whitespaces('  a     b  ')").collect()[0][0])  # \u2007
21 |         self.assertEqual("ab", spark.sql("select remove_all_whitespaces('  a     b  ')").collect()[0][0])  # 202F
22 |         self.assertEqual(None, spark.sql("select remove_all_whitespaces(NULL)").collect()[0][0])
23 |         self.assertEqual("", spark.sql("select remove_all_whitespaces('')").collect()[0][0])
24 |         self.assertEqual("", spark.sql("select remove_all_whitespaces('  ')").collect()[0][0])
25 | 
26 |     def test_trim_all(self):
27 |         spark = LocalSpark.get()
28 |         SqlProcessor(spark, "")
29 | 
30 |         self.assertEqual("a    b", spark.sql("select trim_all('  a    b  ')").collect()[0][0])
31 |         self.assertEqual("a    b", spark.sql("select trim_all(' \ta    b\t ')").collect()[0][0])
32 |         self.assertEqual("a    b", spark.sql("select trim_all(' \na    b\n ')").collect()[0][0])
33 |         self.assertEqual("a    b", spark.sql("select trim_all(' \fa    b\f ')").collect()[0][0])
34 |         self.assertEqual("a    b", spark.sql("select trim_all(' \ra    b\r ')").collect()[0][0])
35 |         self.assertEqual("a    b", spark.sql("select trim_all(' \va    b\v ')").collect()[0][0])
36 | 
37 |         self.assertEqual("a    b", spark.sql("select trim_all('  a    b  ')").collect()[0][0])  # \u00A0
38 |         self.assertEqual("a    b", spark.sql("select trim_all('  a    b  ')").collect()[0][0])  # \u2007
39 |         self.assertEqual("a    b", spark.sql("select trim_all('  a    b  ')").collect()[0][0])  # 202F
40 |         self.assertEqual(None, spark.sql("select trim_all(NULL)").collect()[0][0])
41 |         self.assertEqual("", spark.sql("select trim_all('')").collect()[0][0])
42 |         self.assertEqual("", spark.sql("select trim_all('  ')").collect()[0][0])
43 | 
44 | 
45 | if __name__ == "__main__":
46 |     unittest.main()
47 | 


--------------------------------------------------------------------------------
/easy_sql/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/easysql/easy_sql/b568542617942f347579ff872d976fd2175aa071/easy_sql/utils/__init__.py


--------------------------------------------------------------------------------
/easy_sql/utils/db_connection_utils.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import TYPE_CHECKING, Dict, Optional
 4 | 
 5 | from easy_sql.utils.kv import KV
 6 | 
 7 | if TYPE_CHECKING:
 8 |     from sqlalchemy.engine.base import Connection, Engine
 9 | 
10 |     from easy_sql.sql_processor.backend.flink import FlinkBackend
11 | 
12 | 
13 | def _create_sqlalchemy_conn(flink_connector_config: Dict[str, str]) -> Optional[Connection]:
14 |     base_url = flink_connector_config["url"]
15 |     username = flink_connector_config["username"]
16 |     password = flink_connector_config["password"]
17 |     split_expr = "://"
18 |     split_expr_index = base_url.index(split_expr)
19 |     db_type = base_url[len("jdbc:") : split_expr_index]
20 |     sqlalchemy_db_url = f"{db_type}{split_expr}{username}:{password}@{KV.from_config(base_url, split_expr).v}"
21 |     if sqlalchemy_db_url:
22 |         from sqlalchemy import create_engine
23 | 
24 |         engine: Engine = create_engine(sqlalchemy_db_url, isolation_level="AUTOCOMMIT", pool_size=1)
25 |         conn: Connection = engine.connect()
26 |         return conn
27 | 
28 | 
29 | def get_connector_raw_conn_for_flink_backend(backend: FlinkBackend, connector_name: str) -> Optional[Connection]:
30 |     connector_options = backend.flink_tables_config.get_connector_options(connector_name)
31 |     return _create_sqlalchemy_conn(connector_options)
32 | 


--------------------------------------------------------------------------------
/easy_sql/utils/flink_test_cluster.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | import time
 4 | 
 5 | from easy_sql.logger import logger
 6 | 
 7 | 
 8 | def _check_call(command: str) -> bool:
 9 |     logger.info(f"will exec command: {command}")
10 |     try:
11 |         return subprocess.check_call(["bash", "-c", command]) == 0
12 |     except subprocess.CalledProcessError:
13 |         return False
14 | 
15 | 
16 | def _check_call_for_script(script_file: str) -> bool:
17 |     logger.info(f"will exec script: {script_file}")
18 |     try:
19 |         return subprocess.check_call(["bash", script_file]) == 0
20 |     except subprocess.CalledProcessError:
21 |         return False
22 | 
23 | 
24 | class FlinkTestClusterManager:
25 |     def __init__(self, op_wait_secs: float = 3):
26 |         import pyflink
27 | 
28 |         if not _check_call("type curl") or not _check_call("type grep"):
29 |             raise Exception(
30 |                 "Can not find curl or grep. This module only works in a unix environment with curl and grep installed."
31 |             )
32 |         self.flink_home = os.path.dirname(pyflink.__file__)
33 |         self.wait_secs = op_wait_secs
34 | 
35 |     def is_started(self):
36 |         return _check_call("curl -s localhost:8081 | grep 'Apache Flink Web Dashboard'")
37 | 
38 |     def is_not_started(self):
39 |         return _check_call("curl localhost:8081 2>&1 | grep 'Connection refused'")
40 | 
41 |     def start_cluster(self):
42 |         success = _check_call_for_script(os.path.join(self.flink_home, "bin/start-cluster.sh"))
43 |         if success:
44 |             logger.info(f"Wait {self.wait_secs} for flink to be fully started.")
45 |             time.sleep(self.wait_secs)
46 |         else:
47 |             raise Exception("Start flink cluster failed, please check the output.")
48 | 
49 |     def stop_cluster(self):
50 |         success = _check_call_for_script(os.path.join(self.flink_home, "bin/stop-cluster.sh"))
51 |         if success:
52 |             logger.info(f"Wait {self.wait_secs} for flink to be fully stopped.")
53 |             time.sleep(self.wait_secs)
54 |         else:
55 |             raise Exception("Stop flink cluster failed, please check the output.")
56 | 


--------------------------------------------------------------------------------
/easy_sql/utils/flink_test_cluster_itest.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from .flink_test_cluster import FlinkTestClusterManager
 4 | 
 5 | 
 6 | class FlinkTestClusterManagerTest(unittest.TestCase):
 7 |     def test_cluster_manager(self):
 8 |         fm = FlinkTestClusterManager(10)
 9 |         if fm.is_started():
10 |             fm.stop_cluster()
11 |         fm.start_cluster()
12 |         self.assertTrue(fm.is_started())
13 |         self.assertFalse(fm.is_not_started())
14 |         fm.stop_cluster()
15 |         self.assertFalse(fm.is_started())
16 |         self.assertTrue(fm.is_not_started())
17 | 


--------------------------------------------------------------------------------
/easy_sql/utils/io_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | from os import path
 4 | 
 5 | from easy_sql.logger import logger
 6 | 
 7 | 
 8 | def resolve_file(file_path: str, abs_path: bool = False, prefix: str = "", relative_to: str = "") -> str:
 9 |     if file_path.lower().startswith("hdfs://") or file_path.lower().startswith("file://"):
10 |         # do not resolve if it is hdfs or absolute file path
11 |         return file_path
12 |     base_path = os.path.abspath(os.curdir)
13 |     if not path.exists(file_path):
14 |         if path.exists(path.join(base_path, file_path)):
15 |             file_path = path.join(base_path, file_path)
16 |         elif path.exists(path.basename(file_path)):
17 |             file_path = path.basename(file_path)
18 |         elif relative_to and path.isfile(relative_to) and path.exists(path.join(path.dirname(relative_to), file_path)):
19 |             file_path = path.join(path.dirname(relative_to), file_path)
20 |         elif relative_to and path.isdir(relative_to) and path.exists(path.join(relative_to, file_path)):
21 |             file_path = path.join(relative_to, file_path)
22 |         else:
23 |             raise Exception(f"file not found: {file_path}, curdir: {base_path}")
24 |     if abs_path:
25 |         file_path = path.abspath(file_path)
26 |     if " " in file_path:
27 |         parts = file_path.split("/")
28 |         file_path_no_space = "/".join([re.sub(r" .*$", "", part) for part in parts])
29 |         logger.warn(
30 |             "Remove space inside file path, since spark will raise issue with space in path. "
31 |             "We must ensure there is a soft link to the path with space removed to the end. "
32 |             f'Will resolve file path from "{file_path}" to "{file_path}".'
33 |         )
34 |         file_path = file_path_no_space
35 |     return prefix + file_path
36 | 
37 | 
38 | def resolve_files(files_path: str, abs_path: bool = False, relative_to: str = "") -> str:
39 |     return ",".join(
40 |         [resolve_file(f.strip(), abs_path, relative_to=relative_to) for f in files_path.split(",") if f.strip()]
41 |     )
42 | 
43 | 
44 | def read_sql(sql_file: str):
45 |     with open(resolve_file(sql_file)) as f:
46 |         return f.read()
47 | 


--------------------------------------------------------------------------------
/easy_sql/utils/kv.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import Any, Callable, Dict, Optional, Tuple
 4 | 
 5 | 
 6 | def get_key_by_splitter_and_strip(source: str, splitter: Optional[str] = "=", strip_chars: Optional[str] = None):
 7 |     source = source.strip()
 8 |     splitter = splitter or "="
 9 |     assert splitter in source, f"splitter {splitter} not found in source {source}"
10 |     return source[: source.index(splitter)].strip(strip_chars)
11 | 
12 | 
13 | def get_value_by_splitter_and_strip(source: str, splitter: Optional[str] = "=", strip_chars: Optional[str] = None):
14 |     source = source.strip()
15 |     splitter = splitter or "="
16 |     assert splitter in source, f"splitter {splitter} not found in source {source}"
17 |     return source[source.index(splitter) + len(splitter) :].strip(strip_chars)
18 | 
19 | 
20 | class KV:
21 |     def __init__(self, k: str, v: str) -> None:
22 |         self.k, self.v = k, v
23 | 
24 |     @staticmethod
25 |     def from_config(config_line: str, splitter: Optional[str] = "=", strip_chars: Optional[str] = None) -> KV:
26 |         return KV(
27 |             get_key_by_splitter_and_strip(config_line, splitter, strip_chars),
28 |             get_value_by_splitter_and_strip(config_line, splitter, strip_chars),
29 |         )
30 | 
31 |     def as_tuple(
32 |         self, k_convert: Optional[Callable[[str], Any]] = None, v_convert: Optional[Callable[[str], Any]] = None
33 |     ) -> Tuple[Any, Any]:
34 |         return (k_convert(self.k) if k_convert else self.k, v_convert(self.v) if v_convert else self.v)
35 | 
36 |     def as_dict(self) -> Dict[str, str]:
37 |         return {self.k: self.v}
38 | 


--------------------------------------------------------------------------------
/easy_sql/utils/object_utils.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict
 2 | 
 3 | 
 4 | def get_attr(obj: Dict, path: str):
 5 |     data_current = obj
 6 |     if not path:
 7 |         return data_current
 8 |     for attr_current in path.split("."):
 9 |         assert attr_current != "", f"Neither part of path should be empty: path=`{path}`, current_part=`{attr_current}`"
10 |         if attr_current not in data_current:
11 |             data_current[attr_current] = {}
12 |         data_current = data_current[attr_current]
13 |     return data_current
14 | 


--------------------------------------------------------------------------------
/easy_sql/utils/object_utils_test.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from easy_sql.utils.object_utils import get_attr
 4 | 
 5 | 
 6 | class ObjectUtilsTest(unittest.TestCase):
 7 |     def test_get_attr(self):
 8 |         self.assertEqual(get_attr({}, "a.b.c"), {})
 9 |         self.assertEqual(get_attr({"a": {}}, "a.b.c"), {})
10 |         self.assertEqual(get_attr({"a": {"b": {"c": [1, 2, 3]}}}, "a.b.c"), [1, 2, 3])
11 | 
12 |         self.assertEqual(get_attr({}, "a"), {})
13 |         self.assertEqual(get_attr({"a": ""}, "a"), "")
14 | 
15 |         self.assertEqual(get_attr({"a": 1}, ""), {"a": 1})
16 | 
17 | 
18 | if __name__ == "__main__":
19 |     unittest.main()
20 | 


--------------------------------------------------------------------------------
/examples/rtdw/.gitignore:
--------------------------------------------------------------------------------
1 | lib/
2 | workflow/**/jars
3 | *.jar
4 | *.log
5 | 


--------------------------------------------------------------------------------
/examples/rtdw/Makefile:
--------------------------------------------------------------------------------
1 | download-flink-jars:
2 | 	test -f lib/flink/jars/flink-connector-jdbc-1.15.1.jar || wget -P lib/flink/jars https://repo1.maven.org/maven2/org/apache/flink/flink-connector-jdbc/1.15.1/flink-connector-jdbc-1.15.1.jar
3 | 	test -f lib/flink/jars/flink-sql-connector-hive-3.1.2_2.12-1.15.1.jar || wget -P lib/flink/jars https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-hive-3.1.2_2.12/1.15.1/flink-sql-connector-hive-3.1.2_2.12-1.15.1.jar
4 | 	test -f lib/flink/jars/postgresql-42.2.14.jar || wget -P lib/flink/jars https://repo1.maven.org/maven2/org/postgresql/postgresql/42.2.14/postgresql-42.2.14.jar
5 | 	test -f lib/flink/jars/flink-sql-connector-postgres-cdc-2.3.0.jar || wget -P lib/flink/jars https://repo1.maven.org/maven2/com/ververica/flink-sql-connector-postgres-cdc/2.3.0/flink-sql-connector-postgres-cdc-2.3.0.jar
6 | 	test -f lib/flink/jars/hudi-flink1.15-bundle-0.12.2.jar || wget -P lib/flink/jars https://repo1.maven.org/maven2/org/apache/hudi/hudi-flink1.15-bundle/0.12.2/hudi-flink1.15-bundle-0.12.2.jar
7 | 


--------------------------------------------------------------------------------
/examples/rtdw/java/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | out/
3 | .gradle/
4 | src/**/generated/
5 | src/test/.local/
6 | bin/
7 | 


--------------------------------------------------------------------------------
/examples/rtdw/java/README:
--------------------------------------------------------------------------------
1 | A Flink application project using Java and Gradle.
2 | 
3 | To package your job for submission to Flink, use: 'gradle shadowJar'. Afterwards, you'll find the
4 | jar to use in the 'build/libs' folder.
5 | 
6 | To run and test your application with an embedded instance of Flink use: 'gradle run'
7 | 


--------------------------------------------------------------------------------
/examples/rtdw/java/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | distributionBase=GRADLE_USER_HOME
2 | distributionPath=wrapper/dists
3 | distributionUrl=https\://services.gradle.org/distributions/gradle-7.6-bin.zip
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 | 


--------------------------------------------------------------------------------
/examples/rtdw/java/gradlew.bat:
--------------------------------------------------------------------------------
 1 | @rem
 2 | @rem Copyright 2015 the original author or authors.
 3 | @rem
 4 | @rem Licensed under the Apache License, Version 2.0 (the "License");
 5 | @rem you may not use this file except in compliance with the License.
 6 | @rem You may obtain a copy of the License at
 7 | @rem
 8 | @rem      https://www.apache.org/licenses/LICENSE-2.0
 9 | @rem
10 | @rem Unless required by applicable law or agreed to in writing, software
11 | @rem distributed under the License is distributed on an "AS IS" BASIS,
12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | @rem See the License for the specific language governing permissions and
14 | @rem limitations under the License.
15 | @rem
16 | 
17 | @if "%DEBUG%" == "" @echo off
18 | @rem ##########################################################################
19 | @rem
20 | @rem  Gradle startup script for Windows
21 | @rem
22 | @rem ##########################################################################
23 | 
24 | @rem Set local scope for the variables with windows NT shell
25 | if "%OS%"=="Windows_NT" setlocal
26 | 
27 | set DIRNAME=%~dp0
28 | if "%DIRNAME%" == "" set DIRNAME=.
29 | set APP_BASE_NAME=%~n0
30 | set APP_HOME=%DIRNAME%
31 | 
32 | @rem Resolve any "." and ".." in APP_HOME to make it shorter.
33 | for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi
34 | 
35 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
36 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
37 | 
38 | @rem Find java.exe
39 | if defined JAVA_HOME goto findJavaFromJavaHome
40 | 
41 | set JAVA_EXE=java.exe
42 | %JAVA_EXE% -version >NUL 2>&1
43 | if "%ERRORLEVEL%" == "0" goto execute
44 | 
45 | echo.
46 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
47 | echo.
48 | echo Please set the JAVA_HOME variable in your environment to match the
49 | echo location of your Java installation.
50 | 
51 | goto fail
52 | 
53 | :findJavaFromJavaHome
54 | set JAVA_HOME=%JAVA_HOME:"=%
55 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe
56 | 
57 | if exist "%JAVA_EXE%" goto execute
58 | 
59 | echo.
60 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
61 | echo.
62 | echo Please set the JAVA_HOME variable in your environment to match the
63 | echo location of your Java installation.
64 | 
65 | goto fail
66 | 
67 | :execute
68 | @rem Setup the command line
69 | 
70 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
71 | 
72 | 
73 | @rem Execute Gradle
74 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %*
75 | 
76 | :end
77 | @rem End local scope for the variables with windows NT shell
78 | if "%ERRORLEVEL%"=="0" goto mainEnd
79 | 
80 | :fail
81 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
82 | rem the _cmd.exe /c_ return code!
83 | if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
84 | exit /b 1
85 | 
86 | :mainEnd
87 | if "%OS%"=="Windows_NT" endlocal
88 | 
89 | :omega
90 | 


--------------------------------------------------------------------------------
/examples/rtdw/java/settings.gradle:
--------------------------------------------------------------------------------
1 | rootProject.name = 'quickstart'
2 | 


--------------------------------------------------------------------------------
/examples/rtdw/java/src/main/java/com/easysql/example/Example.java:
--------------------------------------------------------------------------------
 1 | package com.easysql.example;
 2 | 
 3 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
 4 | import org.apache.flink.streaming.api.datastream.DataStream;
 5 | import org.apache.flink.api.common.functions.FilterFunction;
 6 | 
 7 | public class Example {
 8 | 
 9 |     public static void test(StreamExecutionEnvironment env) throws Exception {
10 |         DataStream<Person> flintstones = env.fromElements(
11 |                 new Person("Fred", 35),
12 |                 new Person("Wilma", 35),
13 |                 new Person("Pebbles", 2));
14 | 
15 |         DataStream<Person> adults = flintstones.filter(new FilterFunction<Person>() {
16 |             @Override
17 |             public boolean filter(Person person) throws Exception {
18 |                 return person.age >= 18;
19 |             }
20 |         });
21 | 
22 |         adults.print();
23 | 
24 |         env.execute();
25 |     }
26 | 
27 |     public static class Person {
28 |         public String name;
29 |         public Integer age;
30 |         public Person() {}
31 | 
32 |         public Person(String name, Integer age) {
33 |             this.name = name;
34 |             this.age = age;
35 |         }
36 | 
37 |         public String toString() {
38 |             return this.name.toString() + ": age " + this.age.toString();
39 |         }
40 |     }
41 | }
42 | 


--------------------------------------------------------------------------------
/examples/rtdw/java/src/main/java/com/easysql/example/Ingest.java:
--------------------------------------------------------------------------------
 1 | package com.easysql.example;
 2 | 
 3 | import lombok.Data;
 4 | import lombok.extern.slf4j.Slf4j;
 5 | import lombok.val;
 6 | import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.databind.ObjectMapper;
 7 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
 8 | import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
 9 | 
10 | import java.util.Arrays;
11 | import java.util.HashMap;
12 | import java.util.List;
13 | 
14 | @Slf4j
15 | public class Ingest {
16 | 
17 |     private static ObjectMapper objectMapper = new ObjectMapper();
18 | 
19 |     public static void ingest(StreamExecutionEnvironment env, StreamTableEnvironment tEnv, String jsonOpts) throws Exception {
20 |         val ic = objectMapper.readValue(jsonOpts, IngestConfig.class);
21 |         env.getCheckpointConfig().setCheckpointInterval(10000);
22 |         val cdcSource = Sources.createPgCDCSource(ic.source_connector.options, ic.schema_list, ic.table_list);
23 |         final SplitTableFunction splitTableFunction = new SplitTableFunction(Arrays.asList("inventory.user,inventory.product,inventory.user_order".split(",")));
24 |         val ds = env.addSource(cdcSource)
25 |             .process(splitTableFunction);
26 |         splitTableFunction.getOutputTags().values().forEach(tag -> {
27 |             tEnv.fromDataStream(ds.getSideOutput(tag).map(data -> data).setParallelism(1));
28 |             val tmpTableName = "_tmp__" + tag.getId().replace(".", "__");
29 |             tEnv.createTemporaryView(tmpTableName, ds);
30 |         });
31 |         env.execute();
32 |     }
33 | 
34 |     @Data
35 |     public static class IngestConfig {
36 | 
37 |         private List<HashMap<String, String>> catalogs;
38 |         private List<Database> databases;
39 |         private List<String> schema_list;
40 |         private List<String> table_list;
41 |         private Connector source_connector;
42 | 
43 |         public Database db(String dbName) {
44 |             return databases.stream().filter(db -> dbName.equals(db.getName())).findFirst().orElse(null);
45 |         }
46 | 
47 |         public HashMap<String, String> connector(String dbName, String connectorName) {
48 |             val db = db(dbName);
49 |             val conn = db.connector(connectorName);
50 |             return conn.getOptions();
51 |         }
52 | 
53 |         @Data
54 |         public static class Database {
55 | 
56 |             private String name;
57 |             private List<Connector> connectors;
58 |             private List<Table> tables;
59 | 
60 |             public Connector connector(String name) {
61 |                 return connectors.stream().filter(prop -> name.equals(prop.getName())).findFirst().orElse(null);
62 |             }
63 | 
64 |             public Table table(String name) {
65 |                 return tables.stream().filter(prop -> name.equals(prop.getName())).findFirst().orElse(null);
66 |             }
67 | 
68 |         }
69 | 
70 |         @Data
71 |         public static class Connector {
72 | 
73 |             private String name;
74 |             private HashMap<String, String> options;
75 |         }
76 | 
77 |         @Data
78 |         public static class Table {
79 | 
80 |             private String name;
81 |             private Connector connector;
82 |             private List<String> schema;
83 | 
84 |             public HashMap<String, String> fullOptions(Database db) {
85 |                 val dbConn = db.connector(connector.name);
86 |                 val result = new HashMap<>(dbConn.getOptions());
87 |                 if (connector.getOptions() != null) {
88 |                     result.putAll(connector.getOptions());
89 |                     if (connector.getOptions().containsKey("path")) {
90 |                         result.put("path", String.format("%s/%s.db/%s", result.get("path"), db.name, name));
91 |                     }
92 |                 }
93 |                 return result;
94 |             }
95 |         }
96 |     }
97 | }
98 | 


--------------------------------------------------------------------------------
/examples/rtdw/java/src/main/java/com/easysql/example/Sinks.java:
--------------------------------------------------------------------------------
 1 | package com.easysql.example;
 2 | 
 3 | import com.easysql.example.Ingest.IngestConfig;
 4 | import com.easysql.example.Ingest.IngestConfig.Database;
 5 | import com.easysql.example.Ingest.IngestConfig.Table;
 6 | import lombok.val;
 7 | import org.apache.flink.table.data.RowData;
 8 | import org.apache.hudi.common.model.HoodieTableType;
 9 | import org.apache.hudi.configuration.FlinkOptions;
10 | import org.apache.hudi.util.HoodiePipeline;
11 | 
12 | import java.util.HashMap;
13 | import java.util.Map;
14 | 
15 | public class Sinks {
16 |     public static void createHudiSink(String db, String table, IngestConfig config) {
17 |         final Database dbObj = config.db(db);
18 |         final Table tableObj = dbObj.table(table);
19 |         val tableOpts = tableObj.fullOptions(dbObj);
20 |         val tableSchema = tableObj.getSchema();
21 | 
22 |         HoodiePipeline.Builder builder = HoodiePipeline.builder(table)
23 |             .column("uuid VARCHAR(20)")
24 |             .column("name VARCHAR(10)")
25 |             .column("age INT")
26 |             .column("ts TIMESTAMP(3)")
27 |             .column("_di VARCHAR(20)")
28 |             .partition("_di")
29 |             .options(tableOpts);
30 | 
31 | //        builder.sink(dataStream, false);
32 |     }
33 | }
34 | 


--------------------------------------------------------------------------------
/examples/rtdw/java/src/main/java/com/easysql/example/SplitTableFunction.java:
--------------------------------------------------------------------------------
 1 | package com.easysql.example;
 2 | 
 3 | import com.easysql.example.RowDataDebeziumDeserializationSchema.GenericRowDataWithSchema;
 4 | import lombok.extern.slf4j.Slf4j;
 5 | import lombok.val;
 6 | import org.apache.flink.api.java.tuple.Tuple2;
 7 | import org.apache.flink.streaming.api.functions.ProcessFunction;
 8 | import org.apache.flink.util.Collector;
 9 | import org.apache.flink.util.OutputTag;
10 | 
11 | import java.util.List;
12 | import java.util.Map;
13 | import java.util.function.Function;
14 | import java.util.stream.Collectors;
15 | 
16 | @Slf4j
17 | public class SplitTableFunction extends ProcessFunction<GenericRowDataWithSchema, GenericRowDataWithSchema> {
18 | 
19 |     private List<String> tables;
20 |     private transient Map<String, OutputTag<GenericRowDataWithSchema>> outputTags = null;
21 | 
22 |     public SplitTableFunction(List<String> tables) { this.tables = tables; }
23 | 
24 |     public Map<String, OutputTag<GenericRowDataWithSchema>> getOutputTags() {
25 |         if (outputTags == null) {
26 |             val dbTables = tables.stream()
27 |                 .map(table -> new Tuple2<>(table.substring(0, table.indexOf(".")), table.substring(table.indexOf(".") + 1)))
28 |                 .collect(Collectors.toList());
29 |             outputTags = dbTables.stream()
30 |                 .map(table -> new OutputTag<GenericRowDataWithSchema>(table.f0 + "." + table.f1) { })
31 |                 .collect(Collectors.toMap(OutputTag::getId, Function.identity()));
32 |         }
33 |         return outputTags;
34 |     }
35 | 
36 |     @Override
37 |     public void processElement(GenericRowDataWithSchema rowData, ProcessFunction<GenericRowDataWithSchema, GenericRowDataWithSchema>.Context ctx, Collector<GenericRowDataWithSchema> out) throws Exception {
38 |         val table = rowData.getTable();
39 |         val tags = this.getOutputTags();
40 |         if (tags.containsKey(table)) {
41 |             val tag = tags.get(table);
42 |             ctx.output(tag, rowData);
43 |         } else {
44 |             log.debug("Ignore message for table {} since it it not configured to process.", table);
45 |         }
46 |     }
47 | 
48 | }
49 | 


--------------------------------------------------------------------------------
/examples/rtdw/java/src/main/java/org/myorg/quickstart/DataStreamJob.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The ASF licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | 
19 | package org.myorg.quickstart;
20 | 
21 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
22 | 
23 | /**
24 |  * Skeleton for a Flink DataStream Job.
25 |  *
26 |  * <p>For a tutorial how to write a Flink application, check the
27 |  * tutorials and examples on the <a href="https://flink.apache.org">Flink Website</a>.
28 |  *
29 |  * <p>To package your application into a JAR file for execution, run
30 |  * 'mvn clean package' on the command line.
31 |  *
32 |  * <p>If you change the name of the main class (with the public static void main(String[] args))
33 |  * method, change the respective entry in the POM.xml file (simply search for 'mainClass').
34 |  */
35 | public class DataStreamJob {
36 | 
37 | 	public static void main(String[] args) throws Exception {
38 | 		// Sets up the execution environment, which is the main entry point
39 | 		// to building Flink applications.
40 | 		final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
41 | 
42 | 		/*
43 | 		 * Here, you can start creating your execution plan for Flink.
44 | 		 *
45 | 		 * Start with getting some data from the environment, like
46 | 		 * 	env.fromSequence(1, 10);
47 | 		 *
48 | 		 * then, transform the resulting DataStream<Long> using operations
49 | 		 * like
50 | 		 * 	.filter()
51 | 		 * 	.flatMap()
52 | 		 * 	.window()
53 | 		 * 	.process()
54 | 		 *
55 | 		 * and many more.
56 | 		 * Have a look at the programming guide:
57 | 		 *
58 | 		 * https://nightlies.apache.org/flink/flink-docs-stable/
59 | 		 *
60 | 		 */
61 | 
62 | 		// Execute program, beginning computation.
63 | 		env.execute("Flink Java API Skeleton");
64 | 	}
65 | }
66 | 


--------------------------------------------------------------------------------
/examples/rtdw/java/src/main/resources/log4j2.properties:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | #  Licensed to the Apache Software Foundation (ASF) under one
 3 | #  or more contributor license agreements.  See the NOTICE file
 4 | #  distributed with this work for additional information
 5 | #  regarding copyright ownership.  The ASF licenses this file
 6 | #  to you under the Apache License, Version 2.0 (the
 7 | #  "License"); you may not use this file except in compliance
 8 | #  with the License.  You may obtain a copy of the License at
 9 | #
10 | #      http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | #  Unless required by applicable law or agreed to in writing, software
13 | #  distributed under the License is distributed on an "AS IS" BASIS,
14 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | #  See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | ################################################################################
18 | 
19 | rootLogger.level = INFO
20 | rootLogger.appenderRef.console.ref = ConsoleAppender
21 | 
22 | appender.console.name = ConsoleAppender
23 | appender.console.type = CONSOLE
24 | appender.console.layout.type = PatternLayout
25 | appender.console.layout.pattern = %d{HH:mm:ss,SSS} %-5p %-60c %x - %m%n
26 | 


--------------------------------------------------------------------------------
/examples/rtdw/java/src/main/scala/com/easysql/example/ingest.scala:
--------------------------------------------------------------------------------
 1 | package com.easysql.example
 2 | 
 3 | import com.ververica.cdc.connectors.postgres.PostgreSQLSource
 4 | import com.ververica.cdc.debezium.{DebeziumSourceFunction, JsonDebeziumDeserializationSchema}
 5 | import org.apache.flink.api.common.serialization.SimpleStringSchema
 6 | import org.apache.flink.connector.base.DeliveryGuarantee
 7 | import org.apache.flink.connector.kafka.sink.{KafkaRecordSerializationSchema, KafkaSink}
 8 | import org.apache.flink.contrib.streaming.state.EmbeddedRocksDBStateBackend
 9 | import org.apache.flink.streaming.api.CheckpointingMode
10 | import org.apache.flink.streaming.api.environment.CheckpointConfig.ExternalizedCheckpointCleanup
11 | import org.apache.flink.streaming.api.scala.{StreamExecutionEnvironment, _}
12 | 
13 | import java.util.Properties
14 | 
15 | object PostgresCDC {
16 |     def createCDCSource(): DebeziumSourceFunction[String]={
17 |         val prop = new Properties()
18 |         prop.setProperty("decimal.handling.mode","string")
19 |         PostgreSQLSource.builder[String]
20 |             .hostname("testpg")
21 |             .port(15432)
22 |             .username("postgres")
23 |             .password("123456")
24 |             .database("postgres")
25 |             .schemaList("inventory")
26 |             .slotName("pg_cdc")
27 |             .decodingPluginName("pgoutput")
28 |             .debeziumProperties(prop)
29 |             .deserializer(new JsonDebeziumDeserializationSchema)
30 |             .build
31 |     }
32 | 
33 |     def createKafkaSink(): KafkaSink[String] ={
34 |         val sinkTopic = "pgcdc"
35 |         KafkaSink.builder[String].setBootstrapServers("localhost:9092")
36 |             .setRecordSerializer(KafkaRecordSerializationSchema.builder()
37 |                 .setTopic(sinkTopic)
38 |                 .setValueSerializationSchema(new SimpleStringSchema())
39 |                 .build())
40 |             .setDeliverGuarantee(DeliveryGuarantee.EXACTLY_ONCE)
41 |             .setTransactionalIdPrefix("pgcdc-transaction-id")
42 |             .setKafkaProducerConfig(Map("transaction.timeout.ms"-> "300000"))
43 |             .build
44 |     }
45 | 
46 |     implicit def map2Properties(map: Map[String, String]): java.util.Properties = {
47 |         map.foldLeft(new java.util.Properties){ case (props, (k, v)) => props.put(k, v); props }
48 |     }
49 | 
50 |     def main(args: Array[String]): Unit = {
51 |         val env = StreamExecutionEnvironment.getExecutionEnvironment
52 |         env.enableCheckpointing(10 * 1000)
53 |         env.getCheckpointConfig.setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE)
54 |         env.getCheckpointConfig.setMinPauseBetweenCheckpoints(500)
55 |         env.getCheckpointConfig.setCheckpointTimeout(60000)
56 |         env.getCheckpointConfig.enableExternalizedCheckpoints(ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION)
57 |         val rocksBackend = new EmbeddedRocksDBStateBackend()
58 |         rocksBackend.setDbStoragePath("/tmp/cdc-flink-states")
59 |         env.setStateBackend(rocksBackend)
60 | 
61 |         env.addSource(createCDCSource()).name("postgres cdc source")
62 |             .map(data => {
63 |                 data
64 |             })
65 |             .setParallelism(1)
66 | //            .print()
67 |             .sinkTo(createKafkaSink()).name("cdc sink kafka")
68 | 
69 |         env.execute("Postgres CDC")
70 |     }
71 | 
72 | }
73 | 


--------------------------------------------------------------------------------
/examples/rtdw/readme.md:
--------------------------------------------------------------------------------
 1 | ## Target
 2 | 
 3 | - ingest data: support both snapshot and incremental data
 4 | - etl development: join big tables with updates
 5 | 
 6 | ## TODO
 7 | - [√] setup a scenario
 8 | - kafka to help ingest data
 9 |     + if use flink cdc directly, it will use multiple connections and add pressure to db
10 |     + consider ingest snapshot first and then data change log
11 | 
12 | ## V1
13 | 
14 | **Solution:**
15 | 
16 | pg  ----------->  kafka  -------------->  dw
17 |       flink-cdc          spark-streaming
18 | 
19 | 
20 | **Prepare environment:**
21 | 
22 | ```bash
23 | # execute the commands below in separate terminals
24 | 
25 | # prepare data in pg: run workflow/sales/ods/data.sql in pg manually
26 | # start kafka
27 | bin/zookeeper-server-start.sh config/zookeeper.properties
28 | bin/kafka-server-start.sh config/server.properties
29 | bin/kafka-console-consumer.sh --topic pgcdc --from-beginning --bootstrap-server localhost:9092
30 | # start flink cdc to kafka (in sales/ods)
31 | java -cp '~/dev/sdks/scala-2.12.10/lib/scala-library.jar:easysql-example-ods.jar:/usr/local/lib/python3.8/site-packages/pyflink/lib/*:jars/*' com.easysql.example.PostgresCDC
32 | # start spark streaming app to ingest data to hudi (in workflow)
33 | bash -c "$(python3 -m easy_sql.data_process -f sales/ods/ingest_hudi.sql -p)" 2>&1 | tee ingest_hudi.log
34 | # query the ingested data to hudi (in workflow)
35 | bash -c "$(python3 -m easy_sql.data_process -f sales/ods/ingest_hudi.test.sql -p)"
36 | ```
37 | 
38 | **Test**
39 | 
40 | Emit queries in postgres.
41 | 
42 | Cases to test:
43 | - add data: `insert into inventory.product(pid,pname,pprice) values ('6','prodcut-006',225.31);`
44 | - change data: `update inventory.product set pname='p6' where pid=6;`
45 | - delete data: `delete from inventory.product where pid=6;`
46 | - add column with default value: `alter table inventory.product add column ex int default 0;`
47 | - delete column: `alter table inventory.product delete column ex;`
48 | - rename column: `alter table inventory.product rename column ex to ex1;`
49 | - change column type: `alter table inventory.product change column ex ex1 int;`
50 | 


--------------------------------------------------------------------------------
/examples/rtdw/scala/.gitignore:
--------------------------------------------------------------------------------
1 | classes/
2 | ref/
3 | 


--------------------------------------------------------------------------------
/examples/rtdw/scala/Makefile:
--------------------------------------------------------------------------------
 1 | SCALA_BIN=~/dev/sdks/scala-2.12.10/bin
 2 | FLINK_JAR_PATH=/usr/local/lib/python3.8/site-packages/pyflink/lib/*
 3 | SCALA_CP="${FLINK_JAR_PATH}:../lib/flink/jars/*"
 4 | 
 5 | 
 6 | ods-jar:
 7 | 	- rm -r classes
 8 | 	mkdir -pv classes
 9 | 	${SCALA_BIN}/scalac -nobootcp -cp ${SCALA_CP} -d classes src/com/easysql/example/*.scala
10 | 	cd classes && jar -cvfe ../easysql-example-ods.jar com.easysql.example.PostgresCDC com
11 | 


--------------------------------------------------------------------------------
/examples/rtdw/scala/src/com/easysql/example/ingest.scala:
--------------------------------------------------------------------------------
 1 | package com.easysql.example
 2 | 
 3 | import com.ververica.cdc.connectors.postgres.PostgreSQLSource
 4 | import com.ververica.cdc.debezium.{DebeziumSourceFunction, JsonDebeziumDeserializationSchema}
 5 | import org.apache.flink.api.common.serialization.SimpleStringSchema
 6 | import org.apache.flink.connector.base.DeliveryGuarantee
 7 | import org.apache.flink.connector.kafka.sink.{KafkaRecordSerializationSchema, KafkaSink}
 8 | import org.apache.flink.contrib.streaming.state.EmbeddedRocksDBStateBackend
 9 | import org.apache.flink.streaming.api.CheckpointingMode
10 | import org.apache.flink.streaming.api.environment.CheckpointConfig.ExternalizedCheckpointCleanup
11 | import org.apache.flink.streaming.api.scala.{StreamExecutionEnvironment, _}
12 | 
13 | import java.util.Properties
14 | 
15 | object PostgresCDC {
16 |     def createCDCSource(): DebeziumSourceFunction[String]={
17 |         val prop = new Properties()
18 |         prop.setProperty("decimal.handling.mode","string")
19 |         PostgreSQLSource.builder[String]
20 |             .hostname("testpg")
21 |             .port(15432)
22 |             .username("postgres")
23 |             .password("123456")
24 |             .database("postgres")
25 |             .schemaList("inventory")
26 |             .slotName("test_pg_cdc")
27 |             .decodingPluginName("pgoutput")
28 |             .debeziumProperties(prop)
29 |             .deserializer(new JsonDebeziumDeserializationSchema)
30 |             .build
31 |     }
32 | 
33 |     def createKafkaSink(): KafkaSink[String] ={
34 |         val sinkTopic = "pgcdc"
35 |         KafkaSink.builder[String].setBootstrapServers("localhost:9092")
36 |             .setRecordSerializer(KafkaRecordSerializationSchema.builder()
37 |                 .setTopic(sinkTopic)
38 |                 .setValueSerializationSchema(new SimpleStringSchema())
39 |                 .build())
40 |             .setDeliverGuarantee(DeliveryGuarantee.EXACTLY_ONCE)
41 |             .setTransactionalIdPrefix("pgcdc-transaction-id")
42 |             .setKafkaProducerConfig(Map("transaction.timeout.ms"-> "300000"))
43 |             .build
44 |     }
45 | 
46 |     implicit def map2Properties(map: Map[String, String]): java.util.Properties = {
47 |         map.foldLeft(new java.util.Properties){ case (props, (k, v)) => props.put(k, v); props }
48 |     }
49 | 
50 |     def main(args: Array[String]): Unit = {
51 |         val env = StreamExecutionEnvironment.getExecutionEnvironment
52 |         env.enableCheckpointing(10 * 1000)
53 |         env.getCheckpointConfig.setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE)
54 |         env.getCheckpointConfig.setMinPauseBetweenCheckpoints(500)
55 |         env.getCheckpointConfig.setCheckpointTimeout(60000)
56 |         env.getCheckpointConfig.enableExternalizedCheckpoints(ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION)
57 |         val rocksBackend = new EmbeddedRocksDBStateBackend()
58 |         rocksBackend.setDbStoragePath("/tmp/cdc-flink-states")
59 |         env.setStateBackend(rocksBackend)
60 | 
61 |         env.addSource(createCDCSource()).name("postgres cdc source")
62 |             .setParallelism(1)
63 |             .sinkTo(createKafkaSink()).name("cdc sink kafka")
64 | 
65 |         env.execute("Postgres CDC")
66 |     }
67 | 
68 | }
69 | 


--------------------------------------------------------------------------------
/examples/rtdw/workflow/sales/ods/Makefile:
--------------------------------------------------------------------------------
1 | cdc-to-kafka:
2 | 	java -cp '~/dev/sdks/scala-2.12.10/lib/scala-library.jar:easysql-example-ods.jar:/usr/local/lib/python3.8/site-packages/pyflink/lib/*:jars/*' com.easysql.example.PostgresCDC
3 | 


--------------------------------------------------------------------------------
/examples/rtdw/workflow/sales/ods/data.sql:
--------------------------------------------------------------------------------
 1 | drop schema if exists inventory;
 2 | create schema inventory;
 3 | set search_path=inventory;
 4 | 
 5 | CREATE OR REPLACE FUNCTION update_modify_time_column()
 6 | RETURNS TRIGGER AS $$
 7 | BEGIN
 8 |    NEW.modify_time = now();
 9 |    RETURN NEW;
10 | END;
11 | $$ language 'plpgsql';
12 | 
13 | -- create  user table
14 | drop table if exists inventory.user;
15 | create table if not exists inventory.user (
16 |     id           serial not null,
17 |     name         varchar(155)                        null,
18 |     device_model varchar(155)                        null,
19 |     email        varchar(50)                         null,
20 |     phone        varchar(50)                         null,
21 |     create_time  timestamp default CURRENT_TIMESTAMP not null,
22 |     modify_time  timestamp default CURRENT_TIMESTAMP not null,
23 |     primary key (id)
24 | );
25 | 
26 | CREATE TRIGGER update_user_modify_time BEFORE UPDATE
27 |     ON inventory.user FOR EACH ROW EXECUTE PROCEDURE
28 |     update_modify_time_column();
29 | 
30 | -- insert data
31 | insert into inventory.user(name,device_model,email,phone) values
32 | ('customer-01','dm-01','abc01@email.com','188776xxxxx'),
33 | ('customer-02','dm-02','abc02@email.com','166776xxxxx');
34 | 
35 | -- create product table
36 | create table if not exists inventory.product
37 | (
38 |     pid          serial not null,
39 |     pname        varchar(155)                        null,
40 |     pprice       decimal(10,2)                           ,
41 |     create_time  timestamp default CURRENT_TIMESTAMP not null,
42 |     modify_time  timestamp default CURRENT_TIMESTAMP not null,
43 |     primary key (pid)
44 | );
45 | 
46 | CREATE TRIGGER update_user_modify_time BEFORE UPDATE
47 |     ON inventory.product FOR EACH ROW EXECUTE PROCEDURE
48 |     update_modify_time_column();
49 | 
50 | -- insert data
51 | insert into inventory.product(pid,pname,pprice) values
52 | ('1','prodcut-001',125.12),
53 | ('2','prodcut-002',225.31);
54 | 
55 | -- create order table
56 | drop table if exists inventory.user_order;
57 | create table if not exists inventory.user_order
58 | (
59 |     id           serial,
60 |     oid          varchar(155)                        not null,
61 |     uid          int                                         ,
62 |     pid          int                                         ,
63 |     onum         int                                         ,
64 |     create_time  timestamp default CURRENT_TIMESTAMP not null,
65 |     modify_time  timestamp default CURRENT_TIMESTAMP not null,
66 |     primary key (id)
67 | );
68 | 
69 | CREATE TRIGGER update_user_modify_time BEFORE UPDATE
70 |     ON inventory.user_order FOR EACH ROW EXECUTE PROCEDURE
71 |     update_modify_time_column();
72 | 
73 | -- insert data
74 | insert into user_order(oid,uid,pid,onum) values
75 | ('o10001',1,1,100),
76 | ('o10002',1,2,30),
77 | ('o10001',2,1,22),
78 | ('o10002',2,2,16);
79 | 
80 | -- select data
81 | select * from user;
82 | select * from product;
83 | select * from user_order;
84 | 


--------------------------------------------------------------------------------
/examples/rtdw/workflow/sales/ods/ingest.sql:
--------------------------------------------------------------------------------
 1 | -- backend: flink
 2 | 
 3 | -- config: easy_sql.flink_tables_file_path=ods.flink_tables.json
 4 | -- config: easy_sql.func_file_path=ingest_funcs.py
 5 | -- config: easy_sql.etl_type=streaming
 6 | 
 7 | -- config: flink.cmd=-pyexec python3
 8 | -- config: flink.cmd=-pyclientexec python3
 9 | -- config: flink.cmd=-t remote
10 | -- config: flink.execution.checkpointing.interval=3s
11 | -- config: flink.pipeline.jars=../lib/flink/jars/flink-sql-connector-postgres-cdc-2.3.0.jar;../lib/flink/jars/hudi-flink1.15-bundle-0.12.2.jar;sales/ods/easysql-example-ingest.jar;sales/ods/jars/flink-connector-jdbc-1.15.2.jar;sales/ods/jars/postgresql-42.2.14.jar
12 | -- config: flink.pipeline.name=sales.ingest
13 | 
14 | -- target=variables
15 | select
16 |     'append'           as __save_mode__
17 |     , 'inventory.user,inventory.product,inventory.user_order'          as tables_
18 | 
19 | -- target=func.ingest_cdc_pg(${__backend__}, db_pg, connector_cdc, ${tables_}, sales)
20 | 


--------------------------------------------------------------------------------
/examples/rtdw/workflow/sales/ods/ingest.test.sql:
--------------------------------------------------------------------------------
 1 | -- backend: flink
 2 | 
 3 | -- config: easy_sql.flink_tables_file_path=ods.flink_tables.json
 4 | -- config: easy_sql.etl_type=batch
 5 | 
 6 | -- config: flink.cmd=-pyexec python3
 7 | -- config: flink.cmd=-pyclientexec python3
 8 | -- config: flink.cmd=-t remote
 9 | -- config: flink.execution.checkpointing.interval=3s
10 | -- config: flink.pipeline.jars=../lib/flink/jars/flink-sql-connector-postgres-cdc-2.3.0.jar;../lib/flink/jars/hudi-flink1.15-bundle-0.12.2.jar
11 | 
12 | -- target=variables
13 | select DATE_FORMAT(now(), 'yyyy-MM-dd') as TODAY;
14 | 
15 | -- target=func.exec_sql_in_source(${__step__}, db_pg, connector_jdbc)
16 | -- prepare data to ingest
17 | drop schema if exists ingest_test_sales cascade;
18 | create schema ingest_test_sales;
19 | create table ingest_test_sales.order (id int, product_id int, created_at timestamp, updated_at timestamp, primary key (id));
20 | create table ingest_test_sales.product (id int, name text, category text, created_at timestamp, updated_at timestamp, primary key (id));
21 | insert into ingest_test_sales.product values (1, 'p1', 'c1', '${TODAY} 00:00:00', '${TODAY} 00:00:00'), (2, 'p2', 'c2', '${TODAY} 00:00:01', '${TODAY} 00:00:01');
22 | insert into ingest_test_sales.order values (1, 1, '${TODAY} 00:00:01', '${TODAY} 00:00:01'), (2, 1, '${TODAY} 00:00:01', '${TODAY} 00:00:01'), (3, 1, '${TODAY} 00:00:01', '${TODAY} 00:00:01');
23 | insert into ingest_test_sales.order values (4, 2, '${TODAY} 00:00:01', '${TODAY} 00:00:01'), (5, 2, '${TODAY} 00:00:01', '${TODAY} 00:00:01');
24 | 
25 | -- target=func.test_run_etl(${__config__}, ingest.sql)
26 | 
27 | -- target=func.sleep(10)
28 | 
29 | 
30 | -- target=check.ensure_product_data_ingested
31 | select
32 |     2 as expected
33 |     , count(1) as actual
34 | from ods_rt_sales.ingest_test_sales_product
35 | 
36 | -- target=check.ensure_order_data_ingested
37 | select
38 |     5 as expected
39 |     , count(1) as actual
40 | from ods_rt_sales.ingest_test_sales_order
41 | 
42 | 
43 | -- target=func.exec_sql_in_source(${__step__}, db_pg, connector_jdbc)
44 | -- prepare data to ingest
45 | insert into ingest_test_sales.product values (3, 'p3', 'c3', '${TODAY} 00:00:00', '${TODAY} 00:00:00');
46 | insert into ingest_test_sales.order values (6, 2, '${TODAY} 00:00:01', '${TODAY} 00:00:01');
47 | 
48 | -- target=func.sleep(5)
49 | 
50 | 
51 | -- target=check.ensure_product_data_ingested
52 | select
53 |     3 as expected
54 |     , count(1) as actual
55 | from ods_rt_sales.ingest_test_sales_product
56 | 


--------------------------------------------------------------------------------
/examples/rtdw/workflow/sales/ods/ingest_funcs.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import json
 4 | from typing import TYPE_CHECKING, Dict
 5 | 
 6 | from easy_sql.sql_processor.backend.base import SaveMode, TableMeta
 7 | 
 8 | if TYPE_CHECKING:
 9 |     from easy_sql.sql_processor.backend import FlinkBackend
10 | 
11 | 
12 | __all__ = ["ingest_cdc_pg"]
13 | 
14 | 
15 | def ingest_cdc_pg(backend: FlinkBackend, db: str, connector: str, table_list: str, domain: str):
16 |     db_config = backend.flink_tables_config.database(db)
17 |     if not db_config:
18 |         raise Exception("Db not configured: " + db)
19 |     connector_config = backend.flink_tables_config.connector(db_config, connector_name=connector)
20 |     if not connector_config:
21 |         raise Exception(f"Connector {connector} not configured for db {db}")
22 |     target_tables = {table: f'ods_{domain}.{domain}_{table.split(".")[1]}' for table in table_list.split(",")}
23 |     table_with_fields_list = [
24 |         {
25 |             "name": table,
26 |             "schemaRefTableName": target_table,
27 |             "fields": backend.flink_tables_config.table_fields(target_table, ["_di"]),
28 |         }
29 |         for table, target_table in target_tables.items()
30 |     ]
31 | 
32 |     backend.register_tables()
33 | 
34 |     from py4j.java_gateway import java_import
35 |     from pyflink.java_gateway import get_gateway
36 | 
37 |     gw = get_gateway()
38 |     java_import(gw.jvm, "com.easysql.example.Sources")
39 |     readPgCDC = eval(
40 |         "gw.jvm.com.easysql.example.Sources.readPgCDC",
41 |         {
42 |             "gw": gw,
43 |         },
44 |     )
45 |     _j_env = backend.flink_stream_env._j_stream_execution_environment  # type: ignore
46 |     result_tables: Dict[str, str] = readPgCDC(
47 |         _j_env, backend.flink._j_tenv, connector_config["options"], json.dumps(table_with_fields_list)
48 |     )
49 | 
50 |     ingest_tables = {
51 |         f'ods_{domain}.{domain}_{table.split(".")[1]}': read_temp_table
52 |         for table, read_temp_table in result_tables.items()
53 |     }
54 |     for hudi_table, read_temp_table in ingest_tables.items():
55 |         backend.exec_native_sql_query(f"select * from {read_temp_table}").print_schema()
56 |         table_with_partition = backend.exec_native_sql_query(
57 |             f"select *, from_unixtime(_op_ts / 1000, 'yyyyMMdd') as _di from {read_temp_table}"
58 |         )
59 |         backend.flink.create_temporary_view(read_temp_table, table_with_partition)
60 |         assert "." not in read_temp_table
61 |         backend.save_table(TableMeta(read_temp_table), TableMeta(hudi_table), SaveMode.append)
62 | 


--------------------------------------------------------------------------------
/examples/rtdw/workflow/sales/ods/ingest_hudi.sql:
--------------------------------------------------------------------------------
 1 | -- config: easy_sql.spark_submit=spark-submit --packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0,org.apache.hudi:hudi-spark3.3-bundle_2.12:0.12.2
 2 | -- config: easy_sql.func_file_path=ingest_hudi_funcs.py
 3 | -- config: easy_sql.etl_type=streaming
 4 | 
 5 | -- config: spark.serializer=org.apache.spark.serializer.KryoSerializer
 6 | -- config: spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension
 7 | -- config: spark.sql.catalog.spark_catalog=org.apache.spark.sql.hudi.catalog.HoodieCatalog
 8 | 
 9 | -- target=func.read_kafka(pgcdc)
10 | -- target=func.write_hudi(pgcdc)
11 | 


--------------------------------------------------------------------------------
/examples/rtdw/workflow/sales/ods/ingest_hudi.test.sql:
--------------------------------------------------------------------------------
 1 | -- config: easy_sql.spark_submit=spark-submit --packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0,org.apache.hudi:hudi-spark3.3-bundle_2.12:0.12.2
 2 | -- config: easy_sql.func_file_path=ingest_hudi_funcs.py
 3 | -- config: easy_sql.etl_type=batch
 4 | 
 5 | -- config: spark.serializer=org.apache.spark.serializer.KryoSerializer
 6 | -- config: spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension
 7 | -- config: spark.sql.catalog.spark_catalog=org.apache.spark.sql.hudi.catalog.HoodieCatalog
 8 | 
 9 | -- target=func.read_hudi(user)
10 | -- target=log.sample_user
11 | select * from user order by _dt;
12 | 


--------------------------------------------------------------------------------
/examples/rtdw/workflow/sales/ods/ods.flink_tables.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "catalogs": [
  3 |         {
  4 |             "name": "myhiveCatalog",
  5 |             "type": "hive",
  6 |             "hive-conf-dir": "test/flink/flink_hive_conf"
  7 |         },
  8 |         {
  9 |             "name": "myhiveCatalog_1",
 10 |             "type": "hive",
 11 |             "hive-conf-dir": "test/flink/flink_hive_conf"
 12 |         },
 13 |         {
 14 |             "name": "testpg",
 15 |             "type": "jdbc",
 16 |             "default-database": "postgres",
 17 |             "username": "postgres",
 18 |             "password": "123456",
 19 |             "base-url": "jdbc:postgresql://testpg:15432"
 20 |         }
 21 |     ],
 22 |     "databases": [
 23 |         {
 24 |             "name": "db_pg",
 25 |             "connectors": [
 26 |                 {
 27 |                     "name": "connector_jdbc",
 28 |                     "options": {
 29 |                         "connector": "jdbc",
 30 |                         "url": "jdbc:postgresql://testpg:15432/postgres",
 31 |                         "username": "postgres",
 32 |                         "password": "123456"
 33 |                     }
 34 |                 },
 35 |                 {
 36 |                     "name": "connector_cdc",
 37 |                     "options": {
 38 |                         "connector": "postgres-cdc",
 39 |                         "hostname": "testpg",
 40 |                         "port": "15432",
 41 |                         "database-name": "postgres",
 42 |                         "slot.name": "cdc_slot",
 43 |                         "decoding.plugin.name": "pgoutput",
 44 |                         "slot.drop.on.stop": "true",
 45 |                         "schema-name": "sample",
 46 |                         "username": "postgres",
 47 |                         "password": "123456"
 48 |                     }
 49 |                 }
 50 |             ]
 51 |         },
 52 |         {
 53 |             "name": "ods_sales",
 54 |             "connectors": [
 55 |                 {
 56 |                     "name": "connector_hudi",
 57 |                     "options": {
 58 |                         "connector": "hudi",
 59 |                         "path": "/tmp/hudi-flink-test/ods_sales.db",
 60 |                         "table.type": "COPY_ON_WRITE",
 61 |                         "precombine.field": "_op_ts",
 62 |                         "changelog.enabled": true,
 63 |                         "compaction.async.enabled": false
 64 |                     }
 65 |                 }
 66 |             ],
 67 |             "tables": [
 68 |                 {
 69 |                     "name": "sales_user",
 70 |                     "connector": {
 71 |                         "name": "connector_hudi"
 72 |                     },
 73 |                     "partition_by": ["_di"],
 74 |                     "schema": [
 75 |                         "id INT NOT NULL PRIMARY KEY NOT ENFORCED",
 76 |                         "name VARCHAR",
 77 |                         "device_model VARCHAR",
 78 |                         "email VARCHAR",
 79 |                         "phone VARCHAR",
 80 |                         "create_time  timestamp",
 81 |                         "modify_time  timestamp",
 82 |                         "_di INT",
 83 |                         "_op_ts BIGINT"
 84 |                     ]
 85 |                 },
 86 |                 {
 87 |                     "name": "sales_product",
 88 |                     "connector": {
 89 |                         "name": "connector_hudi"
 90 |                     },
 91 |                     "partition_by": ["_di"],
 92 |                     "schema": [
 93 |                         "pid INT NOT NULL PRIMARY KEY NOT ENFORCED",
 94 |                         "pname VARCHAR",
 95 |                         "pprice decimal",
 96 |                         "phone VARCHAR",
 97 |                         "create_time  timestamp",
 98 |                         "modify_time  timestamp",
 99 |                         "_di INT",
100 |                         "_op_ts BIGINT"
101 |                     ]
102 |                 },
103 |                 {
104 |                     "name": "sales_user_order",
105 |                     "connector": {
106 |                         "name": "connector_hudi"
107 |                     },
108 |                     "partition_by": ["_di"],
109 |                     "schema": [
110 |                         "id INT NOT NULL PRIMARY KEY NOT ENFORCED",
111 |                         "oid VARCHAR",
112 |                         "uid INT",
113 |                         "pid INT",
114 |                         "onum INT",
115 |                         "create_time  timestamp",
116 |                         "modify_time  timestamp",
117 |                         "_di INT",
118 |                         "_op_ts BIGINT"
119 |                     ]
120 |                 }
121 |             ]
122 |         }
123 |     ],
124 |     "table_list": ["a"],
125 |     "schema_list": ["a"]
126 | }
127 | 


--------------------------------------------------------------------------------
/examples/rtdw/workflow/sales/ods/register-pg.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "inventory-connector",
 3 |     "config": {
 4 |         "connector.class": "io.debezium.connector.postgresql.PostgresConnector",
 5 |         "tasks.max": "1",
 6 |         "database.hostname": "testpg",
 7 |         "database.port": "15432",
 8 |         "database.user": "postgres",
 9 |         "database.password": "123456",
10 |         "database.dbname": "postgres",
11 |         "database.server.name": "dbserver1",
12 |         "schema.include.list": "inventory"
13 |     }
14 | }
15 | 


--------------------------------------------------------------------------------
/poetry.toml:
--------------------------------------------------------------------------------
1 | [repositories]
2 | [repositories.aliyun]
3 | url = "https://mirrors.aliyun.com/pypi/simple/"
4 | 
5 | [repositories.testpypi]
6 | url = "https://test.pypi.org/legacy/"
7 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "easy_sql-easy_sql"
 3 | version = "1.1.0"
 4 | description = "A library developed to ease the data ETL development process."
 5 | authors = ["Easy SQL from Thoughtworks <easy_sql@thoughtworks.com>"]
 6 | license = "Apache-2.0"
 7 | readme = "README.md"
 8 | repository = "https://github.com/easysql/easy_sql"
 9 | homepage = "https://easy-sql.readthedocs.io"
10 | 
11 | classifiers = [
12 |     "Programming Language :: Python :: 3.8",
13 |     "License :: OSI Approved :: Apache Software License",
14 |     "Operating System :: MacOS",
15 |     "Operating System :: POSIX :: Linux",
16 | ]
17 | packages = [
18 |     { include = "easy_sql" },
19 | ]
20 | 
21 | [tool.poetry.urls]
22 | "Bug Tracker" = "https://github.com/easysql/easy_sql/issues"
23 | 
24 | [tool.poetry.dependencies]
25 | python = "^3.7"
26 | click = {version = "^8.1.3", optional = true}
27 | regex = {version = "^2022.7.25", optional = true}
28 | colorlog = {version = "^6.6.0", optional = true}
29 | sqlfluff = {version = "~1.4.5", optional = true}
30 | SQLAlchemy = {version = "^1.4.40", optional = true}
31 | clickhouse-driver = {version = "^0.2.4", optional = true}
32 | clickhouse-sqlalchemy = {version = "^0.2.1", optional = true}
33 | psycopg2 = {version = "^2.9.3", optional = true}
34 | pyodps = {version = "^0.11.2.1", optional = true}
35 | pyspark = [{version = ">=2.3.0, != 3.1.1, != 3.1.2, != 3.1.3, !=3.2.0, != 3.2.1", optional = true}]
36 | numpy = {version="~1.21.4", python=">=3.7,<3.11", optional=true}
37 | pandas = {version="~1.3", python=">=3.7.1", optional=true}
38 | apache-flink = {version = "^1.17.0", optional = true}
39 | ydata-profiling = {version = "^4.2.0", optional = true, python = ">=3.8,<3.12"}
40 | pyyaml = {version = "^6.0", optional = true}
41 | pymongo = "^3.8.0"
42 | 
43 | [tool.poetry.group.test.dependencies]
44 | pytest = "^7.1.2"
45 | coverage = "^6.4.3"
46 | openpyxl = "^3.0.10"
47 | 
48 | [tool.poetry.group.dev.dependencies]
49 | pre-commit = "^2.20.0"
50 | flake8 = {version = "^6.0.0", python = ">=3.8.1"}
51 | flake8-bugbear = {version = "^23.5.9", python = ">=3.8.1"}
52 | flake8-comprehensions = "^3.12.0"
53 | flake8-simplify = "^0.20.0"
54 | flake8-type-checking = {version = "^2.4.0", python = ">=3.8"}
55 | 
56 | [tool.poetry.extras]
57 | cli = ["click"]
58 | linter = ["sqlfluff","colorlog","regex"]
59 | spark = ["pyspark"]
60 | pg = ["SQLAlchemy", "psycopg2"]
61 | clickhouse = ["SQLAlchemy","clickhouse-driver","clickhouse-sqlalchemy"]
62 | maxcompute = ["pyodps"]
63 | flink = ["apache-flink", "pyyaml"]
64 | ydata-profiling=["ydata-profiling"]
65 | 
66 | [tool.isort]
67 | profile = "black"
68 | src_paths = ["easy_sql"]
69 | 
70 | [tool.black]
71 | line-length = 120
72 | preview = true
73 | 
74 | [tool.pytest.ini_options]
75 | testpaths = [
76 |     "easy_sql",
77 | ]
78 | python_files = [
79 |     "*_itest.py",
80 |     "*_test.py",
81 | ]
82 | 
83 | [build-system]
84 | requires = ["poetry-core>=1.0.0"]
85 | build-backend = "poetry.core.masonry.api"
86 | 


--------------------------------------------------------------------------------
/requirements-all.txt:
--------------------------------------------------------------------------------
 1 | appdirs==1.4.4 ; python_version >= "3.7" and python_version < "4.0"
 2 | attrs==22.1.0 ; python_version >= "3.7" and python_version < "4.0"
 3 | backports-cached-property==1.0.2 ; python_version >= "3.7" and python_version < "3.8"
 4 | backports-zoneinfo==0.2.1 ; python_version >= "3.7" and python_version < "3.9"
 5 | certifi==2022.6.15 ; python_version >= "3.7" and python_version < "4"
 6 | chardet==5.0.0 ; python_version >= "3.7" and python_version < "4.0"
 7 | charset-normalizer==2.1.1 ; python_version >= "3.7" and python_version < "4"
 8 | click==8.1.3 ; python_version >= "3.7" and python_version < "4.0"
 9 | clickhouse-driver==0.2.4 ; python_version >= "3.7" and python_version < "4"
10 | clickhouse-sqlalchemy==0.2.2 ; python_version >= "3.7" and python_version < "4"
11 | colorama==0.4.5 ; python_version >= "3.7" and python_version < "4.0"
12 | colorlog==6.7.0 ; python_version >= "3.7" and python_version < "4.0"
13 | diff-cover==6.5.1 ; python_version >= "3.7" and python_version < "4.0"
14 | greenlet==1.1.3 ; python_version >= "3.7" and (platform_machine == "aarch64" or platform_machine == "ppc64le" or platform_machine == "x86_64" or platform_machine == "amd64" or platform_machine == "AMD64" or platform_machine == "win32" or platform_machine == "WIN32") and python_version < "4.0"
15 | idna==3.3 ; python_version >= "3.7" and python_version < "4"
16 | importlib-metadata==4.2.0 ; python_version >= "3.7" and python_version < "3.8"
17 | iniconfig==1.1.1 ; python_version >= "3.7" and python_version < "4.0"
18 | jinja2==3.1.2 ; python_version >= "3.7" and python_version < "4.0"
19 | markupsafe==2.1.1 ; python_version >= "3.7" and python_version < "4.0"
20 | packaging==21.3 ; python_version >= "3.7" and python_version < "4.0"
21 | pathspec==0.10.1 ; python_version >= "3.7" and python_version < "4.0"
22 | pluggy==1.0.0 ; python_version >= "3.7" and python_version < "4.0"
23 | psycopg2==2.9.3 ; python_version >= "3.7" and python_version < "4.0"
24 | py4j==0.10.9.5 ; python_version >= "3.7" and python_version < "4.0"
25 | py==1.11.0 ; python_version >= "3.7" and python_version < "4.0"
26 | pygments==2.13.0 ; python_version >= "3.7" and python_version < "4.0"
27 | pyodps==0.11.2.1 ; python_version >= "3.7" and python_version < "4.0"
28 | pyparsing==3.0.9 ; python_version >= "3.7" and python_version < "4.0"
29 | pyspark==3.3.0 ; python_version >= "3.7" and python_version < "4.0"
30 | pytest==7.1.3 ; python_version >= "3.7" and python_version < "4.0"
31 | pytz-deprecation-shim==0.1.0.post0 ; python_version >= "3.7" and python_version < "4"
32 | pytz==2022.2.1 ; python_version >= "3.7" and python_version < "4"
33 | pyyaml==6.0 ; python_version >= "3.7" and python_version < "4.0"
34 | regex==2022.8.17 ; python_version >= "3.7" and python_version < "4.0"
35 | requests==2.28.1 ; python_version >= "3.7" and python_version < "4"
36 | setuptools==65.3.0 ; python_version >= "3.7" and python_version < "3.8"
37 | sqlalchemy==1.4.40 ; python_version >= "3.7" and python_version < "4.0"
38 | sqlfluff==1.2.1 ; python_version >= "3.7" and python_version < "4.0"
39 | tblib==1.7.0 ; python_version >= "3.7" and python_version < "4.0"
40 | toml==0.10.2 ; python_version >= "3.7" and python_version < "4.0"
41 | tomli==2.0.1 ; python_version >= "3.7" and python_version < "4.0"
42 | tqdm==4.64.0 ; python_version >= "3.7" and python_version < "4.0"
43 | typing-extensions==4.3.0 ; python_version >= "3.7" and python_version < "4.0"
44 | tzdata==2022.2 ; python_version >= "3.7" and python_version < "4"
45 | tzlocal==4.2 ; python_version >= "3.7" and python_version < "4"
46 | urllib3==1.26.12 ; python_version >= "3.7" and python_version < "4"
47 | zipp==3.8.1 ; python_version >= "3.7" and python_version < "3.8"
48 | 


--------------------------------------------------------------------------------
/test/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nikolaik/python-nodejs:python3.8-nodejs12
 2 | 
 3 | RUN apt-get update && apt-get install -y vim wget openjdk-11-jdk zip unzip lsof less
 4 | 
 5 | WORKDIR /tmp
 6 | 
 7 | ADD sample_etl.spark.sql /tmp
 8 | ADD sample_etl.postgres.sql /tmp
 9 | ADD sample_etl.clickhouse.sql /tmp
10 | 
11 | RUN python3 -m pip install 'easy-sql-easy-sql[spark,pg,clickhouse,cli]'
12 | 
13 | ARG PG_URL=
14 | ARG CLICKHOUSE_URL=
15 | 
16 | RUN bash -c "$(python3 -m easy_sql.data_process -f sample_etl.spark.sql -p)"
17 | RUN PG_URL=$PG_URL python3 -m easy_sql.data_process -f sample_etl.postgres.sql
18 | RUN CLICKHOUSE_URL=$CLICKHOUSE_URL python3 -m easy_sql.data_process -f sample_etl.clickhouse.sql
19 | 


--------------------------------------------------------------------------------
/test/customized_func/customized_func.py:
--------------------------------------------------------------------------------
 1 | __all__ = ["count_partitions"]
 2 | 
 3 | 
 4 | def count_partitions(table_name: str) -> int:
 5 |     from pyspark.sql import SparkSession
 6 | 
 7 |     spark: SparkSession = SparkSession.builder.getOrCreate()
 8 |     partitions = spark.sql(f"show partitions {table_name}").collect()
 9 |     return len(partitions)
10 | 


--------------------------------------------------------------------------------
/test/customized_func/etl_with_customized_func.sql:
--------------------------------------------------------------------------------
 1 | -- backend: spark
 2 | -- config: easy_sql.func_file_path=customized_func.py
 3 | 
 4 | -- target=action.define_table
 5 | create table some_table partitioned by (pt) as
 6 | select * from (
 7 |     select 1 as a, 2 as b, 1 as pt
 8 |     union
 9 |     select 1 as a, 2 as b, 2 as pt
10 | ) t
11 | 
12 | -- target=log.partition_count
13 | select ${count_partitions(some_table)} as partition_count
14 | 


--------------------------------------------------------------------------------
/test/doc/debugging.sql:
--------------------------------------------------------------------------------
 1 | -- prepare-sql: drop database if exists sample cascade
 2 | -- prepare-sql: create database sample
 3 | -- prepare-sql: create table sample.order_table as select 1 as id, '1' as val
 4 | -- prepare-sql: create table sample.order_table_after_joined as select 1 as id, '1' as val
 5 | 
 6 | -- target=variables
 7 | select
 8 |     3 as c
 9 | 
10 | -- target=log.i_would_like_to_log_something
11 | select
12 |     1 as a
13 |     , 2 as b
14 |     , ${c} as c
15 | 
16 | -- target=log.order_count
17 | select
18 |     count(1)
19 | from sample.order_table
20 | 
21 | -- target=check.order_count_must_be_equal_after_joined_product
22 | select
23 |     (select count(1) from sample.order_table) as expected
24 |     , (select count(1) from sample.order_table_after_joined) as actual
25 | 
26 | -- target=check.equal(${c}, 3)
27 | 


--------------------------------------------------------------------------------
/test/doc/test_sqlfulff.sql:
--------------------------------------------------------------------------------
1 | SELECT a+b  AS foo,
2 | c AS bar from my_table where name = {{ test_name }};    -- noqa: L014,L034
3 | 


--------------------------------------------------------------------------------
/test/doc/variables.sql:
--------------------------------------------------------------------------------
 1 | -- target=variables
 2 | select 1 as a, '2' as b
 3 | 
 4 | -- target=variables
 5 | select
 6 | 	${a} as a
 7 | 	, ${b} as b
 8 | 	, 1${a} as a1
 9 | 	, ${a} + ${b} as ab
10 | 
11 | -- target=log.variables
12 | select ${a} as a, ${b} as b, ${a1} as a1, ${ab} as ab
13 | 


--------------------------------------------------------------------------------
/test/etl_test.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/easysql/easy_sql/b568542617942f347579ff872d976fd2175aa071/test/etl_test.xlsx


--------------------------------------------------------------------------------
/test/flink/flink_hive_conf/hive-site.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
3 | <configuration>
4 |    <property>
5 |        <name>hive.metastore.uris</name>
6 |        <value>thrift://localhost:9083</value>
7 |    </property>
8 | </configuration>
9 | 


--------------------------------------------------------------------------------
/test/sample_data_process.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from pyspark.sql import SparkSession
 4 | 
 5 | from easy_sql.sql_processor import SqlProcessor
 6 | 
 7 | 
 8 | def run_spark_etl():
 9 |     from easy_sql.sql_processor.backend import SparkBackend
10 | 
11 |     spark = SparkSession.builder.enableHiveSupport().getOrCreate()
12 |     backend = SparkBackend(spark)
13 |     sql = """
14 | -- target=log.some_log
15 | select 1 as a
16 |     """
17 |     sql_processor = SqlProcessor(backend, sql)
18 |     sql_processor.run()
19 | 
20 | 
21 | def run_postgres_etl():
22 |     from easy_sql.sql_processor.backend.rdb import RdbBackend
23 | 
24 |     backend = RdbBackend(os.environ["PG_URL"])
25 |     sql = """
26 | -- target=log.some_log
27 | select 1 as a
28 |     """
29 |     sql_processor = SqlProcessor(backend, sql)
30 |     sql_processor.run()
31 | 
32 | 
33 | def run_clickhouse_etl():
34 |     from easy_sql.sql_processor.backend.rdb import RdbBackend
35 | 
36 |     backend = RdbBackend(os.environ["CLICKHOUSE_URL"])
37 |     sql = """
38 | -- target=log.some_log
39 | select 1 as a
40 |     """
41 |     sql_processor = SqlProcessor(backend, sql)
42 |     sql_processor.run()
43 | 
44 | 
45 | if __name__ == "__main__":
46 |     run_spark_etl()
47 |     run_postgres_etl()
48 |     run_clickhouse_etl()
49 | 


--------------------------------------------------------------------------------
/test/sample_etl.clickhouse.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "completed": 1,
 4 |     "default_col_type": "string",
 5 |     "func_file_paths": [],
 6 |     "includes": {},
 7 |     "inputs": [
 8 |       {
 9 |         "column_types": "[\"int\", \"String\"]",
10 |         "columns": "[\"id\", \"val\"]",
11 |         "name": "sample.test",
12 |         "value_descriptions": [
13 |           "Some sample data for testing"
14 |         ],
15 |         "values": [
16 |           "[1, \"1\"]"
17 |         ]
18 |       }
19 |     ],
20 |     "missed_fields": [],
21 |     "name": "test for sample etl",
22 |     "outputs": [
23 |       {
24 |         "column_types": "[\"int\", \"String\"]",
25 |         "columns": "[\"id\", \"val\"]",
26 |         "name": "sample.result",
27 |         "value_descriptions": [],
28 |         "values": [
29 |           "[1, \"1\"]",
30 |           "[1, \"2\"]"
31 |         ]
32 |       }
33 |     ],
34 |     "simple_sql_name": "sample_etl.clickhouse.sql",
35 |     "sql_file_content": null,
36 |     "sql_file_path": "test/sample_etl.clickhouse.sql",
37 |     "udf_file_paths": [],
38 |     "vars": {}
39 |   }
40 | ]
41 | 


--------------------------------------------------------------------------------
/test/sample_etl.clickhouse.sql:
--------------------------------------------------------------------------------
 1 | -- backend: clickhouse
 2 | -- prepare-sql: drop database if exists sample
 3 | -- prepare-sql: create database sample
 4 | -- prepare-sql: create table sample.test engine MergeTree() order by tuple() as select 1 as id, '1' as val
 5 | 
 6 | -- target=variables
 7 | select 1 as __create_output_table__
 8 | 
 9 | -- target=variables
10 | select 1 as a
11 | 
12 | -- target=log.a
13 | select '${a}' as a
14 | 
15 | -- target=log.test_log
16 | select 1 as some_log
17 | 
18 | -- target=check.should_equal
19 | select 1 as actual, 1 as expected
20 | 
21 | -- target=temp.result
22 | select
23 |     ${a} as id, cast(${a} + 1 as text) as val
24 | union all
25 | select id, val from sample.test
26 | 
27 | -- target=output.sample.result
28 | select * from result
29 | 
30 | -- target=log.sample_result
31 | select * from result
32 | 


--------------------------------------------------------------------------------
/test/sample_etl.clickhouse.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/easysql/easy_sql/b568542617942f347579ff872d976fd2175aa071/test/sample_etl.clickhouse.xlsx


--------------------------------------------------------------------------------
/test/sample_etl.flink.hive.postgres.sql:
--------------------------------------------------------------------------------
 1 | -- backend: flink
 2 | -- config: easy_sql.flink_tables_file_path=sample_etl.flink_tables_file_hive.yml
 3 | -- inputs: db_pg.source_1
 4 | 
 5 | -- target=variables
 6 | select 2 as a
 7 | 
 8 | -- target=log.a
 9 | select '${a}' as a
10 | 
11 | -- target=log.test_log
12 | select 1 as some_log
13 | 
14 | -- target=check.should_equal
15 | select 1 as actual, 1 as expected
16 | 
17 | -- target=temp.result_view
18 | select
19 |     ${a} as id,
20 |     '2' as val
21 | union all
22 | select id, val from myhiveCatalog.default.hive_table
23 | union all
24 | select id, val from db_pg.source_1
25 | 
26 | -- target=output.myhiveCatalog.default.hive_out_table
27 | select id, val from result_view
28 | 
29 | -- target=log.sample_result
30 | select * from result_view
31 | 


--------------------------------------------------------------------------------
/test/sample_etl.flink.hive.sql:
--------------------------------------------------------------------------------
 1 | -- backend: flink
 2 | -- config: easy_sql.flink_tables_file_path=sample_etl.flink_tables_file_hive.yml
 3 | -- config: flink.cmd=-t local
 4 | -- config: flink.cmd=--parallelism 2
 5 | -- config: flink.python.fn-execution.bundle.size=1000
 6 | -- config: flink.python.client.executable=python
 7 | -- config: flink.jobmanager.memory.process.size=1024m
 8 | -- config: flink.taskmanager.memory.process.size=4096m
 9 | 
10 | -- target=variables
11 | select 2 as a
12 | 
13 | -- target=log.a
14 | select '${a}' as a
15 | 
16 | -- target=log.test_log
17 | select 1 as some_log
18 | 
19 | -- target=check.should_equal
20 | select 1 as actual, 1 as expected
21 | 
22 | -- target=temp.result_view
23 | select
24 |     ${a} as id,
25 |     '2' as val
26 | union all
27 | select id, val from myhiveCatalog.default.hive_table
28 | 
29 | -- target=output.myhiveCatalog_1.default.hive_out_table
30 | select id, val from result_view
31 | 
32 | -- target=log.sample_result
33 | select * from result_view
34 | 


--------------------------------------------------------------------------------
/test/sample_etl.flink.hudi-agg.sql:
--------------------------------------------------------------------------------
 1 | -- Preparation: refer sample_etl.flink.postgres-hudi.sql
 2 | --
 3 | -- Verification:
 4 | -- 1. verify there are two rows in hudi table:
 5 | --     - start sql client: /usr/local/lib/python3.8/site-packages/pyflink/bin/sql-client.sh embedded -j test/flink/jars/hudi-flink1.15-bundle-0.12.2.jar shell
 6 | --     - emit sql: create table hudi_agg (val varchar NOT NULL PRIMARY KEY NOT ENFORCED, val_count bigint) WITH (
 7 | --                    'connector' = 'hudi' , 'path' = '/tmp/hudi-flink-test/db_hudi.db/target_hudi_agg' , 'table.type' = 'MERGE_ON_READ' , 'changelog.enabled' = 'True' , 'compaction.async.enabled' = 'False'
 8 | --                );
 9 | --     - emit sql: select * from hudi_agg;
10 | -- 2. insert data into sample.test and check if it aggregates correctly in hudi table
11 | --
12 | -- Cleanup:
13 | -- 1. cancel applicaiton from flink dashboard (http://localhost:8081/)
14 | 
15 | 
16 | -- backend: flink
17 | 
18 | -- config: easy_sql.flink_tables_file_path=test/sample_etl.flink_tables_file.yml
19 | -- config: easy_sql.etl_type=streaming
20 | 
21 | -- config: flink.cmd=-pyexec python3
22 | -- config: flink.cmd=-pyclientexec python3
23 | -- config: flink.cmd=-t remote
24 | -- config: flink.execution.checkpointing.interval=3s
25 | -- config: flink.pipeline.jars=test/flink/jars/flink-sql-connector-postgres-cdc-2.3.0.jar;test/flink/jars/hudi-flink1.15-bundle-0.12.2.jar
26 | 
27 | -- prepare-sql: drop schema if exists sample cascade
28 | -- prepare-sql: create schema sample
29 | -- prepare-sql: create table sample.test as select 1 as id, '1' as val
30 | 
31 | -- inputs: db_pg.source_cdc
32 | -- add db_pg.target_1 below to allow the prepare-sql command to execute against.
33 | -- outputs: db_hudi.target_hudi_agg, db_pg.target_1
34 | 
35 | -- target=variables
36 | select
37 |     'append'           as __save_mode__
38 | 
39 | -- target=temp.result_view
40 | select
41 |     2 as id
42 |     ,'2' as val
43 | union all
44 | select id, val from db_pg.source_cdc
45 | 
46 | -- target=output.db_hudi.target_hudi_agg
47 | select val, count(*) as val_count from result_view group by val
48 | 


--------------------------------------------------------------------------------
/test/sample_etl.flink.postgres-cdc.multi-sink.sql:
--------------------------------------------------------------------------------
 1 | -- Preparation:
 2 | -- 1. start a local flink cluster: your/site-packages/path/pyflink/bin/start-cluster.sh
 3 | -- 2. ensure postgres started with configuration: `wal_level=logical` (in file /var/lib/postgresql/data/postgresql.conf)
 4 | -- 3. use remote mode to run flink application: configure `flink.cmd=-t remote` (already done below)
 5 | --
 6 | -- Verification:
 7 | -- 1. verify there are two rows in postgres table public.output_table
 8 | -- 2. verify there are two rows in postgres table public.output_table_agg
 9 | -- 3. insert data into sample.test and check if it shows up in public.output_table and aggregates correctly in public.output_table_agg
10 | --
11 | -- Cleanup:
12 | -- 1. cancel applicaiton from flink dashboard (http://localhost:8081/)
13 | 
14 | 
15 | -- backend: flink
16 | 
17 | -- config: easy_sql.flink_tables_file_path=test/sample_etl.flink_tables_file.yml
18 | -- config: easy_sql.etl_type=streaming
19 | -- config: easy_sql.prepare_sql_connector=connector_1
20 | 
21 | -- config: flink.cmd=-pyexec python3
22 | -- config: flink.cmd=-pyclientexec python3
23 | -- config: flink.cmd=-t remote
24 | -- config: flink.pipeline.jars=test/flink/jars/flink-sql-connector-postgres-cdc-2.3.0.jar;test/flink/jars/hudi-flink1.15-bundle-0.12.2.jar;test/flink/jars/flink-sql-connector-hive-3.1.2_2.12-1.15.1.jar;test/flink/jars/postgresql-42.2.14.jar;test/flink/jars/flink-connector-jdbc-1.15.1.jar
25 | 
26 | -- inputs: db_pg.source_cdc
27 | -- outputs: db_pg.target_1, db_pg.target_agg
28 | 
29 | -- prepare-sql: drop schema if exists sample cascade
30 | -- prepare-sql: create schema sample
31 | -- prepare-sql: create table sample.test as select 1 as id, '1' as val
32 | -- prepare-sql: drop table if exists public.output_table
33 | -- prepare-sql: create table public.output_table (id int4 PRIMARY KEY, val text)
34 | -- prepare-sql: drop table if exists public.output_table_agg
35 | -- prepare-sql: create table public.output_table_agg (val text PRIMARY KEY, count_val bigint)
36 | 
37 | -- target=variables
38 | select
39 |     'append'           as __save_mode__
40 | 
41 | -- target=variables
42 | select 2 as a
43 | 
44 | -- target=log.a
45 | select '${a}' as a
46 | 
47 | -- target=log.test_log
48 | select 1 as some_log
49 | 
50 | -- target=check.should_equal
51 | select 1 as actual, 1 as expected
52 | 
53 | -- target=temp.result_view
54 | select
55 |     ${a} as id,
56 |     '2' as val
57 | union all
58 | select id, val from db_pg.source_cdc
59 | 
60 | -- target=output.db_pg.target_1
61 | select id, val from result_view
62 | 
63 | -- target=output.db_pg.target_agg
64 | select val, count(1) as count_val from result_view group by val
65 | 
66 | -- target=func.execute_streaming_inserts()
67 | -- if there are multiple inserts and we call the function above, these inserts will be merged into one job and share streams
68 | -- it takes the optimization method here: https://nightlies.apache.org/flink/flink-docs-release-1.16/docs/dev/table/sql/insert/
69 | 
70 | -- target=log.db_pg__target_1__count
71 | select count(*) from db_pg.target_1
72 | 


--------------------------------------------------------------------------------
/test/sample_etl.flink.postgres-cdc.sql:
--------------------------------------------------------------------------------
 1 | -- Preparation:
 2 | -- 1. start a local flink cluster: your/site-packages/path/pyflink/bin/start-cluster.sh
 3 | -- 2. ensure postgres started with configuration: `wal_level=logical` (in file /var/lib/postgresql/data/postgresql.conf)
 4 | -- 3. use remote mode to run flink application: configure `flink.cmd=-t remote` (already done below)
 5 | --
 6 | -- Verification:
 7 | -- 1. verify there are two rows in postgres table public.output_table
 8 | -- 2. insert data into sample.test and check if it shows up in public.output_table
 9 | --
10 | -- Cleanup:
11 | -- 1. cancel applicaiton from flink dashboard (http://localhost:8081/)
12 | 
13 | 
14 | -- backend: flink
15 | 
16 | -- config: easy_sql.flink_tables_file_path=test/sample_etl.flink_tables_file.yml
17 | -- config: easy_sql.etl_type=streaming
18 | -- config: easy_sql.prepare_sql_connector=connector_1
19 | 
20 | -- config: flink.cmd=-pyexec python3
21 | -- config: flink.cmd=-pyclientexec python3
22 | -- config: flink.cmd=-t remote
23 | -- config: flink.pipeline.jars=test/flink/jars/flink-sql-connector-postgres-cdc-2.3.0.jar;test/flink/jars/hudi-flink1.15-bundle-0.12.2.jar;test/flink/jars/flink-sql-connector-hive-3.1.2_2.12-1.15.1.jar;test/flink/jars/postgresql-42.2.14.jar;test/flink/jars/flink-connector-jdbc-1.15.1.jar
24 | 
25 | -- inputs: db_pg.source_cdc
26 | -- outputs: db_pg.target_1
27 | 
28 | -- prepare-sql: drop schema if exists sample cascade
29 | -- prepare-sql: create schema sample
30 | -- prepare-sql: create table sample.test as select 1 as id, '1' as val
31 | -- prepare-sql: drop table if exists public.output_table
32 | -- prepare-sql: create table public.output_table (id int4 PRIMARY KEY, val text)
33 | 
34 | -- target=variables
35 | select
36 |     'append'           as __save_mode__
37 | 
38 | -- target=variables
39 | select 2 as a
40 | 
41 | -- target=log.a
42 | select '${a}' as a
43 | 
44 | -- target=log.test_log
45 | select 1 as some_log
46 | 
47 | -- target=check.should_equal
48 | select 1 as actual, 1 as expected
49 | 
50 | -- target=temp.result_view
51 | select
52 |     ${a} as id,
53 |     '2' as val
54 | union all
55 | select id, val from db_pg.source_cdc
56 | 
57 | -- target=output.db_pg.target_1
58 | select id, val from result_view
59 | 
60 | -- target=log.db_pg__target_1
61 | select * from db_pg.target_1
62 | 


--------------------------------------------------------------------------------
/test/sample_etl.flink.postgres-hudi.sql:
--------------------------------------------------------------------------------
 1 | -- Preparation:
 2 | -- 1. download a hadoop release: wget https://dlcdn.apache.org/hadoop/common/hadoop-3.3.5/hadoop-3.3.5.tar.gz
 3 | -- 2. set hadoop classpath: tar xf hadoop-3.3.5.tar.gz && export HADOOP_CLASSPATH=$($(pwd)/hadoop-3.3.5/bin/hadoop classpath)
 4 | -- 3. start a local flink cluster: your/site-packages/path/pyflink/bin/start-cluster.sh
 5 | -- 4. ensure postgres started with configuration: `wal_level=logical` (in file /var/lib/postgresql/data/postgresql.conf)
 6 | -- 5. use remote mode to run flink application: configure `flink.cmd=-t remote` (already done below)
 7 | --
 8 | -- Verification:
 9 | -- 1. verify there are two rows in hudi table /tmp/hudi-flink-test:
10 | --     echo 'drop table if exists hudi_table;create table hudi_table using hudi location "/tmp/hudi-flink-test/db_hudi.db/target_hudi";select * from hudi_table;' | \
11 | --     spark-sql --packages org.apache.hudi:hudi-spark3.3-bundle_2.12:0.12.2 \
12 | --         --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' \
13 | --         --conf 'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension' \
14 | --         --conf 'spark.sql.catalog.spark_catalog=org.apache.spark.sql.hudi.catalog.HoodieCatalog' \
15 | --         --conf 'spark.driver.extraJavaOptions="-Dderby.system.home=/tmp/spark-warehouse-metastore-hudi -Dderby.stream.error.file=/tmp/spark-warehouse-metastore-hudi.log"'
16 | -- 2. insert data into sample.test and check if it shows up in the hudi table
17 | --
18 | -- Cleanup:
19 | -- 1. cancel applicaiton from flink dashboard (http://localhost:8081/)
20 | 
21 | 
22 | -- backend: flink
23 | 
24 | -- config: easy_sql.flink_tables_file_path=test/sample_etl.flink_tables_file.yml
25 | -- config: easy_sql.etl_type=streaming
26 | -- config: easy_sql.prepare_sql_connector=connector_1
27 | 
28 | -- config: flink.cmd=-pyexec python3
29 | -- config: flink.cmd=-pyclientexec python3
30 | -- config: flink.cmd=-t remote
31 | -- config: flink.execution.checkpointing.interval=3s
32 | -- config: flink.pipeline.jars=test/flink/jars/flink-sql-connector-postgres-cdc-2.3.0.jar;test/flink/jars/hudi-flink1.15-bundle-0.12.2.jar;test/flink/jars/flink-sql-connector-hive-3.1.2_2.12-1.15.1.jar;test/flink/jars/postgresql-42.2.14.jar;test/flink/jars/flink-connector-jdbc-1.15.1.jar
33 | 
34 | -- inputs: db_pg.source_cdc
35 | -- add db_pg.target_1 below to allow the prepare-sql command to execute against.
36 | -- outputs: db_hudi.target_hudi, db_pg.target_1
37 | 
38 | -- prepare-sql: drop schema if exists sample cascade
39 | -- prepare-sql: create schema sample
40 | -- prepare-sql: create table sample.test as select 1 as id, '1' as val
41 | 
42 | -- target=variables
43 | select
44 |     'append'           as __save_mode__
45 | 
46 | -- target=temp.result_view
47 | select
48 |     2 as id,
49 |     '2' as val
50 | union all
51 | select id, val from db_pg.source_cdc
52 | 
53 | -- target=output.db_hudi.target_hudi
54 | select id, val from result_view
55 | 
56 | -- target=func.execute_streaming_inserts()
57 | -- trigger execution of inserts manually, or it will be triggered at the end of the job and the query of db_hudi.target_hudi fails.
58 | 
59 | -- hack below as we didn't prepared the hudi table
60 | -- target1=log.db_hudi__target_hudi
61 | -- select * from db_hudi.target_hudi
62 | 


--------------------------------------------------------------------------------
/test/sample_etl.flink.postgres.sql:
--------------------------------------------------------------------------------
 1 | -- backend: flink
 2 | 
 3 | -- config: easy_sql.flink_tables_file_path=test/sample_etl.flink_tables_file.yml
 4 | -- config: easy_sql.prepare_sql_connector=connector_1
 5 | 
 6 | -- config: flink.cmd=-pyexec python3
 7 | -- config: flink.cmd=-pyclientexec python3
 8 | -- config: flink.cmd=-t local
 9 | -- config: flink.pipeline.jars=test/flink/jars/flink-sql-connector-postgres-cdc-2.3.0.jar;test/flink/jars/hudi-flink1.15-bundle-0.12.2.jar;test/flink/jars/postgresql-42.2.14.jar;test/flink/jars/flink-connector-jdbc-1.15.1.jar
10 | 
11 | -- inputs: db_pg.source_1, db_pg.target_1
12 | -- outputs: db_pg.target_1
13 | 
14 | -- prepare-sql: drop schema if exists sample cascade
15 | -- prepare-sql: create schema sample
16 | -- prepare-sql: create table sample.test as select 1 as id, '1' as val
17 | -- prepare-sql: drop table if exists public.output_table
18 | -- prepare-sql: create table public.output_table (id int4 PRIMARY KEY, val text)
19 | 
20 | -- target=variables
21 | select
22 |     'append'           as __save_mode__
23 | 
24 | -- target=variables
25 | select 2 as a
26 | 
27 | -- target=log.a
28 | select '${a}' as a
29 | 
30 | -- target=log.test_log
31 | select 1 as some_log
32 | 
33 | -- target=check.should_equal
34 | select 1 as actual, 1 as expected
35 | 
36 | -- target=temp.result_view
37 | select
38 |     ${a} as id,
39 |     '2' as val
40 | union all
41 | select id, val from db_pg.source_1
42 | 
43 | -- target=output.db_pg.target_1
44 | select id, val from result_view
45 | 
46 | -- target=log.sample_result
47 | select * from result_view
48 | 


--------------------------------------------------------------------------------
/test/sample_etl.flink_tables_file.yml:
--------------------------------------------------------------------------------
 1 | connectors:
 2 |   connector_1:
 3 |     options: |
 4 |       'connector' = 'jdbc',
 5 |       'url' = 'jdbc:postgresql://localhost:5432/postgres',
 6 |       'username' = 'postgres',
 7 |       'password' = '123456'
 8 |   connector_cdc:
 9 |     options: |
10 |       'connector' = 'postgres-cdc',
11 |       'hostname' = 'localhost',
12 |       'port' = '5432',
13 |       'username' = 'postgres',
14 |       'password' = '123456',
15 |       'database-name' = 'postgres',
16 |       'schema-name' = 'sample',
17 |       'decoding.plugin.name' = 'pgoutput'
18 |   connector_hudi:
19 |     options: |
20 |       'connector' = 'hudi',
21 |       'path' = '/tmp/hudi-flink-test',
22 |       'table.type' = 'MERGE_ON_READ',
23 |       'changelog.enabled' = 'true',
24 |       'compaction.async.enabled' = 'false'
25 | catalogs:
26 |   default_catalog:
27 |     databases:
28 |       db_pg:
29 |         tables:
30 |           source_1:
31 |             connector: connector_1
32 |             options: |
33 |               'table-name'   =   'sample.test'
34 |             partition_by: "id, val"
35 |             schema: |
36 |               `id` INT,
37 |               val VARCHAR,
38 |               PRIMARY KEY (id) NOT ENFORCED
39 |           source_cdc:
40 |             connector: connector_cdc
41 |             options: |
42 |               'table-name'   =   'test'
43 |             partition_by: "id, val"
44 |             schema: |
45 |               `id` INT,
46 |               val VARCHAR,
47 |               PRIMARY KEY (id) NOT ENFORCED
48 |           target_1:
49 |             connector: connector_1
50 |             options: |
51 |               'table-name'   =   'output_table'
52 |             schema: |
53 |               `id` INT,
54 |               val VARCHAR,
55 |               PRIMARY KEY (id) NOT ENFORCED
56 |           target_agg:
57 |             connector: connector_1
58 |             options: |
59 |               'table-name'   =   'output_table_agg'
60 |             schema: |
61 |               count_val BIGINT,
62 |               val VARCHAR,
63 |               PRIMARY KEY (val) NOT ENFORCED
64 |       db_hudi:
65 |         tables:
66 |           target_hudi:
67 |             connector: connector_hudi
68 |             schema: |
69 |               id INT NOT NULL PRIMARY KEY NOT ENFORCED,
70 |               val VARCHAR
71 |           target_hudi_agg:
72 |             connector: connector_hudi
73 |             schema: |
74 |               val VARCHAR NOT NULL PRIMARY KEY NOT ENFORCED,
75 |               val_count BIGINT NOT NULL
76 | 


--------------------------------------------------------------------------------
/test/sample_etl.flink_tables_file_hive.yml:
--------------------------------------------------------------------------------
 1 | connectors:
 2 |   connector_1:
 3 |     options: |
 4 |       'connector' = 'jdbc',
 5 |       'url' = 'jdbc:postgresql://localhost:5432/postgres',
 6 |       'username' = 'postgres',
 7 |       'password' = '123456'
 8 |   connector_cdc:
 9 |     options: |
10 |       'connector' = 'postgres-cdc',
11 |       'hostname' = 'localhost',
12 |       'port' = '5432',
13 |       'username' = 'postgres',
14 |       'password' = '123456',
15 |       'database-name' = 'postgres',
16 |       'schema-name' = 'sample',
17 |       'decoding.plugin.name' = 'pgoutput'
18 |   connector_hudi:
19 |     options: |
20 |       'connector' = 'hudi',
21 |       'path' = '/tmp/hudi-flink-test',
22 |       'table.type': 'MERGE_ON_READ',
23 |       'changelog.enabled': 'true',
24 |       'compaction.async.enabled': 'false'
25 | catalogs:
26 |   myhiveCatalog:
27 |     options: |
28 |       'type' = 'hive',
29 |       'hive-conf-dir' = 'test/flink/flink_hive_conf'
30 |     databases:
31 |       db_pg:
32 |         tables:
33 |           source_1:
34 |             connector: connector_1
35 |             options: |
36 |               'table-name'   =   'sample.test'
37 |             partition_by: "id, val"
38 |             schema: |
39 |               `id` INT,
40 |               val VARCHAR,
41 |               PRIMARY KEY (id) NOT ENFORCED
42 |           source_cdc:
43 |             connector: connector_cdc
44 |             options: |
45 |               'table-name'   =   'test'
46 |             partition_by: "id, val"
47 |             schema: |
48 |               `id` INT,
49 |               val VARCHAR,
50 |               PRIMARY KEY (id) NOT ENFORCED
51 |           target_1:
52 |             connector: connector_1
53 |             options: |
54 |               'table-name'   =   'output_table'
55 |             partition_by: "id, val"
56 |             schema: |
57 |               `id` INT,
58 |               val VARCHAR,
59 |               PRIMARY KEY (id) NOT ENFORCED
60 |           target_agg:
61 |             connector: connector_1
62 |             options: |
63 |               'table-name'   =   'output_table_agg'
64 |             schema: |
65 |               count_val BIGINT,
66 |               val VARCHAR,
67 |               PRIMARY KEY (val) NOT ENFORCED
68 |       db_hudi:
69 |         tables:
70 |           target_hudi:
71 |             connector: connector_hudi
72 |             schema: |
73 |               id INT NOT NULL PRIMARY KEY NOT ENFORCED,
74 |               val VARCHAR
75 |           target_hudi_agg:
76 |             connector: connector_hudi
77 |             schema: |
78 |               val VARCHAR NOT NULL PRIMARY KEY NOT ENFORCED,
79 |               val_count BIGINT NOT NULL
80 | 


--------------------------------------------------------------------------------
/test/sample_etl.postgres.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "completed": 1,
 4 |     "default_col_type": "string",
 5 |     "func_file_paths": [],
 6 |     "includes": {},
 7 |     "inputs": [
 8 |       {
 9 |         "column_types": "[\"int\", \"text\"]",
10 |         "columns": "[\"id\", \"val\"]",
11 |         "name": "sample.test",
12 |         "value_descriptions": [
13 |           "Some sample data for testing"
14 |         ],
15 |         "values": [
16 |           "[1, \"1\"]"
17 |         ]
18 |       }
19 |     ],
20 |     "missed_fields": [],
21 |     "name": "test for sample etl",
22 |     "outputs": [
23 |       {
24 |         "column_types": "[\"int\", \"text\"]",
25 |         "columns": "[\"id\", \"val\"]",
26 |         "name": "sample.result",
27 |         "value_descriptions": [],
28 |         "values": [
29 |           "[1, \"1\"]",
30 |           "[1, \"2\"]"
31 |         ]
32 |       }
33 |     ],
34 |     "simple_sql_name": "sample_etl.postgres.sql",
35 |     "sql_file_content": null,
36 |     "sql_file_path": "test/sample_etl.postgres.sql",
37 |     "udf_file_paths": [],
38 |     "vars": {}
39 |   }
40 | ]
41 | 


--------------------------------------------------------------------------------
/test/sample_etl.postgres.sql:
--------------------------------------------------------------------------------
 1 | -- backend: postgres
 2 | -- prepare-sql: drop schema if exists sample cascade
 3 | -- prepare-sql: create schema sample
 4 | -- prepare-sql: create table sample.test as select 1 as id, '1' as val
 5 | 
 6 | -- target=variables
 7 | select true as __create_output_table__
 8 | 
 9 | -- target=variables
10 | select 1 as a
11 | 
12 | -- target=log.a
13 | select '${a}' as a
14 | 
15 | -- target=log.test_log
16 | select 1 as some_log
17 | 
18 | -- target=check.should_equal
19 | select 1 as actual, 1 as expected
20 | 
21 | -- target=temp.result
22 | select
23 |     ${a} as id, cast(${a} + 1 as text) as val
24 | union all
25 | select id, val from sample.test
26 | 
27 | -- target=output.sample.result
28 | select * from result
29 | 
30 | -- target=log.sample_result
31 | select * from result
32 | 


--------------------------------------------------------------------------------
/test/sample_etl.postgres.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/easysql/easy_sql/b568542617942f347579ff872d976fd2175aa071/test/sample_etl.postgres.xlsx


--------------------------------------------------------------------------------
/test/sample_etl.spark.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "completed": 1,
 4 |     "default_col_type": "string",
 5 |     "func_file_paths": [],
 6 |     "includes": {},
 7 |     "inputs": [
 8 |       {
 9 |         "column_types": "[\"int\", \"string\"]",
10 |         "columns": "[\"id\", \"val\"]",
11 |         "name": "sample.test",
12 |         "value_descriptions": [
13 |           "Some sample data for testing"
14 |         ],
15 |         "values": [
16 |           "[1, \"1\"]"
17 |         ]
18 |       }
19 |     ],
20 |     "missed_fields": [],
21 |     "name": "test for sample etl",
22 |     "outputs": [
23 |       {
24 |         "column_types": "[\"int\", \"string\"]",
25 |         "columns": "[\"id\", \"val\"]",
26 |         "name": "sample.result",
27 |         "value_descriptions": [],
28 |         "values": [
29 |           "[1, \"1\"]",
30 |           "[1, \"2\"]"
31 |         ]
32 |       }
33 |     ],
34 |     "simple_sql_name": "sample_etl.spark.sql",
35 |     "sql_file_content": null,
36 |     "sql_file_path": "test/sample_etl.spark.sql",
37 |     "udf_file_paths": [],
38 |     "vars": {}
39 |   }
40 | ]
41 | 


--------------------------------------------------------------------------------
/test/sample_etl.spark.sql:
--------------------------------------------------------------------------------
 1 | -- prepare-sql: drop database if exists sample cascade
 2 | -- prepare-sql: create database sample
 3 | -- prepare-sql: create table sample.test as select 1 as id, '1' as val
 4 | 
 5 | -- target=variables
 6 | select true as __create_output_table__
 7 | 
 8 | -- target=variables
 9 | select 1 as a
10 | 
11 | -- target=log.a
12 | select '${a}' as a
13 | 
14 | -- target=log.test_log
15 | select 1 as some_log
16 | 
17 | -- target=check.should_equal
18 | select 1 as actual, 1 as expected
19 | 
20 | -- target=temp.result
21 | select
22 |     ${a} as id, ${a} + 1 as val
23 | union all
24 | select id, val from sample.test
25 | 
26 | -- target=output.sample.result
27 | select * from result
28 | 
29 | -- target=log.sample_result
30 | select * from result
31 | 


--------------------------------------------------------------------------------
/test/sample_etl.spark.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/easysql/easy_sql/b568542617942f347579ff872d976fd2175aa071/test/sample_etl.spark.xlsx


--------------------------------------------------------------------------------
/test/sample_etl.syntax.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/easysql/easy_sql/b568542617942f347579ff872d976fd2175aa071/test/sample_etl.syntax.xlsx


--------------------------------------------------------------------------------
/test/sample_etl_wps.syntax.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/easysql/easy_sql/b568542617942f347579ff872d976fd2175aa071/test/sample_etl_wps.syntax.xlsx


--------------------------------------------------------------------------------
/test/udf/clickhouse/etl_with_udf.sql:
--------------------------------------------------------------------------------
1 | -- backend: clickhouse
2 | -- config: easy_sql.udf_file_path=udf.py
3 | 
4 | -- target=log.test_udf
5 | select translate('abcad', 'a', '') as translated_str
6 | 


--------------------------------------------------------------------------------
/test/udf/clickhouse/udf.py:
--------------------------------------------------------------------------------
1 | def translate():
2 |     return "CREATE FUNCTION IF NOT EXISTS translate AS (input, from, to) -> replaceAll(input, from, to)"
3 | 


--------------------------------------------------------------------------------
/test/udf/flink-python/etl_with_udf.sql:
--------------------------------------------------------------------------------
1 | -- backend: flink
2 | -- config: easy_sql.udf_file_path=udf.py
3 | 
4 | -- target=log.test_udf
5 | select test_func(1, 2) as sum_value
6 | 


--------------------------------------------------------------------------------
/test/udf/flink-python/udf.py:
--------------------------------------------------------------------------------
 1 | from pyflink.table import DataTypes
 2 | from pyflink.table.udf import udf
 3 | 
 4 | __all__ = ["test_func"]
 5 | 
 6 | 
 7 | @udf(result_type=DataTypes.BIGINT())
 8 | def test_func(a: int, b: int) -> int:
 9 |     return a + b
10 | 


--------------------------------------------------------------------------------
/test/udf/flink-scala/.gitignore:
--------------------------------------------------------------------------------
1 | classes/
2 | *.jar
3 | 


--------------------------------------------------------------------------------
/test/udf/flink-scala/Makefile:
--------------------------------------------------------------------------------
1 | SCALA_BIN=/usr/local/bin
2 | SCALA_CP="/Users/yuewu/.pyenv/versions/3.8.13/lib/python3.8/site-packages/pyflink/lib/*"
3 | 
4 | jar:
5 | 	- rm -r classes
6 | 	mkdir -pv classes
7 | 	${SCALA_BIN}/scalac -nobootcp -cp ${SCALA_CP} -d classes your/company/*.scala
8 | 	cd classes && jar -cvf ../udf.jar .
9 | 


--------------------------------------------------------------------------------
/test/udf/flink-scala/etl_with_udf.sql:
--------------------------------------------------------------------------------
1 | -- backend: flink
2 | -- config: flink.cmd=--jarfile udf.jar
3 | -- config: easy_sql.scala_udf_initializer=your.company.udfs
4 | 
5 | -- target=log.test_udf
6 | select test_func(1, 2) as sum_value
7 | 


--------------------------------------------------------------------------------
/test/udf/flink-scala/your/company/udfs.scala:
--------------------------------------------------------------------------------
 1 | package your.company
 2 | 
 3 | import org.apache.flink.table.api._
 4 | import org.apache.flink.table.functions.ScalarFunction
 5 | 
 6 | class TestFunction extends ScalarFunction {
 7 |   def eval(a: Integer, b: Integer): Integer = {
 8 |     a + b + 10
 9 |   }
10 | }
11 | 
12 | object udfs {
13 |     def initUdfs(flink: TableEnvironment) {
14 |         flink.createTemporarySystemFunction("test_func", classOf[TestFunction])
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/test/udf/spark-python/etl_with_udf.sql:
--------------------------------------------------------------------------------
1 | -- backend: spark
2 | -- config: easy_sql.udf_file_path=udf.py
3 | 
4 | -- target=log.test_udf
5 | select string_set(array("a", "a", "b")) as stringset
6 | 


--------------------------------------------------------------------------------
/test/udf/spark-python/udf.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 | 
3 | __all__ = ["string_set"]
4 | 
5 | 
6 | def string_set(string_arr: List[str]) -> List[str]:
7 |     return list(set(string_arr))
8 | 


--------------------------------------------------------------------------------
/test/udf/spark-scala/.gitignore:
--------------------------------------------------------------------------------
1 | classes/
2 | *.jar
3 | 


--------------------------------------------------------------------------------
/test/udf/spark-scala/Makefile:
--------------------------------------------------------------------------------
1 | SCALA_BIN=~/dev/sdks/scala-2.12.10/bin
2 | SCALA_CP="/usr/local/lib/python3.8/site-packages/pyspark/jars/*"
3 | 
4 | jar:
5 | 	- rm -r classes
6 | 	mkdir -pv classes
7 | 	${SCALA_BIN}/scalac -nobootcp -cp ${SCALA_CP} -d classes your/company/*.scala
8 | 	cd classes && jar -cvf ../udf.jar .
9 | 


--------------------------------------------------------------------------------
/test/udf/spark-scala/etl_with_udf.sql:
--------------------------------------------------------------------------------
1 | -- backend: spark
2 | -- config: spark.jars=udf.jar
3 | -- config: easy_sql.scala_udf_initializer=your.company.udfs
4 | 
5 | -- target=log.test_udf
6 | select string_set(array("a", "a", "b")) as stringset
7 | 


--------------------------------------------------------------------------------
/test/udf/spark-scala/your/company/udfs.scala:
--------------------------------------------------------------------------------
 1 | package your.company
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.sql.functions.udf
 5 | import org.apache.spark.sql.types._
 6 | 
 7 | object udfs {
 8 |     def initUdfs(spark: SparkSession) {
 9 |         val string_set = udf((s: Seq[String]) => s.filter(_ != null).toSet.toArray)
10 |         spark.udf.register("string_set", string_set)
11 |     }
12 | }
13 | 


--------------------------------------------------------------------------------