├── .coveragerc ├── .editorconfig ├── .flake8 ├── .github ├── mergify.yml ├── release-drafter.yml └── workflows │ ├── build.yaml │ ├── draft-release.yaml │ └── release.yaml ├── .gitignore ├── .pre-commit-config.yaml ├── .readthedocs.yaml ├── .vscode ├── launch.json └── settings.json ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── debugger-usage.gif ├── debugger.py ├── docs ├── .gitignore ├── Makefile ├── conf.py ├── easy_sql │ ├── add_backend.md │ ├── backend │ │ └── flink.md │ ├── bnf.md │ ├── build_install.md │ ├── command_line.md │ ├── debug.md │ ├── easy_sql.md │ ├── faq.md │ ├── functions.md │ ├── functions.tpl.md │ ├── how_to.md │ ├── img │ │ └── test_case.png │ ├── index.md │ ├── linter.md │ ├── other_features.md │ ├── quick_start.md │ ├── syntax.md │ ├── testing.md │ ├── udfs.md │ ├── udfs.tpl.md │ └── variables.md ├── index.rst ├── make.bat ├── pyproject.toml ├── requirements.txt ├── scripts │ ├── generate_func_data.py │ └── update_doc.py └── sqlfluff │ ├── new_rule.md │ └── quick_start.md ├── easy_sql ├── __init__.py ├── base_test.py ├── cli │ ├── __init__.py │ └── backend_processor.py ├── config │ ├── sql_config.py │ └── sql_config_test.py ├── data_process.py ├── data_process_itest.py ├── data_process_test.py ├── local_spark.py ├── logger.py ├── report.py ├── report_test.py ├── spark_optimizer.py ├── sql_linter │ ├── __init__.py │ ├── rules │ │ ├── __init__.py │ │ └── bq_schema_rule.py │ ├── sql_linter.py │ ├── sql_linter_cli.py │ ├── sql_linter_reportor.py │ └── sql_linter_test.py ├── sql_processor │ ├── __init__.py │ ├── backend │ │ ├── __init__.py │ │ ├── base.py │ │ ├── bigquery.py │ │ ├── clickhouse.py │ │ ├── flink.py │ │ ├── flink_itest.py │ │ ├── maxcompute.py │ │ ├── maxcompute_itest.py │ │ ├── postgres.py │ │ ├── rdb.py │ │ ├── rdb_itest.py │ │ ├── rdb_test.py │ │ ├── spark.py │ │ ├── spark_test.py │ │ └── sql_dialect │ │ │ ├── __init__.py │ │ │ ├── bigquery.py │ │ │ ├── clickhouse.py │ │ │ ├── clickhouse_test.py │ │ │ └── postgres.py │ ├── common.py │ ├── context.py │ ├── context_test.py │ ├── funcs.py │ ├── funcs_common.py │ ├── funcs_flink.py │ ├── funcs_flink_itest.py │ ├── funcs_itest.py │ ├── funcs_rdb.py │ ├── funcs_spark.py │ ├── report.py │ ├── sql_processor.py │ ├── step.py │ └── step_test.py ├── sql_processor_debugger.py ├── sql_processor_debugger_itest.py ├── sql_processor_itest.py ├── sql_processor_test.py ├── sql_test.py ├── sql_test_itest.py ├── sql_tester.py ├── sql_tester_test.py ├── udf │ ├── __init__.py │ ├── check.py │ ├── udfs.py │ └── udfs_test.py └── utils │ ├── __init__.py │ ├── db_connection_utils.py │ ├── flink_test_cluster.py │ ├── flink_test_cluster_itest.py │ ├── io_utils.py │ ├── kv.py │ ├── object_utils.py │ ├── object_utils_test.py │ ├── sql_expr.py │ └── sql_expr_test.py ├── examples └── rtdw │ ├── .gitignore │ ├── Makefile │ ├── java │ ├── .gitignore │ ├── README │ ├── build.gradle │ ├── gradle │ │ └── wrapper │ │ │ └── gradle-wrapper.properties │ ├── gradlew │ ├── gradlew.bat │ ├── settings.gradle │ └── src │ │ └── main │ │ ├── java │ │ ├── com │ │ │ └── easysql │ │ │ │ └── example │ │ │ │ ├── Example.java │ │ │ │ ├── Ingest.java │ │ │ │ ├── RowDataDebeziumDeserializationSchema.java │ │ │ │ ├── Sinks.java │ │ │ │ ├── Sources.java │ │ │ │ └── SplitTableFunction.java │ │ └── org │ │ │ └── myorg │ │ │ └── quickstart │ │ │ └── DataStreamJob.java │ │ ├── resources │ │ └── log4j2.properties │ │ └── scala │ │ └── com │ │ └── easysql │ │ └── example │ │ └── ingest.scala │ ├── readme.md │ ├── scala │ ├── .gitignore │ ├── Makefile │ └── src │ │ └── com │ │ └── easysql │ │ └── example │ │ └── ingest.scala │ └── workflow │ └── sales │ └── ods │ ├── Makefile │ ├── data.sql │ ├── ingest.sql │ ├── ingest.test.sql │ ├── ingest_funcs.py │ ├── ingest_hudi.sql │ ├── ingest_hudi.test.sql │ ├── ingest_hudi_funcs.py │ ├── ods.flink_tables.json │ └── register-pg.json ├── poetry.lock ├── poetry.toml ├── pyproject.toml ├── requirements-all.txt └── test ├── Dockerfile ├── customized_func ├── customized_func.py └── etl_with_customized_func.sql ├── doc ├── .sqlfluff ├── debugging.sql ├── test_sqlfulff.sql └── variables.sql ├── etl_test.xlsx ├── flink └── flink_hive_conf │ └── hive-site.xml ├── sample_data_process.py ├── sample_etl.clickhouse.json ├── sample_etl.clickhouse.sql ├── sample_etl.clickhouse.xlsx ├── sample_etl.flink.hive.postgres.sql ├── sample_etl.flink.hive.sql ├── sample_etl.flink.hudi-agg.sql ├── sample_etl.flink.postgres-cdc.multi-sink.sql ├── sample_etl.flink.postgres-cdc.sql ├── sample_etl.flink.postgres-hudi.sql ├── sample_etl.flink.postgres.sql ├── sample_etl.flink_tables_file.yml ├── sample_etl.flink_tables_file_hive.yml ├── sample_etl.postgres.json ├── sample_etl.postgres.sql ├── sample_etl.postgres.xlsx ├── sample_etl.spark.json ├── sample_etl.spark.sql ├── sample_etl.spark.xlsx ├── sample_etl.syntax.xlsx ├── sample_etl_wps.syntax.xlsx └── udf ├── clickhouse ├── etl_with_udf.sql └── udf.py ├── flink-python ├── etl_with_udf.sql └── udf.py ├── flink-scala ├── .gitignore ├── Makefile ├── etl_with_udf.sql └── your │ └── company │ └── udfs.scala ├── spark-python ├── etl_with_udf.sql └── udf.py └── spark-scala ├── .gitignore ├── Makefile ├── etl_with_udf.sql └── your └── company └── udfs.scala /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | branch = True 3 | source = 4 | easy_sql/ 5 | omit = 6 | */*_test.py 7 | */*_itest.py 8 | */maxcompute.py 9 | */sql_dialect/bigquery.py 10 | 11 | [report] 12 | # Regexes for lines to exclude from consideration 13 | exclude_lines = 14 | # Have to re-enable the standard pragma 15 | pragma: no cover 16 | 17 | # Don't complain about missing debug-only code: 18 | def __repr__ 19 | if self\.debug 20 | 21 | # Don't complain if tests don't hit defensive assertion code: 22 | raise AssertionError 23 | raise SqlProcessorAssertionError 24 | raise NotImplementedError 25 | 26 | # Don't complain if non-runnable code isn't run: 27 | if 0: 28 | if __name__ == .__main__.: 29 | 30 | [html] 31 | directory = build/coverage 32 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # EditorConfig is awesome: https://EditorConfig.org 2 | 3 | # top-most EditorConfig file 4 | root = true 5 | 6 | [Makefile] 7 | indent_style = tab 8 | 9 | # Unix-style newlines with a newline ending every file 10 | [*] 11 | end_of_line = lf 12 | insert_final_newline = true 13 | charset = utf-8 14 | indent_style = space 15 | indent_size = 4 16 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 120 3 | extend-ignore = E203, E501, E731, SIM905 4 | extend-select = TC, TC1, B950 5 | per-file-ignores = 6 | easy_sql/sql_linter/sql_linter_test.py: W291,W293 7 | __init__.py:F403, F401, 8 | easy_sql/sql_tester.py: B950 9 | easy_sql/sql_processor/report.py: B950 10 | dataplat/report_test.py: B950 11 | -------------------------------------------------------------------------------- /.github/mergify.yml: -------------------------------------------------------------------------------- 1 | queue_rules: 2 | - name: shared_queue 3 | conditions: 4 | - check-success=unit-test 5 | - check-success=e2e-test-spark 6 | - check-success=e2e-test-postgres 7 | - check-success=e2e-test-clickhouse 8 | - check-success=test-coverage-all 9 | 10 | pull_request_rules: 11 | - name: delete head branch after merge 12 | conditions: 13 | - merged 14 | actions: 15 | delete_head_branch: 16 | 17 | # Push PR into queue when it passes all checks 18 | - name: put approved pr to queue 19 | conditions: 20 | - or: 21 | - and: 22 | - "#approved-reviews-by>=1" 23 | - -draft 24 | - check-success=test-coverage-all 25 | - and: 26 | - label=can-merge 27 | - -draft 28 | - check-success=test-coverage-all 29 | actions: 30 | queue: 31 | name: shared_queue 32 | method: squash 33 | 34 | # Check if PR title contain valid types 35 | - name: Comment PR if title not semantic 36 | conditions: 37 | - author!=Mergify 38 | - -draft 39 | - '-title~=^(feat|fix|refactor|ci|build|docs|website|chore)(\(.*\))?!?:' 40 | actions: 41 | comment: 42 | message: | 43 | This pull request's title is not fulfill the requirements. @{{author}} please update it 🙏. 44 | 45 | Valid format: 46 | 47 | ``` 48 | fix(query): fix group by string bug 49 | ^ ^---------------------^ 50 | | | 51 | | +-> Summary in present tense. 52 | | 53 | +-------> Type: feat, fix, refactor, ci, build, docs, website, chore 54 | ``` 55 | 56 | Valid types: 57 | 58 | - `feat`: this PR introduces a new feature to the codebase 59 | - `fix`: this PR patches a bug in codebase 60 | - `refactor`: this PR changes the code base without new features or bugfix 61 | - `ci|build`: this PR changes build/testing/ci steps 62 | - `docs|website`: this PR changes the documents or websites 63 | - `chore`: this PR only has small changes that no need to record 64 | - `type(scope)!`: this type of PR introduces breaking changes to the codebase 65 | 66 | 67 | # Assign pr label based of tags 68 | - name: label on New Feature 69 | conditions: 70 | - 'title~=^(feat)(\(.*\))?:' 71 | actions: 72 | label: 73 | add: 74 | - pr-feature 75 | - name: label on Bug Fix 76 | conditions: 77 | - 'title~=^(fix)(\(.*\))?:' 78 | actions: 79 | label: 80 | add: 81 | - pr-bugfix 82 | - name: label on Refactor 83 | conditions: 84 | - 'title~=^(refactor)(\(.*\))?:' 85 | actions: 86 | label: 87 | add: 88 | - pr-refactor 89 | - name: label on Build/Testing/CI 90 | conditions: 91 | - 'title~=^(ci|build)(\(.*\))?:' 92 | actions: 93 | label: 94 | add: 95 | - pr-build 96 | - name: label on Documentation 97 | conditions: 98 | - 'title~=^(docs|website)(\(.*\))?:' 99 | actions: 100 | label: 101 | add: 102 | - pr-doc 103 | - name: label on Not for changelog 104 | conditions: 105 | - 'title~=^(chore)(\(.*\))?:' 106 | actions: 107 | label: 108 | add: 109 | - pr-chore 110 | - name: label on breaking changes 111 | conditions: 112 | - 'title~=^.*?(\(.*\))?!:' 113 | actions: 114 | label: 115 | add: 116 | - pr-breaking 117 | -------------------------------------------------------------------------------- /.github/release-drafter.yml: -------------------------------------------------------------------------------- 1 | name-template: 'v$RESOLVED_VERSION' 2 | tag-template: 'v$RESOLVED_VERSION' 3 | template: | 4 | $CHANGES 5 | 6 | **Full Changelog**: https://github.com/$OWNER/$REPOSITORY/compare/$PREVIOUS_TAG...v$RESOLVED_VERSION 7 | 8 | categories: 9 | - title: 'Breaking' 10 | label: 'pr-breaking' 11 | - title: 'New' 12 | label: 'pr-feature' 13 | - title: 'Bug Fixes' 14 | label: 'pr-bugfix' 15 | - title: 'Maintenance' 16 | label: 'pr-refactor' 17 | - title: 'Documentation' 18 | label: 'pr-doc' 19 | - title: 'Other changes' 20 | 21 | version-resolver: 22 | major: 23 | labels: 24 | - 'pr-breaking' 25 | minor: 26 | labels: 27 | - 'pr-feature' 28 | patch: 29 | labels: 30 | - 'pr-bugfix' 31 | - 'pr-refactor' 32 | - 'pr-build' 33 | - 'pr-doc' 34 | - 'pr-chore' 35 | 36 | exclude-labels: 37 | - 'skip-changelog' 38 | -------------------------------------------------------------------------------- /.github/workflows/draft-release.yaml: -------------------------------------------------------------------------------- 1 | name: Draft Release 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | 8 | jobs: 9 | update-release-draft: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: release-drafter/release-drafter@master 13 | env: 14 | GITHUB_TOKEN: ${{ secrets.PAT }} 15 | -------------------------------------------------------------------------------- /.github/workflows/release.yaml: -------------------------------------------------------------------------------- 1 | name: Release 2 | on: 3 | push: 4 | tags: 5 | - v*.*.* 6 | 7 | jobs: 8 | release: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v3 12 | with: 13 | ref: main 14 | # ssh-key: ${{ secrets.DEPLOY_KEY }} 15 | token: ${{ secrets.PAT }} 16 | - uses: actions/setup-python@v4 17 | with: 18 | python-version: 3.8 19 | - name: Load cached Poetry installation 20 | uses: actions/cache@v2 21 | with: 22 | path: ~/.local/ # the path depends on the OS 23 | key: poetry-0 # increment to reset cache 24 | - name: Install Poetry 25 | uses: snok/install-poetry@v1.3.4 26 | with: 27 | version: 1.5.1 28 | - name: version check 29 | id: version 30 | run: | 31 | tag=${GITHUB_REF_NAME} 32 | version=${tag#v} 33 | old_version=$(poetry version -s) 34 | echo "tags: $tag , version: $version , old_version: $old_version" 35 | echo "::set-output name=tag::${tag}" 36 | echo "::set-output name=version::${version}" 37 | echo "::set-output name=version_changed::$( [ $version != $old_version ] && echo 'true' )" 38 | - name: bump version 39 | if: ${{ steps.version.outputs.version_changed == 'true' }} 40 | run: poetry version ${{ steps.version.outputs.version }} 41 | - name: commit and update release tag 42 | if: ${{ steps.version.outputs.version_changed == 'true' }} 43 | run: | 44 | git config user.name 'auto-release' 45 | git config user.email 'easy_sql@thoughtworks.com' 46 | git commit -am "release: bump to version ${{ steps.version.outputs.version }} [skip ci]" 47 | git tag ${{ steps.version.outputs.tag }} -f 48 | git push --atomic origin main ${{ steps.version.outputs.tag }} -f 49 | - name: upload pypi 50 | if: ${{ steps.version.outputs.version_changed == 'true' }} 51 | run: | 52 | poetry config pypi-token.pypi ${{ secrets.PYPI_TOKEN }} 53 | make upload-pip 54 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | tmp/ 2 | .idea/ 3 | .metals/ 4 | __pycache__ 5 | **/.DS_Store 6 | **/*.zip 7 | venv/ 8 | .python-version 9 | spark-warehouse/ 10 | build/* 11 | .ipynb_checkpoints/ 12 | easy_sql-easy_sql* 13 | easy-sql-easy-sql* 14 | easy_sql_easy_sql.egg-info 15 | dist/ 16 | dist.old/ 17 | metastore_db/ 18 | derby.log 19 | coverage.xml 20 | .coverage 21 | readme.local.md 22 | test/flink/jars 23 | test/*.local.* 24 | test/*__local.py 25 | test/spark/jars 26 | test/flink/tools 27 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pycqa/isort 3 | rev: 5.12.0 4 | hooks: 5 | - id: isort 6 | 7 | - repo: https://github.com/psf/black 8 | rev: 24.1.1 9 | hooks: 10 | - id: black 11 | 12 | - repo: https://github.com/pycqa/flake8 13 | rev: 6.0.0 14 | hooks: 15 | - id: flake8 16 | additional_dependencies: 17 | - flake8-bugbear 18 | - flake8-comprehensions 19 | - flake8-simplify 20 | - flake8-type-checking 21 | 22 | - repo: https://github.com/pre-commit/pre-commit-hooks 23 | rev: v4.4.0 24 | hooks: 25 | - id: end-of-file-fixer 26 | - id: trailing-whitespace 27 | exclude: | 28 | (?x)^( 29 | easy_sql/sql_linter/sql_linter_test.py 30 | )$ 31 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | build: 4 | os: "ubuntu-20.04" 5 | tools: 6 | python: "3.8" 7 | 8 | sphinx: 9 | configuration: docs/conf.py 10 | 11 | python: 12 | install: 13 | - requirements: docs/requirements.txt 14 | -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // Use IntelliSense to learn about possible attributes. 3 | // Hover to view descriptions of existing attributes. 4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [ 7 | { 8 | "name": "Debug Unit Test", 9 | "type": "python", 10 | "request": "test", 11 | "justMyCode": false 12 | } 13 | ] 14 | } 15 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.linting.flake8Enabled": true, 3 | "python.linting.pylintEnabled": false, 4 | "python.linting.enabled": true, 5 | "python.testing.unittestArgs": ["-v", "-s", "./", "-p", "*test.py"], 6 | "python.testing.pytestEnabled": true, 7 | "python.testing.unittestEnabled": false, 8 | "python.analysis.typeCheckingMode": "basic", 9 | "editor.formatOnSave": true, 10 | "editor.defaultFormatter": "ms-python.black-formatter", 11 | "python.testing.pytestArgs": [ 12 | "easy_sql" 13 | ] 14 | } 15 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to EasySQL 2 | 3 | - [Contributing to EasySQL](#contributing-to-easysql) 4 | - [Architecture Design](#architecture-design) 5 | - [Build and Run EasySQL](#build-and-run-EasySQL) 6 | - [Create Tracking Issue if Necessary](#create-tracking-issue-if-necessary) 7 | - [Write Tests](#write-tests) 8 | - [Running Test and Checks](#running-test-and-checks) 9 | - [Submit a PR](#submit-a-pr) 10 | - [Pull Request Title](#pull-request-title) 11 | - [Pull Request Description](#pull-request-description) 12 | - [Sign DCO (Developer Certificate of Origin)](#sign-dco-developer-certificate-of-origin) 13 | 14 | Thanks for your contribution! The EasySQL project welcomes contribution of various types -- new features, bug fixes 15 | and reports, typo fixes, etc. If you want to contribute to the EasySQL project, you will need to pass necessary 16 | checks and sign DCO. If you have any question, feel free to ping community members on GitHub and in Slack channels. 17 | 18 | ## Architecture Design 19 | 20 | TODO: need to enhance this part 21 | 22 | ## Build and Run EasySQL 23 | 24 | TODO: need to enhance this part 25 | 26 | ## Create Tracking Issue if Necessary 27 | 28 | If you are working on a large feature (>= 300 LoCs), it is recommended to create a tracking issue first, so that 29 | contributors and maintainers can understand the issue better and discuss how to proceed and implement the features. 30 | 31 | ## Write Tests 32 | 33 | TODO: need to enhance this part 34 | 35 | ## Running Test and Checks 36 | 37 | We provide a simple make command to run all the checks: 38 | 39 | ```shell 40 | make unit-test 41 | ``` 42 | 43 | After all the checks pass, your changes will likely be accepted. 44 | 45 | ## Submit a PR 46 | 47 | ### Pull Request Title 48 | 49 | As described in [here](https://github.com/commitizen/conventional-commit-types/blob/master/index.json), a valid PR title should begin with one of the following prefixes: 50 | 51 | - `feat`: A new feature 52 | - `fix`: A bug fix 53 | - `docs`: Documentation only changes 54 | - `style`: Changes that do not affect the meaning of the code (white-space, formatting, missing semi-colons, etc) 55 | - `refactor`: A code change that neither fixes a bug nor adds a feature 56 | - `perf`: A code change that improves performance 57 | - `test`: Adding missing tests or correcting existing tests 58 | - `build`: Changes that affect the build system or external dependencies (example scopes: gulp, broccoli, npm) 59 | - `ci`: Changes to EasySQL CI configuration files and scripts 60 | - `chore`: Other changes that don't modify src or test files 61 | - `revert`: Reverts a previous commit 62 | 63 | For example, a PR title could be: 64 | 65 | - `refactor: modify sql processor protobuf package path` 66 | - `feat(processor): support clickhouse as backend.` 67 | 68 | 69 | > `(): ` 70 | > 71 | > ``` 72 | > feat(scope): add hat wobble 73 | > ^--^ ^---^ ^------------^ 74 | > | | | 75 | > | | +-> Summary in present tense. 76 | > | | 77 | > | +---> Scope: executor, storage, etc. 78 | > | 79 | > +-------> Type: chore, docs, feat, fix, refactor, style, or test. 80 | > ``` 81 | 82 | 83 | ### Pull Request Description 84 | 85 | - If your PR is small (such as a typo fix), you can go brief. 86 | - If it is large and you have changed a lot, it's better to write more details. 87 | 88 | ### Sign DCO (Developer Certificate of Origin) 89 | 90 | Contributors will need to sign DCO in their commits. From [GitHub App's DCO](https://github.com/apps/dco) page: 91 | 92 | The Developer Certificate of Origin (DCO) is a lightweight way for contributors to certify that they wrote or otherwise 93 | have the right to submit the code they are contributing to the project. Here is the full text of the DCO, reformatted 94 | for readability: 95 | 96 | > By making a contribution to this project, I certify that: 97 | > 98 | > The contribution was created in whole or in part by me and I have the right to submit it under the open source license indicated in the file; or 99 | > 100 | > The contribution is based upon previous work that, to the best of my knowledge, is covered under an appropriate open source license and I have the right under that license to submit that work with modifications, whether created in whole or in part by me, under the same open source license (unless I am permitted to submit under a different license), as indicated in the file; or 101 | > 102 | > The contribution was provided directly to me by some other person who certified 1., 2. or 3. and I have not modified it. 103 | > 104 | > I understand and agree that this project and the contribution are public and that a record of the contribution (including all personal information I submit with it, including my sign-off) is maintained indefinitely and may be redistributed consistent with this project or the open source license(s) involved. 105 | 106 | Contributors will need to add a `Signed-off-by` line in all their commits: 107 | 108 | ``` 109 | Signed-off-by: Random J Developer 110 | ``` 111 | 112 | The `git` command provides `-s` parameter to attach DCO to the commits. 113 | 114 | ``` 115 | git commit -m "feat(scope): commit messages" -s 116 | ``` 117 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | prune test 2 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | test-coverage: 2 | export PYSPARK_PYTHON=python3 && export PYSPARK_DRIVER_PYTHON=python3 && \ 3 | python3 -m coverage run -m pytest -o python_files=*_test.py 4 | python3 -m coverage report -m 5 | - mkdir build 6 | - rm -r build/coverage 7 | python3 -m coverage html 8 | 9 | unit-test: 10 | export PYSPARK_PYTHON=python3 && export PYSPARK_DRIVER_PYTHON=python3 && \ 11 | python3 -m pytest 12 | 13 | # Note: env var named PG_URL and CLICKHOUSE_URL must be set to run e2e test 14 | e2e-test: 15 | cd test && docker build . --build-arg PG_URL=${PG_URL} --build-arg CLICKHOUSE_URL=${CLICKHOUSE_URL} 16 | 17 | echo-var: 18 | echo ${PG_URL} ${CLICKHOUSE_URL} 19 | 20 | e2e-test-spark: 21 | python3 -m easy_sql.data_process -f test/sample_etl.spark.sql 22 | 23 | e2e-test-postgres: 24 | python3 -m easy_sql.data_process -f test/sample_etl.postgres.sql 25 | 26 | e2e-test-clickhouse: 27 | python3 -m easy_sql.data_process -f test/sample_etl.clickhouse.sql 28 | 29 | e2e-test-flink-postgres: 30 | python3 -m easy_sql.data_process -f test/sample_etl.flink.postgres.sql 31 | 32 | e2e-test-flink-streaming: 33 | python3 -m easy_sql.data_process -f test/sample_etl.flink.postgres-cdc.sql 34 | python3 -m easy_sql.data_process -f test/sample_etl.flink.postgres-cdc.multi-sink.sql 35 | python3 -m easy_sql.data_process -f test/sample_etl.flink.postgres-hudi.sql 36 | 37 | e2e-test-flink-hive: 38 | python3 -m easy_sql.data_process -f test/sample_etl.flink.hive.sql 39 | 40 | test-coverage-all: 41 | export PYSPARK_PYTHON=python3 && export PYSPARK_DRIVER_PYTHON=python3 && \ 42 | PG_URL=${PG_URL} CLICKHOUSE_URL=${CLICKHOUSE_URL} python3 -m coverage run -m pytest -o python_files=*test.py 43 | python3 -m coverage report -m 44 | python3 -m coverage xml 45 | 46 | package-zip: 47 | - rm build/easysql.zip 48 | mkdir -p build 49 | zip -r --exclude=*__pycache__* build/easysql.zip easy_sql 50 | 51 | package-pip: 52 | poetry build 53 | 54 | upload-test-pip: 55 | rm -rf ./dist 56 | poetry publish -r testpypi --build 57 | 58 | install-test-pip: 59 | pip3 uninstall easy_sql-easy_sql 60 | python3 -m pip install --index-url https://test.pypi.org/simple/ 'easy-sql-easy-sql[cli]' 61 | 62 | upload-pip: 63 | rm -rf ./dist 64 | poetry publish --build 65 | 66 | prepare-flink-hadoop: 67 | test -f test/flink/tools/hadoop/hadoop-3.3.5.tar.gz || ( \ 68 | mkdir -pv test/flink/tools/hadoop && \ 69 | wget -P test/flink/tools/hadoop https://dlcdn.apache.org/hadoop/common/hadoop-3.3.5/hadoop-3.3.5.tar.gz && \ 70 | cd test/flink/tools/hadoop && \ 71 | tar xf hadoop-3.3.5.tar.gz ) 72 | 73 | download-flink-jars: 74 | test -f test/flink/jars/flink-connector-jdbc-1.15.1.jar || wget -P test/flink/jars https://repo1.maven.org/maven2/org/apache/flink/flink-connector-jdbc/1.15.1/flink-connector-jdbc-1.15.1.jar 75 | test -f test/flink/jars/flink-sql-connector-hive-3.1.2_2.12-1.15.1.jar || wget -P test/flink/jars https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-hive-3.1.2_2.12/1.15.1/flink-sql-connector-hive-3.1.2_2.12-1.15.1.jar 76 | test -f test/flink/jars/postgresql-42.2.14.jar || wget -P test/flink/jars https://repo1.maven.org/maven2/org/postgresql/postgresql/42.2.14/postgresql-42.2.14.jar 77 | test -f test/flink/jars/flink-sql-connector-postgres-cdc-2.3.0.jar || wget -P test/flink/jars https://repo1.maven.org/maven2/com/ververica/flink-sql-connector-postgres-cdc/2.3.0/flink-sql-connector-postgres-cdc-2.3.0.jar 78 | test -f test/flink/jars/hudi-flink1.15-bundle-0.12.2.jar || wget -P test/flink/jars https://repo1.maven.org/maven2/org/apache/hudi/hudi-flink1.15-bundle/0.12.2/hudi-flink1.15-bundle-0.12.2.jar 79 | test -f test/flink/jars/flink-sql-connector-kafka-1.15.2.jar || wget -P test/flink/jars https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-kafka/1.15.2/flink-sql-connector-kafka-1.15.2.jar 80 | test -f test/flink/jars/kafka-clients-3.3.2.jar || wget -P test/flink/jars https://repo1.maven.org/maven2/org/apache/kafka/kafka-clients/3.3.2/kafka-clients-3.3.2.jar 81 | -------------------------------------------------------------------------------- /debugger-usage.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/easysql/easy_sql/b568542617942f347579ff872d976fd2175aa071/debugger-usage.gif -------------------------------------------------------------------------------- /debugger.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from os import path 3 | from typing import Any, Dict, Optional 4 | 5 | src_path = path.dirname(path.abspath(__file__)) 6 | sys.path.insert(0, src_path) 7 | 8 | __all__ = ["create_debugger", "create_pg_debugger", "create_ch_debugger"] 9 | 10 | 11 | def create_debugger(sql_file_path: str, vars: Optional[Dict[str, Any]] = None, funcs: Optional[Dict[str, Any]] = None): 12 | import os 13 | import subprocess 14 | 15 | spark_home = ( 16 | subprocess.check_output(["bash", "-c", "echo 'import os; print(os.environ[\"SPARK_HOME\"])' | pyspark"]) 17 | .decode("utf8") 18 | .split("\n") 19 | ) 20 | spark_home = [c.strip() for c in spark_home if c.strip()][0] 21 | os.environ["SPARK_HOME"] = spark_home 22 | import findspark 23 | 24 | findspark.init() 25 | 26 | from pyspark.sql import SparkSession 27 | 28 | from easy_sql.sql_processor.backend import SparkBackend 29 | 30 | spark = SparkSession.builder.enableHiveSupport().getOrCreate() 31 | backend = SparkBackend(spark) 32 | from easy_sql.sql_processor_debugger import SqlProcessorDebugger 33 | 34 | debugger = SqlProcessorDebugger(sql_file_path, backend, vars, funcs) 35 | return debugger 36 | 37 | 38 | def create_pg_debugger( 39 | sql_file_path: str, vars: Optional[Dict[str, Any]] = None, funcs: Optional[Dict[str, Any]] = None 40 | ): 41 | from easy_sql.sql_processor.backend.rdb import RdbBackend 42 | 43 | pg = RdbBackend("postgresql://postgres:123456@testpg:15432/postgres") 44 | from easy_sql.sql_processor_debugger import SqlProcessorDebugger 45 | 46 | debugger = SqlProcessorDebugger(sql_file_path, pg, vars, funcs) 47 | return debugger 48 | 49 | 50 | def create_ch_debugger( 51 | sql_file_path: str, vars: Optional[Dict[str, Any]] = None, funcs: Optional[Dict[str, Any]] = None 52 | ): 53 | from easy_sql.sql_processor.backend.rdb import RdbBackend 54 | 55 | ch = RdbBackend("clickhouse+native://default@testch:30123") 56 | from easy_sql.sql_processor_debugger import SqlProcessorDebugger 57 | 58 | debugger = SqlProcessorDebugger(sql_file_path, ch, vars, funcs) 59 | return debugger 60 | -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | _build/ 2 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | # import os 14 | # import sys 15 | # sys.path.insert(0, os.path.abspath('.')) 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | import os.path 20 | 21 | project = "Easy SQL" 22 | copyright = "2022, easysql" 23 | author = "easysql" 24 | 25 | # The full version, including alpha/beta/rc tags 26 | release = "v0.1.0" 27 | 28 | 29 | # -- General configuration --------------------------------------------------- 30 | 31 | # Add any Sphinx extension module names here, as strings. They can be 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 33 | # ones. 34 | extensions = [ 35 | "myst_parser", 36 | "sphinx.ext.duration", 37 | "sphinx.ext.doctest", 38 | "sphinx.ext.autodoc", 39 | "sphinx.ext.autosummary", 40 | "sphinx.ext.intersphinx", 41 | "autoapi.extension", 42 | ] 43 | 44 | autoapi_type = "python" 45 | autoapi_dirs = ["../"] 46 | _docs_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 47 | autoapi_ignore = [ 48 | f"{_docs_dir}/debugger.py", 49 | f"{_docs_dir}/test/*", 50 | f"{_docs_dir}/build/*", 51 | f"{_docs_dir}/docs/*", 52 | f"{_docs_dir}/requirements/*", 53 | "*_itest.py", 54 | "*_test.py", 55 | ] 56 | autoapi_options = [ 57 | "show-module-summary", 58 | "members", 59 | "undoc-members", 60 | "imported-members", 61 | "show-inheritance", 62 | "show-inheritance-diagram", 63 | ] 64 | autoapi_member_order = "groupwise" 65 | autodoc_typehints = "description" 66 | 67 | # Add any paths that contain templates here, relative to this directory. 68 | templates_path = ["_templates"] 69 | 70 | # List of patterns, relative to source directory, that match files and 71 | # directories to ignore when looking for source files. 72 | # This pattern also affects html_static_path and html_extra_path. 73 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 74 | 75 | intersphinx_mapping = { 76 | "python": ("https://docs.python.org/3/", None), 77 | "sphinx": ("https://www.sphinx-doc.org/en/master/", None), 78 | } 79 | intersphinx_disabled_domains = ["std"] 80 | 81 | # -- Options for HTML output ------------------------------------------------- 82 | 83 | # The theme to use for HTML and HTML Help pages. See the documentation for 84 | # a list of builtin themes. 85 | # 86 | html_theme = "sphinx_rtd_theme" 87 | 88 | # Add any paths that contain custom static files (such as style sheets) here, 89 | # relative to this directory. They are copied after the builtin static files, 90 | # so a file named "default.css" will overwrite the builtin "default.css". 91 | html_static_path = ["_static"] 92 | -------------------------------------------------------------------------------- /docs/easy_sql/add_backend.md: -------------------------------------------------------------------------------- 1 | # Add Backend 2 | 3 | ##Introduction 4 | 5 | Easy-sql is designed as a tool to quick implement with different kind of sql backend. 6 | 7 | So far supported backends are: 8 | 9 | + spark sql(spark engine) 10 | + bigquery(sqlalchemy engine) 11 | + postgresql(sqlalchemy engine) 12 | + clickhouse(sqlalchemy engine) 13 | + flink(flink engine) 14 | 15 | Easy sql is designed to be flexible and scalable. If in future have requirement on build new backend engine in easy sql, it can be easily added on by implement the method. Following is the description on how to implement new engine step by step. 16 | 17 | ## 18 | -------------------------------------------------------------------------------- /docs/easy_sql/bnf.md: -------------------------------------------------------------------------------- 1 | The pseudocode BNF of Easy SQL syntax. 2 | 3 | ``` 4 | easysql: target_def | sql_body | config | include 5 | target_def: target_def_prefix (variables_def | list_variables_def | temp_def | cache_def | broadcast_def | func_def | log_def | check_def | output_def | template_def | action) (, if = func_call)? 6 | sql_body: (any var_reference any | any tpl_reference any)* comment? 7 | 8 | target_def_prefix: '-- target=' 9 | 10 | var_reference: var_reference_lit | var_reference_func 11 | var_reference_lit: ${ name } 12 | var_reference_func: ${ func_call } 13 | 14 | func_call: func_call_no_arg | func_call_with_args 15 | func_call_no_arg: name \( \) 16 | func_call_with_args: name \( func_call_args \) 17 | func_call_args: (name_wide | var_reference_lit) (, name_wide | , var_reference_lit)* 18 | 19 | variables_def: 'variables' 20 | list_variables_def: 'list_variables' 21 | temp_def: 'temp.'name 22 | cache_def: 'cache.'name 23 | broadcast_def: 'broadcast.'name 24 | func_def: 'func.'func_call 25 | log_def: 'log.'name 26 | check_def: 'check.'name | 'check.'func_call 27 | output_def: 'output.'name.name | 'output.'name.name.name 28 | template_def: 'template.'name 29 | action_def: 'action.'name 30 | 31 | config: '-- config:' name_key = any | \ 32 | '-- backend:' name | \ 33 | '-- owner:' | \ 34 | '-- owner:' name (, name)* | \ 35 | '-- schedule:' any | \ 36 | '-- prepare-sql: ' any | \ 37 | '-- inputs:' | \ 38 | '-- inputs:' (name.name | name.name.name) (, name.name | , name.name.name)* | \ 39 | '-- outputs:' | \ 40 | '-- outputs:' (name.name | name.name.name) (, name.name | , name.name.name)* 41 | 42 | include: '-- include=' any 43 | 44 | tpl_call: tpl_call_no_arg | tpl_call_with_args 45 | tpl_call_no_arg: name \( \) 46 | tpl_call_with_args: name \( tpl_call_args \) 47 | tpl_call_args: (name = name_wide | name = var_reference_lit) (, name = name_wide | , name = var_reference_lit)* 48 | 49 | 50 | tpl_reference: tpl_reference_lit | tpl_reference_func 51 | tpl_reference_lit: @{ name } 52 | tpl_reference_func: @{ tpl_call } 53 | 54 | name: r'[a-zA-Z_]\\w*' 55 | name_wide: r'[^),]*' 56 | 57 | comment: '--' any 58 | ``` 59 | -------------------------------------------------------------------------------- /docs/easy_sql/build_install.md: -------------------------------------------------------------------------------- 1 | # Build and install Easy SQL 2 | 3 | Easy SQL is a very light-weight python library. The common Python library conventions are followed. 4 | It's easy to build or install Easy SQL. 5 | 6 | ## Install Easy SQL 7 | 8 | Install Easy SQL using pip: `python3 -m pip install 'easy-sql-easy-sql[extra,extra]'` 9 | 10 | Currently we are providing below extras, choose according to your need: 11 | - cli 12 | - linter 13 | - spark 14 | - pg 15 | - clickhouse 16 | 17 | We also provide flink backend, but because of dependency confliction between pyspark and apache-flink, you need to install the flink backend dependencies manually with the following command `python3 -m pip install apache-flink`. 18 | 19 | Usually we read data from some data source and write data to some other system using flink with different connectors. So we need to download some jars for the used connectors as well. Refer [here](https://nightlies.apache.org/flink/flink-docs-release-1.15/docs/connectors/table/overview/) to get more information and [here](https://nightlies.apache.org/flink/flink-docs-release-1.15/docs/connectors/table/downloads/) to download the required connectors. 20 | ## Building Easy SQL 21 | 22 | Internally we use `poetry` to manage the dependencies. So make sure you have [installed it](https://python-poetry.org/docs/master/#installation). Package could be built with the following make command: `make package-pip` or just `poetry build`. 23 | 24 | After the above command, there will be a file named `easy_sql*.whl` generated in the `dist` folder. 25 | You can install it with command `python3 -m pip install dist/easy_sql*.whl[extra]` or just `poetry install -E 'extra extra'`. 26 | -------------------------------------------------------------------------------- /docs/easy_sql/command_line.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/easysql/easy_sql/b568542617942f347579ff872d976fd2175aa071/docs/easy_sql/command_line.md -------------------------------------------------------------------------------- /docs/easy_sql/debug.md: -------------------------------------------------------------------------------- 1 | # Debug ETL 2 | 3 | There is a debugger interface implemented in Easy SQL. 4 | 5 | ## Start to debug 6 | 7 | We recommend debugging ETLs from jupyter. You can follow the steps below to start debugging your ETL. 8 | 9 | 1. Install jupyter first with command `python3 -m pip install jupyterlab`. 10 | 11 | 2. Create a file named `debugger.py` with contents like below: 12 | 13 | A more detailed sample could be found [here](https://github.com/easysql/easy_sql/blob/main/debugger.py). 14 | 15 | ```python 16 | from typing import Dict, Any 17 | 18 | def create_debugger(sql_file_path: str, vars: Dict[str, Any] = None, funcs: Dict[str, Any] = None): 19 | from pyspark.sql import SparkSession 20 | from easy_sql.sql_processor.backend import SparkBackend 21 | from easy_sql.sql_processor_debugger import SqlProcessorDebugger 22 | spark = SparkSession.builder.enableHiveSupport().getOrCreate() 23 | backend = SparkBackend(spark) 24 | debugger = SqlProcessorDebugger(sql_file_path, backend, vars, funcs) 25 | return debugger 26 | 27 | ``` 28 | 29 | 3. Create a file named `test.sql` with contents as [here](https://github.com/easysql/easy_sql/blob/main/test/sample_etl.spark.sql). 30 | 31 | 4. Then start jupyter lab with command: `jupyter lab`. 32 | 33 | 5. Start debugging like below: 34 | 35 | ![ETL Debugging](https://raw.githubusercontent.com/easysql/easy_sql/main/debugger-usage.gif) 36 | 37 | ## Debuger API 38 | 39 | Please refer to API doc [here](api/debugger.md) 40 | -------------------------------------------------------------------------------- /docs/easy_sql/easy_sql.md: -------------------------------------------------------------------------------- 1 | # Easy SQL 2 | 3 | Easy SQL is built to ease the data ETL development process. 4 | With Easy SQL, you can develop your ETL in SQL in an imperative way. 5 | 6 | It defines a few simple syntax on top of standard SQL, with which SQL could be executed one by one. 7 | Easy SQL also provides a processor to handle all the new syntax. 8 | 9 | Since this is SQL agnostic, any SQL engine could be plugged-in as a backend. 10 | There are built-in supported for several popular SQL engines, including SparkSQL, PostgreSQL, Clickhouse, Aliyun Maxcompute, Google BigQuery. 11 | More will be added in the near future. 12 | 13 | ## Background 14 | 15 | Why do we need imperative syntax in ETL? 16 | 17 | SQL is designed to be used in a declarative way and it causes a few troubles when we use SQL to develop complicated ETL. 18 | 19 | Think about the following cases. 20 | 21 | 1. We would like to use large computing resources when we're handling data in the full-data partition since the amount of data there is far larger than that in the other partitions. 22 | 2. We would like to send out a http request to report status when some step of the ETL fails for some reasons(E.g. some data does not conform to the previous assumptions). 23 | 3. We would like to reuse some code to check if some order is a valid order (think about e-commerce business). 24 | 4. We would like to stop at some step of the ETL and check if the data is what we expected. 25 | 26 | When we use SQL to develop our ETL, it is hard to handle the above cases. 27 | But for a company with a wide range of data usage, there are similar cases everywhere. 28 | 29 | ### Why imperative SQL 30 | 31 | The above cases could be easily handled if we have an imperative-way to write our code. 32 | This might be the reason why a lot of developers like to write ETLs in a general programming language like Python or Scala. 33 | 34 | But for data ETL development case, we still think that to use SQL or SQL-like language is a better choice. The main reasons are: 35 | 36 | - Consistent code style across all ETLs. 37 | - All roles in the team can easily understand the logic in ETL. 38 | - All code about one ETL mainly stays in one file and it makes things simpler when we try to read and understand what it does in the ETL. 39 | 40 | ## Design principal 41 | 42 | When first tried to design the syntax, we found several important things. Which are: 43 | 44 | - Keep compatible with standard SQL. So that every SQL editor could be used to develop in Easy SQL. 45 | - Try to use SQL-way to implement most of the features. 46 | - Use intuitive syntax which is also similar to the widely-used syntax in other programming languages. 47 | - Implement widely-used debugging features, such as logging and asserting and even step by step debugging. 48 | 49 | These important things become the design principals of Easy SQL. They provide guidance in the whole design process. 50 | If there is an argument about which design is better, the design principals could be referred to make a decision. 51 | 52 | ## Language features in Easy SQL 53 | 54 | For Easy SQL, guided by the design principals, there are a few simple language features added to support these imperative characteristics. Below is a list about these features: 55 | 56 | - An imperative structure of ETL code. 57 | - Variables which could be defined and modified any time. 58 | - A way to call external functions. 59 | - A way to control whether a step should be executed. 60 | - Templates that could be reused in the same ETL file. 61 | - Include command that could be used to reuse code at file level. 62 | - Logging and assertion that could be used for debugging. 63 | - A debugger interface. 64 | -------------------------------------------------------------------------------- /docs/easy_sql/faq.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/easysql/easy_sql/b568542617942f347579ff872d976fd2175aa071/docs/easy_sql/faq.md -------------------------------------------------------------------------------- /docs/easy_sql/how_to.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/easysql/easy_sql/b568542617942f347579ff872d976fd2175aa071/docs/easy_sql/how_to.md -------------------------------------------------------------------------------- /docs/easy_sql/img/test_case.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/easysql/easy_sql/b568542617942f347579ff872d976fd2175aa071/docs/easy_sql/img/test_case.png -------------------------------------------------------------------------------- /docs/easy_sql/index.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/easysql/easy_sql/b568542617942f347579ff872d976fd2175aa071/docs/easy_sql/index.md -------------------------------------------------------------------------------- /docs/easy_sql/linter.md: -------------------------------------------------------------------------------- 1 | # Linter 2 | 3 | Easy SQL is a powerful tool that can bring convenience to ETL developer. 4 | But so far we do not have an Easy-SQL grammar supported compiler that can auto-check SQL quality and auto fix the violations. 5 | It is the reason why we develop such linter tool on top of sqlfluff. With this linter, we can do static analysis and auto-fixing of ETL code written in Easy SQL. 6 | 7 | 8 | ## Command Line Interface 9 | 10 | The command line interface usage is as follows: 11 | 12 | ```bash 13 | $ python3 -m easy_sql.sql_linter.sql_linter_cli fix --path ${path} 14 | ``` 15 | 16 | There are fix and lint mode, for lint it will only show the rule violations while for fix it will auto-fix the query. 17 | 18 | Fix mode parameters: 19 | 20 | - path: The location of the ETL file. 21 | - config-path: Sql fluff config file path, must be named `.sqlfluff`. Used to customize lint rules. There are some customization introduced by Easy SQL. If this is specified, the customization will be lost. 22 | - include: Comma separated rule id to be included. 23 | - exclude: Comma separated rule id to be excluded. 24 | - backend: The backend of the ETL file. Will be used to find the correct rules. 25 | - easy_sql: Boolean value to indicate whether the ETL file is written in Easy SQL or normal SQL. Will default to true. 26 | - inplace: Boolean value to indicate whether to overwrite the origin file with the fixed output. If false the fixed output will be written to a new file with suffix `.fixed.sql`. 27 | 28 | Lint mode parameters: 29 | 30 | - path: The location of the ETL file. 31 | - include: Comma separated rule id to be included. 32 | - config-path: Sql fluff config file path, must be named `.sqlfluff`. Used to customize lint rules. There are some customization introduced by Easy SQL. If this is specified, the customization will be lost. 33 | - exclude: Comma separated rule id to be excluded. 34 | - backend: The backend of the ETL file. Will be used to find the correct rules. 35 | - easy_sql: Boolean value to indicate whether the ETL file is written in Easy SQL or normal SQL. Will default to true. 36 | 37 | ## Programmatical usage 38 | 39 | ```python 40 | from easy_sql.sql_linter.sql_linter import SqlLinter 41 | 42 | sql = "" 43 | sql_linter = SqlLinter(sql, include_rules=None, exclude_rules=None) 44 | result = sql_linter.lint("bigquery", easy_sql=True) 45 | fixed = sql_linter.fix("bigquery", easy_sql=True) 46 | ``` 47 | 48 | You may find out that in the lint and fix command there is an option to specify which backend the ETL file is written to. 49 | If you do not provide the option, and you are using easy sql, it will automatically detect the backend from the file. 50 | Make sure you've specified the correct options, or it will generate unexpected output. 51 | 52 | ( 53 | For developers: 54 | 55 | The backend impacts the applied rules. If defined as bigquery, all the customized rules with groups containing bigquery and sqlfluff built-in core rules will be applied. 56 | 57 | ```python 58 | # groups in customized rules 59 | groups = ("all", "bigquery") 60 | ``` 61 | ) 62 | -------------------------------------------------------------------------------- /docs/easy_sql/other_features.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/easysql/easy_sql/b568542617942f347579ff872d976fd2175aa071/docs/easy_sql/other_features.md -------------------------------------------------------------------------------- /docs/easy_sql/variables.md: -------------------------------------------------------------------------------- 1 | # Variables 2 | 3 | Easy SQL provides several special variables to help with ETL implementation. 4 | These special variables are all starts with `__`. 5 | 6 | Here are some description about what they are and how to use them. 7 | 8 | ## Variables to control data saving 9 | 10 | - `__create_output_table__`: When true and the output table does not exist, will try to create output table automatically. 11 | - `__partition__`: If specified, will save output data to the specified partition. There must be a partition column followed in the variable name. 12 | As an example, if we defined variable `__partition__dt`, then dt will be the partition column and the value of the variable will be the partition value. 13 | - `__save_mode__`: Value could be 'overwrite' or 'append'. If not specified, default to 'overwrite'. Will do append or overwrite when write data to table. 14 | 15 | ## Variables to control execution behaviour 16 | 17 | - `__no_check__`: If true, will skip any `check` step defined by `-- target=check.xxx` for performance consideration. 18 | - `__no_log__`: If true, will skip any `log` step defined by `-- target=log.xxx` for performance consideration. 19 | - `__no_cache__`: If true, will create temporal table instead of cache table. This if for spark backend only. For the other backends, all the `cache` or `temp` table will be views. 20 | - `__dry_run_verify_output_schema__`: If true, will verify output table schema against the target table. Will fail if target table does not exist or there are columns in target table but not in the output query. Useful when need to do check in dryrun mode. 21 | - `__dry_run_verify_output_schema_type__`: Will be take into account when `__dry_run_verify_output_schema__` is true. If both `__dry_run_verify_output_schema__` and `__dry_run_verify_output_schema_type__` are true, will verify output table field types against the target table. 22 | - `__skip_all__`: If true, will skip execution of the following steps. Could be used when the partition of the input data does not exist. 23 | - `__exception_handler__`: When specified, the value must be a function call. 24 | The function call will be executed when there is an exception found during the execution of some step. 25 | As an example, the value could be `some_exception_handler({__step__}, {var_a}, b)`. As we see, there could be variables referenced in the function call and the variable will be resolved when exception happens (at runtime, not definition time). 26 | 27 | ## Variables for function calling 28 | 29 | - `__backend__`: An instance of [`Backend`]() class. Usually used to pass into functions. 30 | - `__step__`: An instance of [`Step`](https://easy-sql.readthedocs.io/en/latest/autoapi/easy_sql/sql_processor/step/index.html#easy_sql.sql_processor.step.Step) class. Usually used to pass into functions. 31 | - `__context__`: An instance of [`ProcessorContext`](https://easy-sql.readthedocs.io/en/latest/autoapi/easy_sql/sql_processor/context/index.html#easy_sql.sql_processor.context.ProcessorContext) class. Usually used to pass into functions. 32 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. Easy SQL documentation master file, created by 2 | sphinx-quickstart on Wed Apr 27 16:59:16 2022. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to Easy SQL's documentation! 7 | ==================================== 8 | 9 | Easy SQL is built to ease the data ETL development process. 10 | With Easy SQL, you can develop your ETL in SQL in an imperative way. 11 | 12 | It defines a few simple syntax on top of standard SQL, with which SQL could be executed one by one. 13 | Easy SQL also provides a processor to handle all the new syntax. 14 | 15 | Since this is SQL agnostic, any SQL engine could be plugged-in as a backend. 16 | There are built-in supported for several popular SQL engines, including SparkSQL, PostgreSQL, Clickhouse, Aliyun Maxcompute, Google BigQuery. 17 | More will be added in the near future. 18 | 19 | Contents 20 | -------- 21 | 22 | .. toctree:: 23 | :maxdepth: 6 24 | 25 | easy_sql/easy_sql.md 26 | easy_sql/build_install.md 27 | easy_sql/quick_start.md 28 | easy_sql/syntax.md 29 | easy_sql/debug.md 30 | easy_sql/testing.md 31 | easy_sql/linter.md 32 | easy_sql/functions.md 33 | easy_sql/udfs.md 34 | easy_sql/variables.md 35 | easy_sql/backend/flink.md 36 | autoapi/index 37 | 38 | 39 | 40 | Indices and tables 41 | ================== 42 | 43 | * :ref:`genindex` 44 | * :ref:`modindex` 45 | * :ref:`search` 46 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["flit_core >=3.2,<4"] 3 | build-backend = "flit_core.buildapi" 4 | 5 | [project] 6 | name = "easysql" 7 | authors = [{name = "easy_sql", email = "easy_sql@thoughtworks.com"}] 8 | dynamic = ["version", "description"] 9 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | myst-parser==0.17.2 2 | sphinx-rtd-theme==1.0.0 3 | Sphinx==4.5.0 4 | sphinx-autoapi==1.8.4 5 | -------------------------------------------------------------------------------- /docs/scripts/update_doc.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | import os.path 3 | import re 4 | from typing import Callable, Sequence 5 | 6 | from easy_sql.sql_processor import funcs_rdb, funcs_spark 7 | from easy_sql.udf import udfs 8 | 9 | 10 | def _render_doc_modules_functions(backend: str): 11 | print("render doc for:", backend) 12 | assert backend in ["spark", "rdb"] 13 | mod = funcs_spark if backend == "spark" else funcs_rdb 14 | groups_doc = [] 15 | for funcs_group in mod.__all__: 16 | mod_name: str = funcs_group 17 | funcs_group_mod = getattr(mod, funcs_group) 18 | funcs = [func for func in dir(funcs_group_mod) if not func.startswith("_") and func == func.lower()] 19 | assert mod_name.endswith("Func") or mod_name.endswith("Funcs") or mod_name.endswith("Functions") 20 | group_name = mod_name[: mod_name.rindex("Func")] 21 | 22 | funcs_doc = [] 23 | for func_name in funcs: 24 | func_mod = getattr(funcs_group_mod, func_name) 25 | func_sig = str(inspect.signature(func_mod)).replace("(self, ", "(", 1).replace("'", "") 26 | module = func_mod.__module__ 27 | func_doc = ( 28 | f"- [`{func_name}{func_sig}`]" 29 | f'(https://easy-sql.readthedocs.io/en/latest/autoapi/{module.replace(".", "/")}/index.html#{module}.{mod_name}.{func_name})' # noqa: B950 30 | ) 31 | funcs_doc.append(func_doc) 32 | funcs_doc = "\n".join(funcs_doc) 33 | 34 | funcs_group_doc = f""" 35 | #### {group_name} functions 36 | 37 | {funcs_doc} 38 | """ 39 | groups_doc.append(funcs_group_doc) 40 | return "\n".join(groups_doc) 41 | 42 | 43 | def _update_doc( 44 | doc_tpl_file: str, 45 | doc_file: str, 46 | tpl_rex: str, 47 | render: Callable[[Sequence[str]], str], 48 | ): 49 | with open(doc_tpl_file, "r") as f: 50 | doc_tpl = f.read() 51 | lines = doc_tpl.split("\n") 52 | result_lines = [] 53 | for line in lines: 54 | m = re.match(tpl_rex, line) 55 | if m: 56 | groups = m.groups() 57 | result_lines.append(render(groups)) 58 | else: 59 | result_lines.append(line) 60 | 61 | with open(doc_file, "w") as f: 62 | f.write("\n".join(result_lines)) 63 | print("updated file:", doc_file) 64 | 65 | 66 | def update_func_doc(): 67 | doc_tpl_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../easy_sql/functions.tpl.md") 68 | doc_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../easy_sql/functions.md") 69 | 70 | def render(groups: Sequence[str]) -> str: 71 | backend = groups[0] 72 | title = groups[1].strip() if len(groups) > 1 and groups[1].strip() else f"Functions for {backend} backend" 73 | return f""" 74 | ### {title} 75 | 76 | {_render_doc_modules_functions(backend)} 77 | """ 78 | 79 | _update_doc(doc_tpl_file, doc_file, r"\{\{ (spark|rdb) functions:? ?(.*)? \}\}", render) 80 | 81 | 82 | def update_udf_doc(): 83 | doc_tpl_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../easy_sql/udfs.tpl.md") 84 | doc_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../easy_sql/udfs.md") 85 | 86 | def render(groups: Sequence[str]) -> str: 87 | backend = groups[0] 88 | backend_display_names = { 89 | "spark": "Spark", 90 | "pg": "PostgreSQL", 91 | "ch": "Clickhouse", 92 | } 93 | 94 | udf_names = udfs.get_udfs(backend) 95 | udf_mods = {"spark": udfs.SparkUdfs, "pg": udfs.PgUdfs, "ch": udfs.ChUdfs} 96 | rendered_udfs_doc = [] 97 | for udf_name in udf_names: 98 | udf_mod = udf_mods[backend] 99 | udf_sig = str(inspect.signature(getattr(udf_mod, udf_name))) 100 | udf_doc = ( 101 | f"- [`{udf_name}{udf_sig}`]" 102 | f"(https://easy-sql.readthedocs.io/en/latest/autoapi/easy_sql/udf/udfs/index.html#easy_sql.udf.udfs.{udf_mod.__name__}.{udf_name})" 103 | ) 104 | rendered_udfs_doc.append(udf_doc) 105 | rendered_udfs_doc = "\n".join(rendered_udfs_doc) 106 | return f""" 107 | ### {backend_display_names.get(backend, backend)} UDFs 108 | 109 | {rendered_udfs_doc} 110 | """ 111 | 112 | _update_doc(doc_tpl_file, doc_file, r"\{\{ (spark|pg|ch) UDFs:? ?(.*)? \}\}", render) 113 | 114 | 115 | if __name__ == "__main__": 116 | update_func_doc() 117 | update_udf_doc() 118 | -------------------------------------------------------------------------------- /docs/sqlfluff/new_rule.md: -------------------------------------------------------------------------------- 1 | # Easy SQL customized code quality rules 2 | 3 | ## Introduction 4 | 5 | In SQL FLuff we already have predefined rules, it follows the common rules for clean SQL code. 6 | You can check all the implemented rule from running the command: `sqlfluff rules`. 7 | But they are not enough, in some situation we need to implement a customized rule. This documentation will go through the steps of how to achieve it. 8 | 9 | Before we start, we need to understand the design of SQL Fluff first. When checking the SQL code, it will go through the following steps: 10 | 11 | - **templates handling**: To replace the variable used in SQL. Jinjia/dbt format are supported. The replacement is static, and all the variables will be resolved from the config file first. 12 | - **lex**: Separate the SQL into whitespace and code segment. 13 | - **parse**: Parse the lex result and organize the tokens to a grammar tree with the specific SQL dialects. If no matches found for a segment, the content will be wrapped in an `UnparsableSegment` which will be picked up as a parsing error later. 14 | - **lint**: Walk through the parsed tree-structured data and check if there are violations according to the rules. There will be `lintError` returned if any violations found. 15 | - **fix**: Auto-fix the problem pointed out by lint. 16 | 17 | In SQL Fluff, segments form a tree-like structure. The top-level segment is a `FileSegment`, which contains zero or more `StatementSegment`s. 18 | Before parsing the segments and names according to their type, they are `raw`, meaning that they are literal values. 19 | 20 | ## New rule 21 | 22 | To create new rules, we need to implement a new class extended from `BaseRule` in SQL Fluff first. 23 | The name of the class will become the name of rules. The name convention should be `Rule_xxxxx_Lxxx`. The `BaseRule` contains a parsing logic to parse rules following the convention. 24 | The core function of lint is `_eval`, the input is a tree structure element to indicate the context. 25 | By calling context.segment.children you can find the next segment, and the link structure also allows you traverse to the end. 26 | 27 | In the `groups` definition, "all" must be there. 28 | 29 | ```config 30 | # Comma separated list of rules to check, default to all 31 | rules = all 32 | ``` 33 | 34 | You can check the type of the segment by `is_type("table_reference")`. To find the correct name of the segment, you need to go into SQL Fluff code to find the segment class and name is inside. 35 | With the `is_type` check, you can correctly point to the location that need to check the rule。 36 | 37 | ```python 38 | from sqlfluff.core.rules.base import BaseRule, RuleContext 39 | 40 | 41 | class Rule_BigQuery_L001(BaseRule): 42 | 43 | groups = ("all", "bigquery") 44 | 45 | def __init__(self, *args, **kwargs): 46 | """Overwrite __init__ to set config.""" 47 | super().__init__(*args, **kwargs) 48 | 49 | def _eval(self, context: RuleContext): 50 | pass 51 | ``` 52 | 53 | ```python 54 | from sqlfluff.core.parser import BaseSegment 55 | 56 | class WildcardExpressionSegment(BaseSegment): 57 | type = "wildcard_expression" 58 | ``` 59 | 60 | ## Define the rule violation 61 | 62 | The return value of the _eval() function is the rule violation record object. 63 | If the check passed and no error found, it should return nothing. 64 | The return object is a `LintResult` object. In creation of this object, you should pass three arguments. 65 | 66 | + `anchor`: the segment that hold the position info. 67 | + `description`: a description of the reason why it is failed. 68 | + `fix`: a list of fix object (`delete`/`replace`/`create_before`/`create_after`) to fix the problem. To further understand the function, can read the `LintFix` code. If you do not pass in anything as `fix`, it will do nothing in the fix step. 69 | 70 | 71 | ## Add the rule into action 72 | 73 | To make the customized function work, you need to pass in a list of classes as the linter `user_rules` parameter. 74 | So far, we imported all the classes in the `easy_sql.sql_linter.rules`. By adding the rule while initiating the module, it works as expected. 75 | 76 | ```python 77 | from sqlfluff.core import Linter 78 | 79 | linter = Linter( user_rules=[]) 80 | ``` 81 | 82 | ```python 83 | from easy_sql.sql_linter.rules.bq_schema_rule import Rule_BigQuery_L001 84 | 85 | __all__ = [Rule_BigQuery_L001] 86 | ``` 87 | -------------------------------------------------------------------------------- /docs/sqlfluff/quick_start.md: -------------------------------------------------------------------------------- 1 | # Quick start 2 | 3 | ## Introduction to sqlfluff 4 | 5 | With multiple contributors to a project and varying technical backgrounds, it's really difficult to maintain consistent readability and comprehension across a codebase. 6 | sqlfluff is a tool that can easily checkout sql code quality with varying SQL backgrounds and dialects. 7 | 8 | ## Python Requirement 9 | 10 | Sqlfluff does not support for python 2. It need python 3+. Check your python version with the following: 11 | 12 | ```bash 13 | $ python --version 14 | ``` 15 | 16 | ## Quick install 17 | 18 | Install sqlfluff with pip: `pip install sqlfluff` 19 | 20 | Check if installation succeeds: `pip install sqlfluff` 21 | 22 | 23 | ## Hands on demo 24 | We can use command line to quick check the sql code quality. Here we use an example, save the example to a sql file. 25 | ```sql 26 | SELECT a+b AS foo, 27 | c AS bar from my_table 28 | ``` 29 | cd to the folder and run the test 30 | ```bash 31 | $ cd /test/doc 32 | $ sqlfluff lint test_sqlfulff.sql --dialect ansi 33 | ``` 34 | output: 35 | ``` 36 | == [test_sqlfulff.sql] FAIL 37 | L: 1 | P: 1 | L034 | Select wildcards then simple targets before calculations 38 | | and aggregates. 39 | L: 1 | P: 1 | L036 | Select targets should be on a new line unless there is 40 | | only one select target. 41 | L: 1 | P: 9 | L006 | Missing whitespace before + 42 | L: 1 | P: 9 | L006 | Missing whitespace after + 43 | L: 1 | P: 11 | L039 | Unnecessary whitespace found. 44 | L: 2 | P: 1 | L003 | Expected 1 indentations, found 0 [compared to line 01] 45 | L: 2 | P: 10 | L010 | Keywords must be consistently upper case. 46 | L: 2 | P: 23 | L009 | Files must end with a single trailing newline. 47 | All Finished 📜 🎉! 48 | ``` 49 | The sqlfluff checker will tell what is needed to take care. 50 | To further understand the rule, check https://docs.sqlfluff.com/en/stable/rules.html#ruleref 51 | 52 | Automatically fix the issue with specify rule: 53 | 54 | ```bash 55 | $ sqlfluff fix test_sqlfulff.sql --rules L003,L009,L010 --dialect ansi 56 | ``` 57 | 58 | ## Customer style 59 | In lint command can specify to use different kind of dialect for varying SQL backend. 60 | Check the currently support backend: 61 | 62 | ```bash 63 | $ sqlfluff dialects 64 | ``` 65 | ```output 66 | ==== sqlfluff - dialects ==== 67 | ansi: ansi dialect [inherits from 'nothing'] 68 | bigquery: bigquery dialect [inherits from 'ansi'] 69 | db2: db2 dialect [inherits from 'ansi'] 70 | exasol: exasol dialect [inherits from 'ansi'] 71 | ``` 72 | All dialects are inherited from basic ansi dialects. If we want to customize dialects, we need to fork the git repo and create new class. 73 | 74 | Lint command also can specify which rules to use to check. To list the currently support rules: 75 | 76 | ```bash 77 | $ sqlfluff rules 78 | ``` 79 | 80 | ```output 81 | ==== sqlfluff - rules ==== 82 | L001: Unnecessary trailing whitespace. 83 | L002: Mixed Tabs and Spaces in single whitespace. 84 | L003: Indentation not consistent with previous lines. 85 | ``` 86 | 87 | Rules are predefined, but it is flexible at parameter and usage. 88 | All the settings can be specified in a `.sqlfluff` file. 89 | Change the config file, the customer style will be applied immediately. 90 | 91 | Only enable few rule: 92 | 93 | ```config 94 | rules = L001,L002 (default :all) 95 | ``` 96 | 97 | Ignore specify rule: 98 | 99 | ```config 100 | exclude_rules = L001,L002 101 | ``` 102 | 103 | To customize a specific rule, parameters are predefined: 104 | 105 | ```config 106 | [sqlfluff:rules:L010] 107 | capitalisation_policy = consistent 108 | ignore_words = from 109 | ignore_words_regex = None 110 | ``` 111 | 112 | ## Jinjia Template 113 | 114 | SQL fluff also support template replacement by variables for flexibility. 115 | By default, it is Jinja template, and we also use Jinja as example here. 116 | 117 | ```sql 118 | SELECT a+b AS foo, 119 | c AS bar from my_table where name = {{ test_name }}; 120 | ``` 121 | 122 | Set the value of the parameter in the config file `.sqlfluff`: 123 | 124 | ```config 125 | [sqlfluff:templater:jinja:context] 126 | test_name=456 127 | ``` 128 | 129 | After this, you can get the parsed result by run the following command: 130 | 131 | ```bash 132 | $ sqlfluff parse test_sqlfulff.sql --rules L003,L009,L010 --dialect ansi 133 | ``` 134 | 135 | # Easy sql integration Plan 136 | 137 | 1. Parse the backend config in easy sql to define the dialect 138 | 2. Our easy sql have multiple sql with comment as seperator. Add for-loop to loop through all different sql file 139 | 3. Make rules enable /disable for different sql backend. For example, bigquery specific need schema. 140 | 4. Add rules to check, including input/output check, partition check 141 | 5. Allow for easy_sql function and variable like ${temp_db} to be checked. 142 | 143 | 144 | refer dbt tool but it is static 145 | -------------------------------------------------------------------------------- /easy_sql/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/easysql/easy_sql/b568542617942f347579ff872d976fd2175aa071/easy_sql/__init__.py -------------------------------------------------------------------------------- /easy_sql/base_test.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | import uuid 5 | from datetime import datetime 6 | from typing import TYPE_CHECKING, Dict, List, Optional 7 | 8 | if TYPE_CHECKING: 9 | from pyspark.sql import SparkSession 10 | 11 | from easy_sql.sql_processor import SqlProcessor 12 | from easy_sql.sql_processor.backend.rdb import SqlExpr 13 | 14 | from .local_spark import LocalSpark 15 | from .logger import log_time 16 | 17 | 18 | def should_run_integration_test(key: Optional[str] = None): 19 | if key is None or key in ["pg", "ch", "mc", "bq", "flink_hive"]: 20 | return False 21 | return True 22 | 23 | 24 | TEST_PG_URL = os.environ.get("PG_URL", "postgresql://postgres:123456@testpg:15432/postgres") 25 | TEST_PG_JDBC_URL = os.environ.get("PG_JDBC_URL", "jdbc:postgresql://testpg:15432/postgres") 26 | TEST_PG_JDBC_USER = os.environ.get("PG_JDBC_USER", "postgres") 27 | TEST_PG_JDBC_PASSWD = os.environ.get("PG_JDBC_PASSWD", "123456") 28 | TEST_CH_URL = os.environ.get("CLICKHOUSE_URL", "clickhouse+native://default@testch:30123") 29 | TEST_BQ_URL = os.environ.get("BQ_URL", "bigquery://") 30 | 31 | __partition_col_converter__ = lambda col: ( 32 | f"PARSE_DATE('%Y-%m', {col}) as {col}" if col in ["data_month", ":data_month"] else f"CAST({col} as DATE)" 33 | ) 34 | __partition_value_converter__ = lambda col, value: ( 35 | datetime.strptime(value, "%Y-%m").date() if col == "data_month" else datetime.strptime(value, "%Y-%m-%d").date() 36 | ) 37 | __column_sql_type_converter__ = lambda backend_type, col_name, col_type: ( 38 | "DATE" if col_name in ["di", "dt", "data_date", "data_month"] else None 39 | ) 40 | __partition_expr__ = lambda backend_type, partition_col: ( 41 | f"DATE_TRUNC({partition_col}, MONTH)" 42 | if backend_type == "bigqiery" and partition_col == "data_month" 43 | else partition_col 44 | ) 45 | bigquery_sql_expr = SqlExpr( 46 | column_sql_type_converter=__column_sql_type_converter__, 47 | partition_col_converter=__partition_col_converter__, 48 | partition_value_converter=__partition_value_converter__, 49 | partition_expr=__partition_expr__, 50 | ) 51 | 52 | 53 | def dt(dt_s): 54 | return datetime.strptime(dt_s, "%Y-%m-%d %H:%M:%S") 55 | 56 | 57 | def date(s): 58 | return datetime.strptime(s, "%Y-%m-%d").date() 59 | 60 | 61 | def dt_zone(dt_s: str, formate="%Y-%m-%d %H:%M:%S", timezone=None): 62 | if timezone is None: 63 | return datetime.strptime(dt_s, formate) 64 | else: 65 | return datetime.strptime(dt_s, formate).replace(tzinfo=timezone) 66 | 67 | 68 | def next_id(): 69 | return str(uuid.uuid1()).replace("-", "") 70 | 71 | 72 | @log_time 73 | def run_sql( 74 | sql: str, 75 | result_table: str, 76 | funcs: Optional[Dict] = None, 77 | variables: Optional[Dict] = None, 78 | dry_run: bool = False, 79 | spark: Optional[SparkSession] = None, 80 | spark_conf: Optional[Dict] = None, 81 | ) -> List: 82 | spark = spark or LocalSpark.get(spark_conf) 83 | processor = SqlProcessor(spark, sql, [], variables or {}) 84 | processor.func_runner.register_funcs(funcs or {}) 85 | processor.run(dry_run=dry_run) 86 | return spark.sql(f"select * from {result_table}").collect() 87 | -------------------------------------------------------------------------------- /easy_sql/cli/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/easysql/easy_sql/b568542617942f347579ff872d976fd2175aa071/easy_sql/cli/__init__.py -------------------------------------------------------------------------------- /easy_sql/data_process.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | import sys 5 | from typing import List, Optional 6 | 7 | import click 8 | 9 | 10 | @click.command(name="data_process") 11 | @click.option("--sql-file", "-f", type=str) 12 | @click.option("--vars", "-v", type=str, required=False) 13 | @click.option("--dry-run", type=str, required=False, help="if dry run, one of [true, 1, false, 0]") 14 | @click.option("--python-path", type=str, required=False) 15 | @click.option("--print-command", "-p", is_flag=True) 16 | def data_process(sql_file: str, vars: str, dry_run: str, python_path: str, print_command: bool): 17 | EasySqlProcessor(sql_file, vars, dry_run, print_command, python_path=python_path).process() 18 | 19 | 20 | class EasySqlProcessor: 21 | def __init__( 22 | self, 23 | sql_file: str, 24 | vars: Optional[str], 25 | dry_run: Optional[str], 26 | print_command: bool, 27 | python_path: Optional[str] = None, 28 | ) -> None: 29 | if not sql_file.endswith(".sql"): 30 | raise Exception(f"sql_file must ends with .sql, found `{sql_file}`") 31 | 32 | try: 33 | from easy_sql.config.sql_config import EasySqlConfig 34 | except ModuleNotFoundError: 35 | assert python_path is not None 36 | sys.path.insert(0, python_path) 37 | from easy_sql.config.sql_config import EasySqlConfig 38 | 39 | self.sql_file = sql_file 40 | self.vars_arg = vars 41 | self.dry_run_arg = dry_run if dry_run is not None else "0" 42 | self.dry_run = dry_run in ["true", "1"] 43 | self.config = EasySqlConfig.from_sql(sql_file) 44 | self.print_command = print_command 45 | 46 | def process(self, backend_config: Optional[List[str]] = None) -> Optional[str]: 47 | from easy_sql.cli.backend_processor import BackendProcessor 48 | 49 | backend_processor = BackendProcessor.create_backend_processor(self.config) 50 | 51 | if self.print_command: 52 | command = backend_processor.shell_command( 53 | self.vars_arg, self.dry_run_arg, os.path.abspath(__file__), backend_config 54 | ) 55 | print(command) 56 | return command 57 | else: 58 | backend_processor.run(self.vars_arg, self.dry_run, backend_config) 59 | 60 | 61 | if __name__ == "__main__": 62 | data_process() 63 | -------------------------------------------------------------------------------- /easy_sql/data_process_itest.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | import unittest 3 | from typing import List, Optional 4 | 5 | from easy_sql import data_process 6 | 7 | proj_base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 8 | 9 | 10 | def _data_process( 11 | sql_file: str, 12 | vars: Optional[str], 13 | dry_run: Optional[str], 14 | print_command: bool, 15 | backend_config: Optional[List[str]] = None, 16 | ) -> None: 17 | data_process.EasySqlProcessor(sql_file, vars, dry_run, print_command).process(backend_config) 18 | 19 | 20 | class DataProcessTest(unittest.TestCase): 21 | def test_spark(self): 22 | _data_process(os.path.join(proj_base_dir, "test/sample_etl.spark.sql"), None, None, False) 23 | 24 | def test_postgres(self): 25 | _data_process(os.path.join(proj_base_dir, "test/sample_etl.postgres.sql"), None, None, False) 26 | 27 | def test_clickhouse(self): 28 | _data_process(os.path.join(proj_base_dir, "test/sample_etl.clickhouse.sql"), None, None, False) 29 | 30 | def test_flink_postgres(self): 31 | print(_data_process(os.path.join(proj_base_dir, "test/sample_etl.flink.postgres.sql"), None, None, False)) 32 | -------------------------------------------------------------------------------- /easy_sql/data_process_test.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | import unittest 3 | from typing import Optional 4 | 5 | from easy_sql import data_process 6 | 7 | proj_base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 8 | 9 | 10 | def _data_process(sql_file: str, vars: Optional[str], dry_run: Optional[str], print_command: bool) -> Optional[str]: 11 | return data_process.EasySqlProcessor(sql_file, vars, dry_run, print_command).process() 12 | 13 | 14 | class DataProcessTest(unittest.TestCase): 15 | def test_spark(self): 16 | command = _data_process(os.path.join(proj_base_dir, "test/sample_etl.spark.sql"), None, None, True) 17 | assert command is not None 18 | print(command) 19 | self.assertRegex( 20 | command, 21 | r"spark-submit --conf spark.master=local\[2\] --conf spark.submit.deployMode=client " 22 | r"--conf spark.app.name=sample_etl.spark_[\d]+ " 23 | "--conf spark.sql.warehouse.dir=/tmp/spark-warehouse-localdw " 24 | '--conf spark.driver.extraJavaOptions="-Dderby.system.home=/tmp/spark-warehouse-metastore ' 25 | '-Dderby.stream.error.file=/tmp/spark-warehouse-metastore.log" ' 26 | '--conf spark.files="[^"]+test/sample_etl.spark.sql" ' 27 | '"[^"]+/easy_sql/data_process.py" ' 28 | "-f .+/test/sample_etl.spark.sql --dry-run 0", 29 | ) 30 | 31 | def test_flink_hive(self): 32 | command = _data_process(os.path.join(proj_base_dir, "test/sample_etl.flink.hive.sql"), None, None, True) 33 | 34 | assert command is not None 35 | self.assertRegex( 36 | command.strip(), 37 | r".*flink run --parallelism 2 " 38 | '--pyFiles "[^"]+test/sample_etl.flink.hive.sql" ' 39 | "-t local " 40 | '--python "[^"]+/easy_sql/data_process.py" ' 41 | r"-f .+/test/sample_etl.flink.hive.sql --dry-run 0", 42 | ) 43 | 44 | def test_flink_hive_postgres(self): 45 | command = _data_process( 46 | os.path.join(proj_base_dir, "test/sample_etl.flink.hive.postgres.sql"), None, None, True 47 | ) 48 | assert command is not None 49 | self.assertRegex( 50 | command, 51 | r".*flink run --parallelism 1 " 52 | '--pyFiles "[^"]+test/sample_etl.flink.hive.postgres.sql" ' 53 | '--python "[^"]+/easy_sql/data_process.py" ' 54 | "-f .+/test/sample_etl.flink.hive.postgres.sql --dry-run 0", 55 | ) 56 | 57 | def test_flink_scala_udf(self): 58 | command = _data_process(os.path.join(proj_base_dir, "test/udf/flink-scala/etl_with_udf.sql"), None, None, True) 59 | assert command is not None 60 | self.assertRegex( 61 | command, 62 | r".*flink run --parallelism 1 " 63 | '--pyFiles "[^"]+test/udf/flink-scala/etl_with_udf.sql" --jarfile udf.jar ' 64 | '--python "[^"]+/easy_sql/data_process.py" ' 65 | "-f .+/test/udf/flink-scala/etl_with_udf.sql --dry-run 0", 66 | ) 67 | -------------------------------------------------------------------------------- /easy_sql/local_spark.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | from typing import Any, Dict, Optional 4 | 5 | from pyspark.sql import SparkSession 6 | 7 | 8 | class LocalSpark: 9 | spark: Optional[SparkSession] = None 10 | __conf: Dict = {} 11 | 12 | @staticmethod 13 | def stop(): 14 | if LocalSpark.spark: 15 | LocalSpark.spark.stop() 16 | LocalSpark.spark = None 17 | 18 | @staticmethod 19 | def get(conf: Optional[Dict[str, Any]] = None, clean_existing_data: bool = True) -> SparkSession: 20 | conf = conf or {} 21 | if LocalSpark.spark is None: 22 | default_conf = { 23 | "spark.default.parallelism": 4, 24 | "hive.exec.dynamic.partition.mode": "nonstrict", 25 | "spark.sql.warehouse.dir": "/tmp/spark-warehouse-localdw-ut", 26 | "spark.driver.extraJavaOptions": ( 27 | "-Dderby.system.home=/tmp/spark-warehouse-metastore-ut " 28 | "-Dderby.stream.error.file=/tmp/spark-warehouse-metastore-ut.log" 29 | ), 30 | } 31 | default_conf.update(conf) 32 | conf = default_conf 33 | 34 | if clean_existing_data: 35 | # delete old spark warehouse/metastore dir 36 | print(f"removing dir {conf['spark.sql.warehouse.dir']}") 37 | shutil.rmtree(conf["spark.sql.warehouse.dir"], ignore_errors=True) 38 | if "-Dderby.system.home" in conf["spark.driver.extraJavaOptions"]: 39 | import re 40 | 41 | java_options = re.sub(r"\s*=\s*", "=", conf["spark.driver.extraJavaOptions"].strip()).split() 42 | for op in java_options: 43 | if op.split("=")[0].strip() == "-Dderby.system.home": 44 | print(f"removing dir {op.split('=')[1].strip()}") 45 | shutil.rmtree(op.split("=")[1].strip(), ignore_errors=True) 46 | 47 | # ensure a local spark with default config 48 | os.environ["SPARK_CONF_DIR"] = "/tmp/local-spark-conf-ut" 49 | spark_builder = SparkSession.builder.appName("UnitTest").master("local[4]") 50 | 51 | print("using conf: ", conf) 52 | for k, v in conf.items(): 53 | spark_builder.config(k, v) 54 | LocalSpark.spark = spark_builder.enableHiveSupport().getOrCreate() 55 | 56 | spark = LocalSpark.spark 57 | spark.catalog.clearCache() 58 | for table in spark.catalog.listTables("default"): 59 | if table.isTemporary: 60 | print(f"dropping temp view {table.name}") 61 | spark.catalog.dropTempView(table.name) 62 | 63 | return LocalSpark.spark 64 | -------------------------------------------------------------------------------- /easy_sql/logger.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | import logging 3 | import sys 4 | from datetime import datetime 5 | from functools import wraps 6 | from typing import Callable 7 | 8 | LOG_LEVEL = logging.DEBUG 9 | 10 | 11 | def _config_logger(): 12 | logger = logging.getLogger("simple_logger") 13 | logger.setLevel(LOG_LEVEL) 14 | python_version = sys.version_info 15 | if python_version.major == 3 and python_version.minor == 6: 16 | sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach()) # type: ignore 17 | elif hasattr(sys.stdout, "reconfigure"): 18 | sys.stdout.reconfigure(encoding="utf-8") # type: ignore 19 | handler = logging.StreamHandler(sys.stdout) 20 | handler.setLevel(LOG_LEVEL) 21 | 22 | formatter = logging.Formatter( 23 | "[%(asctime)s][%(processName)s:%(threadName)s][%(levelname)s][%(module)s.%(funcName)s:%(lineno)d] %(message)s" 24 | ) 25 | handler.setFormatter(formatter) 26 | 27 | for existing_handler in logger.handlers: 28 | logger.removeHandler(existing_handler) 29 | logger.addHandler(handler) 30 | 31 | return logger 32 | 33 | 34 | def log_time(func: Callable): 35 | @wraps(func) 36 | def wrapper(*args, **kwargs): 37 | start_time = datetime.now() 38 | try: 39 | return func(*args, **kwargs) 40 | finally: 41 | end_time = datetime.now() 42 | logger.debug("function {} took {}s".format(func.__name__, (end_time - start_time).total_seconds())) 43 | 44 | return wrapper 45 | 46 | 47 | logger = _config_logger() 48 | -------------------------------------------------------------------------------- /easy_sql/report.py: -------------------------------------------------------------------------------- 1 | import json 2 | from datetime import datetime 3 | from typing import Any 4 | 5 | 6 | class EsService: 7 | def __init__(self, base_url: str, should_send: bool = True): 8 | self.base_url = base_url.strip("/") 9 | self.should_send = should_send 10 | self.data = None 11 | 12 | def post(self, url_path: str, data: str): 13 | import requests 14 | 15 | if self.should_send: 16 | resp = requests.post( 17 | self.base_url + url_path, headers={"Content-Type": "Application/json"}, data=data.encode("utf8") 18 | ) 19 | if not resp.ok: 20 | raise Exception(f"send data quality report failed(status={resp.status_code}): {resp.text}") 21 | print("data post to es done") 22 | else: 23 | self.data = {"method": "post", "args": {"url_path": url_path, "data": data}} 24 | print("will not send data") 25 | 26 | def put(self, url_path: str, data: str): 27 | import requests 28 | 29 | if self.should_send: 30 | resp = requests.put( 31 | self.base_url + url_path, headers={"Content-Type": "Application/json"}, data=data.encode("utf8") 32 | ) 33 | if not resp.ok: 34 | raise Exception(f"send data quality report failed(status={resp.status_code}): {resp.text}") 35 | print("data put to es done") 36 | else: 37 | self.data = {"method": "put", "args": {"url_path": url_path, "data": data}} 38 | print("will not send data") 39 | 40 | def delete_by_query(self, index: str, query: object): 41 | import requests 42 | 43 | data = json.dumps({"query": query}) 44 | url_path = f"/{index}/_delete_by_query" 45 | if self.should_send: 46 | resp = requests.post( 47 | self.base_url + url_path, headers={"Content-Type": "Application/json"}, data=data.encode("utf8") 48 | ) 49 | if not resp.ok: 50 | raise Exception(f"send data quality report failed(status={resp.status_code}): {resp.text}") 51 | else: 52 | self.data = {"method": "post", "args": {"url_path": url_path, "data": data}} 53 | print("will not send data") 54 | 55 | 56 | class Reporter: 57 | def __init__(self, es_service: EsService, index_prefix: str = "", now: Any = None): 58 | self.es_service = es_service 59 | self.now = now 60 | self.index_prefix = index_prefix 61 | 62 | def _es_index_name(self, name: str): 63 | return f"{self.index_prefix}_{name}" if self.index_prefix else name 64 | 65 | def report_task_result(self, task_id: str, report: str): 66 | """ 67 | es index: 68 | 69 | PUT /{index_prefix}_task_report 70 | { 71 | "mappings": { 72 | "properties": { 73 | "task_id": { "type": "wildcard" }, 74 | "report": { "type": "text" }, 75 | "created_at": { "type": "date", "format": "yyyy-MM-dd HH:mm:ss" } 76 | } 77 | } 78 | } 79 | """ 80 | now = (self.now or datetime.utcnow()).strftime("%Y-%m-%d %H:%M:%S") 81 | data = {"task_id": task_id, "report": report, "created_at": now} 82 | self.es_service.post(f'/{self._es_index_name("task_report")}/_doc', json.dumps(data)) 83 | -------------------------------------------------------------------------------- /easy_sql/report_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from easy_sql.report import EsService, Reporter 4 | 5 | 6 | class ReporterTest(unittest.TestCase): 7 | @unittest.skip("integration test") 8 | def test_should_report_task_result(self): 9 | reporter = Reporter(EsService("http://testes:9200")) 10 | reporter.report_task_result("some-task", "some message\nsome other message") 11 | -------------------------------------------------------------------------------- /easy_sql/spark_optimizer.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Optional 2 | 3 | from pyspark.sql import SparkSession 4 | 5 | 6 | class SparkDynamicConfig: 7 | def __init__(self, max_shuffle_partitions: Optional[int] = None, min_shuffle_partitions: Optional[int] = None): 8 | self.max_shuffle_partitions = max_shuffle_partitions 9 | self.min_shuffle_partitions = min_shuffle_partitions 10 | 11 | def use_min_shuffle_partitions(self, spark: SparkSession) -> "SparkDynamicConfig": 12 | assert self.min_shuffle_partitions, "must provide min_shuffle_partitions to use the conf" 13 | spark.conf.set("spark.sql.adaptive.enabled", "false") 14 | spark.conf.set("spark.sql.shuffle.partitions", str(self.min_shuffle_partitions)) 15 | spark.conf.set("spark.default.parallelism", str(self.min_shuffle_partitions)) 16 | return self 17 | 18 | def use_max_shuffle_partitions(self, spark: SparkSession) -> "SparkDynamicConfig": 19 | assert self.max_shuffle_partitions, "must provide max_shuffle_partitions to use the conf" 20 | spark.conf.set("spark.sql.adaptive.enabled", "false") 21 | spark.conf.set("spark.sql.shuffle.partitions", str(self.max_shuffle_partitions)) 22 | spark.conf.set("spark.default.parallelism", str(self.max_shuffle_partitions)) 23 | return self 24 | 25 | def use_adaptive_shuffle_partitions(self, spark: SparkSession) -> "SparkDynamicConfig": 26 | spark.conf.set("spark.sql.adaptive.enabled", "true") 27 | return self 28 | 29 | 30 | def get_spark(app_name: Optional[str] = None, conf: Optional[Dict] = None): 31 | builder = SparkSession.builder.enableHiveSupport() 32 | if app_name: 33 | builder.config("spark.app.name", app_name) 34 | conf = conf or {} 35 | for k, v in conf.items(): 36 | builder.config(k, v) 37 | 38 | spark = builder.getOrCreate() 39 | spark.conf.set("spark.sql.statistics.fallBackToHdfs", "true") 40 | # 启用 Adaptive Execution ,从而启用自动设置 Shuffle Reducer 特性 41 | spark.conf.set("spark.sql.adaptive.enabled", "true") 42 | # 设置每个 Reducer 读取的目标数据量,单位为字节。默认64M,一般改成集群块大小 43 | spark.conf.set("spark.sql.adaptive.shuffle.targetPostShuffleInputSize", "134217728") 44 | # 允许动态资源分配,配合 spark.dynamicAllocation.minExecutors, spark.dynamicAllocation.maxExecutors 等使用 45 | # spark 3.0+ 不允许动态设置以下两个参数 46 | import pyspark 47 | 48 | if str(pyspark.__version__).startswith("2."): # type: ignore 49 | spark.conf.set("spark.dynamicAllocation.enabled", "true") 50 | spark.conf.set("spark.shuffle.service.enabled", "true") 51 | 52 | # spark.conf.set("hive.exec.dynamic.partition", "true") 53 | # default strict. In strict mode, the user must specify at least one static partition, 54 | # in case the user accidentally overwrites all partitions. 55 | # In nonstrict mode all partitions are allowed to be dynamic. 56 | spark.conf.set("hive.exec.dynamic.partition.mode", "nonstrict") 57 | 58 | return spark 59 | 60 | 61 | def clear_temp_views(spark: SparkSession): 62 | for table in spark.catalog.listTables("default"): 63 | if table.isTemporary: 64 | print(f"dropping temp view {table.name}") 65 | spark.catalog.dropTempView(table.name) 66 | -------------------------------------------------------------------------------- /easy_sql/sql_linter/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/easysql/easy_sql/b568542617942f347579ff872d976fd2175aa071/easy_sql/sql_linter/__init__.py -------------------------------------------------------------------------------- /easy_sql/sql_linter/rules/__init__.py: -------------------------------------------------------------------------------- 1 | from easy_sql.sql_linter.rules.bq_schema_rule import Rule_BigQuery_L001 2 | 3 | all_rules = [Rule_BigQuery_L001] 4 | -------------------------------------------------------------------------------- /easy_sql/sql_linter/rules/bq_schema_rule.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import TYPE_CHECKING 4 | 5 | from sqlfluff.core.parser import CodeSegment 6 | from sqlfluff.core.rules.base import BaseRule, LintFix, LintResult 7 | from sqlfluff.core.rules.crawlers import SegmentSeekerCrawler 8 | 9 | if TYPE_CHECKING: 10 | from sqlfluff.core.rules.context import RuleContext 11 | 12 | 13 | class Rule_BigQuery_L001(BaseRule): 14 | """ 15 | Table schema is required for queries in BigQuery. 16 | 17 | **Anti-pattern** 18 | Select from some table without schema. 19 | 20 | .. code-block:: sql 21 | SELECT * 22 | FROM foo 23 | 24 | **Best practice** 25 | Select from some table with schema. 26 | 27 | .. code-block:: sql 28 | SELECT * 29 | FROM test.foo 30 | """ 31 | 32 | groups = ("all", "bigquery") 33 | crawl_behaviour = SegmentSeekerCrawler({"table_reference"}) 34 | 35 | def __init__(self, *args, **kwargs): 36 | """Overwrite __init__ to set config.""" 37 | super().__init__(*args, **kwargs) 38 | 39 | def _eval(self, context: RuleContext): 40 | """check from table have schema""" 41 | if len(context.segment.segments) != 3: 42 | return LintResult( 43 | anchor=context.segment, 44 | fixes=[ 45 | LintFix.create_before( 46 | context.segment, 47 | [CodeSegment(raw="${temp_db}.")], 48 | ) 49 | ], 50 | description=f"No schema found when select from table `{context.segment.raw}`.", 51 | ) 52 | -------------------------------------------------------------------------------- /easy_sql/sql_linter/sql_linter_cli.py: -------------------------------------------------------------------------------- 1 | import re 2 | import sys 3 | import warnings 4 | from typing import List, Optional 5 | 6 | import click 7 | 8 | from easy_sql.sql_linter.sql_linter import SqlLinter 9 | 10 | 11 | def split_rules_to_list(rule_description: str) -> Optional[List[str]]: 12 | if rule_description != "": 13 | return rule_description.split(",") 14 | else: 15 | return None 16 | 17 | 18 | def parse_backend(sql: str): 19 | sql_lines = sql.split("\n") 20 | parsed_backend = None 21 | for line in sql_lines: 22 | if re.match(r"^-- \s*backend:.*$", line): 23 | parsed_backend = line[line.index("backend:") + len("backend:") :].strip() 24 | break 25 | 26 | if parsed_backend is None: 27 | parsed_backend = "spark" 28 | return parsed_backend 29 | 30 | 31 | def lint_process( 32 | check_sql_file_path: str, 33 | exclude: str, 34 | include: str, 35 | backend: str, 36 | easy_sql: bool, 37 | config_path: Optional[str] = None, 38 | ): 39 | if not check_sql_file_path.endswith(".sql"): 40 | warnings.warn("file name:" + check_sql_file_path + " must end with .sql", stacklevel=2) 41 | 42 | with open(check_sql_file_path, "r") as file: 43 | sql = file.read() 44 | sql_linter = SqlLinter(sql, exclude_rules=split_rules_to_list(exclude), include_rules=split_rules_to_list(include)) 45 | backend = backend if backend else parse_backend(sql) 46 | print("using backend:", backend) 47 | result = sql_linter.lint(backend, easysql=easy_sql, config_path=config_path) 48 | fixed = sql_linter.fix(backend, easy_sql=easy_sql, config_path=config_path) 49 | 50 | return result, fixed 51 | 52 | 53 | def write_out_fixed(check_sql_file_path: str, fixed: str, inplace: bool): 54 | if inplace: 55 | write_out_file_path = check_sql_file_path 56 | else: 57 | write_out_file_path = check_sql_file_path.replace(".sql", ".fixed.sql") 58 | with open(write_out_file_path, "w") as file: 59 | file.write(fixed) 60 | 61 | 62 | @click.group() 63 | def cli(): 64 | """Check or fix violations in SQL.""" 65 | pass 66 | 67 | 68 | def fix_process( 69 | path: str, 70 | exclude: str, 71 | include: str, 72 | backend: str, 73 | inplace: bool, 74 | easy_sql: bool, 75 | config_path: Optional[str] = None, 76 | ): 77 | result, fixed = lint_process(path, exclude, include, backend, easy_sql, config_path=config_path) 78 | write_out_fixed(path, fixed, inplace) 79 | 80 | 81 | @cli.command(help="""Fix rule violations in sql""") 82 | @click.option("--path", help="sql file path", required=True, type=str) 83 | @click.option("--config-path", help="sql fluff config file path, must be named .sqlfluff", required=False, type=str) 84 | @click.option("--exclude", help="comma separated rule to be excluded", default="", required=False, type=str) 85 | @click.option("--include", help="comma separated rule to be included", default="", required=False, type=str) 86 | @click.option( 87 | "--backend", 88 | help=( 89 | "backend for this file, " 90 | "if easy sql it will parse from the sql file if not specify, " 91 | "if normal sql it will default to spark" 92 | ), 93 | default=None, 94 | required=False, 95 | type=str, 96 | ) 97 | @click.option("--inplace", help="fix file inplace", default=False, required=False, type=bool) 98 | @click.option("--easy_sql", help="easy sql or normal sql", default=True, required=False, type=bool) 99 | def fix(path: str, config_path: str, exclude: str, include: str, backend: str, inplace: bool, easy_sql: bool): 100 | fix_process(path, exclude, include, backend, inplace, easy_sql, config_path=config_path) 101 | 102 | 103 | @cli.command(help="""Check rule violations in sql""") 104 | @click.option("--path", help="sql file path", required=True, type=str) 105 | @click.option("--config-path", help="sql fluff config file path, must be named .sqlfluff", required=False, type=str) 106 | @click.option("--exclude", help="comma separated rule to be excluded", default="", required=False, type=str) 107 | @click.option("--include", help="comma separated rule to be included", default="", required=False, type=str) 108 | @click.option( 109 | "--backend", 110 | help=( 111 | "backend for this file, " 112 | "if easy sql it will parse from the sql file if not specify, " 113 | "if normal sql it will default to spark" 114 | ), 115 | default=None, 116 | required=False, 117 | type=str, 118 | ) 119 | @click.option("--easy_sql", help="easy sql or normal sql", default=True, required=False, type=bool) 120 | def lint(path: str, config_path: str, exclude: str, include: str, backend: str, easy_sql: bool): 121 | lint_process(path, exclude, include, backend, easy_sql, config_path=config_path) 122 | 123 | 124 | if __name__ == "__main__": 125 | cli.main(sys.argv[1:]) 126 | -------------------------------------------------------------------------------- /easy_sql/sql_linter/sql_linter_reportor.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import codecs 4 | import logging 5 | import sys 6 | from typing import TYPE_CHECKING, List, Union 7 | 8 | import colorlog 9 | 10 | if TYPE_CHECKING: 11 | from sqlfluff.core import SQLBaseError 12 | 13 | 14 | class LintReporter: 15 | def __init__(self): 16 | self.sql_linter_log = LintReporter._create_logger(logging.DEBUG) 17 | 18 | def _get_extra_default_dict(self): 19 | return {"pos_info": "", "description": "", "warn": "", "pass": ""} 20 | 21 | @staticmethod 22 | def _create_logger(log_level: Union[int, str]): 23 | logger = logging.getLogger("linter_logger") 24 | logger.setLevel(log_level) 25 | info_formater = colorlog.ColoredFormatter( 26 | fmt="%(white)s%(message)s%(red)s%(warn)s %(green)s%(pass)s %(blue)s%(pos_info)s %(white)s%(description)s " 27 | ) 28 | python_version = sys.version_info 29 | if python_version.major == 3 and python_version.minor == 6: 30 | sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach()) # type: ignore 31 | elif hasattr(sys.stdout, "reconfigure"): 32 | sys.stdout.reconfigure(encoding="utf-8") # type: ignore 33 | handler = logging.StreamHandler(sys.stdout) 34 | handler.setFormatter(info_formater) 35 | for existing_handler in logger.handlers: 36 | logger.removeHandler(existing_handler) 37 | logger.addHandler(handler) 38 | return logger 39 | 40 | def report_violation(self, violation: SQLBaseError, step_start_line=0): 41 | pos_info = "L: {} | P: {}: | {} :".format( 42 | violation.line_no + step_start_line, violation.line_pos, violation.rule_code() 43 | ) 44 | extra_dict = self._get_extra_default_dict() 45 | extra_dict["pos_info"] = pos_info 46 | extra_dict["description"] = violation.desc() 47 | self.sql_linter_log.info("", extra=extra_dict) 48 | 49 | def report_list_of_violations(self, lint_result: List[SQLBaseError], step_start_line=0): 50 | if len(lint_result) > 0: 51 | self.report_warning("Fail") 52 | for violation in lint_result: 53 | self.report_violation(violation, step_start_line) 54 | else: 55 | self.report_pass("Pass") 56 | 57 | def report_message(self, message): 58 | self.sql_linter_log.info(message, extra=self._get_extra_default_dict()) 59 | 60 | def report_warning(self, warning: str): 61 | extra_dict = self._get_extra_default_dict() 62 | extra_dict["warn"] = warning 63 | self.sql_linter_log.warning("", extra=extra_dict) 64 | 65 | def report_pass(self, pass_info: str): 66 | extra_dict = self._get_extra_default_dict() 67 | extra_dict["pass"] = pass_info 68 | self.sql_linter_log.warning("", extra=extra_dict) 69 | -------------------------------------------------------------------------------- /easy_sql/sql_processor/__init__.py: -------------------------------------------------------------------------------- 1 | from easy_sql.sql_processor.common import Column, SqlProcessorException 2 | from easy_sql.sql_processor.context import VarsContext 3 | from easy_sql.sql_processor.funcs import FuncRunner 4 | from easy_sql.sql_processor.report import SqlProcessorReporter, StepReport, StepStatus 5 | from easy_sql.sql_processor.sql_processor import ( 6 | SqlProcessor, 7 | get_current_backend, 8 | get_current_config, 9 | get_current_context, 10 | get_current_sql_processor, 11 | get_current_step, 12 | ) 13 | from easy_sql.sql_processor.step import Step, StepConfig, StepType 14 | 15 | __all__ = [ 16 | "Column", 17 | "SqlProcessorException", 18 | "StepConfig", 19 | "StepType", 20 | "VarsContext", 21 | "FuncRunner", 22 | "Step", 23 | "StepStatus", 24 | "StepReport", 25 | "SqlProcessorReporter", 26 | "SqlProcessor", 27 | "get_current_backend", 28 | "get_current_config", 29 | "get_current_context", 30 | "get_current_step", 31 | "get_current_sql_processor", 32 | ] 33 | -------------------------------------------------------------------------------- /easy_sql/sql_processor/backend/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import * 2 | from .flink import * 3 | from .spark import * 4 | -------------------------------------------------------------------------------- /easy_sql/sql_processor/backend/bigquery.py: -------------------------------------------------------------------------------- 1 | from .rdb import RdbBackend as BigQueryBackend 2 | from .rdb import RdbRow as BigQueryRow 3 | from .rdb import TimeLog, _exec_sql 4 | 5 | __all__ = ["BigQueryBackend", "TimeLog", "_exec_sql", "BigQueryRow"] 6 | -------------------------------------------------------------------------------- /easy_sql/sql_processor/backend/clickhouse.py: -------------------------------------------------------------------------------- 1 | from .rdb import RdbBackend as ChBackend 2 | from .rdb import RdbRow as ChRow 3 | from .rdb import TimeLog, _exec_sql 4 | 5 | __all__ = ["ChBackend", "TimeLog", "_exec_sql", "ChRow"] 6 | -------------------------------------------------------------------------------- /easy_sql/sql_processor/backend/postgres.py: -------------------------------------------------------------------------------- 1 | from .rdb import RdbBackend as PostgresBackend 2 | from .rdb import RdbRow as PgRow 3 | from .rdb import TimeLog, _exec_sql 4 | 5 | __all__ = ["PostgresBackend", "TimeLog", "_exec_sql", "PgRow"] 6 | -------------------------------------------------------------------------------- /easy_sql/sql_processor/backend/rdb_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from unittest.mock import patch 3 | 4 | from sqlalchemy.dialects.postgresql import DOUBLE_PRECISION 5 | from sqlalchemy.engine.mock import create_mock_engine 6 | from sqlalchemy.engine.reflection import Inspector 7 | 8 | from easy_sql.sql_processor.backend.rdb import RdbBackend 9 | 10 | 11 | class RdbTest(unittest.TestCase): 12 | def test_get_column_names_should_only_get_from_name(self): 13 | mock_engine = create_mock_engine("postgresql://", None) 14 | cols = [{"name": "a"}, {"name": "b"}, {"type": "c"}] 15 | with patch.object(RdbBackend, "get_columns", return_value=cols): 16 | rdb = RdbBackend("", engine=mock_engine) # type: ignore 17 | names = rdb.get_column_names("test") 18 | self.assertSequenceEqual(names, ["a", "b"]) 19 | 20 | def test_get_columns_should_compile_type_by_dialect_when_now_raw(self): 21 | mock_engine = create_mock_engine("postgresql://", None) 22 | mock_engine.close = lambda: None # type: ignore 23 | col = {"name": "id", "type": DOUBLE_PRECISION(10)} 24 | raw_cols = [col] 25 | with patch.object(Inspector, "get_columns", return_value=[col.copy() for col in raw_cols]): 26 | rdb = RdbBackend("", engine=mock_engine) # type: ignore 27 | 28 | cols = rdb.get_columns("test") 29 | 30 | self.assertNotEqual(str(col["type"]), "DOUBLE PRECISION") 31 | self.assertEqual(cols, [{"name": "id", "type": "DOUBLE PRECISION"}]) 32 | 33 | def test_get_columns_should_compile_type_by_dialect_when_in_raw(self): 34 | mock_engine = create_mock_engine("postgresql://", None) 35 | mock_engine.close = lambda: None # type: ignore 36 | col = {"name": "id", "type": DOUBLE_PRECISION(10)} 37 | raw_cols = [col] 38 | with patch.object(Inspector, "get_columns", return_value=[col.copy() for col in raw_cols]): 39 | rdb = RdbBackend("", engine=mock_engine) # type: ignore 40 | 41 | cols = rdb.get_columns("test", raw=True) 42 | 43 | self.assertEqual(cols, raw_cols) 44 | -------------------------------------------------------------------------------- /easy_sql/sql_processor/backend/spark_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from pyspark.sql.functions import expr, lit 4 | 5 | from easy_sql.base_test import LocalSpark 6 | from easy_sql.sql_processor.backend import SparkTable 7 | from easy_sql.sql_processor.backend.base import TableMeta 8 | from easy_sql.sql_processor.backend.spark import SparkBackend 9 | from easy_sql.sql_processor.common import SqlProcessorException 10 | 11 | 12 | class SparkTest(unittest.TestCase): 13 | def test_with_column(self): 14 | spark = LocalSpark.get() 15 | df = spark.sql("select 1 as id") 16 | # expr('2021-01-01') 效果就是 select 2021-01-01,少了引号,解析就会出现奇怪的结果 17 | self.assertNotEqual( 18 | SparkTable(df).with_column("data_date", "2021-01-01").df.select("data_date").limit(1).collect(), 19 | [("2021-01-01",)], 20 | ) 21 | # expr("'2021-01-01'") 效果就是 select '2021-01-01',结果正确 22 | self.assertEqual( 23 | SparkTable(df).with_column("data_date", "'2021-01-01'").df.select("data_date").limit(1).collect(), 24 | [("2021-01-01",)], 25 | ) 26 | # 可以直接传入 Column 27 | self.assertEqual( 28 | SparkTable(df).with_column("data_date", lit("2021-01-01")).df.select("data_date").limit(1).collect(), 29 | [("2021-01-01",)], 30 | ) 31 | self.assertEqual(SparkTable(df).with_column("flag", "1==2").df.select("flag").limit(1).collect(), [(False,)]) 32 | self.assertEqual( 33 | SparkTable(df).with_column("flag", expr("1==2")).df.select("flag").limit(1).collect(), [(False,)] 34 | ) 35 | 36 | def test_verify_schema(self): 37 | spark = LocalSpark.get() 38 | backend = SparkBackend(spark) 39 | spark.sql('create table test_verify_schema using parquet as select 1 as id, "a" as name') 40 | 41 | # should check if target table exists 42 | spark.sql("select 1 as id").createOrReplaceTempView("test_verify_schema0") 43 | with self.assertRaises(SqlProcessorException): 44 | backend.verify_schema(TableMeta("test_verify_schema0"), TableMeta("test_verify_schema1")) 45 | 46 | # should verify column name 47 | spark.sql("select 1 as id").createOrReplaceTempView("test_verify_schema1") 48 | with self.assertRaises(SqlProcessorException): 49 | backend.verify_schema(TableMeta("test_verify_schema1"), TableMeta("test_verify_schema")) 50 | 51 | # should ignore case and not verify type 52 | spark.sql("select 1 as Id, 1 as name").createOrReplaceTempView("test_verify_schema2") 53 | # should not raise exception 54 | backend.verify_schema(TableMeta("test_verify_schema2"), TableMeta("test_verify_schema")) 55 | 56 | # should verify type and raise error 57 | spark.sql("select 1 as id, 1 as Name").createOrReplaceTempView("test_verify_schema21") 58 | with self.assertRaises(SqlProcessorException): 59 | backend.verify_schema(TableMeta("test_verify_schema21"), TableMeta("test_verify_schema"), True) 60 | 61 | # should ignore extra column 62 | spark.sql("select 1 as id, 'a' as name, 1 as id1").createOrReplaceTempView("test_verify_schema3") 63 | # should not raise exception 64 | backend.verify_schema(TableMeta("test_verify_schema3"), TableMeta("test_verify_schema")) 65 | 66 | 67 | if __name__ == "__main__": 68 | unittest.main() 69 | -------------------------------------------------------------------------------- /easy_sql/sql_processor/backend/sql_dialect/clickhouse_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from easy_sql.sql_processor.backend import Partition 4 | from easy_sql.sql_processor.backend.sql_dialect import SqlExpr 5 | from easy_sql.sql_processor.backend.sql_dialect.clickhouse import ChSqlDialect 6 | 7 | 8 | class RdbTest(unittest.TestCase): 9 | def test_ch_config(self): 10 | ch_config = ChSqlDialect(SqlExpr(), "dataplat.__table_partitions__") 11 | sql = ch_config.delete_partition_sql("test.test", [Partition("dt", "20210101")]) 12 | self.assertEqual( 13 | sql, 14 | [ 15 | "alter table test.test drop partition tuple('20210101')", 16 | ( 17 | "alter table dataplat.__table_partitions__ delete " 18 | "where db_name = 'test' and table_name = 'test' and partition_value = '20210101'" 19 | ), 20 | ], 21 | ) 22 | -------------------------------------------------------------------------------- /easy_sql/sql_processor/common.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import TYPE_CHECKING, Any 4 | 5 | from ..logger import logger 6 | 7 | if TYPE_CHECKING: 8 | from pyspark.sql import DataFrame, SparkSession 9 | 10 | 11 | def _exec_sql(spark: SparkSession, sql: str) -> DataFrame: 12 | logger.info(f"will exec sql: {sql}") 13 | return spark.sql(sql) 14 | 15 | 16 | def is_int_type(type_name): 17 | return any(type_name.startswith(t) for t in ["integer", "long", "decimal", "short"]) 18 | 19 | 20 | class Column: 21 | def __init__(self, name: str, value: Any): 22 | self.name, self.value = name, value 23 | 24 | 25 | class SqlProcessorException(Exception): 26 | def __init__(self, message: str): 27 | super().__init__(message) 28 | 29 | 30 | class SqlProcessorAssertionError(Exception): 31 | def __init__(self, message: str): 32 | super().__init__(message) 33 | 34 | 35 | class VarsReplacer: 36 | def replace_variables(self, text: str, include_funcs: bool = True) -> str: 37 | raise NotImplementedError() 38 | -------------------------------------------------------------------------------- /easy_sql/sql_processor/context_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from easy_sql.sql_processor.context import TemplatesContext, VarsContext 4 | from easy_sql.sql_processor.funcs import FuncRunner 5 | 6 | 7 | class TemplateContextTest(unittest.TestCase): 8 | def test_should_replace_template(self): 9 | tc = TemplatesContext(True, {"a": "xx\n#{var}=abc, 123"}) 10 | replaced = tc.replace_templates("??@{a(var=${abc})}??") 11 | self.assertEqual("??xx\n${abc}=abc, 123??", replaced) 12 | 13 | # does not support var-func in template parameters 14 | replaced = tc.replace_templates("??@{a(var=${fn(abc)})}??") 15 | self.assertNotEquals("??xx\n${fn(abc)}=abc, 123??", replaced) 16 | 17 | # if this is a comment, do not replace 18 | replaced = tc.replace_templates("??@{a(var=${abc})}?? --??@{a(var=${abc})}??") 19 | self.assertEqual("??xx\n${abc}=abc, 123?? --??@{a(var=${abc})}??", replaced) 20 | replaced = tc.replace_templates("-- ??@{a(var=${abc})}??") 21 | self.assertEqual("-- ??@{a(var=${abc})}??", replaced) 22 | 23 | def test_multi_line_in_template_reference(self): 24 | tc = TemplatesContext(True, {"a": "xx\n#{var}=abc, #{var1} 123"}) 25 | replaced = tc.replace_templates("??@{a(var=123\n,var1=234)}??") 26 | self.assertEqual("??xx\n123=abc, 234 123??", replaced) 27 | 28 | replaced = tc.replace_templates("??@{a(var=123,\nvar1=234)}??") 29 | self.assertEqual("??xx\n123=abc, 234 123??", replaced) 30 | 31 | replaced = tc.replace_templates("??@{a(\n var\n=123\n,\nvar1=234)}??") 32 | self.assertEqual("??xx\n123=abc, 234 123??", replaced) 33 | 34 | def test_comment_line_in_template_reference(self): 35 | tc = TemplatesContext(True, {"a": "--xx\n#{var}=abc, #{var1} 123--abc\n--abc"}) 36 | replaced = tc.replace_templates("??@{a(var=123\n,var1=234)}??") 37 | self.assertEqual("??--xx\n123=abc, 234 123--abc\n--abc\n??", replaced) 38 | 39 | tc = TemplatesContext(True, {"a": "--xx\n#{var}=abc, #{var1} 123--abc\n"}) 40 | replaced = tc.replace_templates("??@{a(var=123\n,var1=234)}??") 41 | self.assertEqual("??--xx\n123=abc, 234 123--abc\n??", replaced) 42 | 43 | tc = TemplatesContext(True, {"a": "--xx\n#{var}=abc, #{var1} 123--abc"}) 44 | replaced = tc.replace_templates("??@{a(var=123\n,var1=234)}??") 45 | self.assertEqual("??--xx\n123=abc, 234 123--abc\n??", replaced) 46 | 47 | tc = TemplatesContext(True, {"a": "\n#{var}=abc, #{var1} 123\n"}) 48 | replaced = tc.replace_templates("??@{a(var=123\n,var1=234)}??") 49 | self.assertEqual("??123=abc, 234 123??", replaced) 50 | 51 | 52 | class VarsContextTest(unittest.TestCase): 53 | def test_should_replace_vars(self): 54 | vc = VarsContext(vars={"a": "##A##", "aa": "##${a}##"}, debug_log=True) 55 | self.assertEqual("-##A##, ===####A####===", vc.replace_variables("-${a}, ===${aa}==="), "should replace all") 56 | self.assertEqual( 57 | "-- -${a}, ===${aa}===", vc.replace_variables("-- -${a}, ===${aa}==="), "do not replace comment" 58 | ) 59 | self.assertEqual( 60 | "-##A##, ==-- =${aa}===", vc.replace_variables("-${a}, ==-- =${aa}==="), "do not replace comment" 61 | ) 62 | self.assertEqual("-\\##A##, ===####A####===", vc.replace_variables("-\\${a}, ===${aa}==="), "ignore escaping") 63 | 64 | vc = VarsContext(vars={"a": "##A##", "b": "##${a}##", "aa": "##${b}##"}, debug_log=True) 65 | self.assertEqual( 66 | "-##A##, -####A####, ===######A######===", 67 | vc.replace_variables("-${a}, -${b}, ===${aa}==="), 68 | "replace vars recursively", 69 | ) 70 | 71 | vc = VarsContext(vars={"a": "##A##", "aa": "##${a}##", "b": "1"}, debug_log=True) 72 | vc.init(func_runner=FuncRunner({"f": lambda x: int(x) + 1})) 73 | self.assertEqual("-6, ===####A####===", vc.replace_variables("-${f(5)}, ===${aa}==="), "func call in vars") 74 | self.assertEqual( 75 | "-2, ===####A####===", vc.replace_variables("-${f(${b})}, ===${aa}==="), "vars as args in func call" 76 | ) 77 | self.assertEqual( 78 | "-4, ===####A####===", 79 | vc.replace_variables("-${f(${c:3})}, ===${aa}==="), 80 | "vars with default value as args in func call", 81 | ) 82 | 83 | vc = VarsContext(vars={"a": "##A##", "b": "##${a}##", "aa": "##${b}##"}, debug_log=True) 84 | self.assertEqual( 85 | "-1, -####A####, ===######A######===", 86 | vc.replace_variables("-${a1:1}, -${b}, ===${aa:b?x}==="), 87 | "vars with default value", 88 | ) 89 | 90 | # TODO: support for confliction detection 91 | -------------------------------------------------------------------------------- /easy_sql/sql_processor/funcs_flink_itest.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from easy_sql.base_test import TEST_PG_JDBC_PASSWD, TEST_PG_JDBC_URL, TEST_PG_JDBC_USER 4 | from easy_sql.config.sql_config import EasySqlConfig 5 | from easy_sql.sql_processor.backend import FlinkBackend, FlinkTablesConfig 6 | from easy_sql.sql_processor.step import Step 7 | from easy_sql.utils.flink_test_cluster import FlinkTestClusterManager 8 | 9 | from .funcs_flink import TestFuncs 10 | 11 | 12 | def step_with_sql(sql: str) -> Step: 13 | return Step("0", None, None, select_sql=sql) # type: ignore 14 | 15 | 16 | class FlinkFuncsTest(unittest.TestCase): 17 | def create_flink_backend(self): 18 | return FlinkBackend( 19 | True, 20 | FlinkTablesConfig( 21 | connectors={ 22 | "jdbc": FlinkTablesConfig.Connector( 23 | f""" 24 | 'url' = '{TEST_PG_JDBC_URL}', 25 | 'username' = '{TEST_PG_JDBC_USER}', 26 | 'password' = '{TEST_PG_JDBC_PASSWD}' 27 | """ 28 | ), 29 | }, 30 | catalogs={}, 31 | ), 32 | ) 33 | 34 | def test_exec_sql_in_source(self): 35 | fb = self.create_flink_backend() 36 | tf = TestFuncs(fb) 37 | tf.exec_sql_in_source(step_with_sql("select 1;\nselect now();"), "db", "jdbc") 38 | 39 | def test_run_etl_streaming(self): 40 | fb = self.create_flink_backend() 41 | tf = TestFuncs(fb) 42 | with open("/tmp/flink_func_test__test_run_etl.sql", "w") as f: 43 | f.write( 44 | """ 45 | -- backend: flink 46 | -- config: easy_sql.etl_type=streaming 47 | -- config: flink.cmd=-pyexec python3 48 | -- config: flink.cmd=-t remote 49 | -- config: flink.cmd=-pyclientexec python3 50 | -- target=variables 51 | select 52 | 'append' as __save_mode__ 53 | """ 54 | ) 55 | fm = FlinkTestClusterManager() 56 | if fm.is_not_started(): 57 | fm.start_cluster() 58 | tf.test_run_etl(None, "/tmp/flink_func_test__test_run_etl.sql") 59 | self.assertTrue(fm.is_started()) 60 | fm.stop_cluster() 61 | 62 | tf.test_run_etl(None, "/tmp/flink_func_test__test_run_etl.sql") 63 | self.assertTrue(fm.is_not_started()) 64 | 65 | def test_run_etl_batch(self): 66 | fb = self.create_flink_backend() 67 | tf = TestFuncs(fb) 68 | with open("/tmp/flink_func_test__test_run_etl.sql", "w") as f: 69 | f.write( 70 | """ 71 | -- backend: flink 72 | -- config: easy_sql.etl_type=batch 73 | -- config: flink.cmd=-pyexec python3 74 | -- config: flink.cmd=-t local 75 | -- config: flink.cmd=-pyclientexec python3 76 | -- target=variables 77 | select 78 | 'append' as __save_mode__ 79 | """ 80 | ) 81 | tf.test_run_etl( 82 | EasySqlConfig.from_sql(sql_file="/tmp/flink_func_test__test_run_etl.sql"), 83 | "/tmp/flink_func_test__test_run_etl.sql", 84 | ) 85 | -------------------------------------------------------------------------------- /easy_sql/sql_processor/step_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from easy_sql.sql_processor import SqlProcessorException, StepConfig 4 | from easy_sql.sql_processor.step import SqlCleaner, StepFactory 5 | 6 | 7 | class StepConfigTest(unittest.TestCase): 8 | def test_should_parse_config(self): 9 | self.assertEqual(StepConfig.from_config_line("-- target=check.f1", 0), StepConfig("check", "f1", None, 0)) 10 | self.assertEqual( 11 | StepConfig.from_config_line("-- target=check.f1(a, ${b})", 0), StepConfig("check", "f1(a, ${b})", None, 0) 12 | ) 13 | self.assertEqual( 14 | StepConfig.from_config_line("-- target=check.f1(a, ${b}), if=f2(c, ${d})", 0), 15 | StepConfig("check", "f1(a, ${b})", "f2(c, ${d})", 0), 16 | ) 17 | self.assertEqual( 18 | StepConfig.from_config_line("-- target=check.f1(a, ${b}),if=f2(c, ${d})", 0), 19 | StepConfig("check", "f1(a, ${b})", "f2(c, ${d})", 0), 20 | ) 21 | self.assertEqual(StepConfig.from_config_line("-- target=variables", 0), StepConfig("variables", None, None, 0)) 22 | self.assertEqual( 23 | StepConfig.from_config_line("-- target=variables, if=f2(c, ${d})", 0), 24 | StepConfig("variables", None, "f2(c, ${d})", 0), 25 | ) 26 | with self.assertRaises(expected_exception=SqlProcessorException): 27 | StepConfig.from_config_line("-- target=check.f1(a, ${b}),if=f2-(c, ${d})", 0) 28 | with self.assertRaises(expected_exception=SqlProcessorException): 29 | StepConfig.from_config_line("-- target=unknown_type", 0) 30 | 31 | def test_should_clean_sql(self): 32 | self.assertEquals( 33 | """ 34 | with a as (select 1 as a) -- comment 35 | --comment 36 | select * from a 37 | """.strip(), 38 | SqlCleaner().clean_sql( 39 | """ 40 | -- comment 41 | with a as (select 1 as a) -- comment 42 | --comment 43 | select * from a -- comment 44 | ; 45 | --comment 46 | """ 47 | ), 48 | ) 49 | 50 | def test_should_clean_sql_with_semicolon_before_comment(self): 51 | self.assertEquals( 52 | """ 53 | with a as (select 1 as a) -- comment 54 | --comment 55 | select * from a 56 | """.strip(), 57 | SqlCleaner().clean_sql( 58 | """ 59 | -- comment 60 | with a as (select 1 as a) -- comment 61 | --comment 62 | select * from a; -- comment 63 | ; 64 | --comment 65 | """ 66 | ), 67 | ) 68 | 69 | def test_should_read_sql_correctly(self): 70 | sql = """ 71 | -- target=temp.test 72 | select ';' as a 73 | """ 74 | steps = StepFactory(None, None).create_from_sql(sql, {}) # type: ignore 75 | self.assertEquals(1, len(steps)) 76 | assert steps[0].target_config is not None 77 | self.assertEquals(steps[0].target_config.name, "test") 78 | assert steps[0].select_sql is not None 79 | self.assertEquals(steps[0].select_sql.strip(), "select ';' as a") 80 | 81 | def test_should_skip_duplicate_include(self): 82 | sql0 = """ 83 | -- target=temp.test 84 | select 1 as a 85 | """ 86 | sql1 = """ 87 | -- include 0 start 88 | -- include=0.sql 89 | """ 90 | sql = """ 91 | -- outer include start 92 | -- include=1.sql 93 | -- include=1.sql 94 | -- include=0.sql 95 | """ 96 | sql_expected = """ 97 | -- outer include start 98 | -- include 0 start 99 | -- target=temp.test 100 | select 1 as a 101 | """ 102 | sf = StepFactory(None, None, skip_duplicate_include=True) # type: ignore 103 | sf.create_from_sql(sql, {"0.sql": sql0, "1.sql": sql1}) 104 | print(sf.resolved_sql) 105 | assert ( 106 | "\n".join([line.strip() for line in sf.resolved_sql.splitlines() if line.strip()]) == sql_expected.strip() 107 | ) 108 | 109 | 110 | if __name__ == "__main__": 111 | unittest.main() 112 | -------------------------------------------------------------------------------- /easy_sql/sql_test_itest.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from . import sql_test 4 | 5 | 6 | class SqlTestTest(unittest.TestCase): 7 | def test_convert_test_for_spark(self): 8 | self.run_test("spark") 9 | 10 | def test_convert_test_for_postgres(self): 11 | self.run_test("postgres") 12 | 13 | def test_convert_test_for_clickhouse(self): 14 | self.run_test("clickhouse") 15 | 16 | def run_test(self, backend: str): 17 | sql_test._convert_json(f"test/sample_etl.{backend}.xlsx") 18 | sql_test._run_test(f"test/sample_etl.{backend}.xlsx", backend=backend) 19 | sql_test._run_test(f"test/sample_etl.{backend}.json", backend=backend) 20 | -------------------------------------------------------------------------------- /easy_sql/udf/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/easysql/easy_sql/b568542617942f347579ff872d976fd2175aa071/easy_sql/udf/__init__.py -------------------------------------------------------------------------------- /easy_sql/udf/check.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import re 4 | from typing import TYPE_CHECKING, Callable 5 | 6 | if TYPE_CHECKING: 7 | from pyspark.sql.types import DataType 8 | 9 | 10 | class UDF: 11 | def __init__(self, func: Callable, return_type: DataType): 12 | self.func = func 13 | self.return_type = return_type 14 | 15 | def __call__(self, *args, **kwargs): 16 | return self.func(*args, **kwargs) 17 | 18 | 19 | def check_regex_func(pattern): 20 | return lambda any_str: any_str if any_str and re.match(pattern, any_str) else None 21 | -------------------------------------------------------------------------------- /easy_sql/udf/udfs.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Callable, Dict, List, Optional, Union 2 | 3 | 4 | def _all_udfs(cls: Any): 5 | return { 6 | attr: getattr(cls, attr) 7 | for attr in dir(cls) 8 | if callable(getattr(cls, attr)) and not attr.startswith("_") and attr != "all" 9 | } 10 | 11 | 12 | def get_udfs(type: str) -> Union[Dict[str, Callable[[], Union[str, List[str]]]], Dict[str, Callable]]: 13 | if type == "pg": 14 | return PgUdfs.all() 15 | elif type == "ch": 16 | return ChUdfs.all() 17 | elif type == "spark": 18 | return SparkUdfs.all() 19 | else: 20 | return {} 21 | 22 | 23 | class SparkUdfs: 24 | @staticmethod 25 | def all() -> Dict[str, Callable]: 26 | return _all_udfs(SparkUdfs) 27 | 28 | @staticmethod 29 | def remove_all_whitespaces(value: Optional[str]) -> Optional[str]: 30 | return "".join(value.split()) if value is not None else None 31 | 32 | @staticmethod 33 | def trim_all(value: Optional[str]) -> Optional[str]: 34 | return value.strip() if value is not None else None 35 | 36 | 37 | class PgUdfs: 38 | @staticmethod 39 | def all() -> Dict[str, Callable[[], str]]: 40 | return _all_udfs(PgUdfs) 41 | 42 | @staticmethod 43 | def trim_all(): 44 | return """ 45 | create or replace function trim_all(value text) returns text 46 | as $$ select regexp_replace(regexp_replace($1, E'^[\\\\a\\\\b\\\\e\\\\f\\\\n\\\\r\\\\t\\\\v\\\\0 ]+', ''), E'[\\\\a\\\\b\\\\e\\\\f\\\\n\\\\r\\\\t\\\\v\\\\0 ]+$', '') $$ 47 | LANGUAGE SQL 48 | IMMUTABLE 49 | RETURNS NULL ON NULL INPUT 50 | """ # noqa : B950 51 | 52 | @staticmethod 53 | def split(): 54 | return """ 55 | create or replace function split(value text, sep text) returns text[] 56 | as $$ select string_to_array($1, $2) $$ 57 | LANGUAGE SQL 58 | IMMUTABLE 59 | RETURNS NULL ON NULL INPUT 60 | """ 61 | 62 | @staticmethod 63 | def from_unixtime(): 64 | return """ 65 | create or replace function from_unixtime(value float) returns timestamp 66 | as $$ select to_timestamp($1) $$ 67 | LANGUAGE SQL 68 | IMMUTABLE 69 | RETURNS NULL ON NULL INPUT 70 | """ 71 | 72 | @staticmethod 73 | def date_format(): 74 | return """ 75 | create or replace function date_format(value timestamp, format text) returns text 76 | as $$ select to_char($1, $2) $$ 77 | LANGUAGE SQL 78 | IMMUTABLE 79 | RETURNS NULL ON NULL INPUT 80 | """ 81 | 82 | @staticmethod 83 | def get_json_object(): 84 | return """ 85 | create or replace function get_json_object(value text, path text) returns text 86 | as $$ select $1::json#>(string_to_array($2, '.'))[2:] $$ 87 | LANGUAGE SQL 88 | IMMUTABLE 89 | RETURNS NULL ON NULL INPUT 90 | """ 91 | 92 | @staticmethod 93 | def sha1(): 94 | return """ 95 | CREATE EXTENSION IF NOT EXISTS pgcrypto with schema public; 96 | create or replace function sha1(value text) returns text 97 | as $$ select encode(public.digest($1::bytea, cast('sha1' as text)), 'hex') $$ 98 | LANGUAGE SQL 99 | IMMUTABLE 100 | RETURNS NULL ON NULL INPUT 101 | """.split( 102 | ";" 103 | ) 104 | 105 | 106 | class ChUdfs: 107 | """ 108 | https://clickhouse.com/docs/en/sql-reference/statements/create/function 109 | CREATE FUNCTION name AS (parameter0, ...) -> expression 110 | """ 111 | 112 | @staticmethod 113 | def all() -> Dict[str, Callable[[], str]]: 114 | return _all_udfs(ChUdfs) 115 | 116 | @staticmethod 117 | def translate(): 118 | return """ 119 | CREATE FUNCTION IF NOT EXISTS translate AS (input, from, to) -> replaceAll(input, from, to) 120 | """ 121 | -------------------------------------------------------------------------------- /easy_sql/udf/udfs_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from easy_sql.base_test import LocalSpark 4 | from easy_sql.sql_processor import SqlProcessor 5 | 6 | 7 | class FunctionsTest(unittest.TestCase): 8 | def test_remove_all_whitespaces(self): 9 | spark = LocalSpark.get() 10 | SqlProcessor(spark, "") 11 | 12 | self.assertEqual("ab", spark.sql("select remove_all_whitespaces(' a b ')").collect()[0][0]) 13 | self.assertEqual("ab", spark.sql("select remove_all_whitespaces(' \ta\t b\t ')").collect()[0][0]) 14 | self.assertEqual("ab", spark.sql("select remove_all_whitespaces(' \na\n b\n ')").collect()[0][0]) 15 | self.assertEqual("ab", spark.sql("select remove_all_whitespaces(' \fa\f b\f ')").collect()[0][0]) 16 | self.assertEqual("ab", spark.sql("select remove_all_whitespaces(' \ra\r b\r ')").collect()[0][0]) 17 | self.assertEqual("ab", spark.sql("select remove_all_whitespaces(' \va\v b\v ')").collect()[0][0]) 18 | 19 | self.assertEqual("ab", spark.sql("select remove_all_whitespaces('  a  b  ')").collect()[0][0]) # \u00A0 20 | self.assertEqual("ab", spark.sql("select remove_all_whitespaces('  a  b  ')").collect()[0][0]) # \u2007 21 | self.assertEqual("ab", spark.sql("select remove_all_whitespaces('  a  b  ')").collect()[0][0]) # 202F 22 | self.assertEqual(None, spark.sql("select remove_all_whitespaces(NULL)").collect()[0][0]) 23 | self.assertEqual("", spark.sql("select remove_all_whitespaces('')").collect()[0][0]) 24 | self.assertEqual("", spark.sql("select remove_all_whitespaces(' ')").collect()[0][0]) 25 | 26 | def test_trim_all(self): 27 | spark = LocalSpark.get() 28 | SqlProcessor(spark, "") 29 | 30 | self.assertEqual("a b", spark.sql("select trim_all(' a b ')").collect()[0][0]) 31 | self.assertEqual("a b", spark.sql("select trim_all(' \ta b\t ')").collect()[0][0]) 32 | self.assertEqual("a b", spark.sql("select trim_all(' \na b\n ')").collect()[0][0]) 33 | self.assertEqual("a b", spark.sql("select trim_all(' \fa b\f ')").collect()[0][0]) 34 | self.assertEqual("a b", spark.sql("select trim_all(' \ra b\r ')").collect()[0][0]) 35 | self.assertEqual("a b", spark.sql("select trim_all(' \va b\v ')").collect()[0][0]) 36 | 37 | self.assertEqual("a b", spark.sql("select trim_all('  a b  ')").collect()[0][0]) # \u00A0 38 | self.assertEqual("a b", spark.sql("select trim_all('  a b  ')").collect()[0][0]) # \u2007 39 | self.assertEqual("a b", spark.sql("select trim_all('  a b  ')").collect()[0][0]) # 202F 40 | self.assertEqual(None, spark.sql("select trim_all(NULL)").collect()[0][0]) 41 | self.assertEqual("", spark.sql("select trim_all('')").collect()[0][0]) 42 | self.assertEqual("", spark.sql("select trim_all(' ')").collect()[0][0]) 43 | 44 | 45 | if __name__ == "__main__": 46 | unittest.main() 47 | -------------------------------------------------------------------------------- /easy_sql/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/easysql/easy_sql/b568542617942f347579ff872d976fd2175aa071/easy_sql/utils/__init__.py -------------------------------------------------------------------------------- /easy_sql/utils/db_connection_utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import TYPE_CHECKING, Dict, Optional 4 | 5 | from easy_sql.utils.kv import KV 6 | 7 | if TYPE_CHECKING: 8 | from sqlalchemy.engine.base import Connection, Engine 9 | 10 | from easy_sql.sql_processor.backend.flink import FlinkBackend 11 | 12 | 13 | def _create_sqlalchemy_conn(flink_connector_config: Dict[str, str]) -> Optional[Connection]: 14 | base_url = flink_connector_config["url"] 15 | username = flink_connector_config["username"] 16 | password = flink_connector_config["password"] 17 | split_expr = "://" 18 | split_expr_index = base_url.index(split_expr) 19 | db_type = base_url[len("jdbc:") : split_expr_index] 20 | sqlalchemy_db_url = f"{db_type}{split_expr}{username}:{password}@{KV.from_config(base_url, split_expr).v}" 21 | if sqlalchemy_db_url: 22 | from sqlalchemy import create_engine 23 | 24 | engine: Engine = create_engine(sqlalchemy_db_url, isolation_level="AUTOCOMMIT", pool_size=1) 25 | conn: Connection = engine.connect() 26 | return conn 27 | 28 | 29 | def get_connector_raw_conn_for_flink_backend(backend: FlinkBackend, connector_name: str) -> Optional[Connection]: 30 | connector_options = backend.flink_tables_config.get_connector_options(connector_name) 31 | return _create_sqlalchemy_conn(connector_options) 32 | -------------------------------------------------------------------------------- /easy_sql/utils/flink_test_cluster.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import time 4 | 5 | from easy_sql.logger import logger 6 | 7 | 8 | def _check_call(command: str) -> bool: 9 | logger.info(f"will exec command: {command}") 10 | try: 11 | return subprocess.check_call(["bash", "-c", command]) == 0 12 | except subprocess.CalledProcessError: 13 | return False 14 | 15 | 16 | def _check_call_for_script(script_file: str) -> bool: 17 | logger.info(f"will exec script: {script_file}") 18 | try: 19 | return subprocess.check_call(["bash", script_file]) == 0 20 | except subprocess.CalledProcessError: 21 | return False 22 | 23 | 24 | class FlinkTestClusterManager: 25 | def __init__(self, op_wait_secs: float = 3): 26 | import pyflink 27 | 28 | if not _check_call("type curl") or not _check_call("type grep"): 29 | raise Exception( 30 | "Can not find curl or grep. This module only works in a unix environment with curl and grep installed." 31 | ) 32 | self.flink_home = os.path.dirname(pyflink.__file__) 33 | self.wait_secs = op_wait_secs 34 | 35 | def is_started(self): 36 | return _check_call("curl -s localhost:8081 | grep 'Apache Flink Web Dashboard'") 37 | 38 | def is_not_started(self): 39 | return _check_call("curl localhost:8081 2>&1 | grep 'Connection refused'") 40 | 41 | def start_cluster(self): 42 | success = _check_call_for_script(os.path.join(self.flink_home, "bin/start-cluster.sh")) 43 | if success: 44 | logger.info(f"Wait {self.wait_secs} for flink to be fully started.") 45 | time.sleep(self.wait_secs) 46 | else: 47 | raise Exception("Start flink cluster failed, please check the output.") 48 | 49 | def stop_cluster(self): 50 | success = _check_call_for_script(os.path.join(self.flink_home, "bin/stop-cluster.sh")) 51 | if success: 52 | logger.info(f"Wait {self.wait_secs} for flink to be fully stopped.") 53 | time.sleep(self.wait_secs) 54 | else: 55 | raise Exception("Stop flink cluster failed, please check the output.") 56 | -------------------------------------------------------------------------------- /easy_sql/utils/flink_test_cluster_itest.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from .flink_test_cluster import FlinkTestClusterManager 4 | 5 | 6 | class FlinkTestClusterManagerTest(unittest.TestCase): 7 | def test_cluster_manager(self): 8 | fm = FlinkTestClusterManager(10) 9 | if fm.is_started(): 10 | fm.stop_cluster() 11 | fm.start_cluster() 12 | self.assertTrue(fm.is_started()) 13 | self.assertFalse(fm.is_not_started()) 14 | fm.stop_cluster() 15 | self.assertFalse(fm.is_started()) 16 | self.assertTrue(fm.is_not_started()) 17 | -------------------------------------------------------------------------------- /easy_sql/utils/io_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | from os import path 4 | 5 | from easy_sql.logger import logger 6 | 7 | 8 | def resolve_file(file_path: str, abs_path: bool = False, prefix: str = "", relative_to: str = "") -> str: 9 | if file_path.lower().startswith("hdfs://") or file_path.lower().startswith("file://"): 10 | # do not resolve if it is hdfs or absolute file path 11 | return file_path 12 | base_path = os.path.abspath(os.curdir) 13 | if not path.exists(file_path): 14 | if path.exists(path.join(base_path, file_path)): 15 | file_path = path.join(base_path, file_path) 16 | elif path.exists(path.basename(file_path)): 17 | file_path = path.basename(file_path) 18 | elif relative_to and path.isfile(relative_to) and path.exists(path.join(path.dirname(relative_to), file_path)): 19 | file_path = path.join(path.dirname(relative_to), file_path) 20 | elif relative_to and path.isdir(relative_to) and path.exists(path.join(relative_to, file_path)): 21 | file_path = path.join(relative_to, file_path) 22 | else: 23 | raise Exception(f"file not found: {file_path}, curdir: {base_path}") 24 | if abs_path: 25 | file_path = path.abspath(file_path) 26 | if " " in file_path: 27 | parts = file_path.split("/") 28 | file_path_no_space = "/".join([re.sub(r" .*$", "", part) for part in parts]) 29 | logger.warn( 30 | "Remove space inside file path, since spark will raise issue with space in path. " 31 | "We must ensure there is a soft link to the path with space removed to the end. " 32 | f'Will resolve file path from "{file_path}" to "{file_path}".' 33 | ) 34 | file_path = file_path_no_space 35 | return prefix + file_path 36 | 37 | 38 | def resolve_files(files_path: str, abs_path: bool = False, relative_to: str = "") -> str: 39 | return ",".join( 40 | [resolve_file(f.strip(), abs_path, relative_to=relative_to) for f in files_path.split(",") if f.strip()] 41 | ) 42 | 43 | 44 | def read_sql(sql_file: str): 45 | with open(resolve_file(sql_file)) as f: 46 | return f.read() 47 | -------------------------------------------------------------------------------- /easy_sql/utils/kv.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Any, Callable, Dict, Optional, Tuple 4 | 5 | 6 | def get_key_by_splitter_and_strip(source: str, splitter: Optional[str] = "=", strip_chars: Optional[str] = None): 7 | source = source.strip() 8 | splitter = splitter or "=" 9 | assert splitter in source, f"splitter {splitter} not found in source {source}" 10 | return source[: source.index(splitter)].strip(strip_chars) 11 | 12 | 13 | def get_value_by_splitter_and_strip(source: str, splitter: Optional[str] = "=", strip_chars: Optional[str] = None): 14 | source = source.strip() 15 | splitter = splitter or "=" 16 | assert splitter in source, f"splitter {splitter} not found in source {source}" 17 | return source[source.index(splitter) + len(splitter) :].strip(strip_chars) 18 | 19 | 20 | class KV: 21 | def __init__(self, k: str, v: str) -> None: 22 | self.k, self.v = k, v 23 | 24 | @staticmethod 25 | def from_config(config_line: str, splitter: Optional[str] = "=", strip_chars: Optional[str] = None) -> KV: 26 | return KV( 27 | get_key_by_splitter_and_strip(config_line, splitter, strip_chars), 28 | get_value_by_splitter_and_strip(config_line, splitter, strip_chars), 29 | ) 30 | 31 | def as_tuple( 32 | self, k_convert: Optional[Callable[[str], Any]] = None, v_convert: Optional[Callable[[str], Any]] = None 33 | ) -> Tuple[Any, Any]: 34 | return (k_convert(self.k) if k_convert else self.k, v_convert(self.v) if v_convert else self.v) 35 | 36 | def as_dict(self) -> Dict[str, str]: 37 | return {self.k: self.v} 38 | -------------------------------------------------------------------------------- /easy_sql/utils/object_utils.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | 4 | def get_attr(obj: Dict, path: str): 5 | data_current = obj 6 | if not path: 7 | return data_current 8 | for attr_current in path.split("."): 9 | assert attr_current != "", f"Neither part of path should be empty: path=`{path}`, current_part=`{attr_current}`" 10 | if attr_current not in data_current: 11 | data_current[attr_current] = {} 12 | data_current = data_current[attr_current] 13 | return data_current 14 | -------------------------------------------------------------------------------- /easy_sql/utils/object_utils_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from easy_sql.utils.object_utils import get_attr 4 | 5 | 6 | class ObjectUtilsTest(unittest.TestCase): 7 | def test_get_attr(self): 8 | self.assertEqual(get_attr({}, "a.b.c"), {}) 9 | self.assertEqual(get_attr({"a": {}}, "a.b.c"), {}) 10 | self.assertEqual(get_attr({"a": {"b": {"c": [1, 2, 3]}}}, "a.b.c"), [1, 2, 3]) 11 | 12 | self.assertEqual(get_attr({}, "a"), {}) 13 | self.assertEqual(get_attr({"a": ""}, "a"), "") 14 | 15 | self.assertEqual(get_attr({"a": 1}, ""), {"a": 1}) 16 | 17 | 18 | if __name__ == "__main__": 19 | unittest.main() 20 | -------------------------------------------------------------------------------- /examples/rtdw/.gitignore: -------------------------------------------------------------------------------- 1 | lib/ 2 | workflow/**/jars 3 | *.jar 4 | *.log 5 | -------------------------------------------------------------------------------- /examples/rtdw/Makefile: -------------------------------------------------------------------------------- 1 | download-flink-jars: 2 | test -f lib/flink/jars/flink-connector-jdbc-1.15.1.jar || wget -P lib/flink/jars https://repo1.maven.org/maven2/org/apache/flink/flink-connector-jdbc/1.15.1/flink-connector-jdbc-1.15.1.jar 3 | test -f lib/flink/jars/flink-sql-connector-hive-3.1.2_2.12-1.15.1.jar || wget -P lib/flink/jars https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-hive-3.1.2_2.12/1.15.1/flink-sql-connector-hive-3.1.2_2.12-1.15.1.jar 4 | test -f lib/flink/jars/postgresql-42.2.14.jar || wget -P lib/flink/jars https://repo1.maven.org/maven2/org/postgresql/postgresql/42.2.14/postgresql-42.2.14.jar 5 | test -f lib/flink/jars/flink-sql-connector-postgres-cdc-2.3.0.jar || wget -P lib/flink/jars https://repo1.maven.org/maven2/com/ververica/flink-sql-connector-postgres-cdc/2.3.0/flink-sql-connector-postgres-cdc-2.3.0.jar 6 | test -f lib/flink/jars/hudi-flink1.15-bundle-0.12.2.jar || wget -P lib/flink/jars https://repo1.maven.org/maven2/org/apache/hudi/hudi-flink1.15-bundle/0.12.2/hudi-flink1.15-bundle-0.12.2.jar 7 | -------------------------------------------------------------------------------- /examples/rtdw/java/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | out/ 3 | .gradle/ 4 | src/**/generated/ 5 | src/test/.local/ 6 | bin/ 7 | -------------------------------------------------------------------------------- /examples/rtdw/java/README: -------------------------------------------------------------------------------- 1 | A Flink application project using Java and Gradle. 2 | 3 | To package your job for submission to Flink, use: 'gradle shadowJar'. Afterwards, you'll find the 4 | jar to use in the 'build/libs' folder. 5 | 6 | To run and test your application with an embedded instance of Flink use: 'gradle run' 7 | -------------------------------------------------------------------------------- /examples/rtdw/java/gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | distributionBase=GRADLE_USER_HOME 2 | distributionPath=wrapper/dists 3 | distributionUrl=https\://services.gradle.org/distributions/gradle-7.6-bin.zip 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | -------------------------------------------------------------------------------- /examples/rtdw/java/gradlew.bat: -------------------------------------------------------------------------------- 1 | @rem 2 | @rem Copyright 2015 the original author or authors. 3 | @rem 4 | @rem Licensed under the Apache License, Version 2.0 (the "License"); 5 | @rem you may not use this file except in compliance with the License. 6 | @rem You may obtain a copy of the License at 7 | @rem 8 | @rem https://www.apache.org/licenses/LICENSE-2.0 9 | @rem 10 | @rem Unless required by applicable law or agreed to in writing, software 11 | @rem distributed under the License is distributed on an "AS IS" BASIS, 12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | @rem See the License for the specific language governing permissions and 14 | @rem limitations under the License. 15 | @rem 16 | 17 | @if "%DEBUG%" == "" @echo off 18 | @rem ########################################################################## 19 | @rem 20 | @rem Gradle startup script for Windows 21 | @rem 22 | @rem ########################################################################## 23 | 24 | @rem Set local scope for the variables with windows NT shell 25 | if "%OS%"=="Windows_NT" setlocal 26 | 27 | set DIRNAME=%~dp0 28 | if "%DIRNAME%" == "" set DIRNAME=. 29 | set APP_BASE_NAME=%~n0 30 | set APP_HOME=%DIRNAME% 31 | 32 | @rem Resolve any "." and ".." in APP_HOME to make it shorter. 33 | for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi 34 | 35 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 36 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m" 37 | 38 | @rem Find java.exe 39 | if defined JAVA_HOME goto findJavaFromJavaHome 40 | 41 | set JAVA_EXE=java.exe 42 | %JAVA_EXE% -version >NUL 2>&1 43 | if "%ERRORLEVEL%" == "0" goto execute 44 | 45 | echo. 46 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 47 | echo. 48 | echo Please set the JAVA_HOME variable in your environment to match the 49 | echo location of your Java installation. 50 | 51 | goto fail 52 | 53 | :findJavaFromJavaHome 54 | set JAVA_HOME=%JAVA_HOME:"=% 55 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe 56 | 57 | if exist "%JAVA_EXE%" goto execute 58 | 59 | echo. 60 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 61 | echo. 62 | echo Please set the JAVA_HOME variable in your environment to match the 63 | echo location of your Java installation. 64 | 65 | goto fail 66 | 67 | :execute 68 | @rem Setup the command line 69 | 70 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar 71 | 72 | 73 | @rem Execute Gradle 74 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %* 75 | 76 | :end 77 | @rem End local scope for the variables with windows NT shell 78 | if "%ERRORLEVEL%"=="0" goto mainEnd 79 | 80 | :fail 81 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of 82 | rem the _cmd.exe /c_ return code! 83 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 84 | exit /b 1 85 | 86 | :mainEnd 87 | if "%OS%"=="Windows_NT" endlocal 88 | 89 | :omega 90 | -------------------------------------------------------------------------------- /examples/rtdw/java/settings.gradle: -------------------------------------------------------------------------------- 1 | rootProject.name = 'quickstart' 2 | -------------------------------------------------------------------------------- /examples/rtdw/java/src/main/java/com/easysql/example/Example.java: -------------------------------------------------------------------------------- 1 | package com.easysql.example; 2 | 3 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; 4 | import org.apache.flink.streaming.api.datastream.DataStream; 5 | import org.apache.flink.api.common.functions.FilterFunction; 6 | 7 | public class Example { 8 | 9 | public static void test(StreamExecutionEnvironment env) throws Exception { 10 | DataStream flintstones = env.fromElements( 11 | new Person("Fred", 35), 12 | new Person("Wilma", 35), 13 | new Person("Pebbles", 2)); 14 | 15 | DataStream adults = flintstones.filter(new FilterFunction() { 16 | @Override 17 | public boolean filter(Person person) throws Exception { 18 | return person.age >= 18; 19 | } 20 | }); 21 | 22 | adults.print(); 23 | 24 | env.execute(); 25 | } 26 | 27 | public static class Person { 28 | public String name; 29 | public Integer age; 30 | public Person() {} 31 | 32 | public Person(String name, Integer age) { 33 | this.name = name; 34 | this.age = age; 35 | } 36 | 37 | public String toString() { 38 | return this.name.toString() + ": age " + this.age.toString(); 39 | } 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /examples/rtdw/java/src/main/java/com/easysql/example/Ingest.java: -------------------------------------------------------------------------------- 1 | package com.easysql.example; 2 | 3 | import lombok.Data; 4 | import lombok.extern.slf4j.Slf4j; 5 | import lombok.val; 6 | import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.databind.ObjectMapper; 7 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; 8 | import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; 9 | 10 | import java.util.Arrays; 11 | import java.util.HashMap; 12 | import java.util.List; 13 | 14 | @Slf4j 15 | public class Ingest { 16 | 17 | private static ObjectMapper objectMapper = new ObjectMapper(); 18 | 19 | public static void ingest(StreamExecutionEnvironment env, StreamTableEnvironment tEnv, String jsonOpts) throws Exception { 20 | val ic = objectMapper.readValue(jsonOpts, IngestConfig.class); 21 | env.getCheckpointConfig().setCheckpointInterval(10000); 22 | val cdcSource = Sources.createPgCDCSource(ic.source_connector.options, ic.schema_list, ic.table_list); 23 | final SplitTableFunction splitTableFunction = new SplitTableFunction(Arrays.asList("inventory.user,inventory.product,inventory.user_order".split(","))); 24 | val ds = env.addSource(cdcSource) 25 | .process(splitTableFunction); 26 | splitTableFunction.getOutputTags().values().forEach(tag -> { 27 | tEnv.fromDataStream(ds.getSideOutput(tag).map(data -> data).setParallelism(1)); 28 | val tmpTableName = "_tmp__" + tag.getId().replace(".", "__"); 29 | tEnv.createTemporaryView(tmpTableName, ds); 30 | }); 31 | env.execute(); 32 | } 33 | 34 | @Data 35 | public static class IngestConfig { 36 | 37 | private List> catalogs; 38 | private List databases; 39 | private List schema_list; 40 | private List table_list; 41 | private Connector source_connector; 42 | 43 | public Database db(String dbName) { 44 | return databases.stream().filter(db -> dbName.equals(db.getName())).findFirst().orElse(null); 45 | } 46 | 47 | public HashMap connector(String dbName, String connectorName) { 48 | val db = db(dbName); 49 | val conn = db.connector(connectorName); 50 | return conn.getOptions(); 51 | } 52 | 53 | @Data 54 | public static class Database { 55 | 56 | private String name; 57 | private List connectors; 58 | private List tables; 59 | 60 | public Connector connector(String name) { 61 | return connectors.stream().filter(prop -> name.equals(prop.getName())).findFirst().orElse(null); 62 | } 63 | 64 | public Table table(String name) { 65 | return tables.stream().filter(prop -> name.equals(prop.getName())).findFirst().orElse(null); 66 | } 67 | 68 | } 69 | 70 | @Data 71 | public static class Connector { 72 | 73 | private String name; 74 | private HashMap options; 75 | } 76 | 77 | @Data 78 | public static class Table { 79 | 80 | private String name; 81 | private Connector connector; 82 | private List schema; 83 | 84 | public HashMap fullOptions(Database db) { 85 | val dbConn = db.connector(connector.name); 86 | val result = new HashMap<>(dbConn.getOptions()); 87 | if (connector.getOptions() != null) { 88 | result.putAll(connector.getOptions()); 89 | if (connector.getOptions().containsKey("path")) { 90 | result.put("path", String.format("%s/%s.db/%s", result.get("path"), db.name, name)); 91 | } 92 | } 93 | return result; 94 | } 95 | } 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /examples/rtdw/java/src/main/java/com/easysql/example/Sinks.java: -------------------------------------------------------------------------------- 1 | package com.easysql.example; 2 | 3 | import com.easysql.example.Ingest.IngestConfig; 4 | import com.easysql.example.Ingest.IngestConfig.Database; 5 | import com.easysql.example.Ingest.IngestConfig.Table; 6 | import lombok.val; 7 | import org.apache.flink.table.data.RowData; 8 | import org.apache.hudi.common.model.HoodieTableType; 9 | import org.apache.hudi.configuration.FlinkOptions; 10 | import org.apache.hudi.util.HoodiePipeline; 11 | 12 | import java.util.HashMap; 13 | import java.util.Map; 14 | 15 | public class Sinks { 16 | public static void createHudiSink(String db, String table, IngestConfig config) { 17 | final Database dbObj = config.db(db); 18 | final Table tableObj = dbObj.table(table); 19 | val tableOpts = tableObj.fullOptions(dbObj); 20 | val tableSchema = tableObj.getSchema(); 21 | 22 | HoodiePipeline.Builder builder = HoodiePipeline.builder(table) 23 | .column("uuid VARCHAR(20)") 24 | .column("name VARCHAR(10)") 25 | .column("age INT") 26 | .column("ts TIMESTAMP(3)") 27 | .column("_di VARCHAR(20)") 28 | .partition("_di") 29 | .options(tableOpts); 30 | 31 | // builder.sink(dataStream, false); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /examples/rtdw/java/src/main/java/com/easysql/example/SplitTableFunction.java: -------------------------------------------------------------------------------- 1 | package com.easysql.example; 2 | 3 | import com.easysql.example.RowDataDebeziumDeserializationSchema.GenericRowDataWithSchema; 4 | import lombok.extern.slf4j.Slf4j; 5 | import lombok.val; 6 | import org.apache.flink.api.java.tuple.Tuple2; 7 | import org.apache.flink.streaming.api.functions.ProcessFunction; 8 | import org.apache.flink.util.Collector; 9 | import org.apache.flink.util.OutputTag; 10 | 11 | import java.util.List; 12 | import java.util.Map; 13 | import java.util.function.Function; 14 | import java.util.stream.Collectors; 15 | 16 | @Slf4j 17 | public class SplitTableFunction extends ProcessFunction { 18 | 19 | private List tables; 20 | private transient Map> outputTags = null; 21 | 22 | public SplitTableFunction(List tables) { this.tables = tables; } 23 | 24 | public Map> getOutputTags() { 25 | if (outputTags == null) { 26 | val dbTables = tables.stream() 27 | .map(table -> new Tuple2<>(table.substring(0, table.indexOf(".")), table.substring(table.indexOf(".") + 1))) 28 | .collect(Collectors.toList()); 29 | outputTags = dbTables.stream() 30 | .map(table -> new OutputTag(table.f0 + "." + table.f1) { }) 31 | .collect(Collectors.toMap(OutputTag::getId, Function.identity())); 32 | } 33 | return outputTags; 34 | } 35 | 36 | @Override 37 | public void processElement(GenericRowDataWithSchema rowData, ProcessFunction.Context ctx, Collector out) throws Exception { 38 | val table = rowData.getTable(); 39 | val tags = this.getOutputTags(); 40 | if (tags.containsKey(table)) { 41 | val tag = tags.get(table); 42 | ctx.output(tag, rowData); 43 | } else { 44 | log.debug("Ignore message for table {} since it it not configured to process.", table); 45 | } 46 | } 47 | 48 | } 49 | -------------------------------------------------------------------------------- /examples/rtdw/java/src/main/java/org/myorg/quickstart/DataStreamJob.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package org.myorg.quickstart; 20 | 21 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; 22 | 23 | /** 24 | * Skeleton for a Flink DataStream Job. 25 | * 26 | *

For a tutorial how to write a Flink application, check the 27 | * tutorials and examples on the Flink Website. 28 | * 29 | *

To package your application into a JAR file for execution, run 30 | * 'mvn clean package' on the command line. 31 | * 32 | *

If you change the name of the main class (with the public static void main(String[] args)) 33 | * method, change the respective entry in the POM.xml file (simply search for 'mainClass'). 34 | */ 35 | public class DataStreamJob { 36 | 37 | public static void main(String[] args) throws Exception { 38 | // Sets up the execution environment, which is the main entry point 39 | // to building Flink applications. 40 | final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); 41 | 42 | /* 43 | * Here, you can start creating your execution plan for Flink. 44 | * 45 | * Start with getting some data from the environment, like 46 | * env.fromSequence(1, 10); 47 | * 48 | * then, transform the resulting DataStream using operations 49 | * like 50 | * .filter() 51 | * .flatMap() 52 | * .window() 53 | * .process() 54 | * 55 | * and many more. 56 | * Have a look at the programming guide: 57 | * 58 | * https://nightlies.apache.org/flink/flink-docs-stable/ 59 | * 60 | */ 61 | 62 | // Execute program, beginning computation. 63 | env.execute("Flink Java API Skeleton"); 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /examples/rtdw/java/src/main/resources/log4j2.properties: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | ################################################################################ 18 | 19 | rootLogger.level = INFO 20 | rootLogger.appenderRef.console.ref = ConsoleAppender 21 | 22 | appender.console.name = ConsoleAppender 23 | appender.console.type = CONSOLE 24 | appender.console.layout.type = PatternLayout 25 | appender.console.layout.pattern = %d{HH:mm:ss,SSS} %-5p %-60c %x - %m%n 26 | -------------------------------------------------------------------------------- /examples/rtdw/java/src/main/scala/com/easysql/example/ingest.scala: -------------------------------------------------------------------------------- 1 | package com.easysql.example 2 | 3 | import com.ververica.cdc.connectors.postgres.PostgreSQLSource 4 | import com.ververica.cdc.debezium.{DebeziumSourceFunction, JsonDebeziumDeserializationSchema} 5 | import org.apache.flink.api.common.serialization.SimpleStringSchema 6 | import org.apache.flink.connector.base.DeliveryGuarantee 7 | import org.apache.flink.connector.kafka.sink.{KafkaRecordSerializationSchema, KafkaSink} 8 | import org.apache.flink.contrib.streaming.state.EmbeddedRocksDBStateBackend 9 | import org.apache.flink.streaming.api.CheckpointingMode 10 | import org.apache.flink.streaming.api.environment.CheckpointConfig.ExternalizedCheckpointCleanup 11 | import org.apache.flink.streaming.api.scala.{StreamExecutionEnvironment, _} 12 | 13 | import java.util.Properties 14 | 15 | object PostgresCDC { 16 | def createCDCSource(): DebeziumSourceFunction[String]={ 17 | val prop = new Properties() 18 | prop.setProperty("decimal.handling.mode","string") 19 | PostgreSQLSource.builder[String] 20 | .hostname("testpg") 21 | .port(15432) 22 | .username("postgres") 23 | .password("123456") 24 | .database("postgres") 25 | .schemaList("inventory") 26 | .slotName("pg_cdc") 27 | .decodingPluginName("pgoutput") 28 | .debeziumProperties(prop) 29 | .deserializer(new JsonDebeziumDeserializationSchema) 30 | .build 31 | } 32 | 33 | def createKafkaSink(): KafkaSink[String] ={ 34 | val sinkTopic = "pgcdc" 35 | KafkaSink.builder[String].setBootstrapServers("localhost:9092") 36 | .setRecordSerializer(KafkaRecordSerializationSchema.builder() 37 | .setTopic(sinkTopic) 38 | .setValueSerializationSchema(new SimpleStringSchema()) 39 | .build()) 40 | .setDeliverGuarantee(DeliveryGuarantee.EXACTLY_ONCE) 41 | .setTransactionalIdPrefix("pgcdc-transaction-id") 42 | .setKafkaProducerConfig(Map("transaction.timeout.ms"-> "300000")) 43 | .build 44 | } 45 | 46 | implicit def map2Properties(map: Map[String, String]): java.util.Properties = { 47 | map.foldLeft(new java.util.Properties){ case (props, (k, v)) => props.put(k, v); props } 48 | } 49 | 50 | def main(args: Array[String]): Unit = { 51 | val env = StreamExecutionEnvironment.getExecutionEnvironment 52 | env.enableCheckpointing(10 * 1000) 53 | env.getCheckpointConfig.setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE) 54 | env.getCheckpointConfig.setMinPauseBetweenCheckpoints(500) 55 | env.getCheckpointConfig.setCheckpointTimeout(60000) 56 | env.getCheckpointConfig.enableExternalizedCheckpoints(ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION) 57 | val rocksBackend = new EmbeddedRocksDBStateBackend() 58 | rocksBackend.setDbStoragePath("/tmp/cdc-flink-states") 59 | env.setStateBackend(rocksBackend) 60 | 61 | env.addSource(createCDCSource()).name("postgres cdc source") 62 | .map(data => { 63 | data 64 | }) 65 | .setParallelism(1) 66 | // .print() 67 | .sinkTo(createKafkaSink()).name("cdc sink kafka") 68 | 69 | env.execute("Postgres CDC") 70 | } 71 | 72 | } 73 | -------------------------------------------------------------------------------- /examples/rtdw/readme.md: -------------------------------------------------------------------------------- 1 | ## Target 2 | 3 | - ingest data: support both snapshot and incremental data 4 | - etl development: join big tables with updates 5 | 6 | ## TODO 7 | - [√] setup a scenario 8 | - kafka to help ingest data 9 | + if use flink cdc directly, it will use multiple connections and add pressure to db 10 | + consider ingest snapshot first and then data change log 11 | 12 | ## V1 13 | 14 | **Solution:** 15 | 16 | pg -----------> kafka --------------> dw 17 | flink-cdc spark-streaming 18 | 19 | 20 | **Prepare environment:** 21 | 22 | ```bash 23 | # execute the commands below in separate terminals 24 | 25 | # prepare data in pg: run workflow/sales/ods/data.sql in pg manually 26 | # start kafka 27 | bin/zookeeper-server-start.sh config/zookeeper.properties 28 | bin/kafka-server-start.sh config/server.properties 29 | bin/kafka-console-consumer.sh --topic pgcdc --from-beginning --bootstrap-server localhost:9092 30 | # start flink cdc to kafka (in sales/ods) 31 | java -cp '~/dev/sdks/scala-2.12.10/lib/scala-library.jar:easysql-example-ods.jar:/usr/local/lib/python3.8/site-packages/pyflink/lib/*:jars/*' com.easysql.example.PostgresCDC 32 | # start spark streaming app to ingest data to hudi (in workflow) 33 | bash -c "$(python3 -m easy_sql.data_process -f sales/ods/ingest_hudi.sql -p)" 2>&1 | tee ingest_hudi.log 34 | # query the ingested data to hudi (in workflow) 35 | bash -c "$(python3 -m easy_sql.data_process -f sales/ods/ingest_hudi.test.sql -p)" 36 | ``` 37 | 38 | **Test** 39 | 40 | Emit queries in postgres. 41 | 42 | Cases to test: 43 | - add data: `insert into inventory.product(pid,pname,pprice) values ('6','prodcut-006',225.31);` 44 | - change data: `update inventory.product set pname='p6' where pid=6;` 45 | - delete data: `delete from inventory.product where pid=6;` 46 | - add column with default value: `alter table inventory.product add column ex int default 0;` 47 | - delete column: `alter table inventory.product delete column ex;` 48 | - rename column: `alter table inventory.product rename column ex to ex1;` 49 | - change column type: `alter table inventory.product change column ex ex1 int;` 50 | -------------------------------------------------------------------------------- /examples/rtdw/scala/.gitignore: -------------------------------------------------------------------------------- 1 | classes/ 2 | ref/ 3 | -------------------------------------------------------------------------------- /examples/rtdw/scala/Makefile: -------------------------------------------------------------------------------- 1 | SCALA_BIN=~/dev/sdks/scala-2.12.10/bin 2 | FLINK_JAR_PATH=/usr/local/lib/python3.8/site-packages/pyflink/lib/* 3 | SCALA_CP="${FLINK_JAR_PATH}:../lib/flink/jars/*" 4 | 5 | 6 | ods-jar: 7 | - rm -r classes 8 | mkdir -pv classes 9 | ${SCALA_BIN}/scalac -nobootcp -cp ${SCALA_CP} -d classes src/com/easysql/example/*.scala 10 | cd classes && jar -cvfe ../easysql-example-ods.jar com.easysql.example.PostgresCDC com 11 | -------------------------------------------------------------------------------- /examples/rtdw/scala/src/com/easysql/example/ingest.scala: -------------------------------------------------------------------------------- 1 | package com.easysql.example 2 | 3 | import com.ververica.cdc.connectors.postgres.PostgreSQLSource 4 | import com.ververica.cdc.debezium.{DebeziumSourceFunction, JsonDebeziumDeserializationSchema} 5 | import org.apache.flink.api.common.serialization.SimpleStringSchema 6 | import org.apache.flink.connector.base.DeliveryGuarantee 7 | import org.apache.flink.connector.kafka.sink.{KafkaRecordSerializationSchema, KafkaSink} 8 | import org.apache.flink.contrib.streaming.state.EmbeddedRocksDBStateBackend 9 | import org.apache.flink.streaming.api.CheckpointingMode 10 | import org.apache.flink.streaming.api.environment.CheckpointConfig.ExternalizedCheckpointCleanup 11 | import org.apache.flink.streaming.api.scala.{StreamExecutionEnvironment, _} 12 | 13 | import java.util.Properties 14 | 15 | object PostgresCDC { 16 | def createCDCSource(): DebeziumSourceFunction[String]={ 17 | val prop = new Properties() 18 | prop.setProperty("decimal.handling.mode","string") 19 | PostgreSQLSource.builder[String] 20 | .hostname("testpg") 21 | .port(15432) 22 | .username("postgres") 23 | .password("123456") 24 | .database("postgres") 25 | .schemaList("inventory") 26 | .slotName("test_pg_cdc") 27 | .decodingPluginName("pgoutput") 28 | .debeziumProperties(prop) 29 | .deserializer(new JsonDebeziumDeserializationSchema) 30 | .build 31 | } 32 | 33 | def createKafkaSink(): KafkaSink[String] ={ 34 | val sinkTopic = "pgcdc" 35 | KafkaSink.builder[String].setBootstrapServers("localhost:9092") 36 | .setRecordSerializer(KafkaRecordSerializationSchema.builder() 37 | .setTopic(sinkTopic) 38 | .setValueSerializationSchema(new SimpleStringSchema()) 39 | .build()) 40 | .setDeliverGuarantee(DeliveryGuarantee.EXACTLY_ONCE) 41 | .setTransactionalIdPrefix("pgcdc-transaction-id") 42 | .setKafkaProducerConfig(Map("transaction.timeout.ms"-> "300000")) 43 | .build 44 | } 45 | 46 | implicit def map2Properties(map: Map[String, String]): java.util.Properties = { 47 | map.foldLeft(new java.util.Properties){ case (props, (k, v)) => props.put(k, v); props } 48 | } 49 | 50 | def main(args: Array[String]): Unit = { 51 | val env = StreamExecutionEnvironment.getExecutionEnvironment 52 | env.enableCheckpointing(10 * 1000) 53 | env.getCheckpointConfig.setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE) 54 | env.getCheckpointConfig.setMinPauseBetweenCheckpoints(500) 55 | env.getCheckpointConfig.setCheckpointTimeout(60000) 56 | env.getCheckpointConfig.enableExternalizedCheckpoints(ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION) 57 | val rocksBackend = new EmbeddedRocksDBStateBackend() 58 | rocksBackend.setDbStoragePath("/tmp/cdc-flink-states") 59 | env.setStateBackend(rocksBackend) 60 | 61 | env.addSource(createCDCSource()).name("postgres cdc source") 62 | .setParallelism(1) 63 | .sinkTo(createKafkaSink()).name("cdc sink kafka") 64 | 65 | env.execute("Postgres CDC") 66 | } 67 | 68 | } 69 | -------------------------------------------------------------------------------- /examples/rtdw/workflow/sales/ods/Makefile: -------------------------------------------------------------------------------- 1 | cdc-to-kafka: 2 | java -cp '~/dev/sdks/scala-2.12.10/lib/scala-library.jar:easysql-example-ods.jar:/usr/local/lib/python3.8/site-packages/pyflink/lib/*:jars/*' com.easysql.example.PostgresCDC 3 | -------------------------------------------------------------------------------- /examples/rtdw/workflow/sales/ods/data.sql: -------------------------------------------------------------------------------- 1 | drop schema if exists inventory; 2 | create schema inventory; 3 | set search_path=inventory; 4 | 5 | CREATE OR REPLACE FUNCTION update_modify_time_column() 6 | RETURNS TRIGGER AS $$ 7 | BEGIN 8 | NEW.modify_time = now(); 9 | RETURN NEW; 10 | END; 11 | $$ language 'plpgsql'; 12 | 13 | -- create user table 14 | drop table if exists inventory.user; 15 | create table if not exists inventory.user ( 16 | id serial not null, 17 | name varchar(155) null, 18 | device_model varchar(155) null, 19 | email varchar(50) null, 20 | phone varchar(50) null, 21 | create_time timestamp default CURRENT_TIMESTAMP not null, 22 | modify_time timestamp default CURRENT_TIMESTAMP not null, 23 | primary key (id) 24 | ); 25 | 26 | CREATE TRIGGER update_user_modify_time BEFORE UPDATE 27 | ON inventory.user FOR EACH ROW EXECUTE PROCEDURE 28 | update_modify_time_column(); 29 | 30 | -- insert data 31 | insert into inventory.user(name,device_model,email,phone) values 32 | ('customer-01','dm-01','abc01@email.com','188776xxxxx'), 33 | ('customer-02','dm-02','abc02@email.com','166776xxxxx'); 34 | 35 | -- create product table 36 | create table if not exists inventory.product 37 | ( 38 | pid serial not null, 39 | pname varchar(155) null, 40 | pprice decimal(10,2) , 41 | create_time timestamp default CURRENT_TIMESTAMP not null, 42 | modify_time timestamp default CURRENT_TIMESTAMP not null, 43 | primary key (pid) 44 | ); 45 | 46 | CREATE TRIGGER update_user_modify_time BEFORE UPDATE 47 | ON inventory.product FOR EACH ROW EXECUTE PROCEDURE 48 | update_modify_time_column(); 49 | 50 | -- insert data 51 | insert into inventory.product(pid,pname,pprice) values 52 | ('1','prodcut-001',125.12), 53 | ('2','prodcut-002',225.31); 54 | 55 | -- create order table 56 | drop table if exists inventory.user_order; 57 | create table if not exists inventory.user_order 58 | ( 59 | id serial, 60 | oid varchar(155) not null, 61 | uid int , 62 | pid int , 63 | onum int , 64 | create_time timestamp default CURRENT_TIMESTAMP not null, 65 | modify_time timestamp default CURRENT_TIMESTAMP not null, 66 | primary key (id) 67 | ); 68 | 69 | CREATE TRIGGER update_user_modify_time BEFORE UPDATE 70 | ON inventory.user_order FOR EACH ROW EXECUTE PROCEDURE 71 | update_modify_time_column(); 72 | 73 | -- insert data 74 | insert into user_order(oid,uid,pid,onum) values 75 | ('o10001',1,1,100), 76 | ('o10002',1,2,30), 77 | ('o10001',2,1,22), 78 | ('o10002',2,2,16); 79 | 80 | -- select data 81 | select * from user; 82 | select * from product; 83 | select * from user_order; 84 | -------------------------------------------------------------------------------- /examples/rtdw/workflow/sales/ods/ingest.sql: -------------------------------------------------------------------------------- 1 | -- backend: flink 2 | 3 | -- config: easy_sql.flink_tables_file_path=ods.flink_tables.json 4 | -- config: easy_sql.func_file_path=ingest_funcs.py 5 | -- config: easy_sql.etl_type=streaming 6 | 7 | -- config: flink.cmd=-pyexec python3 8 | -- config: flink.cmd=-pyclientexec python3 9 | -- config: flink.cmd=-t remote 10 | -- config: flink.execution.checkpointing.interval=3s 11 | -- config: flink.pipeline.jars=../lib/flink/jars/flink-sql-connector-postgres-cdc-2.3.0.jar;../lib/flink/jars/hudi-flink1.15-bundle-0.12.2.jar;sales/ods/easysql-example-ingest.jar;sales/ods/jars/flink-connector-jdbc-1.15.2.jar;sales/ods/jars/postgresql-42.2.14.jar 12 | -- config: flink.pipeline.name=sales.ingest 13 | 14 | -- target=variables 15 | select 16 | 'append' as __save_mode__ 17 | , 'inventory.user,inventory.product,inventory.user_order' as tables_ 18 | 19 | -- target=func.ingest_cdc_pg(${__backend__}, db_pg, connector_cdc, ${tables_}, sales) 20 | -------------------------------------------------------------------------------- /examples/rtdw/workflow/sales/ods/ingest.test.sql: -------------------------------------------------------------------------------- 1 | -- backend: flink 2 | 3 | -- config: easy_sql.flink_tables_file_path=ods.flink_tables.json 4 | -- config: easy_sql.etl_type=batch 5 | 6 | -- config: flink.cmd=-pyexec python3 7 | -- config: flink.cmd=-pyclientexec python3 8 | -- config: flink.cmd=-t remote 9 | -- config: flink.execution.checkpointing.interval=3s 10 | -- config: flink.pipeline.jars=../lib/flink/jars/flink-sql-connector-postgres-cdc-2.3.0.jar;../lib/flink/jars/hudi-flink1.15-bundle-0.12.2.jar 11 | 12 | -- target=variables 13 | select DATE_FORMAT(now(), 'yyyy-MM-dd') as TODAY; 14 | 15 | -- target=func.exec_sql_in_source(${__step__}, db_pg, connector_jdbc) 16 | -- prepare data to ingest 17 | drop schema if exists ingest_test_sales cascade; 18 | create schema ingest_test_sales; 19 | create table ingest_test_sales.order (id int, product_id int, created_at timestamp, updated_at timestamp, primary key (id)); 20 | create table ingest_test_sales.product (id int, name text, category text, created_at timestamp, updated_at timestamp, primary key (id)); 21 | insert into ingest_test_sales.product values (1, 'p1', 'c1', '${TODAY} 00:00:00', '${TODAY} 00:00:00'), (2, 'p2', 'c2', '${TODAY} 00:00:01', '${TODAY} 00:00:01'); 22 | insert into ingest_test_sales.order values (1, 1, '${TODAY} 00:00:01', '${TODAY} 00:00:01'), (2, 1, '${TODAY} 00:00:01', '${TODAY} 00:00:01'), (3, 1, '${TODAY} 00:00:01', '${TODAY} 00:00:01'); 23 | insert into ingest_test_sales.order values (4, 2, '${TODAY} 00:00:01', '${TODAY} 00:00:01'), (5, 2, '${TODAY} 00:00:01', '${TODAY} 00:00:01'); 24 | 25 | -- target=func.test_run_etl(${__config__}, ingest.sql) 26 | 27 | -- target=func.sleep(10) 28 | 29 | 30 | -- target=check.ensure_product_data_ingested 31 | select 32 | 2 as expected 33 | , count(1) as actual 34 | from ods_rt_sales.ingest_test_sales_product 35 | 36 | -- target=check.ensure_order_data_ingested 37 | select 38 | 5 as expected 39 | , count(1) as actual 40 | from ods_rt_sales.ingest_test_sales_order 41 | 42 | 43 | -- target=func.exec_sql_in_source(${__step__}, db_pg, connector_jdbc) 44 | -- prepare data to ingest 45 | insert into ingest_test_sales.product values (3, 'p3', 'c3', '${TODAY} 00:00:00', '${TODAY} 00:00:00'); 46 | insert into ingest_test_sales.order values (6, 2, '${TODAY} 00:00:01', '${TODAY} 00:00:01'); 47 | 48 | -- target=func.sleep(5) 49 | 50 | 51 | -- target=check.ensure_product_data_ingested 52 | select 53 | 3 as expected 54 | , count(1) as actual 55 | from ods_rt_sales.ingest_test_sales_product 56 | -------------------------------------------------------------------------------- /examples/rtdw/workflow/sales/ods/ingest_funcs.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import json 4 | from typing import TYPE_CHECKING, Dict 5 | 6 | from easy_sql.sql_processor.backend.base import SaveMode, TableMeta 7 | 8 | if TYPE_CHECKING: 9 | from easy_sql.sql_processor.backend import FlinkBackend 10 | 11 | 12 | __all__ = ["ingest_cdc_pg"] 13 | 14 | 15 | def ingest_cdc_pg(backend: FlinkBackend, db: str, connector: str, table_list: str, domain: str): 16 | db_config = backend.flink_tables_config.database(db) 17 | if not db_config: 18 | raise Exception("Db not configured: " + db) 19 | connector_config = backend.flink_tables_config.connector(db_config, connector_name=connector) 20 | if not connector_config: 21 | raise Exception(f"Connector {connector} not configured for db {db}") 22 | target_tables = {table: f'ods_{domain}.{domain}_{table.split(".")[1]}' for table in table_list.split(",")} 23 | table_with_fields_list = [ 24 | { 25 | "name": table, 26 | "schemaRefTableName": target_table, 27 | "fields": backend.flink_tables_config.table_fields(target_table, ["_di"]), 28 | } 29 | for table, target_table in target_tables.items() 30 | ] 31 | 32 | backend.register_tables() 33 | 34 | from py4j.java_gateway import java_import 35 | from pyflink.java_gateway import get_gateway 36 | 37 | gw = get_gateway() 38 | java_import(gw.jvm, "com.easysql.example.Sources") 39 | readPgCDC = eval( 40 | "gw.jvm.com.easysql.example.Sources.readPgCDC", 41 | { 42 | "gw": gw, 43 | }, 44 | ) 45 | _j_env = backend.flink_stream_env._j_stream_execution_environment # type: ignore 46 | result_tables: Dict[str, str] = readPgCDC( 47 | _j_env, backend.flink._j_tenv, connector_config["options"], json.dumps(table_with_fields_list) 48 | ) 49 | 50 | ingest_tables = { 51 | f'ods_{domain}.{domain}_{table.split(".")[1]}': read_temp_table 52 | for table, read_temp_table in result_tables.items() 53 | } 54 | for hudi_table, read_temp_table in ingest_tables.items(): 55 | backend.exec_native_sql_query(f"select * from {read_temp_table}").print_schema() 56 | table_with_partition = backend.exec_native_sql_query( 57 | f"select *, from_unixtime(_op_ts / 1000, 'yyyyMMdd') as _di from {read_temp_table}" 58 | ) 59 | backend.flink.create_temporary_view(read_temp_table, table_with_partition) 60 | assert "." not in read_temp_table 61 | backend.save_table(TableMeta(read_temp_table), TableMeta(hudi_table), SaveMode.append) 62 | -------------------------------------------------------------------------------- /examples/rtdw/workflow/sales/ods/ingest_hudi.sql: -------------------------------------------------------------------------------- 1 | -- config: easy_sql.spark_submit=spark-submit --packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0,org.apache.hudi:hudi-spark3.3-bundle_2.12:0.12.2 2 | -- config: easy_sql.func_file_path=ingest_hudi_funcs.py 3 | -- config: easy_sql.etl_type=streaming 4 | 5 | -- config: spark.serializer=org.apache.spark.serializer.KryoSerializer 6 | -- config: spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension 7 | -- config: spark.sql.catalog.spark_catalog=org.apache.spark.sql.hudi.catalog.HoodieCatalog 8 | 9 | -- target=func.read_kafka(pgcdc) 10 | -- target=func.write_hudi(pgcdc) 11 | -------------------------------------------------------------------------------- /examples/rtdw/workflow/sales/ods/ingest_hudi.test.sql: -------------------------------------------------------------------------------- 1 | -- config: easy_sql.spark_submit=spark-submit --packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0,org.apache.hudi:hudi-spark3.3-bundle_2.12:0.12.2 2 | -- config: easy_sql.func_file_path=ingest_hudi_funcs.py 3 | -- config: easy_sql.etl_type=batch 4 | 5 | -- config: spark.serializer=org.apache.spark.serializer.KryoSerializer 6 | -- config: spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension 7 | -- config: spark.sql.catalog.spark_catalog=org.apache.spark.sql.hudi.catalog.HoodieCatalog 8 | 9 | -- target=func.read_hudi(user) 10 | -- target=log.sample_user 11 | select * from user order by _dt; 12 | -------------------------------------------------------------------------------- /examples/rtdw/workflow/sales/ods/ods.flink_tables.json: -------------------------------------------------------------------------------- 1 | { 2 | "catalogs": [ 3 | { 4 | "name": "myhiveCatalog", 5 | "type": "hive", 6 | "hive-conf-dir": "test/flink/flink_hive_conf" 7 | }, 8 | { 9 | "name": "myhiveCatalog_1", 10 | "type": "hive", 11 | "hive-conf-dir": "test/flink/flink_hive_conf" 12 | }, 13 | { 14 | "name": "testpg", 15 | "type": "jdbc", 16 | "default-database": "postgres", 17 | "username": "postgres", 18 | "password": "123456", 19 | "base-url": "jdbc:postgresql://testpg:15432" 20 | } 21 | ], 22 | "databases": [ 23 | { 24 | "name": "db_pg", 25 | "connectors": [ 26 | { 27 | "name": "connector_jdbc", 28 | "options": { 29 | "connector": "jdbc", 30 | "url": "jdbc:postgresql://testpg:15432/postgres", 31 | "username": "postgres", 32 | "password": "123456" 33 | } 34 | }, 35 | { 36 | "name": "connector_cdc", 37 | "options": { 38 | "connector": "postgres-cdc", 39 | "hostname": "testpg", 40 | "port": "15432", 41 | "database-name": "postgres", 42 | "slot.name": "cdc_slot", 43 | "decoding.plugin.name": "pgoutput", 44 | "slot.drop.on.stop": "true", 45 | "schema-name": "sample", 46 | "username": "postgres", 47 | "password": "123456" 48 | } 49 | } 50 | ] 51 | }, 52 | { 53 | "name": "ods_sales", 54 | "connectors": [ 55 | { 56 | "name": "connector_hudi", 57 | "options": { 58 | "connector": "hudi", 59 | "path": "/tmp/hudi-flink-test/ods_sales.db", 60 | "table.type": "COPY_ON_WRITE", 61 | "precombine.field": "_op_ts", 62 | "changelog.enabled": true, 63 | "compaction.async.enabled": false 64 | } 65 | } 66 | ], 67 | "tables": [ 68 | { 69 | "name": "sales_user", 70 | "connector": { 71 | "name": "connector_hudi" 72 | }, 73 | "partition_by": ["_di"], 74 | "schema": [ 75 | "id INT NOT NULL PRIMARY KEY NOT ENFORCED", 76 | "name VARCHAR", 77 | "device_model VARCHAR", 78 | "email VARCHAR", 79 | "phone VARCHAR", 80 | "create_time timestamp", 81 | "modify_time timestamp", 82 | "_di INT", 83 | "_op_ts BIGINT" 84 | ] 85 | }, 86 | { 87 | "name": "sales_product", 88 | "connector": { 89 | "name": "connector_hudi" 90 | }, 91 | "partition_by": ["_di"], 92 | "schema": [ 93 | "pid INT NOT NULL PRIMARY KEY NOT ENFORCED", 94 | "pname VARCHAR", 95 | "pprice decimal", 96 | "phone VARCHAR", 97 | "create_time timestamp", 98 | "modify_time timestamp", 99 | "_di INT", 100 | "_op_ts BIGINT" 101 | ] 102 | }, 103 | { 104 | "name": "sales_user_order", 105 | "connector": { 106 | "name": "connector_hudi" 107 | }, 108 | "partition_by": ["_di"], 109 | "schema": [ 110 | "id INT NOT NULL PRIMARY KEY NOT ENFORCED", 111 | "oid VARCHAR", 112 | "uid INT", 113 | "pid INT", 114 | "onum INT", 115 | "create_time timestamp", 116 | "modify_time timestamp", 117 | "_di INT", 118 | "_op_ts BIGINT" 119 | ] 120 | } 121 | ] 122 | } 123 | ], 124 | "table_list": ["a"], 125 | "schema_list": ["a"] 126 | } 127 | -------------------------------------------------------------------------------- /examples/rtdw/workflow/sales/ods/register-pg.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "inventory-connector", 3 | "config": { 4 | "connector.class": "io.debezium.connector.postgresql.PostgresConnector", 5 | "tasks.max": "1", 6 | "database.hostname": "testpg", 7 | "database.port": "15432", 8 | "database.user": "postgres", 9 | "database.password": "123456", 10 | "database.dbname": "postgres", 11 | "database.server.name": "dbserver1", 12 | "schema.include.list": "inventory" 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /poetry.toml: -------------------------------------------------------------------------------- 1 | [repositories] 2 | [repositories.aliyun] 3 | url = "https://mirrors.aliyun.com/pypi/simple/" 4 | 5 | [repositories.testpypi] 6 | url = "https://test.pypi.org/legacy/" 7 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "easy_sql-easy_sql" 3 | version = "1.1.0" 4 | description = "A library developed to ease the data ETL development process." 5 | authors = ["Easy SQL from Thoughtworks "] 6 | license = "Apache-2.0" 7 | readme = "README.md" 8 | repository = "https://github.com/easysql/easy_sql" 9 | homepage = "https://easy-sql.readthedocs.io" 10 | 11 | classifiers = [ 12 | "Programming Language :: Python :: 3.8", 13 | "License :: OSI Approved :: Apache Software License", 14 | "Operating System :: MacOS", 15 | "Operating System :: POSIX :: Linux", 16 | ] 17 | packages = [ 18 | { include = "easy_sql" }, 19 | ] 20 | 21 | [tool.poetry.urls] 22 | "Bug Tracker" = "https://github.com/easysql/easy_sql/issues" 23 | 24 | [tool.poetry.dependencies] 25 | python = "^3.7" 26 | click = {version = "^8.1.3", optional = true} 27 | regex = {version = "^2022.7.25", optional = true} 28 | colorlog = {version = "^6.6.0", optional = true} 29 | sqlfluff = {version = "~1.4.5", optional = true} 30 | SQLAlchemy = {version = "^1.4.40", optional = true} 31 | clickhouse-driver = {version = "^0.2.4", optional = true} 32 | clickhouse-sqlalchemy = {version = "^0.2.1", optional = true} 33 | psycopg2 = {version = "^2.9.3", optional = true} 34 | pyodps = {version = "^0.11.2.1", optional = true} 35 | pyspark = [{version = ">=2.3.0, != 3.1.1, != 3.1.2, != 3.1.3, !=3.2.0, != 3.2.1", optional = true}] 36 | numpy = {version="~1.21.4", python=">=3.7,<3.11", optional=true} 37 | pandas = {version="~1.3", python=">=3.7.1", optional=true} 38 | apache-flink = {version = "^1.17.0", optional = true} 39 | ydata-profiling = {version = "^4.2.0", optional = true, python = ">=3.8,<3.12"} 40 | pyyaml = {version = "^6.0", optional = true} 41 | pymongo = "^3.8.0" 42 | 43 | [tool.poetry.group.test.dependencies] 44 | pytest = "^7.1.2" 45 | coverage = "^6.4.3" 46 | openpyxl = "^3.0.10" 47 | 48 | [tool.poetry.group.dev.dependencies] 49 | pre-commit = "^2.20.0" 50 | flake8 = {version = "^6.0.0", python = ">=3.8.1"} 51 | flake8-bugbear = {version = "^23.5.9", python = ">=3.8.1"} 52 | flake8-comprehensions = "^3.12.0" 53 | flake8-simplify = "^0.20.0" 54 | flake8-type-checking = {version = "^2.4.0", python = ">=3.8"} 55 | 56 | [tool.poetry.extras] 57 | cli = ["click"] 58 | linter = ["sqlfluff","colorlog","regex"] 59 | spark = ["pyspark"] 60 | pg = ["SQLAlchemy", "psycopg2"] 61 | clickhouse = ["SQLAlchemy","clickhouse-driver","clickhouse-sqlalchemy"] 62 | maxcompute = ["pyodps"] 63 | flink = ["apache-flink", "pyyaml"] 64 | ydata-profiling=["ydata-profiling"] 65 | 66 | [tool.isort] 67 | profile = "black" 68 | src_paths = ["easy_sql"] 69 | 70 | [tool.black] 71 | line-length = 120 72 | preview = true 73 | 74 | [tool.pytest.ini_options] 75 | testpaths = [ 76 | "easy_sql", 77 | ] 78 | python_files = [ 79 | "*_itest.py", 80 | "*_test.py", 81 | ] 82 | 83 | [build-system] 84 | requires = ["poetry-core>=1.0.0"] 85 | build-backend = "poetry.core.masonry.api" 86 | -------------------------------------------------------------------------------- /requirements-all.txt: -------------------------------------------------------------------------------- 1 | appdirs==1.4.4 ; python_version >= "3.7" and python_version < "4.0" 2 | attrs==22.1.0 ; python_version >= "3.7" and python_version < "4.0" 3 | backports-cached-property==1.0.2 ; python_version >= "3.7" and python_version < "3.8" 4 | backports-zoneinfo==0.2.1 ; python_version >= "3.7" and python_version < "3.9" 5 | certifi==2022.6.15 ; python_version >= "3.7" and python_version < "4" 6 | chardet==5.0.0 ; python_version >= "3.7" and python_version < "4.0" 7 | charset-normalizer==2.1.1 ; python_version >= "3.7" and python_version < "4" 8 | click==8.1.3 ; python_version >= "3.7" and python_version < "4.0" 9 | clickhouse-driver==0.2.4 ; python_version >= "3.7" and python_version < "4" 10 | clickhouse-sqlalchemy==0.2.2 ; python_version >= "3.7" and python_version < "4" 11 | colorama==0.4.5 ; python_version >= "3.7" and python_version < "4.0" 12 | colorlog==6.7.0 ; python_version >= "3.7" and python_version < "4.0" 13 | diff-cover==6.5.1 ; python_version >= "3.7" and python_version < "4.0" 14 | greenlet==1.1.3 ; python_version >= "3.7" and (platform_machine == "aarch64" or platform_machine == "ppc64le" or platform_machine == "x86_64" or platform_machine == "amd64" or platform_machine == "AMD64" or platform_machine == "win32" or platform_machine == "WIN32") and python_version < "4.0" 15 | idna==3.3 ; python_version >= "3.7" and python_version < "4" 16 | importlib-metadata==4.2.0 ; python_version >= "3.7" and python_version < "3.8" 17 | iniconfig==1.1.1 ; python_version >= "3.7" and python_version < "4.0" 18 | jinja2==3.1.2 ; python_version >= "3.7" and python_version < "4.0" 19 | markupsafe==2.1.1 ; python_version >= "3.7" and python_version < "4.0" 20 | packaging==21.3 ; python_version >= "3.7" and python_version < "4.0" 21 | pathspec==0.10.1 ; python_version >= "3.7" and python_version < "4.0" 22 | pluggy==1.0.0 ; python_version >= "3.7" and python_version < "4.0" 23 | psycopg2==2.9.3 ; python_version >= "3.7" and python_version < "4.0" 24 | py4j==0.10.9.5 ; python_version >= "3.7" and python_version < "4.0" 25 | py==1.11.0 ; python_version >= "3.7" and python_version < "4.0" 26 | pygments==2.13.0 ; python_version >= "3.7" and python_version < "4.0" 27 | pyodps==0.11.2.1 ; python_version >= "3.7" and python_version < "4.0" 28 | pyparsing==3.0.9 ; python_version >= "3.7" and python_version < "4.0" 29 | pyspark==3.3.0 ; python_version >= "3.7" and python_version < "4.0" 30 | pytest==7.1.3 ; python_version >= "3.7" and python_version < "4.0" 31 | pytz-deprecation-shim==0.1.0.post0 ; python_version >= "3.7" and python_version < "4" 32 | pytz==2022.2.1 ; python_version >= "3.7" and python_version < "4" 33 | pyyaml==6.0 ; python_version >= "3.7" and python_version < "4.0" 34 | regex==2022.8.17 ; python_version >= "3.7" and python_version < "4.0" 35 | requests==2.28.1 ; python_version >= "3.7" and python_version < "4" 36 | setuptools==65.3.0 ; python_version >= "3.7" and python_version < "3.8" 37 | sqlalchemy==1.4.40 ; python_version >= "3.7" and python_version < "4.0" 38 | sqlfluff==1.2.1 ; python_version >= "3.7" and python_version < "4.0" 39 | tblib==1.7.0 ; python_version >= "3.7" and python_version < "4.0" 40 | toml==0.10.2 ; python_version >= "3.7" and python_version < "4.0" 41 | tomli==2.0.1 ; python_version >= "3.7" and python_version < "4.0" 42 | tqdm==4.64.0 ; python_version >= "3.7" and python_version < "4.0" 43 | typing-extensions==4.3.0 ; python_version >= "3.7" and python_version < "4.0" 44 | tzdata==2022.2 ; python_version >= "3.7" and python_version < "4" 45 | tzlocal==4.2 ; python_version >= "3.7" and python_version < "4" 46 | urllib3==1.26.12 ; python_version >= "3.7" and python_version < "4" 47 | zipp==3.8.1 ; python_version >= "3.7" and python_version < "3.8" 48 | -------------------------------------------------------------------------------- /test/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nikolaik/python-nodejs:python3.8-nodejs12 2 | 3 | RUN apt-get update && apt-get install -y vim wget openjdk-11-jdk zip unzip lsof less 4 | 5 | WORKDIR /tmp 6 | 7 | ADD sample_etl.spark.sql /tmp 8 | ADD sample_etl.postgres.sql /tmp 9 | ADD sample_etl.clickhouse.sql /tmp 10 | 11 | RUN python3 -m pip install 'easy-sql-easy-sql[spark,pg,clickhouse,cli]' 12 | 13 | ARG PG_URL= 14 | ARG CLICKHOUSE_URL= 15 | 16 | RUN bash -c "$(python3 -m easy_sql.data_process -f sample_etl.spark.sql -p)" 17 | RUN PG_URL=$PG_URL python3 -m easy_sql.data_process -f sample_etl.postgres.sql 18 | RUN CLICKHOUSE_URL=$CLICKHOUSE_URL python3 -m easy_sql.data_process -f sample_etl.clickhouse.sql 19 | -------------------------------------------------------------------------------- /test/customized_func/customized_func.py: -------------------------------------------------------------------------------- 1 | __all__ = ["count_partitions"] 2 | 3 | 4 | def count_partitions(table_name: str) -> int: 5 | from pyspark.sql import SparkSession 6 | 7 | spark: SparkSession = SparkSession.builder.getOrCreate() 8 | partitions = spark.sql(f"show partitions {table_name}").collect() 9 | return len(partitions) 10 | -------------------------------------------------------------------------------- /test/customized_func/etl_with_customized_func.sql: -------------------------------------------------------------------------------- 1 | -- backend: spark 2 | -- config: easy_sql.func_file_path=customized_func.py 3 | 4 | -- target=action.define_table 5 | create table some_table partitioned by (pt) as 6 | select * from ( 7 | select 1 as a, 2 as b, 1 as pt 8 | union 9 | select 1 as a, 2 as b, 2 as pt 10 | ) t 11 | 12 | -- target=log.partition_count 13 | select ${count_partitions(some_table)} as partition_count 14 | -------------------------------------------------------------------------------- /test/doc/debugging.sql: -------------------------------------------------------------------------------- 1 | -- prepare-sql: drop database if exists sample cascade 2 | -- prepare-sql: create database sample 3 | -- prepare-sql: create table sample.order_table as select 1 as id, '1' as val 4 | -- prepare-sql: create table sample.order_table_after_joined as select 1 as id, '1' as val 5 | 6 | -- target=variables 7 | select 8 | 3 as c 9 | 10 | -- target=log.i_would_like_to_log_something 11 | select 12 | 1 as a 13 | , 2 as b 14 | , ${c} as c 15 | 16 | -- target=log.order_count 17 | select 18 | count(1) 19 | from sample.order_table 20 | 21 | -- target=check.order_count_must_be_equal_after_joined_product 22 | select 23 | (select count(1) from sample.order_table) as expected 24 | , (select count(1) from sample.order_table_after_joined) as actual 25 | 26 | -- target=check.equal(${c}, 3) 27 | -------------------------------------------------------------------------------- /test/doc/test_sqlfulff.sql: -------------------------------------------------------------------------------- 1 | SELECT a+b AS foo, 2 | c AS bar from my_table where name = {{ test_name }}; -- noqa: L014,L034 3 | -------------------------------------------------------------------------------- /test/doc/variables.sql: -------------------------------------------------------------------------------- 1 | -- target=variables 2 | select 1 as a, '2' as b 3 | 4 | -- target=variables 5 | select 6 | ${a} as a 7 | , ${b} as b 8 | , 1${a} as a1 9 | , ${a} + ${b} as ab 10 | 11 | -- target=log.variables 12 | select ${a} as a, ${b} as b, ${a1} as a1, ${ab} as ab 13 | -------------------------------------------------------------------------------- /test/etl_test.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/easysql/easy_sql/b568542617942f347579ff872d976fd2175aa071/test/etl_test.xlsx -------------------------------------------------------------------------------- /test/flink/flink_hive_conf/hive-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | hive.metastore.uris 6 | thrift://localhost:9083 7 | 8 | 9 | -------------------------------------------------------------------------------- /test/sample_data_process.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyspark.sql import SparkSession 4 | 5 | from easy_sql.sql_processor import SqlProcessor 6 | 7 | 8 | def run_spark_etl(): 9 | from easy_sql.sql_processor.backend import SparkBackend 10 | 11 | spark = SparkSession.builder.enableHiveSupport().getOrCreate() 12 | backend = SparkBackend(spark) 13 | sql = """ 14 | -- target=log.some_log 15 | select 1 as a 16 | """ 17 | sql_processor = SqlProcessor(backend, sql) 18 | sql_processor.run() 19 | 20 | 21 | def run_postgres_etl(): 22 | from easy_sql.sql_processor.backend.rdb import RdbBackend 23 | 24 | backend = RdbBackend(os.environ["PG_URL"]) 25 | sql = """ 26 | -- target=log.some_log 27 | select 1 as a 28 | """ 29 | sql_processor = SqlProcessor(backend, sql) 30 | sql_processor.run() 31 | 32 | 33 | def run_clickhouse_etl(): 34 | from easy_sql.sql_processor.backend.rdb import RdbBackend 35 | 36 | backend = RdbBackend(os.environ["CLICKHOUSE_URL"]) 37 | sql = """ 38 | -- target=log.some_log 39 | select 1 as a 40 | """ 41 | sql_processor = SqlProcessor(backend, sql) 42 | sql_processor.run() 43 | 44 | 45 | if __name__ == "__main__": 46 | run_spark_etl() 47 | run_postgres_etl() 48 | run_clickhouse_etl() 49 | -------------------------------------------------------------------------------- /test/sample_etl.clickhouse.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "completed": 1, 4 | "default_col_type": "string", 5 | "func_file_paths": [], 6 | "includes": {}, 7 | "inputs": [ 8 | { 9 | "column_types": "[\"int\", \"String\"]", 10 | "columns": "[\"id\", \"val\"]", 11 | "name": "sample.test", 12 | "value_descriptions": [ 13 | "Some sample data for testing" 14 | ], 15 | "values": [ 16 | "[1, \"1\"]" 17 | ] 18 | } 19 | ], 20 | "missed_fields": [], 21 | "name": "test for sample etl", 22 | "outputs": [ 23 | { 24 | "column_types": "[\"int\", \"String\"]", 25 | "columns": "[\"id\", \"val\"]", 26 | "name": "sample.result", 27 | "value_descriptions": [], 28 | "values": [ 29 | "[1, \"1\"]", 30 | "[1, \"2\"]" 31 | ] 32 | } 33 | ], 34 | "simple_sql_name": "sample_etl.clickhouse.sql", 35 | "sql_file_content": null, 36 | "sql_file_path": "test/sample_etl.clickhouse.sql", 37 | "udf_file_paths": [], 38 | "vars": {} 39 | } 40 | ] 41 | -------------------------------------------------------------------------------- /test/sample_etl.clickhouse.sql: -------------------------------------------------------------------------------- 1 | -- backend: clickhouse 2 | -- prepare-sql: drop database if exists sample 3 | -- prepare-sql: create database sample 4 | -- prepare-sql: create table sample.test engine MergeTree() order by tuple() as select 1 as id, '1' as val 5 | 6 | -- target=variables 7 | select 1 as __create_output_table__ 8 | 9 | -- target=variables 10 | select 1 as a 11 | 12 | -- target=log.a 13 | select '${a}' as a 14 | 15 | -- target=log.test_log 16 | select 1 as some_log 17 | 18 | -- target=check.should_equal 19 | select 1 as actual, 1 as expected 20 | 21 | -- target=temp.result 22 | select 23 | ${a} as id, cast(${a} + 1 as text) as val 24 | union all 25 | select id, val from sample.test 26 | 27 | -- target=output.sample.result 28 | select * from result 29 | 30 | -- target=log.sample_result 31 | select * from result 32 | -------------------------------------------------------------------------------- /test/sample_etl.clickhouse.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/easysql/easy_sql/b568542617942f347579ff872d976fd2175aa071/test/sample_etl.clickhouse.xlsx -------------------------------------------------------------------------------- /test/sample_etl.flink.hive.postgres.sql: -------------------------------------------------------------------------------- 1 | -- backend: flink 2 | -- config: easy_sql.flink_tables_file_path=sample_etl.flink_tables_file_hive.yml 3 | -- inputs: db_pg.source_1 4 | 5 | -- target=variables 6 | select 2 as a 7 | 8 | -- target=log.a 9 | select '${a}' as a 10 | 11 | -- target=log.test_log 12 | select 1 as some_log 13 | 14 | -- target=check.should_equal 15 | select 1 as actual, 1 as expected 16 | 17 | -- target=temp.result_view 18 | select 19 | ${a} as id, 20 | '2' as val 21 | union all 22 | select id, val from myhiveCatalog.default.hive_table 23 | union all 24 | select id, val from db_pg.source_1 25 | 26 | -- target=output.myhiveCatalog.default.hive_out_table 27 | select id, val from result_view 28 | 29 | -- target=log.sample_result 30 | select * from result_view 31 | -------------------------------------------------------------------------------- /test/sample_etl.flink.hive.sql: -------------------------------------------------------------------------------- 1 | -- backend: flink 2 | -- config: easy_sql.flink_tables_file_path=sample_etl.flink_tables_file_hive.yml 3 | -- config: flink.cmd=-t local 4 | -- config: flink.cmd=--parallelism 2 5 | -- config: flink.python.fn-execution.bundle.size=1000 6 | -- config: flink.python.client.executable=python 7 | -- config: flink.jobmanager.memory.process.size=1024m 8 | -- config: flink.taskmanager.memory.process.size=4096m 9 | 10 | -- target=variables 11 | select 2 as a 12 | 13 | -- target=log.a 14 | select '${a}' as a 15 | 16 | -- target=log.test_log 17 | select 1 as some_log 18 | 19 | -- target=check.should_equal 20 | select 1 as actual, 1 as expected 21 | 22 | -- target=temp.result_view 23 | select 24 | ${a} as id, 25 | '2' as val 26 | union all 27 | select id, val from myhiveCatalog.default.hive_table 28 | 29 | -- target=output.myhiveCatalog_1.default.hive_out_table 30 | select id, val from result_view 31 | 32 | -- target=log.sample_result 33 | select * from result_view 34 | -------------------------------------------------------------------------------- /test/sample_etl.flink.hudi-agg.sql: -------------------------------------------------------------------------------- 1 | -- Preparation: refer sample_etl.flink.postgres-hudi.sql 2 | -- 3 | -- Verification: 4 | -- 1. verify there are two rows in hudi table: 5 | -- - start sql client: /usr/local/lib/python3.8/site-packages/pyflink/bin/sql-client.sh embedded -j test/flink/jars/hudi-flink1.15-bundle-0.12.2.jar shell 6 | -- - emit sql: create table hudi_agg (val varchar NOT NULL PRIMARY KEY NOT ENFORCED, val_count bigint) WITH ( 7 | -- 'connector' = 'hudi' , 'path' = '/tmp/hudi-flink-test/db_hudi.db/target_hudi_agg' , 'table.type' = 'MERGE_ON_READ' , 'changelog.enabled' = 'True' , 'compaction.async.enabled' = 'False' 8 | -- ); 9 | -- - emit sql: select * from hudi_agg; 10 | -- 2. insert data into sample.test and check if it aggregates correctly in hudi table 11 | -- 12 | -- Cleanup: 13 | -- 1. cancel applicaiton from flink dashboard (http://localhost:8081/) 14 | 15 | 16 | -- backend: flink 17 | 18 | -- config: easy_sql.flink_tables_file_path=test/sample_etl.flink_tables_file.yml 19 | -- config: easy_sql.etl_type=streaming 20 | 21 | -- config: flink.cmd=-pyexec python3 22 | -- config: flink.cmd=-pyclientexec python3 23 | -- config: flink.cmd=-t remote 24 | -- config: flink.execution.checkpointing.interval=3s 25 | -- config: flink.pipeline.jars=test/flink/jars/flink-sql-connector-postgres-cdc-2.3.0.jar;test/flink/jars/hudi-flink1.15-bundle-0.12.2.jar 26 | 27 | -- prepare-sql: drop schema if exists sample cascade 28 | -- prepare-sql: create schema sample 29 | -- prepare-sql: create table sample.test as select 1 as id, '1' as val 30 | 31 | -- inputs: db_pg.source_cdc 32 | -- add db_pg.target_1 below to allow the prepare-sql command to execute against. 33 | -- outputs: db_hudi.target_hudi_agg, db_pg.target_1 34 | 35 | -- target=variables 36 | select 37 | 'append' as __save_mode__ 38 | 39 | -- target=temp.result_view 40 | select 41 | 2 as id 42 | ,'2' as val 43 | union all 44 | select id, val from db_pg.source_cdc 45 | 46 | -- target=output.db_hudi.target_hudi_agg 47 | select val, count(*) as val_count from result_view group by val 48 | -------------------------------------------------------------------------------- /test/sample_etl.flink.postgres-cdc.multi-sink.sql: -------------------------------------------------------------------------------- 1 | -- Preparation: 2 | -- 1. start a local flink cluster: your/site-packages/path/pyflink/bin/start-cluster.sh 3 | -- 2. ensure postgres started with configuration: `wal_level=logical` (in file /var/lib/postgresql/data/postgresql.conf) 4 | -- 3. use remote mode to run flink application: configure `flink.cmd=-t remote` (already done below) 5 | -- 6 | -- Verification: 7 | -- 1. verify there are two rows in postgres table public.output_table 8 | -- 2. verify there are two rows in postgres table public.output_table_agg 9 | -- 3. insert data into sample.test and check if it shows up in public.output_table and aggregates correctly in public.output_table_agg 10 | -- 11 | -- Cleanup: 12 | -- 1. cancel applicaiton from flink dashboard (http://localhost:8081/) 13 | 14 | 15 | -- backend: flink 16 | 17 | -- config: easy_sql.flink_tables_file_path=test/sample_etl.flink_tables_file.yml 18 | -- config: easy_sql.etl_type=streaming 19 | -- config: easy_sql.prepare_sql_connector=connector_1 20 | 21 | -- config: flink.cmd=-pyexec python3 22 | -- config: flink.cmd=-pyclientexec python3 23 | -- config: flink.cmd=-t remote 24 | -- config: flink.pipeline.jars=test/flink/jars/flink-sql-connector-postgres-cdc-2.3.0.jar;test/flink/jars/hudi-flink1.15-bundle-0.12.2.jar;test/flink/jars/flink-sql-connector-hive-3.1.2_2.12-1.15.1.jar;test/flink/jars/postgresql-42.2.14.jar;test/flink/jars/flink-connector-jdbc-1.15.1.jar 25 | 26 | -- inputs: db_pg.source_cdc 27 | -- outputs: db_pg.target_1, db_pg.target_agg 28 | 29 | -- prepare-sql: drop schema if exists sample cascade 30 | -- prepare-sql: create schema sample 31 | -- prepare-sql: create table sample.test as select 1 as id, '1' as val 32 | -- prepare-sql: drop table if exists public.output_table 33 | -- prepare-sql: create table public.output_table (id int4 PRIMARY KEY, val text) 34 | -- prepare-sql: drop table if exists public.output_table_agg 35 | -- prepare-sql: create table public.output_table_agg (val text PRIMARY KEY, count_val bigint) 36 | 37 | -- target=variables 38 | select 39 | 'append' as __save_mode__ 40 | 41 | -- target=variables 42 | select 2 as a 43 | 44 | -- target=log.a 45 | select '${a}' as a 46 | 47 | -- target=log.test_log 48 | select 1 as some_log 49 | 50 | -- target=check.should_equal 51 | select 1 as actual, 1 as expected 52 | 53 | -- target=temp.result_view 54 | select 55 | ${a} as id, 56 | '2' as val 57 | union all 58 | select id, val from db_pg.source_cdc 59 | 60 | -- target=output.db_pg.target_1 61 | select id, val from result_view 62 | 63 | -- target=output.db_pg.target_agg 64 | select val, count(1) as count_val from result_view group by val 65 | 66 | -- target=func.execute_streaming_inserts() 67 | -- if there are multiple inserts and we call the function above, these inserts will be merged into one job and share streams 68 | -- it takes the optimization method here: https://nightlies.apache.org/flink/flink-docs-release-1.16/docs/dev/table/sql/insert/ 69 | 70 | -- target=log.db_pg__target_1__count 71 | select count(*) from db_pg.target_1 72 | -------------------------------------------------------------------------------- /test/sample_etl.flink.postgres-cdc.sql: -------------------------------------------------------------------------------- 1 | -- Preparation: 2 | -- 1. start a local flink cluster: your/site-packages/path/pyflink/bin/start-cluster.sh 3 | -- 2. ensure postgres started with configuration: `wal_level=logical` (in file /var/lib/postgresql/data/postgresql.conf) 4 | -- 3. use remote mode to run flink application: configure `flink.cmd=-t remote` (already done below) 5 | -- 6 | -- Verification: 7 | -- 1. verify there are two rows in postgres table public.output_table 8 | -- 2. insert data into sample.test and check if it shows up in public.output_table 9 | -- 10 | -- Cleanup: 11 | -- 1. cancel applicaiton from flink dashboard (http://localhost:8081/) 12 | 13 | 14 | -- backend: flink 15 | 16 | -- config: easy_sql.flink_tables_file_path=test/sample_etl.flink_tables_file.yml 17 | -- config: easy_sql.etl_type=streaming 18 | -- config: easy_sql.prepare_sql_connector=connector_1 19 | 20 | -- config: flink.cmd=-pyexec python3 21 | -- config: flink.cmd=-pyclientexec python3 22 | -- config: flink.cmd=-t remote 23 | -- config: flink.pipeline.jars=test/flink/jars/flink-sql-connector-postgres-cdc-2.3.0.jar;test/flink/jars/hudi-flink1.15-bundle-0.12.2.jar;test/flink/jars/flink-sql-connector-hive-3.1.2_2.12-1.15.1.jar;test/flink/jars/postgresql-42.2.14.jar;test/flink/jars/flink-connector-jdbc-1.15.1.jar 24 | 25 | -- inputs: db_pg.source_cdc 26 | -- outputs: db_pg.target_1 27 | 28 | -- prepare-sql: drop schema if exists sample cascade 29 | -- prepare-sql: create schema sample 30 | -- prepare-sql: create table sample.test as select 1 as id, '1' as val 31 | -- prepare-sql: drop table if exists public.output_table 32 | -- prepare-sql: create table public.output_table (id int4 PRIMARY KEY, val text) 33 | 34 | -- target=variables 35 | select 36 | 'append' as __save_mode__ 37 | 38 | -- target=variables 39 | select 2 as a 40 | 41 | -- target=log.a 42 | select '${a}' as a 43 | 44 | -- target=log.test_log 45 | select 1 as some_log 46 | 47 | -- target=check.should_equal 48 | select 1 as actual, 1 as expected 49 | 50 | -- target=temp.result_view 51 | select 52 | ${a} as id, 53 | '2' as val 54 | union all 55 | select id, val from db_pg.source_cdc 56 | 57 | -- target=output.db_pg.target_1 58 | select id, val from result_view 59 | 60 | -- target=log.db_pg__target_1 61 | select * from db_pg.target_1 62 | -------------------------------------------------------------------------------- /test/sample_etl.flink.postgres-hudi.sql: -------------------------------------------------------------------------------- 1 | -- Preparation: 2 | -- 1. download a hadoop release: wget https://dlcdn.apache.org/hadoop/common/hadoop-3.3.5/hadoop-3.3.5.tar.gz 3 | -- 2. set hadoop classpath: tar xf hadoop-3.3.5.tar.gz && export HADOOP_CLASSPATH=$($(pwd)/hadoop-3.3.5/bin/hadoop classpath) 4 | -- 3. start a local flink cluster: your/site-packages/path/pyflink/bin/start-cluster.sh 5 | -- 4. ensure postgres started with configuration: `wal_level=logical` (in file /var/lib/postgresql/data/postgresql.conf) 6 | -- 5. use remote mode to run flink application: configure `flink.cmd=-t remote` (already done below) 7 | -- 8 | -- Verification: 9 | -- 1. verify there are two rows in hudi table /tmp/hudi-flink-test: 10 | -- echo 'drop table if exists hudi_table;create table hudi_table using hudi location "/tmp/hudi-flink-test/db_hudi.db/target_hudi";select * from hudi_table;' | \ 11 | -- spark-sql --packages org.apache.hudi:hudi-spark3.3-bundle_2.12:0.12.2 \ 12 | -- --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' \ 13 | -- --conf 'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension' \ 14 | -- --conf 'spark.sql.catalog.spark_catalog=org.apache.spark.sql.hudi.catalog.HoodieCatalog' \ 15 | -- --conf 'spark.driver.extraJavaOptions="-Dderby.system.home=/tmp/spark-warehouse-metastore-hudi -Dderby.stream.error.file=/tmp/spark-warehouse-metastore-hudi.log"' 16 | -- 2. insert data into sample.test and check if it shows up in the hudi table 17 | -- 18 | -- Cleanup: 19 | -- 1. cancel applicaiton from flink dashboard (http://localhost:8081/) 20 | 21 | 22 | -- backend: flink 23 | 24 | -- config: easy_sql.flink_tables_file_path=test/sample_etl.flink_tables_file.yml 25 | -- config: easy_sql.etl_type=streaming 26 | -- config: easy_sql.prepare_sql_connector=connector_1 27 | 28 | -- config: flink.cmd=-pyexec python3 29 | -- config: flink.cmd=-pyclientexec python3 30 | -- config: flink.cmd=-t remote 31 | -- config: flink.execution.checkpointing.interval=3s 32 | -- config: flink.pipeline.jars=test/flink/jars/flink-sql-connector-postgres-cdc-2.3.0.jar;test/flink/jars/hudi-flink1.15-bundle-0.12.2.jar;test/flink/jars/flink-sql-connector-hive-3.1.2_2.12-1.15.1.jar;test/flink/jars/postgresql-42.2.14.jar;test/flink/jars/flink-connector-jdbc-1.15.1.jar 33 | 34 | -- inputs: db_pg.source_cdc 35 | -- add db_pg.target_1 below to allow the prepare-sql command to execute against. 36 | -- outputs: db_hudi.target_hudi, db_pg.target_1 37 | 38 | -- prepare-sql: drop schema if exists sample cascade 39 | -- prepare-sql: create schema sample 40 | -- prepare-sql: create table sample.test as select 1 as id, '1' as val 41 | 42 | -- target=variables 43 | select 44 | 'append' as __save_mode__ 45 | 46 | -- target=temp.result_view 47 | select 48 | 2 as id, 49 | '2' as val 50 | union all 51 | select id, val from db_pg.source_cdc 52 | 53 | -- target=output.db_hudi.target_hudi 54 | select id, val from result_view 55 | 56 | -- target=func.execute_streaming_inserts() 57 | -- trigger execution of inserts manually, or it will be triggered at the end of the job and the query of db_hudi.target_hudi fails. 58 | 59 | -- hack below as we didn't prepared the hudi table 60 | -- target1=log.db_hudi__target_hudi 61 | -- select * from db_hudi.target_hudi 62 | -------------------------------------------------------------------------------- /test/sample_etl.flink.postgres.sql: -------------------------------------------------------------------------------- 1 | -- backend: flink 2 | 3 | -- config: easy_sql.flink_tables_file_path=test/sample_etl.flink_tables_file.yml 4 | -- config: easy_sql.prepare_sql_connector=connector_1 5 | 6 | -- config: flink.cmd=-pyexec python3 7 | -- config: flink.cmd=-pyclientexec python3 8 | -- config: flink.cmd=-t local 9 | -- config: flink.pipeline.jars=test/flink/jars/flink-sql-connector-postgres-cdc-2.3.0.jar;test/flink/jars/hudi-flink1.15-bundle-0.12.2.jar;test/flink/jars/postgresql-42.2.14.jar;test/flink/jars/flink-connector-jdbc-1.15.1.jar 10 | 11 | -- inputs: db_pg.source_1, db_pg.target_1 12 | -- outputs: db_pg.target_1 13 | 14 | -- prepare-sql: drop schema if exists sample cascade 15 | -- prepare-sql: create schema sample 16 | -- prepare-sql: create table sample.test as select 1 as id, '1' as val 17 | -- prepare-sql: drop table if exists public.output_table 18 | -- prepare-sql: create table public.output_table (id int4 PRIMARY KEY, val text) 19 | 20 | -- target=variables 21 | select 22 | 'append' as __save_mode__ 23 | 24 | -- target=variables 25 | select 2 as a 26 | 27 | -- target=log.a 28 | select '${a}' as a 29 | 30 | -- target=log.test_log 31 | select 1 as some_log 32 | 33 | -- target=check.should_equal 34 | select 1 as actual, 1 as expected 35 | 36 | -- target=temp.result_view 37 | select 38 | ${a} as id, 39 | '2' as val 40 | union all 41 | select id, val from db_pg.source_1 42 | 43 | -- target=output.db_pg.target_1 44 | select id, val from result_view 45 | 46 | -- target=log.sample_result 47 | select * from result_view 48 | -------------------------------------------------------------------------------- /test/sample_etl.flink_tables_file.yml: -------------------------------------------------------------------------------- 1 | connectors: 2 | connector_1: 3 | options: | 4 | 'connector' = 'jdbc', 5 | 'url' = 'jdbc:postgresql://localhost:5432/postgres', 6 | 'username' = 'postgres', 7 | 'password' = '123456' 8 | connector_cdc: 9 | options: | 10 | 'connector' = 'postgres-cdc', 11 | 'hostname' = 'localhost', 12 | 'port' = '5432', 13 | 'username' = 'postgres', 14 | 'password' = '123456', 15 | 'database-name' = 'postgres', 16 | 'schema-name' = 'sample', 17 | 'decoding.plugin.name' = 'pgoutput' 18 | connector_hudi: 19 | options: | 20 | 'connector' = 'hudi', 21 | 'path' = '/tmp/hudi-flink-test', 22 | 'table.type' = 'MERGE_ON_READ', 23 | 'changelog.enabled' = 'true', 24 | 'compaction.async.enabled' = 'false' 25 | catalogs: 26 | default_catalog: 27 | databases: 28 | db_pg: 29 | tables: 30 | source_1: 31 | connector: connector_1 32 | options: | 33 | 'table-name' = 'sample.test' 34 | partition_by: "id, val" 35 | schema: | 36 | `id` INT, 37 | val VARCHAR, 38 | PRIMARY KEY (id) NOT ENFORCED 39 | source_cdc: 40 | connector: connector_cdc 41 | options: | 42 | 'table-name' = 'test' 43 | partition_by: "id, val" 44 | schema: | 45 | `id` INT, 46 | val VARCHAR, 47 | PRIMARY KEY (id) NOT ENFORCED 48 | target_1: 49 | connector: connector_1 50 | options: | 51 | 'table-name' = 'output_table' 52 | schema: | 53 | `id` INT, 54 | val VARCHAR, 55 | PRIMARY KEY (id) NOT ENFORCED 56 | target_agg: 57 | connector: connector_1 58 | options: | 59 | 'table-name' = 'output_table_agg' 60 | schema: | 61 | count_val BIGINT, 62 | val VARCHAR, 63 | PRIMARY KEY (val) NOT ENFORCED 64 | db_hudi: 65 | tables: 66 | target_hudi: 67 | connector: connector_hudi 68 | schema: | 69 | id INT NOT NULL PRIMARY KEY NOT ENFORCED, 70 | val VARCHAR 71 | target_hudi_agg: 72 | connector: connector_hudi 73 | schema: | 74 | val VARCHAR NOT NULL PRIMARY KEY NOT ENFORCED, 75 | val_count BIGINT NOT NULL 76 | -------------------------------------------------------------------------------- /test/sample_etl.flink_tables_file_hive.yml: -------------------------------------------------------------------------------- 1 | connectors: 2 | connector_1: 3 | options: | 4 | 'connector' = 'jdbc', 5 | 'url' = 'jdbc:postgresql://localhost:5432/postgres', 6 | 'username' = 'postgres', 7 | 'password' = '123456' 8 | connector_cdc: 9 | options: | 10 | 'connector' = 'postgres-cdc', 11 | 'hostname' = 'localhost', 12 | 'port' = '5432', 13 | 'username' = 'postgres', 14 | 'password' = '123456', 15 | 'database-name' = 'postgres', 16 | 'schema-name' = 'sample', 17 | 'decoding.plugin.name' = 'pgoutput' 18 | connector_hudi: 19 | options: | 20 | 'connector' = 'hudi', 21 | 'path' = '/tmp/hudi-flink-test', 22 | 'table.type': 'MERGE_ON_READ', 23 | 'changelog.enabled': 'true', 24 | 'compaction.async.enabled': 'false' 25 | catalogs: 26 | myhiveCatalog: 27 | options: | 28 | 'type' = 'hive', 29 | 'hive-conf-dir' = 'test/flink/flink_hive_conf' 30 | databases: 31 | db_pg: 32 | tables: 33 | source_1: 34 | connector: connector_1 35 | options: | 36 | 'table-name' = 'sample.test' 37 | partition_by: "id, val" 38 | schema: | 39 | `id` INT, 40 | val VARCHAR, 41 | PRIMARY KEY (id) NOT ENFORCED 42 | source_cdc: 43 | connector: connector_cdc 44 | options: | 45 | 'table-name' = 'test' 46 | partition_by: "id, val" 47 | schema: | 48 | `id` INT, 49 | val VARCHAR, 50 | PRIMARY KEY (id) NOT ENFORCED 51 | target_1: 52 | connector: connector_1 53 | options: | 54 | 'table-name' = 'output_table' 55 | partition_by: "id, val" 56 | schema: | 57 | `id` INT, 58 | val VARCHAR, 59 | PRIMARY KEY (id) NOT ENFORCED 60 | target_agg: 61 | connector: connector_1 62 | options: | 63 | 'table-name' = 'output_table_agg' 64 | schema: | 65 | count_val BIGINT, 66 | val VARCHAR, 67 | PRIMARY KEY (val) NOT ENFORCED 68 | db_hudi: 69 | tables: 70 | target_hudi: 71 | connector: connector_hudi 72 | schema: | 73 | id INT NOT NULL PRIMARY KEY NOT ENFORCED, 74 | val VARCHAR 75 | target_hudi_agg: 76 | connector: connector_hudi 77 | schema: | 78 | val VARCHAR NOT NULL PRIMARY KEY NOT ENFORCED, 79 | val_count BIGINT NOT NULL 80 | -------------------------------------------------------------------------------- /test/sample_etl.postgres.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "completed": 1, 4 | "default_col_type": "string", 5 | "func_file_paths": [], 6 | "includes": {}, 7 | "inputs": [ 8 | { 9 | "column_types": "[\"int\", \"text\"]", 10 | "columns": "[\"id\", \"val\"]", 11 | "name": "sample.test", 12 | "value_descriptions": [ 13 | "Some sample data for testing" 14 | ], 15 | "values": [ 16 | "[1, \"1\"]" 17 | ] 18 | } 19 | ], 20 | "missed_fields": [], 21 | "name": "test for sample etl", 22 | "outputs": [ 23 | { 24 | "column_types": "[\"int\", \"text\"]", 25 | "columns": "[\"id\", \"val\"]", 26 | "name": "sample.result", 27 | "value_descriptions": [], 28 | "values": [ 29 | "[1, \"1\"]", 30 | "[1, \"2\"]" 31 | ] 32 | } 33 | ], 34 | "simple_sql_name": "sample_etl.postgres.sql", 35 | "sql_file_content": null, 36 | "sql_file_path": "test/sample_etl.postgres.sql", 37 | "udf_file_paths": [], 38 | "vars": {} 39 | } 40 | ] 41 | -------------------------------------------------------------------------------- /test/sample_etl.postgres.sql: -------------------------------------------------------------------------------- 1 | -- backend: postgres 2 | -- prepare-sql: drop schema if exists sample cascade 3 | -- prepare-sql: create schema sample 4 | -- prepare-sql: create table sample.test as select 1 as id, '1' as val 5 | 6 | -- target=variables 7 | select true as __create_output_table__ 8 | 9 | -- target=variables 10 | select 1 as a 11 | 12 | -- target=log.a 13 | select '${a}' as a 14 | 15 | -- target=log.test_log 16 | select 1 as some_log 17 | 18 | -- target=check.should_equal 19 | select 1 as actual, 1 as expected 20 | 21 | -- target=temp.result 22 | select 23 | ${a} as id, cast(${a} + 1 as text) as val 24 | union all 25 | select id, val from sample.test 26 | 27 | -- target=output.sample.result 28 | select * from result 29 | 30 | -- target=log.sample_result 31 | select * from result 32 | -------------------------------------------------------------------------------- /test/sample_etl.postgres.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/easysql/easy_sql/b568542617942f347579ff872d976fd2175aa071/test/sample_etl.postgres.xlsx -------------------------------------------------------------------------------- /test/sample_etl.spark.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "completed": 1, 4 | "default_col_type": "string", 5 | "func_file_paths": [], 6 | "includes": {}, 7 | "inputs": [ 8 | { 9 | "column_types": "[\"int\", \"string\"]", 10 | "columns": "[\"id\", \"val\"]", 11 | "name": "sample.test", 12 | "value_descriptions": [ 13 | "Some sample data for testing" 14 | ], 15 | "values": [ 16 | "[1, \"1\"]" 17 | ] 18 | } 19 | ], 20 | "missed_fields": [], 21 | "name": "test for sample etl", 22 | "outputs": [ 23 | { 24 | "column_types": "[\"int\", \"string\"]", 25 | "columns": "[\"id\", \"val\"]", 26 | "name": "sample.result", 27 | "value_descriptions": [], 28 | "values": [ 29 | "[1, \"1\"]", 30 | "[1, \"2\"]" 31 | ] 32 | } 33 | ], 34 | "simple_sql_name": "sample_etl.spark.sql", 35 | "sql_file_content": null, 36 | "sql_file_path": "test/sample_etl.spark.sql", 37 | "udf_file_paths": [], 38 | "vars": {} 39 | } 40 | ] 41 | -------------------------------------------------------------------------------- /test/sample_etl.spark.sql: -------------------------------------------------------------------------------- 1 | -- prepare-sql: drop database if exists sample cascade 2 | -- prepare-sql: create database sample 3 | -- prepare-sql: create table sample.test as select 1 as id, '1' as val 4 | 5 | -- target=variables 6 | select true as __create_output_table__ 7 | 8 | -- target=variables 9 | select 1 as a 10 | 11 | -- target=log.a 12 | select '${a}' as a 13 | 14 | -- target=log.test_log 15 | select 1 as some_log 16 | 17 | -- target=check.should_equal 18 | select 1 as actual, 1 as expected 19 | 20 | -- target=temp.result 21 | select 22 | ${a} as id, ${a} + 1 as val 23 | union all 24 | select id, val from sample.test 25 | 26 | -- target=output.sample.result 27 | select * from result 28 | 29 | -- target=log.sample_result 30 | select * from result 31 | -------------------------------------------------------------------------------- /test/sample_etl.spark.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/easysql/easy_sql/b568542617942f347579ff872d976fd2175aa071/test/sample_etl.spark.xlsx -------------------------------------------------------------------------------- /test/sample_etl.syntax.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/easysql/easy_sql/b568542617942f347579ff872d976fd2175aa071/test/sample_etl.syntax.xlsx -------------------------------------------------------------------------------- /test/sample_etl_wps.syntax.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/easysql/easy_sql/b568542617942f347579ff872d976fd2175aa071/test/sample_etl_wps.syntax.xlsx -------------------------------------------------------------------------------- /test/udf/clickhouse/etl_with_udf.sql: -------------------------------------------------------------------------------- 1 | -- backend: clickhouse 2 | -- config: easy_sql.udf_file_path=udf.py 3 | 4 | -- target=log.test_udf 5 | select translate('abcad', 'a', '') as translated_str 6 | -------------------------------------------------------------------------------- /test/udf/clickhouse/udf.py: -------------------------------------------------------------------------------- 1 | def translate(): 2 | return "CREATE FUNCTION IF NOT EXISTS translate AS (input, from, to) -> replaceAll(input, from, to)" 3 | -------------------------------------------------------------------------------- /test/udf/flink-python/etl_with_udf.sql: -------------------------------------------------------------------------------- 1 | -- backend: flink 2 | -- config: easy_sql.udf_file_path=udf.py 3 | 4 | -- target=log.test_udf 5 | select test_func(1, 2) as sum_value 6 | -------------------------------------------------------------------------------- /test/udf/flink-python/udf.py: -------------------------------------------------------------------------------- 1 | from pyflink.table import DataTypes 2 | from pyflink.table.udf import udf 3 | 4 | __all__ = ["test_func"] 5 | 6 | 7 | @udf(result_type=DataTypes.BIGINT()) 8 | def test_func(a: int, b: int) -> int: 9 | return a + b 10 | -------------------------------------------------------------------------------- /test/udf/flink-scala/.gitignore: -------------------------------------------------------------------------------- 1 | classes/ 2 | *.jar 3 | -------------------------------------------------------------------------------- /test/udf/flink-scala/Makefile: -------------------------------------------------------------------------------- 1 | SCALA_BIN=/usr/local/bin 2 | SCALA_CP="/Users/yuewu/.pyenv/versions/3.8.13/lib/python3.8/site-packages/pyflink/lib/*" 3 | 4 | jar: 5 | - rm -r classes 6 | mkdir -pv classes 7 | ${SCALA_BIN}/scalac -nobootcp -cp ${SCALA_CP} -d classes your/company/*.scala 8 | cd classes && jar -cvf ../udf.jar . 9 | -------------------------------------------------------------------------------- /test/udf/flink-scala/etl_with_udf.sql: -------------------------------------------------------------------------------- 1 | -- backend: flink 2 | -- config: flink.cmd=--jarfile udf.jar 3 | -- config: easy_sql.scala_udf_initializer=your.company.udfs 4 | 5 | -- target=log.test_udf 6 | select test_func(1, 2) as sum_value 7 | -------------------------------------------------------------------------------- /test/udf/flink-scala/your/company/udfs.scala: -------------------------------------------------------------------------------- 1 | package your.company 2 | 3 | import org.apache.flink.table.api._ 4 | import org.apache.flink.table.functions.ScalarFunction 5 | 6 | class TestFunction extends ScalarFunction { 7 | def eval(a: Integer, b: Integer): Integer = { 8 | a + b + 10 9 | } 10 | } 11 | 12 | object udfs { 13 | def initUdfs(flink: TableEnvironment) { 14 | flink.createTemporarySystemFunction("test_func", classOf[TestFunction]) 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /test/udf/spark-python/etl_with_udf.sql: -------------------------------------------------------------------------------- 1 | -- backend: spark 2 | -- config: easy_sql.udf_file_path=udf.py 3 | 4 | -- target=log.test_udf 5 | select string_set(array("a", "a", "b")) as stringset 6 | -------------------------------------------------------------------------------- /test/udf/spark-python/udf.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | __all__ = ["string_set"] 4 | 5 | 6 | def string_set(string_arr: List[str]) -> List[str]: 7 | return list(set(string_arr)) 8 | -------------------------------------------------------------------------------- /test/udf/spark-scala/.gitignore: -------------------------------------------------------------------------------- 1 | classes/ 2 | *.jar 3 | -------------------------------------------------------------------------------- /test/udf/spark-scala/Makefile: -------------------------------------------------------------------------------- 1 | SCALA_BIN=~/dev/sdks/scala-2.12.10/bin 2 | SCALA_CP="/usr/local/lib/python3.8/site-packages/pyspark/jars/*" 3 | 4 | jar: 5 | - rm -r classes 6 | mkdir -pv classes 7 | ${SCALA_BIN}/scalac -nobootcp -cp ${SCALA_CP} -d classes your/company/*.scala 8 | cd classes && jar -cvf ../udf.jar . 9 | -------------------------------------------------------------------------------- /test/udf/spark-scala/etl_with_udf.sql: -------------------------------------------------------------------------------- 1 | -- backend: spark 2 | -- config: spark.jars=udf.jar 3 | -- config: easy_sql.scala_udf_initializer=your.company.udfs 4 | 5 | -- target=log.test_udf 6 | select string_set(array("a", "a", "b")) as stringset 7 | -------------------------------------------------------------------------------- /test/udf/spark-scala/your/company/udfs.scala: -------------------------------------------------------------------------------- 1 | package your.company 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions.udf 5 | import org.apache.spark.sql.types._ 6 | 7 | object udfs { 8 | def initUdfs(spark: SparkSession) { 9 | val string_set = udf((s: Seq[String]) => s.filter(_ != null).toSet.toArray) 10 | spark.udf.register("string_set", string_set) 11 | } 12 | } 13 | --------------------------------------------------------------------------------