├── .github └── workflows │ ├── build.yml │ ├── release.yml │ └── test.yml ├── .gitignore ├── .goreleaser.yml ├── LICENSE ├── Makefile ├── README.md ├── entrypoint.sh ├── hook └── .gitkeep └── task └── bq2bq ├── Dockerfile ├── README.md ├── compiler.go ├── executor ├── .coveragerc ├── .gitignore ├── .gitlab-ci.yml ├── Dockerfile ├── README.md ├── bumblebee │ ├── __init__.py │ ├── bigquery_service.py │ ├── bq2bq.py │ ├── config.py │ ├── datehelper.py │ ├── filesystem.py │ ├── handler.py │ ├── loader.py │ ├── log.py │ ├── query.py │ ├── transformation.py │ ├── version.py │ ├── window.py │ └── writer.py ├── example.py ├── main.py ├── requirements.txt ├── run_coverage.sh ├── run_tests.sh ├── samples │ └── tasks │ │ ├── allow_field_addition │ │ └── basic │ │ │ ├── properties.cfg │ │ │ └── query.sql │ │ ├── delete │ │ ├── properties.cfg │ │ └── query.sql │ │ ├── dml │ │ ├── properties.cfg │ │ └── query.sql │ │ ├── drop │ │ ├── properties.cfg │ │ └── query.sql │ │ ├── legacy │ │ ├── not_use_spillover │ │ │ ├── properties.cfg │ │ │ ├── query.sql │ │ │ └── spillover_date.sql │ │ └── use_spillover │ │ │ ├── properties.cfg │ │ │ ├── query.sql │ │ │ └── spillover_date.sql │ │ ├── non_partitioned_append │ │ ├── properties.cfg │ │ └── query.sql │ │ ├── partition_append │ │ ├── properties.cfg │ │ └── query.sql │ │ ├── partition_by_column │ │ ├── properties.cfg │ │ └── query.sql │ │ ├── partition_by_column_load_timestamp │ │ ├── properties.cfg │ │ └── query.sql │ │ ├── partition_by_ingestiontime │ │ ├── properties.cfg │ │ └── query.sql │ │ ├── replace_all │ │ └── basic │ │ │ ├── properties.cfg │ │ │ └── query.sql │ │ ├── replace_merge │ │ ├── auto │ │ │ ├── properties.cfg │ │ │ └── query.sql │ │ └── with_filter │ │ │ ├── properties.cfg │ │ │ └── query.sql │ │ ├── select │ │ ├── federated_table │ │ │ ├── properties.cfg │ │ │ └── query.sql │ │ ├── script │ │ │ ├── properties.cfg │ │ │ └── query.sql │ │ └── without_dependency │ │ │ ├── properties.cfg │ │ │ └── query.sql │ │ └── weekly_partitioned │ │ ├── properties.cfg │ │ └── query.sql ├── setup.py └── tests │ ├── sample_config │ └── in │ │ └── query.sql │ ├── test_config.py │ ├── test_query.py │ ├── test_transformation.py │ └── test_window.py ├── factory.go ├── go.mod ├── go.sum ├── main.go ├── main_test.go ├── optimus-plugin-bq2bq.yaml ├── telemetry.go └── validate.go /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: build 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | workflow_dispatch: 8 | 9 | jobs: 10 | build: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - name: Checkout 14 | uses: actions/checkout@v2 15 | with: 16 | fetch-depth: 0 17 | - name: Set up Go 18 | uses: actions/setup-go@v2 19 | with: 20 | go-version: '1.18' 21 | - name: Run GoReleaser for branch [Main] 22 | uses: goreleaser/goreleaser-action@v2.6.1 23 | with: 24 | distribution: goreleaser 25 | version: latest 26 | args: -f .goreleaser.yml --snapshot --rm-dist -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: release 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*' 7 | workflow_dispatch: 8 | 9 | jobs: 10 | release: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - name: Checkout 14 | uses: actions/checkout@v2 15 | with: 16 | fetch-depth: 0 17 | - name: Set up Go 18 | uses: actions/setup-go@v2 19 | with: 20 | go-version: '1.18' 21 | - name: Login to DockerHub 22 | uses: docker/login-action@v1 23 | with: 24 | registry: docker.io 25 | username: ${{ secrets.DOCKERHUB_USERNAME }} 26 | password: ${{ secrets.DOCKERHUB_TOKEN }} 27 | - name: Run GoReleaser [Main] 28 | uses: goreleaser/goreleaser-action@v2.6.1 29 | with: 30 | distribution: goreleaser 31 | version: latest 32 | args: -f .goreleaser.yml --rm-dist 33 | env: 34 | GITHUB_TOKEN: ${{ secrets.GH_PAT }} 35 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: test 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | workflow_dispatch: 9 | 10 | jobs: 11 | executor_bq2bq: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - name: Checkout 15 | uses: actions/checkout@v2 16 | with: 17 | fetch-depth: 0 18 | - name: Set up Python 19 | uses: actions/setup-python@v2 20 | with: 21 | python-version: '3.8' 22 | - name: test executer 23 | run: | 24 | cd ./task/bq2bq/executor 25 | chmod +x ./run_coverage.sh 26 | ./run_coverage.sh -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Binaries for programs and plugins 2 | *.exe 3 | *.exe~ 4 | *.dll 5 | *.so 6 | *.dylib 7 | 8 | # Test binary, built with `go test -c` 9 | *.test 10 | 11 | # Output of the go coverage tool, specifically when used with LiteIDE 12 | *.out 13 | 14 | # Dependency directories (remove the comment below to include it) 15 | # vendor/ 16 | .idea 17 | .DS_Store 18 | 19 | dist 20 | 21 | temp 22 | docker-compose.yml -------------------------------------------------------------------------------- /.goreleaser.yml: -------------------------------------------------------------------------------- 1 | before: 2 | hooks: 3 | - make test 4 | builds: 5 | - dir: ./task/bq2bq 6 | main: . 7 | id: "bq2bq" 8 | binary: "optimus-bq2bq_{{.Os}}_{{.Arch}}" 9 | ldflags: 10 | - -s -w -X main.Version={{.Version}} 11 | goos: 12 | - linux 13 | - darwin 14 | - windows 15 | goarch: 16 | - amd64 17 | - arm64 18 | env: 19 | - CGO_ENABLED=0 20 | archives: 21 | - replacements: 22 | darwin: macos 23 | linux: linux 24 | windows: windows 25 | 386: i386 26 | amd64: x86_64 27 | format_overrides: 28 | - goos: windows 29 | format: zip 30 | files: 31 | - src : ./task/bq2bq/optimus-plugin-bq2bq.yaml 32 | dst : "." 33 | strip_parent: true 34 | 35 | release: 36 | prerelease: auto 37 | draft: true 38 | checksum: 39 | name_template: 'checksums.txt' 40 | snapshot: 41 | name_template: "{{.Version}}" 42 | changelog: 43 | sort: asc 44 | filters: 45 | exclude: 46 | - '^docs:' 47 | - '^test:' 48 | - '^chore:' 49 | - '^build:' 50 | dockers: 51 | - goos: linux 52 | goarch: amd64 53 | image_templates: 54 | - "docker.io/odpf/optimus-task-bq2bq-executor:latest" 55 | - "docker.io/odpf/optimus-task-bq2bq-executor:{{ .Version }}" 56 | dockerfile: ./task/bq2bq/executor/Dockerfile 57 | extra_files: 58 | - task/bq2bq/executor 59 | 60 | brews: 61 | - name: optimus-plugins-odpf 62 | tap: 63 | owner: odpf 64 | name: taps 65 | license: "Apache 2.0" 66 | folder: Formula 67 | description: "Optimus Plugins for warehouse" 68 | skip_upload: auto 69 | dependencies: 70 | - odpf/taps/optimus 71 | commit_author: 72 | name: github-actions[bot] 73 | email: 41898282+github-actions[bot]@users.noreply.github.com 74 | install: | 75 | bin.install Dir["optimus-*"] -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: build check fmt test vet help 2 | .DEFAULT_GOAL := help 3 | GOVERSION := $(shell go version | cut -d ' ' -f 3 | cut -d '.' -f 2) 4 | SHELL := /usr/bin/env bash 5 | ROOT := $(shell pwd) 6 | TASKS := $(shell ls ${ROOT}/task) 7 | HOOKS := $(shell ls ${ROOT}/hook) 8 | 9 | build: test build-gorelease ## build all 10 | @echo " > build finished" 11 | 12 | build-gorelease: ## build everything with goreleaser 13 | @echo " > building binaries" 14 | goreleaser --snapshot --rm-dist 15 | 16 | install: ## install plugin to optimus directory 17 | mkdir -p ~/.optimus/plugins 18 | cp ./dist/bq2bq_darwin_amd64/* ~/.optimus/plugins/ 19 | 20 | clean: ## clean binaries 21 | rm -rf ./dist 22 | 23 | fmt: ## Run FMT 24 | @for target in ${TASKS}; do \ 25 | cd ${ROOT}/task/$${target}; go fmt . ; go mod tidy; \ 26 | done 27 | @for target in ${HOOKS}; do \ 28 | cd ${ROOT}/hook/$${target}; go fmt . ; go mod tidy; \ 29 | done 30 | 31 | test: ## Run tests 32 | @for target in ${TASKS}; do \ 33 | cd ${ROOT}/task/$${target}; go vet . ; go test . -race; \ 34 | done 35 | @for target in ${HOOKS}; do \ 36 | cd ${ROOT}/hook/$${target}; go vet . ; go test . -race; \ 37 | done 38 | 39 | help: 40 | @grep -E '^[a-zA-Z0-9_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Transformers 2 | 3 | [![test workflow](https://github.com/odpf/transformers/actions/workflows/test.yml/badge.svg)](test) 4 | [![build workflow](https://github.com/odpf/transformers/actions/workflows/build.yml/badge.svg)](build) 5 | 6 | Optimus's transformation plugins are implementations of Task and Hook interfaces that allows 7 | execution of arbitrary jobs in optimus. 8 | 9 | ## To install plugins via homebrew 10 | ```shell 11 | brew tap odpf/taps 12 | brew install optimus-plugins-odpf 13 | ``` 14 | 15 | ## To install plugins via shell 16 | 17 | ```shell 18 | curl -sL ${PLUGIN_RELEASE_URL} | tar xvz 19 | chmod +x optimus-* 20 | mv optimus-* /usr/bin/ 21 | ``` -------------------------------------------------------------------------------- /entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # wait for few seconds to prepare scheduler for the run 3 | sleep 5 4 | 5 | #get optimus version 6 | echo "-- optimus client version" 7 | /opt/optimus version 8 | 9 | # get resources 10 | echo "-- initializing optimus assets" 11 | /opt/optimus job run-input "$JOB_NAME" --project-name \ 12 | "$PROJECT" --output-dir "$JOB_DIR" \ 13 | --type "$INSTANCE_TYPE" --name "$INSTANCE_NAME" \ 14 | --scheduled-at "$SCHEDULED_AT" --host "$OPTIMUS_HOST" 15 | 16 | # TODO: this doesnt support using back quote sign in env vars, fix it 17 | echo "-- exporting env" 18 | set -o allexport 19 | source "$JOB_DIR/in/.env" 20 | set +o allexport 21 | 22 | echo "-- current envs" 23 | printenv 24 | 25 | echo "-- exporting env with secret" 26 | set -o allexport 27 | source "$JOB_DIR/in/.secret" 28 | set +o allexport 29 | 30 | echo "-- running unit" 31 | exec $(eval echo "$@") 32 | -------------------------------------------------------------------------------- /hook/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raystack/transformers/b3c74a5906940af2b953576bb651a45e14e7653f/hook/.gitkeep -------------------------------------------------------------------------------- /task/bq2bq/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG VERSION 2 | FROM docker.io/odpf/optimus-task-bq2bq-executor:${VERSION} 3 | 4 | ARG OPTIMUS_RELEASE_URL 5 | ENV GOOGLE_APPLICATION_CREDENTIALS /tmp/auth.json 6 | 7 | RUN apk add curl tar 8 | RUN mkdir -p /opt 9 | RUN curl -sL ${OPTIMUS_RELEASE_URL} | tar xvz optimus 10 | RUN mv optimus /opt/optimus || true 11 | RUN chmod +x /opt/optimus 12 | 13 | COPY ./entrypoint.sh /opt/entrypoint.sh 14 | RUN chmod +x /opt/entrypoint.sh 15 | 16 | ENTRYPOINT ["/opt/entrypoint.sh"] 17 | CMD ["python3", "/opt/bumblebee/main.py"] -------------------------------------------------------------------------------- /task/bq2bq/README.md: -------------------------------------------------------------------------------- 1 | # Bigquery SQL transformation 2 | 3 | Optimus plugin that supports variety of load methods to execute SQL transformations. 4 | Features 5 | - Automatic dependency resolution 6 | - Append to a partition 7 | - Replace table partition 8 | - Merge statements 9 | - BQ Scripts 10 | 11 | #### Load Method 12 | 13 | The way data loaded to destination table depends on the partition configuration of the destination tables 14 | 15 | | Load Method | No Partition | Partitioned Table | 16 | | -------------|------------------------------------------------------------------------------------------------| -------------------------------------------------------------------------------------------| 17 | | APPEND | Append new records to destination table | Append new records to destination table per partition based on localised start_time | 18 | | MERGE | Load the data using DML Merge statement, all of the load logic lies on DML merge statement | Load the data using DML Merge statement, all of the load logic lies on DML merge statement | 19 | | REPLACE | Truncate/Clean the table before insert new records | Clean records in destination partition before insert new record to new partition | 20 | | REPLACE_MERGE| Doesn't work for non partitioned tables and partitioned tables with ingestion time | Same as REPLACE but uses Merge query to emulate replace | 21 | | REPLACE_ALL | Truncate/Clean the table before insert new records, use this instead of REPLACE for aggregation| Clean records in destination partition before insert new record to new partition | 22 | 23 | Note: if `REPLACE` load method is used and window size greater than the partition delta, 24 | it is assumed the table is partitioned with `DAY` at the moment. 25 | This will split the query into multiple queries executing for each partition one by one. 26 | 27 | Note: `REPLACE_MERGE` is experimental and might not work properly for deeply 28 | nested structs, it is advised to test it before using in production 29 | -------------------------------------------------------------------------------- /task/bq2bq/compiler.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "strings" 7 | "text/template" 8 | "time" 9 | ) 10 | 11 | const ( 12 | ISOTimeFormate = time.RFC3339 13 | ISODateLayout = "2006-01-02" 14 | ) 15 | 16 | type Compiler struct { 17 | baseTemplate *template.Template 18 | } 19 | 20 | func NewCompiler() *Compiler { 21 | baseTemplate := template. 22 | New("bq2bq_template_compiler"). 23 | Funcs(map[string]any{ 24 | "Date": dateFn, 25 | }) 26 | 27 | return &Compiler{ 28 | baseTemplate: baseTemplate, 29 | } 30 | } 31 | 32 | func (c *Compiler) Compile(templateMap map[string]string, context map[string]any) (map[string]string, error) { 33 | rendered := map[string]string{} 34 | 35 | for name, content := range templateMap { 36 | tmpl, err := c.baseTemplate.New(name).Parse(content) 37 | if err != nil { 38 | return nil, fmt.Errorf("unable to parse template content: %w", err) 39 | } 40 | 41 | var buf bytes.Buffer 42 | err = tmpl.Execute(&buf, context) 43 | if err != nil { 44 | return nil, fmt.Errorf("unable to render template: %w", err) 45 | } 46 | rendered[name] = strings.TrimSpace(buf.String()) 47 | } 48 | return rendered, nil 49 | } 50 | 51 | func dateFn(timeStr string) (string, error) { 52 | t, err := time.Parse(ISOTimeFormate, timeStr) 53 | if err != nil { 54 | return "", err 55 | } 56 | return t.Format(ISODateLayout), nil 57 | } 58 | -------------------------------------------------------------------------------- /task/bq2bq/executor/.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | branch = True 3 | source = bumblebee 4 | 5 | [report] 6 | fail_under = 50 7 | exclude_lines = 8 | if self.debug: 9 | pragma: no cover 10 | raise NotImplementedError 11 | if __name__ == .__main__.: 12 | ignore_errors = True 13 | omit = 14 | tests/* -------------------------------------------------------------------------------- /task/bq2bq/executor/.gitignore: -------------------------------------------------------------------------------- 1 | # Temporary and binary files 2 | .DS_Store 3 | *~ 4 | *.py[cod] 5 | *.so 6 | *.cfg 7 | !.isort.cfg 8 | !setup.cfg 9 | !properties.cfg 10 | *.orig 11 | *.log 12 | *.pot 13 | __pycache__ 14 | .cache/* 15 | .*.swp 16 | */.ipynb_checkpoints/* 17 | *.pyc 18 | build/ 19 | 20 | # Project files 21 | .ropeproject 22 | .project 23 | .pydevproject 24 | .settings 25 | .idea 26 | tags 27 | 28 | # Package files 29 | *.egg 30 | *.eggs/ 31 | .installed.cfg 32 | *.egg-info 33 | 34 | # Unittest and coverage 35 | htmlcov/* 36 | .coverage 37 | .tox 38 | junit.xml 39 | coverage.xml 40 | .pytest_cache/ 41 | 42 | # Build and docs folder/files 43 | build/* 44 | dist/* 45 | sdist/* 46 | docs/api/* 47 | docs/_rst/* 48 | docs/_build/* 49 | cover/* 50 | MANIFEST 51 | return.json 52 | 53 | # Per-project virtualenvs 54 | .venv*/ 55 | venv/ 56 | 57 | # Vscode 58 | .vscode 59 | 60 | auth.json 61 | /samples/tasks/onetime -------------------------------------------------------------------------------- /task/bq2bq/executor/.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | stages: 2 | - test 3 | - publish 4 | 5 | before_script: 6 | - export IMAGE_TAG="${CI_COMMIT_TAG:-$CI_COMMIT_SHA}" 7 | 8 | test: 9 | stage: test 10 | image: "python:3.8-alpine" 11 | coverage: '/coverage \d+%/' 12 | script: 13 | - apk add build-base 14 | - pip install -r requirements-test.txt 15 | - coverage run setup.py test 16 | - echo "coverage $(coverage report | awk '{print $6}' | tail -n 1)" 17 | - coverage report 18 | 19 | publish: 20 | stage: publish 21 | script: 22 | - export IMAGE="de-${CI_PROJECT_NAME}" 23 | - export ARTIFACTORY_IMAGE="docker.io/odpf/${IMAGE}" 24 | - docker build -t ${ARTIFACTORY_IMAGE}:${IMAGE_TAG} -t ${ARTIFACTORY_IMAGE}:latest . 25 | - docker push ${ARTIFACTORY_IMAGE}:${IMAGE_TAG} 26 | - docker push ${ARTIFACTORY_IMAGE}:latest 27 | tags: 28 | - package 29 | -------------------------------------------------------------------------------- /task/bq2bq/executor/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.8-alpine 2 | 3 | WORKDIR /opt/bumblebee 4 | 5 | COPY task/bq2bq/executor . 6 | RUN ["pip", "install", "-r", "requirements.txt"] 7 | 8 | ENTRYPOINT [ "python3", "/opt/bumblebee/main.py"] 9 | -------------------------------------------------------------------------------- /task/bq2bq/executor/README.md: -------------------------------------------------------------------------------- 1 | Bumblebee 2 | ========= 3 | Python library for transforming data in Bigquery 4 | 5 | ## Features 6 | 7 | * cfg file based configuration, and bigquery Standard SQL syntax for transformation 8 | * HOURLY, DAILY, WEEKLY, MONTHLY transformation window 9 | * Support APPEND, REPLACE and MERGE load method 10 | * Support transformation for partitioned tables such as partition by ingestion time (default) and partition by column 11 | * Support datetime input in different timezone 12 | * dry run 13 | * Support Bigquery DML Merge statement to handle spillover case 14 | 15 | ## Setup: 16 | 17 | ### Python Environment/Virtualenv 18 | 19 | Install virtualenv : 20 | * `pip install virtualenv` 21 | * `virtualenv -p python3.7 venv` 22 | 23 | Activate virtualenv : 24 | `source venv/bin/activate` 25 | 26 | Exit virtualenv : 27 | `deactivate` 28 | 29 | Run tests: 30 | ``` 31 | python3 -m unittest tests/test_transformation.py 32 | ``` 33 | 34 | ### Test using docker 35 | Run tests in a container: (this ensures that tests run in exact same environment) 36 | ```bash 37 | docker-compose build && docker-compose run --entrypoint="./run_tests.sh" bumblebee 38 | ``` 39 | 40 | Run locally: 41 | ```bash 42 | docker-compose up --build 43 | ``` 44 | 45 | 46 | ## How to use 47 | 48 | ### Install the library 49 | 50 | * `git clone` this repo 51 | * Go to repo the repo root directory 52 | * run `pip install .` or `python setup.py install` 53 | 54 | ### Task Files 55 | 56 | file that needed to do transformation 57 | 58 | * properties.cfg - file that contains the transform and load configuration 59 | * query.sql - file that contains the transformation query 60 | 61 | #### properties.cfg 62 | 63 | | Config Name | Description | Values | 64 | | ----------------------- |-----------------------------------------------------------------------------------------------------------------| ------------------------------------| 65 | | `PROJECT` | google cloud platform project id of the destination bigquery table | ... | 66 | | `DATASET` | bigquery dataset name of the destination table | ... | 67 | | `TABLE` | the table name of the destination table | ... | 68 | | `TASK_WINDOW` | window of transformation, to provide dstart and dend macros values for sql transformation | HOURLY, DAILY, WEEKLY, MONTHLY | 69 | | `TIMEZONE` | timezone of transformation, a timezone that the input datetime will be translated to in tz database name format | UTC, Asia/Jakarta, America/New_York | 70 | | `LOAD_METHOD` | method to load data to the destination tables | APPEND, REPLACE, MERGE | 71 | 72 | 73 | Example of `properties.cfg` config : 74 | 75 | ```ini 76 | [DESTINATION] 77 | PROJECT="gcp-project-id" 78 | DATASET="dataset" 79 | TABLE="table_name" 80 | 81 | [TRANSFORMATION] 82 | WINDOW_SIZE = 24h 83 | WINDOW_OFFSET = 0 84 | WINDOW_TRUNCATE_UPTO = d 85 | TIMEZONE="Asia/Jakarta" 86 | 87 | [LOAD] 88 | LOAD_METHOD="REPLACE" 89 | ``` 90 | 91 | 92 | #### Code examples 93 | 94 | ```#!/usr/bin/env python3 95 | import bumblebee 96 | from datetime import datetime 97 | 98 | def main(): 99 | task_path = "samples/tasks/dml" 100 | files_folders = os.listdir(task_path) 101 | files = list(filter(lambda f: not os.path.isfile(f) ,files_folders)) 102 | 103 | bumblebee.bq2bq_v2(files,datetime(2019, 3, 22),False) 104 | 105 | if __name__ == "__main__": 106 | main() 107 | ``` 108 | 109 | #### SQL macros 110 | 111 | * `__destination_table__` - full qualified table name used in DML statement 112 | * `__execution_time__` - Can be replaced in place of CURRENT_TIMESTAMP() macro of BQ 113 | * `__dstart__` - start date/datetime of the window 114 | * `__dend__` - end date/datetime of the window 115 | 116 | The value of `dstart` and `dend` depends on `TASK_WINDOW` config in `properties.cfg` file 117 | 118 | | Window Name | dstart | dend | 119 | | ------------- |--------------------------------------------------------------------| ---------------------------------------------------------------------| 120 | | DAILY | The current date taken from input, for example 2019-01-01 | The next day after dstart date 2019-01-02 | 121 | | WEEKLY | Start of the week date for example : 2019-04-01 | End date of the week , for example : 2019-04-07 | 122 | | MONTHLY | Start of the month date, example : 2019-01-01 | End date of the month, for example : 2019-01-31 | 123 | | HOURLY | Datetime of the start of the hour, for example 2019-01-01 01:00:00 | Datetime the start of the next hour, for example 2019-01-01 02:00:00 | 124 | 125 | SQL transformation query : 126 | 127 | ```sql 128 | select count(1) as count, date(created_time) as dt 129 | from `project.dataset.tablename` 130 | where date(created_time) >= '__dstart__' and date(booking_creation_time) < '__dend__' 131 | group by dt 132 | ``` 133 | 134 | Rendered SQL for DAILY window : 135 | 136 | ```sql 137 | select count(1) as count, date(created_time) as dt 138 | from `project.dataset.tablename` 139 | where date(created_time) >= '2019-01-01' and date(booking_creation_time) < '2019-01-02' 140 | group by dt 141 | ``` 142 | 143 | Rendered SQL for HOURLY window : 144 | the value of `dstart` and `dend` is YYYY-mm-dd HH:MM:SS formatted datetime 145 | 146 | ```sql 147 | select count(1) as count, date(created_time) as dt 148 | from `project.dataset.tablename` 149 | where date(created_time) >= '2019-01-01 06:00:00' and date(booking_creation_time) < '2019-01-01 07:00:00' 150 | group by dt 151 | ``` 152 | 153 | destination_table macros : 154 | 155 | ```sql 156 | MERGE `__destination_table__` S 157 | using 158 | ( 159 | select count(1) as count, date(created_time) as dt 160 | from `project.dataset.tablename` 161 | where date(created_time) >= '__dstart__' and date(created_time) < '__dend__' 162 | group by dt 163 | ) N 164 | on S.date = N.date 165 | WHEN MATCHED then 166 | UPDATE SET `count` = N.count 167 | when not matched then 168 | INSERT (`date`, `count`) VALUES(N.date, N.count) 169 | ``` 170 | 171 | ### Standard SQL 172 | 173 | SQL select statement example : 174 | 175 | ```sql 176 | select count(1) as count, date(created_time) as dt 177 | from `project.dataset.tablename` 178 | where date(created_time) >= '__dstart__' and date(booking_creation_time) < '__dend__' 179 | group by dt 180 | ``` 181 | 182 | DML Merge statement example : 183 | 184 | ```sql 185 | MERGE `__destination_table__` S 186 | using 187 | ( 188 | select count(1) as count, date(created_time) as dt 189 | from `project.dataset.tablename` 190 | where date(created_time) >= '__dstart__' and date(created_time) < '__dend__' 191 | group by dt 192 | ) N 193 | on S.date = N.date 194 | WHEN MATCHED then 195 | UPDATE SET `count` = N.count 196 | when not matched then 197 | INSERT (`date`, `count`) VALUES(N.date, N.count) 198 | ``` 199 | 200 | #### Load Method 201 | 202 | The way data loaded to destination table depends on the partition configuration of the destination tables 203 | 204 | | Load Method | No Partition | Partitioned Table | 205 | | -------------|------------------------------------------------------------------------------------------------| -------------------------------------------------------------------------------------------| 206 | | APPEND | Append new records to destination table | Append new records to destination table per partition based on localised start_time | 207 | | REPLACE | Truncate/Clean the table before insert new records | Clean records in destination partition before insert new record to new partition | 208 | | MERGE | Load the data using DML Merge statement, all of the load logic lies on DML merge statement | Load the data using DML Merge statement, all of the load logic lies on DML merge statement | -------------------------------------------------------------------------------- /task/bq2bq/executor/bumblebee/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raystack/transformers/b3c74a5906940af2b953576bb651a45e14e7653f/task/bq2bq/executor/bumblebee/__init__.py -------------------------------------------------------------------------------- /task/bq2bq/executor/bumblebee/bigquery_service.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | import os 4 | from abc import ABC, abstractmethod 5 | 6 | import google as google 7 | from google.api_core.exceptions import BadRequest, Forbidden 8 | from google.cloud import bigquery 9 | from google.cloud.bigquery.job import QueryJobConfig, CreateDisposition 10 | from google.cloud.bigquery.schema import _parse_schema_resource 11 | from google.cloud.bigquery.table import TimePartitioningType, TimePartitioning, TableReference, Table 12 | from google.cloud.exceptions import GoogleCloudError 13 | 14 | from bumblebee.config import TaskConfigFromEnv 15 | from bumblebee.log import get_logger 16 | 17 | logger = get_logger(__name__) 18 | 19 | SERVICE_ACCOUNT_VAR = "BQ_SERVICE_ACCOUNT" 20 | SERVICE_ACCOUNT_TYPE = "service_account" 21 | 22 | 23 | class BaseBigqueryService(ABC): 24 | 25 | @abstractmethod 26 | def execute_query(self, query): 27 | pass 28 | 29 | @abstractmethod 30 | def transform_load(self, 31 | query, 32 | source_project_id=None, 33 | destination_table=None, 34 | write_disposition=None, 35 | create_disposition=CreateDisposition.CREATE_NEVER, 36 | allow_field_addition=False): 37 | pass 38 | 39 | @abstractmethod 40 | def create_table(self, full_table_name, schema_file, 41 | partitioning_type=TimePartitioningType.DAY, 42 | partitioning_field=None): 43 | pass 44 | 45 | @abstractmethod 46 | def delete_table(self, full_table_name): 47 | pass 48 | 49 | @abstractmethod 50 | def get_table(self, full_table_name): 51 | pass 52 | 53 | 54 | class BigqueryService(BaseBigqueryService): 55 | 56 | def __init__(self, client, labels, writer, on_job_finish = None): 57 | """ 58 | 59 | :rtype: 60 | """ 61 | self.client = client 62 | self.labels = labels 63 | self.writer = writer 64 | self.on_job_finish = on_job_finish 65 | 66 | def execute_query(self, query): 67 | query_job_config = QueryJobConfig() 68 | query_job_config.use_legacy_sql = False 69 | query_job_config.labels = self.labels 70 | 71 | if query is None or len(query) == 0: 72 | raise ValueError("query must not be Empty") 73 | 74 | logger.info("executing query") 75 | query_job = self.client.query(query=query, 76 | job_config=query_job_config) 77 | logger.info("Job {} is initially in state {} of {} project".format(query_job.job_id, query_job.state, 78 | query_job.project)) 79 | try: 80 | query_job.result() 81 | except (GoogleCloudError, Forbidden, BadRequest) as ex: 82 | self.writer.write("error", ex.message) 83 | logger.error(ex) 84 | sys.exit(1) 85 | 86 | logger.info("Job {} is finally in state {} of {} project".format(query_job.job_id, query_job.state, 87 | query_job.project)) 88 | logger.info("Bytes processed: {}, Affected Rows: {}, Bytes billed: {}".format(query_job.estimated_bytes_processed, 89 | query_job.num_dml_affected_rows, 90 | query_job.total_bytes_billed)) 91 | logger.info("Job labels {}".format(query_job._configuration.labels)) 92 | 93 | if self.on_job_finish is not None: 94 | self.on_job_finish(query_job) 95 | 96 | def transform_load(self, 97 | query, 98 | source_project_id=None, 99 | destination_table=None, 100 | write_disposition=None, 101 | create_disposition=CreateDisposition.CREATE_NEVER, 102 | allow_field_addition=False): 103 | if query is None or len(query) == 0: 104 | raise ValueError("query must not be Empty") 105 | 106 | query_job_config = QueryJobConfig() 107 | query_job_config.create_disposition = create_disposition 108 | query_job_config.write_disposition = write_disposition 109 | query_job_config.use_legacy_sql = False 110 | query_job_config.labels = self.labels 111 | if allow_field_addition: 112 | query_job_config.schema_update_options = [ 113 | bigquery.SchemaUpdateOption.ALLOW_FIELD_ADDITION, 114 | bigquery.SchemaUpdateOption.ALLOW_FIELD_RELAXATION 115 | ] 116 | 117 | if destination_table is not None: 118 | table_ref = TableReference.from_string(destination_table) 119 | query_job_config.destination = table_ref 120 | 121 | logger.info("transform load") 122 | query_job = self.client.query(query=query, job_config=query_job_config) 123 | logger.info("Job {} is initially in state {} of {} project".format(query_job.job_id, query_job.state, 124 | query_job.project)) 125 | 126 | try: 127 | query_job.result() 128 | except (GoogleCloudError, Forbidden, BadRequest) as ex: 129 | self.writer.write("error", ex.message) 130 | logger.error(ex) 131 | sys.exit(1) 132 | 133 | logger.info("Job {} is finally in state {} of {} project".format(query_job.job_id, query_job.state, 134 | query_job.project)) 135 | logger.info("Bytes processed: {}, Stats: {} {}".format(query_job.estimated_bytes_processed, 136 | query_job.num_dml_affected_rows, 137 | query_job.total_bytes_billed)) 138 | logger.info("Job labels {}".format(query_job._configuration.labels)) 139 | 140 | if self.on_job_finish is not None: 141 | self.on_job_finish(query_job) 142 | 143 | def create_table(self, full_table_name, schema_file, 144 | partitioning_type=TimePartitioningType.DAY, 145 | partitioning_field=None): 146 | with open(schema_file, 'r') as file: 147 | schema_json = json.load(file) 148 | table_schema = _parse_schema_resource({'fields': schema_json}) 149 | 150 | table_ref = TableReference.from_string(full_table_name) 151 | 152 | bigquery_table = bigquery.Table(table_ref, table_schema) 153 | bigquery_table.time_partitioning = TimePartitioning(type_=partitioning_type, 154 | field=partitioning_field) 155 | 156 | self.client.create_table(bigquery_table) 157 | 158 | def delete_table(self, full_table_name): 159 | table_ref = TableReference.from_string(full_table_name) 160 | self.client.delete_table(bigquery.Table(table_ref)) 161 | 162 | def get_table(self, full_table_name): 163 | table_ref = TableReference.from_string(full_table_name) 164 | return self.client.get_table(table_ref) 165 | 166 | 167 | def create_bigquery_service(task_config: TaskConfigFromEnv, labels, writer, on_job_finish = None): 168 | if writer is None: 169 | writer = writer.StdWriter() 170 | 171 | credentials = _get_bigquery_credentials() 172 | default_query_job_config = QueryJobConfig() 173 | default_query_job_config.priority = task_config.query_priority 174 | default_query_job_config.allow_field_addition = task_config.allow_field_addition 175 | client = bigquery.Client(project=task_config.execution_project, credentials=credentials, default_query_job_config=default_query_job_config) 176 | return BigqueryService(client, labels, writer, on_job_finish=on_job_finish) 177 | 178 | 179 | def _get_bigquery_credentials(): 180 | """Gets credentials from the BQ_SERVICE_ACCOUNT environment var else GOOGLE_APPLICATION_CREDENTIALS for file path.""" 181 | scope = ('https://www.googleapis.com/auth/bigquery', 182 | 'https://www.googleapis.com/auth/cloud-platform', 183 | 'https://www.googleapis.com/auth/drive') 184 | account = os.environ.get(SERVICE_ACCOUNT_VAR) 185 | creds = _load_credentials_from_var(account, scope) 186 | if creds is not None: 187 | return creds 188 | credentials, _ = google.auth.default(scopes=scope) 189 | return credentials 190 | 191 | 192 | def _load_credentials_from_var(account_str, scopes=None): 193 | """Loads Google credentials from an environment variable. 194 | The credentials file must be a service account key. 195 | """ 196 | if account_str is None: 197 | return None 198 | 199 | try: 200 | info = json.loads(account_str) 201 | except ValueError: 202 | return None 203 | 204 | # The type key should indicate that the file is either a service account 205 | # credentials file or an authorized user credentials file. 206 | credential_type = info.get("type") 207 | 208 | if credential_type == SERVICE_ACCOUNT_TYPE: 209 | from google.oauth2 import service_account 210 | 211 | try: 212 | credentials = service_account.Credentials.from_service_account_info(info, scopes=scopes) 213 | except ValueError: 214 | return None 215 | return credentials 216 | 217 | else: 218 | return None 219 | 220 | 221 | class DummyService(BaseBigqueryService): 222 | 223 | def execute_query(self, query): 224 | logger.info("execute query : {}".format(query)) 225 | return [] 226 | 227 | def transform_load(self, query, source_project_id=None, destination_table=None, write_disposition=None, 228 | create_disposition=CreateDisposition.CREATE_NEVER, allow_field_addition=False): 229 | log = """ transform and load with config : 230 | {} 231 | {} 232 | {} 233 | {}""".format(query, source_project_id, destination_table, write_disposition) 234 | logger.info(log) 235 | 236 | def create_table(self, full_table_name, schema_file, partitioning_type=TimePartitioningType.DAY, 237 | partitioning_field=None): 238 | log = """ create table with config : 239 | {} 240 | {} 241 | {} 242 | {}""".format(full_table_name, schema_file, partitioning_type, partitioning_field) 243 | logger.info(log) 244 | 245 | def delete_table(self, full_table_name): 246 | logger.info("delete table: {}".format(full_table_name)) 247 | 248 | def get_table(self, full_table_name): 249 | return Table.from_string(full_table_name) 250 | -------------------------------------------------------------------------------- /task/bq2bq/executor/bumblebee/bq2bq.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | from bumblebee.bigquery_service import create_bigquery_service, DummyService 4 | from bumblebee.config import TaskConfigFromEnv, TaskConfigFromFile 5 | from bumblebee.config import TaskFiles 6 | from bumblebee.filesystem import FileSystem 7 | from bumblebee.log import get_logger 8 | from bumblebee.transformation import Transformation 9 | from bumblebee.version import VERSION 10 | from bumblebee.writer import JsonWriter 11 | 12 | logger = get_logger(__name__) 13 | 14 | 15 | def bq2bq(properties_file: str, 16 | query_file: str, 17 | spillover_query_file: str, 18 | dstart: datetime, 19 | dend: datetime, 20 | execution_time: datetime, 21 | dry_run: bool = False, 22 | labels: dict = {}, 23 | output_on: str = './return.json', 24 | on_job_finish = None, 25 | ): 26 | 27 | logger.info("Using bumblebee version: {}".format(VERSION)) 28 | 29 | job_labels = base_job_Labels.copy() 30 | job_labels.update(labels) 31 | writer = JsonWriter(output_on) 32 | 33 | task_files = TaskFiles(FileSystem(), [query_file, spillover_query_file, properties_file]) 34 | if task_files.properties_cfg is not None: 35 | task_config = TaskConfigFromFile(task_files.properties_cfg) 36 | else: 37 | task_config = TaskConfigFromEnv() 38 | 39 | bigquery_service = DummyService() 40 | if not dry_run: 41 | bigquery_service = create_bigquery_service(task_config, job_labels, writer, on_job_finish=on_job_finish) 42 | 43 | transformation = Transformation(bigquery_service, 44 | task_config, 45 | task_files.query, 46 | task_files.spillover_query, 47 | dstart, 48 | dend, 49 | execution_time, 50 | dry_run) 51 | transformation.transform() 52 | 53 | 54 | base_job_Labels = { 55 | "lifecycle": "process", 56 | "component": "worker", 57 | "alias": "bumblebee" 58 | } 59 | -------------------------------------------------------------------------------- /task/bq2bq/executor/bumblebee/config.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import os 3 | from datetime import timedelta 4 | from enum import Enum 5 | from typing import List 6 | from typing import Optional 7 | import configparser 8 | import iso8601 9 | import pytz 10 | from google.cloud.bigquery.job import WriteDisposition, QueryPriority 11 | from abc import ABC 12 | from abc import abstractmethod 13 | 14 | from bumblebee.datehelper import parse_duration 15 | from bumblebee.filesystem import FileSystem 16 | from bumblebee.log import get_logger 17 | 18 | logger = get_logger(__name__) 19 | 20 | 21 | def get_env_config(name, default=None, raise_if_empty=False): 22 | val = os.environ.get(name, default=default) 23 | if not raise_if_empty: 24 | return val 25 | if val == "" or val is None: 26 | raise AssertionError("config '{}' must be provided".format(name)) 27 | return val 28 | 29 | 30 | def parse_date(date) -> datetime: 31 | return iso8601.parse_date(date) 32 | 33 | 34 | class LoadMethod(Enum): 35 | """ 36 | Bigquery Query load method 37 | """ 38 | APPEND = "APPEND" 39 | 40 | REPLACE = "REPLACE" 41 | 42 | REPLACE_MERGE = "REPLACE_MERGE" 43 | 44 | REPLACE_ALL = "REPLACE_ALL" 45 | 46 | MERGE = "MERGE" 47 | 48 | @property 49 | def write_disposition(self): 50 | if self == LoadMethod.APPEND: 51 | return WriteDisposition.WRITE_APPEND 52 | elif self == LoadMethod.REPLACE or self == LoadMethod.REPLACE_MERGE or self == LoadMethod.REPLACE_ALL: 53 | return WriteDisposition.WRITE_TRUNCATE 54 | else: 55 | raise Exception("write disposition is only for APPEND and REPLACE load method") 56 | 57 | 58 | class TaskConfig(ABC): 59 | 60 | @property 61 | @abstractmethod 62 | def destination_project(self) -> str: 63 | pass 64 | 65 | @property 66 | @abstractmethod 67 | def destination_dataset(self) -> str: 68 | pass 69 | 70 | @property 71 | @abstractmethod 72 | def destination_table_name(self) -> str: 73 | pass 74 | 75 | @property 76 | def destination_table(self) -> str: 77 | return "{}.{}.{}".format(self.destination_project, self.destination_dataset, self.destination_table_name) 78 | 79 | @property 80 | @abstractmethod 81 | def sql_type(self) -> str: 82 | pass 83 | 84 | @property 85 | @abstractmethod 86 | def load_method(self) -> LoadMethod: 87 | pass 88 | 89 | @property 90 | @abstractmethod 91 | def timezone(self): 92 | pass 93 | 94 | @property 95 | @abstractmethod 96 | def use_spillover(self) -> bool: 97 | pass 98 | 99 | @property 100 | @abstractmethod 101 | def concurrency(self) -> int: 102 | pass 103 | 104 | @property 105 | @abstractmethod 106 | def filter_expression(self) -> str: 107 | pass 108 | 109 | @abstractmethod 110 | def print(self): 111 | pass 112 | 113 | 114 | class TaskConfigFromEnv(TaskConfig): 115 | 116 | def __init__(self): 117 | self._destination_project = get_env_config("PROJECT", raise_if_empty=True) 118 | self._execution_project = get_env_config("EXECUTION_PROJECT", default=self._destination_project) 119 | self._destination_dataset = get_env_config("DATASET", raise_if_empty=True) 120 | self._destination_table_name = get_env_config("TABLE", raise_if_empty=True) 121 | self._sql_type = get_env_config("SQL_TYPE", raise_if_empty=True) 122 | self._filter_expression = get_env_config("PARTITION_FILTER", default=None) 123 | self._query_priority = get_env_config("QUERY_PRIORITY", default="INTERACTIVE") 124 | self._load_method = LoadMethod[get_env_config("LOAD_METHOD", raise_if_empty=True)] 125 | self._timezone = _validate_timezone_exist(get_env_config("TIMEZONE", default="UTC")) 126 | self._use_spillover = _bool_from_str(get_env_config("USE_SPILLOVER", default="true")) 127 | self._concurrency = _validate_greater_than_zero(int(get_env_config("CONCURRENCY", default=1))) 128 | self._allow_field_addition = _bool_from_str(get_env_config("ALLOW_FIELD_ADDITION", default="false")) 129 | 130 | @property 131 | def destination_project(self) -> str: 132 | return self._destination_project 133 | 134 | @property 135 | def execution_project(self) -> str: 136 | return self._execution_project 137 | 138 | @property 139 | def destination_dataset(self) -> str: 140 | return self._destination_dataset 141 | 142 | @property 143 | def allow_field_addition(self) -> bool: 144 | return self._allow_field_addition 145 | 146 | @property 147 | def destination_table_name(self) -> str: 148 | return self._destination_table_name 149 | 150 | @property 151 | def filter_expression(self) -> int: 152 | return self._filter_expression 153 | 154 | @property 155 | def sql_type(self) -> str: 156 | return self._sql_type 157 | 158 | @property 159 | def query_priority(self): 160 | if self._query_priority == 'BATCH': 161 | return QueryPriority.BATCH 162 | else: 163 | return QueryPriority.INTERACTIVE 164 | 165 | @property 166 | def load_method(self): 167 | return self._load_method 168 | 169 | @property 170 | def use_spillover(self) -> bool: 171 | return self._use_spillover 172 | 173 | @property 174 | def timezone(self): 175 | return self._timezone 176 | 177 | @property 178 | def concurrency(self) -> int: 179 | return self._concurrency 180 | 181 | def print(self): 182 | logger.info("task config:\n{}".format( 183 | "\n".join([ 184 | "destination: {}".format(self.destination_table), 185 | "load method: {}".format(self.load_method), 186 | "timezone: {}".format(self.timezone), 187 | "partition_filter: {}".format(self.filter_expression), 188 | ]) 189 | )) 190 | 191 | def __str__(self) -> str: 192 | return str(self.__dict__) 193 | 194 | 195 | class AppConfig: 196 | """generates config from environment variables for app""" 197 | 198 | DEFAULT_XCOM_PATH = "/airflow/xcom/return.json" 199 | DEFAULT_JOB_DIR = "/data" 200 | JOB_INPUT_SUBDIR = "in" 201 | JOB_OUTPUT_SUBDIR = "out" 202 | 203 | def __init__(self): 204 | self.sql_file: Optional[str] = None 205 | self.properties_file: Optional[str] = None 206 | self.spillover_sql_file: Optional[str] = None 207 | self.dstart: datetime = None 208 | self.dend: datetime = None 209 | self.execution_time: datetime = None 210 | self.dry_run = self._is_dry_run(get_env_config("DRY_RUN", "false")) 211 | self.job_labels = self._get_job_labels(get_env_config("JOB_LABELS", default="owner=optimus")) 212 | self.xcom_path = get_env_config("XCOM_PATH", self.DEFAULT_XCOM_PATH) 213 | 214 | self._parse_datetime_vars() 215 | self._parse_specs_dir() 216 | 217 | def _parse_datetime_vars(self): 218 | dstart = get_env_config("DSTART", raise_if_empty=True) 219 | dend = get_env_config("DEND", raise_if_empty=True) 220 | default_execution_time = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat() 221 | try: 222 | self.execution_time = parse_date(get_env_config("EXECUTION_TIME", default_execution_time)) 223 | self.dstart = parse_date(dstart) 224 | self.dend = parse_date(dend) 225 | except iso8601.ParseError: 226 | logger.error( 227 | "dstart/dend/execution-time should be YYYY-mm-dd or date time iso8601 format YYYY-mm-ddTHH:MM:SSZ") 228 | raise 229 | 230 | def _parse_specs_dir(self): 231 | dir = get_env_config("JOB_DIR", default=self.DEFAULT_JOB_DIR) 232 | dir = "{}/{}".format(dir, self.JOB_INPUT_SUBDIR) 233 | for dirpath, _, files in os.walk(dir): 234 | for filename in files: 235 | filepath = os.path.join(dirpath, filename) 236 | if filename == 'query.sql': 237 | self.sql_file = filepath 238 | elif filename == 'spillover_date.sql': 239 | self.spillover_sql_file = filepath 240 | 241 | def _is_dry_run(self, input_config) -> bool: 242 | if input_config.lower() in ["true", "1", "yes", "y"]: 243 | logger.info("Bumblebee is running in dry-run mode") 244 | return True 245 | else: 246 | return False 247 | 248 | def _get_job_labels(self, input_config) -> dict: 249 | job_labels_dict = {} 250 | assert input_config not in ["", None], "JOB_LABELS must be provided in k1=v1,k2=v2 format" 251 | 252 | label_sep = "," 253 | key_value_sep = "=" 254 | for label_pair in input_config.split(label_sep): 255 | key_value = label_pair.split(key_value_sep) 256 | assert key_value[0] != "", "label name cannot be empty in JOB_LABELS" 257 | assert key_value[1] != "", "label value cannot be empty in JOB_LABELS" 258 | job_labels_dict[key_value[0]] = key_value[1] 259 | 260 | return job_labels_dict 261 | 262 | 263 | class TaskFiles: 264 | def __init__(self, fs: FileSystem, files: List): 265 | self.fs = fs 266 | 267 | fileset = self._read_all_files(files) 268 | 269 | self.query = fileset['query.sql'] 270 | 271 | self.properties_cfg = None 272 | if 'properties.cfg' in fileset.keys(): 273 | self.properties_cfg = fileset['properties.cfg'] 274 | 275 | self.spillover_query = None 276 | if 'spillover_date.sql' in fileset.keys(): 277 | self.spillover_query = fileset['spillover_date.sql'] 278 | 279 | def _read_all_files(self, files): 280 | fileset = {} 281 | for file in files: 282 | if file is not None and self.fs.exist(file): 283 | content = self.fs.read(file) 284 | filename = self.fs.basename(file) 285 | fileset[filename] = content 286 | return fileset 287 | 288 | 289 | def _validate_greater_than_zero(val: int): 290 | if val > 0: 291 | return val 292 | raise Exception("value should be integer and greater than 0") 293 | 294 | 295 | def _validate_timezone_exist(timezone_name: str): 296 | pytz.timezone(timezone_name) 297 | return timezone_name 298 | 299 | 300 | def _validate_not_empty(val_str: str): 301 | if isinstance(val_str, str) and len(val_str) > 0: 302 | return val_str 303 | else: 304 | raise Exception("value should not be empty") 305 | 306 | 307 | def _validate_window_size(val: str): 308 | if parse_duration(val) == timedelta(seconds=0): 309 | raise ValueError("invalid window size: {}".format(val)) 310 | return val 311 | 312 | 313 | def _bool_from_str(bool_str: str) -> bool: 314 | if bool_str.lower() == "true": 315 | return True 316 | elif bool_str.lower() == "false": 317 | return False 318 | raise Exception("value should be a string true or false") 319 | 320 | 321 | class TaskConfigFromFile(TaskConfig): 322 | 323 | def __init__(self, raw_properties): 324 | 325 | config = configparser.ConfigParser(allow_no_value=True) 326 | config.optionxform = str 327 | config.read_string(raw_properties) 328 | 329 | self._properties = {} 330 | for section in config.sections(): 331 | for key in config[section]: 332 | self._properties[key] = config[section][key] 333 | 334 | self._destination_table_name = _validate_not_empty(self._get_property("TABLE")) 335 | self._destination_project = _validate_not_empty(self._get_property("PROJECT")) 336 | self._execution_project = _validate_not_empty(self._get_property_or_default("EXECUTION_PROJECT", self._destination_project)) 337 | self._destination_dataset = _validate_not_empty(self._get_property("DATASET")) 338 | 339 | self._window_size = _validate_window_size(self._get_property("WINDOW_SIZE")) 340 | self._window_offset = self._get_property("WINDOW_OFFSET") 341 | self._window_truncate_upto = self._get_property("WINDOW_TRUNCATE_UPTO") 342 | 343 | self._filter_expression = self._get_property_or_default("PARTITION_FILTER", None) 344 | self._query_priority = self._get_property_or_default("QUERY_PRIORITY", "INTERACTIVE") 345 | self._load_method = LoadMethod[self._get_property("LOAD_METHOD")] 346 | self._timezone = _validate_timezone_exist(self._get_property_or_default("TIMEZONE", "UTC")) 347 | 348 | self._use_spillover = _bool_from_str(self._get_property_or_default("USE_SPILLOVER", "true")) 349 | self._concurrency = _validate_greater_than_zero(int(self._get_property_or_default("CONCURRENCY", 1))) 350 | self._allow_field_addition = _bool_from_str(self._get_property_or_default("ALLOW_FIELD_ADDITION", "false")) 351 | 352 | @property 353 | def sql_type(self) -> str: 354 | return "STANDARD" 355 | 356 | @property 357 | def destination_dataset(self): 358 | return self._destination_dataset 359 | 360 | @property 361 | def destination_project(self): 362 | return self._destination_project 363 | 364 | @property 365 | def execution_project(self): 366 | return self._execution_project 367 | 368 | @property 369 | def destination_table_name(self): 370 | return self._destination_table_name 371 | 372 | @property 373 | def window_size(self): 374 | return self._window_size 375 | 376 | @property 377 | def window_offset(self): 378 | return self._window_offset 379 | 380 | @property 381 | def window_truncate_upto(self): 382 | return self._window_truncate_upto 383 | 384 | @property 385 | def timezone(self): 386 | return self._timezone 387 | 388 | @property 389 | def query_priority(self): 390 | if self._query_priority == 'BATCH': 391 | return QueryPriority.BATCH 392 | else: 393 | return QueryPriority.INTERACTIVE 394 | 395 | @property 396 | def load_method(self): 397 | return self._load_method 398 | 399 | @property 400 | def use_spillover(self) -> bool: 401 | return self._use_spillover 402 | 403 | @property 404 | def concurrency(self) -> int: 405 | return self._concurrency 406 | 407 | @property 408 | def filter_expression(self) -> str: 409 | return self._filter_expression 410 | 411 | @property 412 | def allow_field_addition(self) -> bool: 413 | return self._allow_field_addition 414 | 415 | def print(self): 416 | logger.info("task config:\n{}".format( 417 | "\n".join([ 418 | "destination: {}".format(self.destination_table), 419 | "load method: {}".format(self.load_method), 420 | "timezone: {}".format(self.timezone), 421 | "partition_filter: {}".format(self.filter_expression), 422 | "spillover: {}".format(self.use_spillover), 423 | ]) 424 | )) 425 | 426 | def _get_property(self, key): 427 | return self._properties[key].strip('"') 428 | 429 | def _get_property_or_default(self, key: str, default: str): 430 | if key in self._properties: 431 | return self._properties[key].strip('"') 432 | return default 433 | 434 | def __str__(self) -> str: 435 | return str(self.__dict__) 436 | -------------------------------------------------------------------------------- /task/bq2bq/executor/bumblebee/datehelper.py: -------------------------------------------------------------------------------- 1 | 2 | from datetime import datetime, date, timedelta 3 | from pytimeparse import parse 4 | import pytz 5 | 6 | def parse_duration(time_str): 7 | """ 8 | :param time_str: example 1d, 2h 9 | :return: timedelta object 10 | """ 11 | if time_str == "" or time_str == "0": 12 | return timedelta(seconds=0) 13 | return timedelta(seconds=parse(time_str)) 14 | 15 | def localise_datetime(datetimeobj: datetime, tzname: str): 16 | """ 17 | :param datetimeobj: 18 | :param tzname: 19 | :return: 20 | """ 21 | local_timezone = pytz.timezone(tzname) 22 | if datetimeobj.tzinfo is None: 23 | return local_timezone.localize(datetimeobj) 24 | else: 25 | return datetimeobj.astimezone(local_timezone) 26 | -------------------------------------------------------------------------------- /task/bq2bq/executor/bumblebee/filesystem.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | class FileSystem: 5 | def read(self, file): 6 | if file is not None: 7 | with open(file, 'r') as f: 8 | return f.read() 9 | 10 | def exist(self,path): 11 | return os.path.exists(path) 12 | 13 | def basename(self,path): 14 | return os.path.basename(path) -------------------------------------------------------------------------------- /task/bq2bq/executor/bumblebee/handler.py: -------------------------------------------------------------------------------- 1 | class BigqueryJobHandler: 2 | def __init__(self) -> None: 3 | self._sum_slot_millis = 0 4 | self._sum_total_bytes_processed = 0 5 | 6 | def handle_job_finish(self, job) -> None: 7 | self._sum_slot_millis += job.slot_millis 8 | self._sum_total_bytes_processed += job.total_bytes_processed 9 | 10 | def get_sum_slot_millis(self) -> int: 11 | return self._sum_slot_millis 12 | 13 | def get_sum_total_bytes_processed(self) -> int: 14 | return self._sum_total_bytes_processed 15 | -------------------------------------------------------------------------------- /task/bq2bq/executor/bumblebee/loader.py: -------------------------------------------------------------------------------- 1 | from bumblebee.bigquery_service import BigqueryService 2 | from datetime import datetime 3 | 4 | from abc import ABC 5 | from abc import abstractmethod 6 | from bumblebee.config import LoadMethod 7 | 8 | class BaseLoader(ABC): 9 | 10 | @abstractmethod 11 | def load(self, query): 12 | pass 13 | 14 | 15 | class PartitionLoader(BaseLoader): 16 | 17 | def __init__(self, bigquery_service, destination: str, load_method: LoadMethod, partition: datetime, allow_field_addition=False): 18 | self.bigquery_service = bigquery_service 19 | self.destination_name = destination 20 | self.load_method = load_method 21 | self.partition_date = partition 22 | self.allow_field_addition = allow_field_addition 23 | 24 | def load(self, query): 25 | partition_date_str = self.partition_date.strftime("%Y%m%d") 26 | load_destination = "{}${}".format(self.destination_name, partition_date_str) 27 | write_disposition = self.load_method.write_disposition 28 | allow_field_addition = self.allow_field_addition 29 | return self.bigquery_service.transform_load(query=query, 30 | write_disposition=write_disposition, 31 | destination_table=load_destination, 32 | allow_field_addition=allow_field_addition) 33 | 34 | 35 | class TableLoader(BaseLoader): 36 | 37 | def __init__(self, bigquery_service, destination: str, load_method: LoadMethod, allow_field_addition=False): 38 | self.bigquery_service = bigquery_service 39 | self.full_table_name = destination 40 | self.load_method = load_method 41 | self.allow_field_addition = allow_field_addition 42 | 43 | def load(self, query): 44 | return self.bigquery_service.transform_load(query=query, 45 | write_disposition=self.load_method.write_disposition, 46 | destination_table=self.full_table_name, 47 | allow_field_addition=self.allow_field_addition) 48 | 49 | 50 | class DMLLoader(BaseLoader): 51 | def __init__(self,bigquery_service: BigqueryService, destination: str): 52 | self.bigquery_service = bigquery_service 53 | self.full_table_name = destination 54 | 55 | def load(self,query): 56 | return self.bigquery_service.execute_query(query) 57 | -------------------------------------------------------------------------------- /task/bq2bq/executor/bumblebee/log.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import logging 3 | 4 | 5 | def get_logger(name: str): 6 | logger = logging.getLogger(name) 7 | logformat = "[%(asctime)s] %(levelname)s:%(name)s: %(message)s" 8 | logging.basicConfig(level=logging.INFO, stream=sys.stdout, 9 | format=logformat, datefmt="%Y-%m-%d %H:%M:%S") 10 | 11 | return logger 12 | -------------------------------------------------------------------------------- /task/bq2bq/executor/bumblebee/query.py: -------------------------------------------------------------------------------- 1 | import re 2 | from datetime import timedelta 3 | from bumblebee.window import Window 4 | import sqlparse 5 | 6 | MERGE_AUTO_REPLACE_SCRIPT_QUERY_TEMPLATE = """\ 7 | -- Optimus generated 8 | DECLARE partitions ARRAY; 9 | 10 | {header} 11 | 12 | CREATE TEMP TABLE `opt__partitions` AS ( 13 | {sql_query} 14 | ); 15 | 16 | SET (partitions) = ( 17 | SELECT AS STRUCT 18 | array_agg(DISTINCT DATE(`{partition_column_name}`)) 19 | FROM opt__partitions 20 | ); 21 | 22 | MERGE INTO 23 | `{destination_table}` AS target 24 | USING 25 | ( 26 | Select * from `opt__partitions` 27 | ) AS source 28 | ON FALSE 29 | WHEN NOT MATCHED BY SOURCE AND DATE(`{partition_column_name}`) IN UNNEST(partitions) 30 | THEN DELETE 31 | WHEN NOT MATCHED THEN INSERT 32 | ( 33 | {destination_columns} 34 | ) 35 | VALUES 36 | ( 37 | {source_columns} 38 | ); 39 | """ 40 | 41 | MERGE_REPLACE_WITH_FILTER_QUERY_TEMPLATE = """\ 42 | -- Optimus generated 43 | {header} 44 | 45 | MERGE INTO 46 | `{destination_table}` AS target 47 | USING 48 | ( 49 | {sql_query} 50 | ) AS source 51 | ON FALSE 52 | WHEN NOT MATCHED BY SOURCE AND {filter_expression} 53 | THEN DELETE 54 | WHEN NOT MATCHED THEN INSERT 55 | ( 56 | {destination_columns} 57 | ) 58 | VALUES 59 | ( 60 | {source_columns} 61 | ); 62 | """ 63 | 64 | 65 | class QueryParameter(dict): 66 | def __init__(self): 67 | super().__init__() 68 | 69 | 70 | class WindowParameter(QueryParameter): 71 | def __init__(self, window: Window): 72 | super().__init__() 73 | dstart_regex = r"__dstart__" 74 | dend_regex = r"__dend__" 75 | 76 | # convert time to date if hour is no required 77 | params = { 78 | dstart_regex: window.start.strftime("%Y-%m-%d"), 79 | dend_regex: window.end.strftime("%Y-%m-%d"), 80 | } 81 | if window.size < timedelta(seconds=60 * 60 * 24) or window.truncate_upto == "h": 82 | params = { 83 | dstart_regex: window.start.strftime("%Y-%m-%d %H:%M:%S"), 84 | dend_regex: window.end.strftime("%Y-%m-%d %H:%M:%S"), 85 | } 86 | 87 | self.update(params) 88 | 89 | 90 | class DestinationParameter(QueryParameter): 91 | def __init__(self, full_table_name): 92 | super().__init__() 93 | self.update({r"(__destination_table__)": full_table_name}) 94 | 95 | 96 | class ExecutionParameter(QueryParameter): 97 | def __init__(self, exec_time): 98 | super().__init__() 99 | self.update({r"(__execution_time__)": exec_time.strftime('%Y-%m-%dT%H:%M:%S.%f')}) 100 | 101 | 102 | # this should be deprecated as optimus does the macro conversion now 103 | class Query(str): 104 | 105 | def replace_param(self, param_kv): 106 | temp = self 107 | for key, value in param_kv.items(): 108 | temp = re.sub(key, value, temp, 0, re.MULTILINE) 109 | return Query(temp) 110 | 111 | def apply_parameter(self, query_parameter: QueryParameter): 112 | return self.replace_param(query_parameter) 113 | 114 | def print(self): 115 | print(self) 116 | 117 | def print_with_logger(self, log): 118 | log.info("sql transformation query:\n{}".format(self)) 119 | 120 | 121 | class MergeReplaceQuery(str): 122 | 123 | def from_filter(self, destination_table: str, destination_columns: list, source_columns: list, filter: str): 124 | prepared_destination_columns = self.prepare_column_names(destination_columns) 125 | prepared_source_columns = self.prepare_column_names(source_columns) 126 | header, body = self.parsed_sql() 127 | 128 | q = MERGE_REPLACE_WITH_FILTER_QUERY_TEMPLATE.format(header="\n".join(header), sql_query=body, 129 | destination_table=destination_table, 130 | destination_columns=",".join(prepared_destination_columns), 131 | source_columns=",".join(prepared_source_columns), 132 | filter_expression=filter) 133 | return MergeReplaceQuery(q) 134 | 135 | def auto(self, destination_table: str, destination_columns: list, source_columns: list, 136 | partition_column_name: str, partition_column_type: str): 137 | prepared_destination_columns = self.prepare_column_names(destination_columns) 138 | prepared_source_columns = self.prepare_column_names(source_columns) 139 | header, body = self.parsed_sql() 140 | 141 | q = MERGE_AUTO_REPLACE_SCRIPT_QUERY_TEMPLATE.format(header="\n".join(header), sql_query=body, 142 | destination_table=destination_table, 143 | destination_columns=",".join(prepared_destination_columns), 144 | source_columns=",".join(prepared_source_columns), 145 | partition_column_name=partition_column_name, 146 | partition_column_type=partition_column_type) 147 | return MergeReplaceQuery(q) 148 | 149 | def prepare_column_names(self, colums): 150 | prepared_columns = [] 151 | for col in colums: 152 | prepared_columns.append("`{}`".format(col)) 153 | return prepared_columns 154 | 155 | def parsed_sql(self): 156 | headers = [] # create function queries 157 | body = [] # with/statement queries 158 | 159 | # split into multiple queries seperated my semicolons 160 | queries = sqlparse.split(self) 161 | if len(queries) == 1: 162 | return headers, queries[0] 163 | 164 | for query in queries: 165 | # parse will provide a AST 166 | # read through all tokens if any of them is a DDL stmt 167 | tokens = sqlparse.parse(query)[0].tokens 168 | is_ddl = False 169 | for token in tokens: 170 | if token.ttype == sqlparse.tokens.Keyword.DDL: 171 | is_ddl = True 172 | break 173 | if is_ddl: 174 | headers.append(query) 175 | else: 176 | body.append(query) 177 | if len(body) != 1: 178 | raise Exception("invalid replace query, should have exactly one DML/CTE statements") 179 | return headers, body[0] 180 | 181 | def print(self): 182 | print(self) 183 | 184 | def print_with_logger(self, log): 185 | log.info("sql transformation query:\n{}".format(self)) 186 | -------------------------------------------------------------------------------- /task/bq2bq/executor/bumblebee/transformation.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import concurrent 3 | import math 4 | from datetime import datetime, timedelta 5 | from typing import List, TypeVar 6 | 7 | from bumblebee.bigquery_service import BigqueryService 8 | from bumblebee.config import LoadMethod, TaskConfig 9 | from bumblebee.datehelper import localise_datetime 10 | from bumblebee.loader import BaseLoader, TableLoader, DMLLoader, PartitionLoader 11 | from bumblebee.log import get_logger 12 | from bumblebee.query import Query, DestinationParameter, WindowParameter, ExecutionParameter, MergeReplaceQuery 13 | from bumblebee.window import WindowFactory, Window, CustomWindow 14 | from concurrent.futures import ThreadPoolExecutor 15 | from google.cloud.bigquery.table import TimePartitioningType 16 | 17 | logger = get_logger(__name__) 18 | 19 | OPTIMUS_QUERY_BREAK_MARKER = "--*--optimus-break-marker--*--" 20 | 21 | 22 | class Transformation: 23 | def __init__(self, 24 | bigquery_service: BigqueryService, 25 | task_config: TaskConfig, 26 | sql_query: str, 27 | spillover_query: str, 28 | dstart: datetime, 29 | dend: datetime, 30 | execution_time: datetime, 31 | dry_run: bool): 32 | self.bigquery_service = bigquery_service 33 | self.task_config = task_config 34 | self.sql_query = sql_query 35 | self.dstart = dstart 36 | self.dend = dend 37 | self.execution_time = execution_time 38 | self.dry_run = dry_run 39 | 40 | def transform(self): 41 | self.task_config.print() 42 | 43 | localised_dstart = localise_datetime(self.dstart, self.task_config.timezone) 44 | localised_dend = localise_datetime(self.dend, self.task_config.timezone) 45 | localised_execution_time = localise_datetime(self.execution_time, self.task_config.timezone) 46 | logger.info("localized dstart: {}".format(localised_dstart)) 47 | logger.info("localized dend: {}".format(localised_dend)) 48 | logger.info("localized execution time: {}".format(localised_execution_time)) 49 | 50 | if self.task_config.load_method is LoadMethod.MERGE: 51 | task = DMLBasedTransformation(self.bigquery_service, 52 | self.task_config, 53 | self.sql_query, 54 | localised_dstart, 55 | localised_dend, 56 | self.dry_run, 57 | localised_execution_time) 58 | task.execute() 59 | elif self.task_config.load_method is LoadMethod.APPEND: 60 | transformation = TableTransformation(self.bigquery_service, 61 | self.task_config, 62 | self.sql_query, 63 | localised_dstart, 64 | localised_dend, 65 | self.dry_run, 66 | localised_execution_time) 67 | transformation.transform() 68 | elif self.task_config.load_method is LoadMethod.REPLACE: 69 | # query bq and check if table is partitioned 70 | bq_destination_table = self.bigquery_service.get_table(self.task_config.destination_table) 71 | if bq_destination_table.time_partitioning is None: 72 | task_queries = self.sql_query.split(OPTIMUS_QUERY_BREAK_MARKER) 73 | transformation = TableTransformation(self.bigquery_service, 74 | self.task_config, 75 | task_queries[0], 76 | self.dstart, 77 | self.dend, 78 | self.dry_run, 79 | localised_execution_time) 80 | elif bq_destination_table.partitioning_type == "DAY": 81 | partition_strategy = timedelta(days=1) 82 | 83 | # queries where source data/partition directly map with destination partitions 84 | transformation = MultiPartitionTransformation(self.bigquery_service, 85 | self.task_config, 86 | self.sql_query, 87 | self.dstart, self.dend, 88 | self.dry_run, 89 | localised_execution_time, 90 | partition_strategy, 91 | self.task_config.concurrency) 92 | else: 93 | raise Exception("unable to generate a transformation for request, unsupported partition strategy") 94 | transformation.transform() 95 | elif self.task_config.load_method is LoadMethod.REPLACE_MERGE: 96 | # query bq and check if table is partitioned 97 | bq_destination_table = self.bigquery_service.get_table(self.task_config.destination_table) 98 | if bq_destination_table.time_partitioning is None and bq_destination_table.range_partitioning is None: 99 | transformation = TableTransformation(self.bigquery_service, 100 | self.task_config, 101 | self.sql_query, 102 | localised_dstart, 103 | localised_dend, 104 | self.dry_run, 105 | localised_execution_time) 106 | else: 107 | partition_type = bq_destination_table.time_partitioning if bq_destination_table.time_partitioning is not None \ 108 | else bq_destination_table.range_partitioning 109 | 110 | # ingestion time partitioned has field as None 111 | if partition_type.field is None and self.task_config.filter_expression is None: 112 | raise Exception("partition filter is required for tables partitioned with INGESTION TIME, " 113 | "for eg: date(`_PARTITIONTIME`) >= date('{{.DSTART}}') AND date(`_PARTITIONTIME`) < date('{{.DEND}}')" 114 | "") 115 | 116 | partition_column_name = partition_type.field 117 | partition_column_type = "DATE" 118 | 119 | table_columns = [] 120 | for field in bq_destination_table.schema: 121 | table_columns.append(field.name) 122 | if field.name == partition_column_name: 123 | partition_column_type = field.field_type 124 | 125 | logger.info("table columns: {}, partitioned on: {}".format(table_columns, partition_column_name, 126 | partition_column_type)) 127 | 128 | transformation = MergeReplaceTransformation(self.bigquery_service, 129 | self.task_config, 130 | self.sql_query, 131 | table_columns, 132 | partition_column_name, 133 | partition_column_type, 134 | self.dry_run, 135 | localised_execution_time 136 | ) 137 | 138 | transformation.transform() 139 | elif self.task_config.load_method is LoadMethod.REPLACE_ALL: 140 | # query bq and check if table is partitioned 141 | bq_destination_table = self.bigquery_service.get_table(self.task_config.destination_table) 142 | if bq_destination_table.time_partitioning is None and bq_destination_table.range_partitioning is None: 143 | task_queries = self.sql_query.split(OPTIMUS_QUERY_BREAK_MARKER) 144 | transformation = TableTransformation(self.bigquery_service, 145 | self.task_config, 146 | task_queries[0], 147 | self.dstart, 148 | self.dend, 149 | self.dry_run, 150 | localised_execution_time) 151 | else: 152 | # queries where source data/partition map with start date as destination partition 153 | transformation = SinglePartitionTransformation(self.bigquery_service, 154 | self.task_config, 155 | self.sql_query, 156 | self.dstart, self.dend, 157 | self.dry_run, 158 | localised_execution_time,) 159 | transformation.transform() 160 | else: 161 | raise Exception("unsupported load method {}".format(self.task_config.load_method)) 162 | 163 | 164 | class DMLBasedTransformation: 165 | def __init__(self, bigquery_service, 166 | task_config: TaskConfig, 167 | task_query: str, 168 | dstart: datetime, 169 | dend: datetime, 170 | dry_run: bool, 171 | execution_time: datetime): 172 | self.loader = DMLLoader(bigquery_service, task_config.destination_table) 173 | self.query = task_query 174 | self.dry_run = dry_run 175 | self.destination_table = task_config.destination_table 176 | self.execution_time = execution_time 177 | self.window = CustomWindow(dstart, dend) 178 | 179 | def execute(self): 180 | logger.info("starting DML transformation job") 181 | 182 | execution_parameter = ExecutionParameter(self.execution_time) 183 | destination_parameter = DestinationParameter(self.destination_table) 184 | window_parameter = WindowParameter(self.window) 185 | 186 | query_parameters = [destination_parameter, window_parameter, execution_parameter] 187 | 188 | query = Query(self.query) 189 | for parameter in query_parameters: 190 | query = query.apply_parameter(parameter) 191 | query.print_with_logger(logger) 192 | 193 | result = None 194 | 195 | if not self.dry_run: 196 | result = self.loader.load(query) 197 | 198 | logger.info(result) 199 | logger.info("finished") 200 | 201 | 202 | class TableTransformation: 203 | """ 204 | Query transformation effects whole non partitioned table 205 | """ 206 | 207 | def __init__(self, bigquery_service: BigqueryService, 208 | task_config: TaskConfig, 209 | task_query: str, 210 | dstart: datetime, 211 | dend: datetime, 212 | dry_run: bool, 213 | execution_time: datetime): 214 | self.bigquery_service = bigquery_service 215 | self.task_config = task_config 216 | self.task_query = task_query 217 | self.dry_run = dry_run 218 | self.window = CustomWindow(dstart, dend) 219 | self.execution_time = execution_time 220 | 221 | def transform(self): 222 | loader = TableLoader(self.bigquery_service, self.task_config.destination_table, self.task_config.load_method, 223 | self.task_config.allow_field_addition) 224 | logger.info("create transformation for table") 225 | 226 | task = PartitionTransformation(self.task_config, 227 | loader, 228 | self.task_query, 229 | self.window, 230 | self.dry_run, 231 | self.execution_time) 232 | task.execute() 233 | 234 | 235 | class SinglePartitionTransformation: 236 | """ 237 | Query transformation effects only a single partition 238 | 239 | queries like aggregate where source partitions don't 240 | directly map to destination partitions 241 | """ 242 | 243 | def __init__(self, bigquery_service: BigqueryService, 244 | task_config: TaskConfig, 245 | task_query: str, 246 | dstart: datetime, 247 | dend: datetime, 248 | dry_run: bool, 249 | execution_time: datetime): 250 | self.bigquery_service = bigquery_service 251 | self.task_config = task_config 252 | self.task_query = task_query 253 | 254 | self.dry_run = dry_run 255 | self.window = CustomWindow(dstart, dend) 256 | self.execution_time = execution_time 257 | 258 | def transform(self): 259 | destination_partition = self.window.start 260 | loader = PartitionLoader(self.bigquery_service, self.task_config.destination_table, 261 | self.task_config.load_method, destination_partition) 262 | logger.info("create transformation for partition: {}".format(destination_partition)) 263 | 264 | task = PartitionTransformation(self.task_config, 265 | loader, 266 | self.task_query, 267 | self.window, 268 | self.dry_run, 269 | self.execution_time) 270 | task.execute() 271 | 272 | 273 | class PartitionTransformation: 274 | def __init__(self, 275 | task_config: TaskConfig, 276 | loader: BaseLoader, 277 | query: str, 278 | window: Window, 279 | dry_run: bool, 280 | execution_time: datetime): 281 | self.dry_run = dry_run 282 | self.loader = loader 283 | 284 | destination_parameter = DestinationParameter(task_config.destination_table) 285 | window_parameter = WindowParameter(window) 286 | execution_parameter = ExecutionParameter(execution_time) 287 | 288 | self.query = Query(query).apply_parameter(window_parameter).apply_parameter( 289 | execution_parameter).apply_parameter(destination_parameter) 290 | 291 | def execute(self): 292 | logger.info("start transformation job") 293 | self.query.print_with_logger(logger) 294 | 295 | result = None 296 | if not self.dry_run: 297 | result = self.loader.load(self.query) 298 | 299 | logger.info(result) 300 | logger.info("finished") 301 | 302 | async def async_execute(self): 303 | self.execute() 304 | 305 | 306 | class MergeReplaceTransformation: 307 | """ 308 | Query replaces the effected partitions. Partitions can be derived either via a filter expression 309 | or executing a query to a temporary table and running a distinct query. 310 | - Using a filter expression is cheaper and faster 311 | - Auto partition resolution actually consumes double the size of requested query 312 | """ 313 | 314 | def __init__(self, bigquery_service: BigqueryService, 315 | task_config: TaskConfig, 316 | task_query: str, 317 | table_columns: list, 318 | partition_column_name: str, 319 | partition_column_type: str, 320 | dry_run: bool, 321 | execution_time: datetime 322 | ): 323 | self.loader = DMLLoader(bigquery_service, task_config.destination_table) 324 | self.task_query = task_query 325 | self.task_config = task_config 326 | self.dry_run = dry_run 327 | self.execution_time = execution_time 328 | self.filter_expression = task_config.filter_expression 329 | self.table_columns = table_columns 330 | self.partition_column_name = partition_column_name 331 | self.partition_column_type = partition_column_type 332 | 333 | def transform(self): 334 | execution_parameter = ExecutionParameter(self.execution_time) 335 | destination_parameter = DestinationParameter(self.task_config.destination_table) 336 | 337 | query_parameters = [destination_parameter, execution_parameter] 338 | 339 | query = Query(self.task_query) 340 | for parameter in query_parameters: 341 | query = query.apply_parameter(parameter) 342 | query.print_with_logger(logger) 343 | 344 | if self.filter_expression is not None: 345 | logger.info("running with filter expression set {}".format(self.filter_expression)) 346 | query = MergeReplaceQuery(query).from_filter(self.task_config.destination_table, self.table_columns, 347 | self.table_columns, self.filter_expression) 348 | else: 349 | logger.info("running with auto partition filter") 350 | query = MergeReplaceQuery(query).auto(self.task_config.destination_table, self.table_columns, 351 | self.table_columns, self.partition_column_name, 352 | self.partition_column_type) 353 | query.print_with_logger(logger) 354 | 355 | result = None 356 | if not self.dry_run: 357 | result = self.loader.load(query) 358 | 359 | logger.info("finished {}".format(result.total_rows)) 360 | 361 | 362 | class MultiPartitionTransformation: 363 | """ 364 | Query transformation effects multiple partitions 365 | 366 | queries where source data/partition directly map with destination partitions 367 | """ 368 | 369 | def __init__(self, bigquery_service: BigqueryService, 370 | task_config: TaskConfig, 371 | task_query: str, 372 | dstart: datetime, 373 | dend: datetime, 374 | dry_run: bool, 375 | execution_time: datetime, 376 | partition_delta: timedelta, 377 | concurrency: int 378 | ): 379 | self.bigquery_service = bigquery_service 380 | self.task_query = task_query 381 | self.task_config = task_config 382 | self.dry_run = dry_run 383 | self.execution_time = execution_time 384 | self.window = CustomWindow(dstart, dend) 385 | self.concurrency = concurrency 386 | self.partition_delta = partition_delta 387 | 388 | def transform(self): 389 | datetime_list = [] 390 | execute_for = self.window.start 391 | 392 | # tables are partitioned for day 393 | # iterate from start to end for each partition 394 | while execute_for < self.window.end: 395 | datetime_list.append(execute_for) 396 | execute_for += self.partition_delta 397 | 398 | # break query file 399 | task_queries = self.task_query.split(OPTIMUS_QUERY_BREAK_MARKER) 400 | if len(task_queries) < len(datetime_list): 401 | raise Exception( 402 | "query needs to be broken using {}, {} query found, needed {}\n{}".format(OPTIMUS_QUERY_BREAK_MARKER, 403 | len(task_queries), 404 | len(datetime_list), 405 | self.task_query)) 406 | 407 | tasks = [] 408 | query_index = 0 409 | for partition_time in datetime_list: 410 | task_window = WindowFactory.create_window_with_time(partition_time, partition_time + self.partition_delta) 411 | destination_partition = task_window.end - self.partition_delta 412 | 413 | logger.info("create transformation for partition: {}".format(destination_partition)) 414 | task_loader = PartitionLoader(self.bigquery_service, self.task_config.destination_table, 415 | self.task_config.load_method, destination_partition, 416 | self.task_config.allow_field_addition) 417 | 418 | task = PartitionTransformation(self.task_config, 419 | task_loader, 420 | task_queries[query_index], 421 | task_window, 422 | self.dry_run, 423 | self.execution_time) 424 | tasks.append(task) 425 | query_index += 1 426 | 427 | executor = ConcurrentTaskExecutor(self.concurrency) 428 | executor.execute(tasks) 429 | 430 | 431 | class LegacySpilloverTransformation: 432 | """ 433 | Query transformation effects multiple partitions 434 | """ 435 | 436 | def __init__(self, 437 | bigquery_service: BigqueryService, 438 | task_config: TaskConfig, 439 | sql_query: str, 440 | spillover_query: str, 441 | start_time: datetime, 442 | dry_run: bool, 443 | execution_time: datetime): 444 | self.bigquery_service = bigquery_service 445 | self.task_config = task_config 446 | self.sql_query = sql_query 447 | self.spillover_query = spillover_query 448 | self.dry_run = dry_run 449 | self.start_time = start_time 450 | self.execution_time = execution_time 451 | 452 | self.concurrency = self.task_config.concurrency 453 | 454 | def transform(self): 455 | datetime_list = [] 456 | default_datetime = [self.start_time] 457 | datetime_list.extend(default_datetime) 458 | 459 | if self.task_config.use_spillover: 460 | spillover = SpilloverDatetimes(self.bigquery_service, 461 | self.spillover_query, 462 | self.task_config, 463 | self.start_time, 464 | self.dry_run, 465 | self.execution_time) 466 | spillover_datetimes = spillover.collect_datetimes() 467 | datetime_list.extend(spillover_datetimes) 468 | 469 | datetime_list = distinct_list(datetime_list) 470 | 471 | tasks = [] 472 | for partition_time in datetime_list: 473 | logger.info("create transformation for partition: {}".format(partition_time)) 474 | loader = PartitionLoader(self.bigquery_service, self.task_config.destination_table, 475 | self.task_config.load_method, partition_time) 476 | 477 | task = PartitionTransformation(self.task_config, 478 | loader, 479 | self.sql_query, 480 | self.window, 481 | self.dry_run, 482 | self.execution_time) 483 | tasks.append(task) 484 | 485 | executor = ConcurrentTaskExecutor(self.concurrency) 486 | executor.execute(tasks) 487 | 488 | 489 | class SpilloverDatetimes: 490 | def __init__(self, bigquery_service: BigqueryService, 491 | query: str, 492 | task_config: TaskConfig, 493 | dstart: datetime, 494 | dend: datetime, 495 | dry_run: bool, 496 | execution_time: datetime): 497 | self.bigquery_service = bigquery_service 498 | self.query = query 499 | self.timezone = task_config.timezone 500 | self.execution_time = execution_time 501 | self.dry_run = dry_run 502 | self.destination_table = task_config.destination_table 503 | self.window = CustomWindow(dstart, dend) 504 | 505 | def collect_datetimes(self): 506 | window_parameter = WindowParameter(self.window) 507 | execution_parameter = ExecutionParameter(self.execution_time) 508 | destination_parameter = DestinationParameter(self.destination_table) 509 | 510 | query = Query(self.query) 511 | query = query.apply_parameter(window_parameter).apply_parameter(execution_parameter).apply_parameter( 512 | destination_parameter) 513 | query.print() 514 | 515 | results = None 516 | if not self.dry_run: 517 | results = self.bigquery_service.execute_query(query) 518 | 519 | dates = [row[0] for row in results] 520 | datetimes = [datetime.combine(d, datetime.min.time()) for d in dates] 521 | localised_datetimes = [localise_datetime(dtime, self.timezone) for dtime in datetimes] 522 | return localised_datetimes 523 | 524 | 525 | T = TypeVar('T') 526 | 527 | 528 | class ConcurrentTaskExecutor: 529 | def __init__(self, concurrency: int): 530 | self.concurrency = concurrency 531 | 532 | def execute(self, tasks: List[T]): 533 | if tasks is not None and len(tasks) > 0: 534 | self._concurrent_execute_task(tasks, self.concurrency) 535 | 536 | def execute_task(self, task): 537 | return task.execute() 538 | 539 | # TODO: future should check for task exception 540 | def _concurrent_execute_task(self, tasks, concurrency: int): 541 | 542 | with ThreadPoolExecutor(concurrency) as executor: 543 | futures = {executor.submit(self.execute_task, task): task for task in tasks} 544 | for future in concurrent.futures.as_completed(futures): 545 | future.result() 546 | 547 | 548 | 549 | def distinct_list(a_list: List) -> List: 550 | d = dict() 551 | for item in a_list: 552 | d[item] = 0 553 | 554 | result = [] 555 | for key in d.keys(): 556 | result.append(key) 557 | 558 | return result 559 | 560 | 561 | def split_list(a_list: List, size: int) -> List[List]: 562 | l = a_list.copy() 563 | 564 | z = [] 565 | by = int(math.ceil(len(l) / size)) 566 | 567 | for i in range(by): 568 | x = [] 569 | for j in range(size): 570 | if len(l) > 0: 571 | a = l.pop() 572 | x.append(a) 573 | z.append(x) 574 | return z 575 | -------------------------------------------------------------------------------- /task/bq2bq/executor/bumblebee/version.py: -------------------------------------------------------------------------------- 1 | VERSION = "0.0.2" 2 | -------------------------------------------------------------------------------- /task/bq2bq/executor/bumblebee/window.py: -------------------------------------------------------------------------------- 1 | import calendar 2 | from abc import ABC 3 | from abc import abstractmethod 4 | from datetime import datetime, timedelta 5 | from bumblebee.datehelper import parse_duration 6 | 7 | 8 | class BaseWindow(ABC): 9 | 10 | @property 11 | @abstractmethod 12 | def start(self): 13 | pass 14 | 15 | @property 16 | @abstractmethod 17 | def end(self): 18 | pass 19 | 20 | @property 21 | @abstractmethod 22 | def size(self): 23 | pass 24 | 25 | @property 26 | @abstractmethod 27 | def offset(self): 28 | pass 29 | 30 | @property 31 | @abstractmethod 32 | def truncate_upto(self): 33 | pass 34 | 35 | 36 | class Window(BaseWindow): 37 | """ 38 | provide start and end property from window_type 39 | 40 | """ 41 | @property 42 | def start(self): 43 | return self._start 44 | 45 | @property 46 | def end(self): 47 | return self._end 48 | 49 | @property 50 | def size(self): 51 | return self._size 52 | 53 | @property 54 | def offset(self): 55 | return self._offset 56 | 57 | @property 58 | def truncate_upto(self): 59 | return self._truncate_upto 60 | 61 | 62 | class WindowFactory: 63 | """ 64 | Generate window that determine start_time and end_time 65 | """ 66 | @staticmethod 67 | def create_window(end_time: datetime, window_size: str, window_offset: str, window_truncate_upto: str): 68 | return XWindow(end_time, window_size.lower(), window_offset.lower(), window_truncate_upto.lower()) 69 | 70 | @staticmethod 71 | def create_window_with_time(start_time: datetime, end_time: datetime): 72 | return CustomWindow(start_time, end_time) 73 | 74 | 75 | class CustomWindow(Window): 76 | """ 77 | Generate window based on already computed start and end time 78 | """ 79 | def __init__(self, start_time: datetime, end_time: datetime): 80 | self._offset = parse_duration("0") 81 | self._size = timedelta(seconds=(end_time - start_time).total_seconds()) 82 | 83 | self._start = start_time 84 | self._end = end_time 85 | self._truncate_upto = "" 86 | 87 | 88 | class XWindow(Window): 89 | """ 90 | Generate window based on user config inputs 91 | """ 92 | def __init__(self, end_time: datetime, window_size: str, window_offset: str, window_truncate_upto: str): 93 | floating_end = end_time 94 | 95 | # apply truncation 96 | if window_truncate_upto == "h": 97 | # remove time upto hours 98 | floating_end = floating_end.replace(minute=0, second=0, microsecond=0) 99 | elif window_truncate_upto == "d": 100 | # remove time upto days 101 | floating_end = floating_end.replace(hour=0, minute=0, second=0, microsecond=0) 102 | elif window_truncate_upto == "w": 103 | # remove time upto days 104 | # get week lists for current month 105 | week_matrix_per_month = calendar.Calendar().monthdatescalendar(end_time.year, end_time.month) 106 | # find week where current day lies 107 | current_week = None 108 | for week in week_matrix_per_month: 109 | for day in week: 110 | if day == end_time.date(): 111 | current_week = week 112 | 113 | floating_end = datetime.combine(current_week[6], end_time.min.time()) 114 | floating_end = floating_end.replace(tzinfo=end_time.tzinfo) 115 | elif window_truncate_upto == "" or window_truncate_upto == "0": 116 | # do nothing 117 | floating_end = floating_end 118 | else: 119 | raise Exception("unsupported truncate method: {}".format(window_truncate_upto)) 120 | 121 | # generate shift & length 122 | self._offset = parse_duration(window_offset) 123 | self._size = parse_duration(window_size) 124 | 125 | self._end = floating_end + self._offset 126 | self._start = self._end - self._size 127 | self._truncate_upto = window_truncate_upto 128 | pass 129 | 130 | 131 | class MonthlyWindow(Window): 132 | """ 133 | @Deprecated 134 | - Not Supported at the moment 135 | Monthly window returns start time of the month and end time of the month 136 | """ 137 | 138 | def __init__(self,start_time): 139 | if start_time.date().day != 1: 140 | raise Exception("for {} start_time should be in the start date of the month".format(start_time)) 141 | 142 | cal = calendar.Calendar() 143 | fullweekdates = cal.monthdatescalendar(start_time.year,start_time.month) 144 | 145 | dates = [] 146 | for week in fullweekdates: 147 | for date in week: 148 | if date.month == start_time.month: 149 | dates.append(date) 150 | 151 | self._start = self._to_datetime(dates[0]) 152 | self._end = self._to_datetime(dates[len(dates)-1]) 153 | 154 | def _to_datetime(self,date): 155 | return datetime.combine(date, datetime.min.time()) 156 | -------------------------------------------------------------------------------- /task/bq2bq/executor/bumblebee/writer.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import pathlib 4 | from abc import ABC 5 | from abc import abstractmethod 6 | 7 | from bumblebee.log import get_logger 8 | 9 | logger = get_logger(__name__) 10 | 11 | 12 | class BaseWriter(ABC): 13 | 14 | @abstractmethod 15 | def write(self, key: str, value: str): 16 | pass 17 | 18 | 19 | class JsonWriter(BaseWriter): 20 | 21 | def __init__(self, filepath: str): 22 | self.filepath = filepath 23 | # create dir if not already exists 24 | pathlib.Path(os.path.dirname(self.filepath)).mkdir(parents=True, exist_ok=True) 25 | return 26 | 27 | def write(self, key: str, value: str): 28 | try: 29 | data_file = open(self.filepath, 'r') 30 | data = json.load(data_file) 31 | data_file.close() 32 | except FileNotFoundError as e: 33 | # do nothing, fresh file 34 | data = {} 35 | 36 | data[key] = value 37 | logger.debug("{}: {}".format(key, value)) 38 | 39 | with open(self.filepath, 'w') as the_file: 40 | json.dump(data, the_file) 41 | 42 | logger.info(data) 43 | return 44 | 45 | 46 | class StdWriter(BaseWriter): 47 | def write(self, key: str, value: str): 48 | logger.info("{}: {}".format(key, value)) 49 | return 50 | -------------------------------------------------------------------------------- /task/bq2bq/executor/example.py: -------------------------------------------------------------------------------- 1 | from bumblebee.bq2bq import bq2bq 2 | from datetime import datetime, timezone 3 | import os 4 | 5 | execution_date = datetime.utcnow() 6 | 7 | 8 | def use_spillover(): 9 | bq2bq( 10 | "./samples/tasks/legacy/use_spillover/properties.cfg", 11 | "./samples/tasks/legacy/use_spillover/query.sql", 12 | "./samples/tasks/legacy/use_spillover/spillover_date.sql", 13 | datetime(2019, 5, 6), datetime(2019, 5, 7), execution_date) 14 | 15 | 16 | def not_use_spillover(): 17 | bq2bq( 18 | "./samples/tasks/legacy/not_use_spillover/properties.cfg", 19 | "./samples/tasks/legacy/not_use_spillover/query.sql", 20 | "./samples/tasks/legacy/not_use_spillover/spillover_date.sql", 21 | datetime(2019, 5, 6), datetime(2019, 5, 7), execution_date) 22 | 23 | 24 | def partition_by_column(): 25 | bq2bq( 26 | "./samples/tasks/partition_by_column/properties.cfg", 27 | "./samples/tasks/partition_by_column/query.sql", 28 | "./samples/tasks/partition_by_column/spillover_date.sql", 29 | datetime(2019, 3, 24), datetime(2019, 3, 25, 4), execution_date) 30 | 31 | 32 | def partition_by_ingestiontime(): 33 | bq2bq( 34 | "./samples/tasks/partition_by_ingestiontime/properties.cfg", 35 | "./samples/tasks/partition_by_ingestiontime/query.sql", 36 | "./samples/tasks/partition_by_ingestiontime/spillover_date.sql", 37 | datetime(2019, 3, 21), datetime(2019, 3, 22, 4), execution_date) 38 | 39 | 40 | def dml(): 41 | bq2bq( 42 | "./samples/tasks/dml/properties.cfg", 43 | "./samples/tasks/dml/query.sql", 44 | None, 45 | datetime(2019, 4, 9), 46 | datetime(2020, 4, 10), 47 | execution_date, 48 | False) 49 | 50 | 51 | def dml_dry_run(): 52 | bq2bq( 53 | "./samples/tasks/dml/properties.cfg", 54 | "./samples/tasks/dml/query.sql", 55 | None, 56 | datetime(2019, 4, 9), 57 | datetime(2020, 4, 10), 58 | execution_date, 59 | True) 60 | 61 | 62 | def select_append_query(): 63 | bq2bq( 64 | "./samples/tasks/select/select/properties.cfg", 65 | "./samples/tasks/select/select/query.sql", 66 | None, 67 | datetime(2020, 5, 25, 4), 68 | datetime(2020, 5, 26, 4), 69 | execution_date, 70 | False, 71 | { 72 | "deployment": "local-test-run", 73 | "org": "local", 74 | "landscape": "test", 75 | "environment": "local" 76 | } 77 | ) 78 | 79 | 80 | def delete_from_query(): 81 | bq2bq( 82 | "./samples/tasks/delete/properties.cfg", 83 | "./samples/tasks/delete/query.sql", 84 | None, 85 | datetime(2020, 5, 25, 4), 86 | datetime(2020, 5, 26, 4), 87 | execution_date, 88 | True, 89 | { 90 | "deployment": "local-test-run", 91 | "org": "local", 92 | "landscape": "test", 93 | "environment": "local" 94 | } 95 | ) 96 | 97 | 98 | def query_script(): 99 | bq2bq( 100 | "samples/tasks/select/script/properties.cfg", 101 | "samples/tasks/select/script/query.sql", 102 | None, 103 | datetime(2020, 7, 8), 104 | datetime(2020, 7, 9, 6), 105 | execution_date, 106 | False 107 | ) 108 | 109 | 110 | def drop_table(): 111 | bq2bq( 112 | "./samples/tasks/drop/properties.cfg", 113 | "./samples/tasks/drop/query.sql", 114 | None, 115 | datetime(2020, 7, 8), 116 | datetime(2020, 7, 9, 6), 117 | execution_date, 118 | False 119 | ) 120 | 121 | 122 | def non_partitioned_append(): 123 | bq2bq( 124 | "./samples/tasks/non_partitioned_append/properties.cfg", 125 | "./samples/tasks/non_partitioned_append/query.sql", 126 | None, 127 | datetime(2020, 7, 8, 6), 128 | datetime(2020, 7, 9, 6), 129 | execution_date, 130 | False 131 | ) 132 | 133 | 134 | def partition_by_column_load_timestamp(): 135 | bq2bq( 136 | "./samples/tasks/partition_by_column_load_timestamp/properties.cfg", 137 | "./samples/tasks/partition_by_column_load_timestamp/query.sql", 138 | None, 139 | datetime(2020, 7, 8, 6), 140 | datetime(2020, 7, 9, 6), 141 | execution_date, 142 | False 143 | ) 144 | 145 | 146 | def weekly_partitioned(): 147 | bq2bq( 148 | "./samples/tasks/weekly_partitioned/properties.cfg", 149 | "./samples/tasks/weekly_partitioned/query.sql", 150 | None, 151 | datetime(2020, 8, 25, 6), 152 | datetime(2020, 9, 1, 6), 153 | execution_date, 154 | False 155 | ) 156 | 157 | 158 | def select_federated_table_from_gsheet(): 159 | bq2bq( 160 | "./samples/tasks/select/federated_table/properties.cfg", 161 | "./samples/tasks/select/federated_table/query.sql", 162 | None, 163 | datetime(2019, 12, 1, 19), 164 | datetime(2019, 12, 2, 19), 165 | execution_date, 166 | False 167 | ) 168 | 169 | 170 | def partition_append(): 171 | bq2bq( 172 | "./samples/tasks/partition_append/properties.cfg", 173 | "./samples/tasks/partition_append/query.sql", 174 | None, 175 | datetime(2020, 8, 25, 6), 176 | datetime(2020, 8, 26, 6), 177 | execution_date, 178 | False 179 | ) 180 | 181 | 182 | def replace_merge(): 183 | bq2bq( 184 | "samples/tasks/replace_merge/auto/properties.cfg", 185 | "samples/tasks/replace_merge/auto/query.sql", 186 | None, 187 | datetime(2020, 12, 5, 1), 188 | datetime(2020, 12, 5, 1), 189 | execution_date, 190 | False 191 | ) 192 | 193 | 194 | def replace_all(): 195 | bq2bq( 196 | "samples/tasks/replace_all/basic/properties.cfg", 197 | "samples/tasks/replace_all/basic/query.sql", 198 | None, 199 | datetime(2021, 9, 1, 1), 200 | datetime(2021, 9, 30, 1), 201 | execution_date, 202 | False 203 | ) 204 | 205 | 206 | def allow_field_addition(): 207 | bq2bq( 208 | DEFAULT_PATH + "/samples/tasks/allow_field_addition/basic/properties.cfg", 209 | DEFAULT_PATH + "/samples/tasks/allow_field_addition/basic/query.sql", 210 | None, 211 | datetime(2021, 9, 1, 1), 212 | datetime(2021, 9, 2, 1), 213 | execution_date, 214 | False 215 | ) 216 | 217 | 218 | DEFAULT_PATH = os.path.dirname(os.path.realpath(__file__)) 219 | allow_field_addition() -------------------------------------------------------------------------------- /task/bq2bq/executor/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import json 4 | import os 5 | from bumblebee import bq2bq, log 6 | from bumblebee.config import AppConfig 7 | from bumblebee.handler import BigqueryJobHandler 8 | import pathlib 9 | 10 | if __name__ == "__main__": 11 | logger = log.get_logger(__name__) 12 | job_handler = BigqueryJobHandler() 13 | 14 | app_config = AppConfig() 15 | xcom_data = {'execution_time': app_config.execution_time.strftime('%Y-%m-%dT%H:%M:%S.%f')} 16 | logger.info("prepared xcom data: {} at: {}".format(xcom_data, app_config.xcom_path)) 17 | 18 | bq2bq.bq2bq( 19 | None, 20 | app_config.sql_file, 21 | app_config.spillover_sql_file, 22 | app_config.dstart, 23 | app_config.dend, 24 | app_config.execution_time, 25 | app_config.dry_run, 26 | app_config.job_labels, 27 | app_config.xcom_path, 28 | on_job_finish = job_handler.handle_job_finish, 29 | ) 30 | 31 | xcom_data['monitoring'] = { 32 | 'slot_millis': job_handler.get_sum_slot_millis(), 33 | 'total_bytes_processed': job_handler.get_sum_total_bytes_processed() 34 | } 35 | 36 | pathlib.Path(os.path.dirname(app_config.xcom_path)).mkdir(parents=True, exist_ok=True) 37 | # will be returned by xcom operator 38 | with open(app_config.xcom_path, 'w') as the_file: 39 | json.dump(xcom_data, the_file) 40 | -------------------------------------------------------------------------------- /task/bq2bq/executor/requirements.txt: -------------------------------------------------------------------------------- 1 | cachetools==4.1.1 2 | certifi==2020.6.20 3 | chardet==3.0.4 4 | google==3.0.0 5 | google-api-core==1.21.0 6 | google-auth==1.18.0 7 | google-cloud-bigquery==1.25.0 8 | google-cloud-core==1.3.0 9 | google-resumable-media==0.5.1 10 | googleapis-common-protos==1.52.0 11 | idna==2.10 12 | iso8601==0.1.12 13 | protobuf==3.12.2 14 | pyasn1==0.4.8 15 | pyasn1-modules==0.2.8 16 | pytimeparse==1.1.8 17 | pytz==2020.1 18 | requests==2.25.1 19 | rsa==4.7 20 | six==1.15.0 21 | tzlocal==2.1 22 | urllib3==1.26.5 23 | sqlparse==0.4.2 24 | -------------------------------------------------------------------------------- /task/bq2bq/executor/run_coverage.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh -e 2 | 3 | pip install -r requirements.txt 4 | pip install coverage 5 | coverage run setup.py test 6 | echo "coverage $(coverage report | awk '{print $6}' | tail -n 1)" 7 | coverage report 8 | -------------------------------------------------------------------------------- /task/bq2bq/executor/run_tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh -e 2 | 3 | pip install -r requirements.txt 4 | python -m unittest discover tests/ 5 | -------------------------------------------------------------------------------- /task/bq2bq/executor/samples/tasks/allow_field_addition/basic/properties.cfg: -------------------------------------------------------------------------------- 1 | [DESTINATION] 2 | PROJECT="g-data-gojek-id-mart" 3 | DATASET="playground" 4 | TABLE="allow_field_addition_test" 5 | 6 | [TRANSFORMATION] 7 | WINDOW_SIZE = 720h 8 | WINDOW_OFFSET = 0 9 | WINDOW_TRUNCATE_UPTO = M 10 | 11 | [LOAD] 12 | LOAD_METHOD="REPLACE" 13 | ALLOW_FIELD_ADDITION=TRUE -------------------------------------------------------------------------------- /task/bq2bq/executor/samples/tasks/allow_field_addition/basic/query.sql: -------------------------------------------------------------------------------- 1 | select 2 | "hakai" as hakai, 3 | "rasengan" as rasengan, 4 | "over" as overs, 5 | "allow_field_addition" as test_column, 6 | TIMESTAMP ('2021-09-01T01:02:03') as `event_timestamp` 7 | -------------------------------------------------------------------------------- /task/bq2bq/executor/samples/tasks/delete/properties.cfg: -------------------------------------------------------------------------------- 1 | [DESTINATION] 2 | PROJECT="g-project" 3 | DATASET="playground" 4 | TABLE="sample_select_dependency_level_2" 5 | 6 | [TRANSFORMATION] 7 | WINDOW_SIZE = 1d 8 | WINDOW_OFFSET = 0 9 | WINDOW_TRUNCATE_UPTO = d 10 | 11 | [LOAD] 12 | LOAD_METHOD="MERGE" -------------------------------------------------------------------------------- /task/bq2bq/executor/samples/tasks/delete/query.sql: -------------------------------------------------------------------------------- 1 | DELETE FROM `__destination_table__` WHERE load_timestamp = "2020-06-02 02:00:41.634266 UTC" 2 | -------------------------------------------------------------------------------- /task/bq2bq/executor/samples/tasks/dml/properties.cfg: -------------------------------------------------------------------------------- 1 | [DESTINATION] 2 | PROJECT="g-project" 3 | DATASET="playground" 4 | TABLE="test_booking_count_dml" 5 | 6 | [TRANSFORMATION] 7 | WINDOW_SIZE = 1d 8 | WINDOW_OFFSET = 0 9 | WINDOW_TRUNCATE_UPTO = d 10 | 11 | [LOAD] 12 | LOAD_METHOD="MERGE" -------------------------------------------------------------------------------- /task/bq2bq/executor/samples/tasks/dml/query.sql: -------------------------------------------------------------------------------- 1 | MERGE `destination_table` S 2 | using 3 | ( 4 | select count(1) as count, date(booking_creation_time) as date 5 | from `g-project.playground.booking_log` 6 | where date(booking_creation_time) >= 'dstart' and date(booking_creation_time) < 'dend' 7 | group by date 8 | ) N 9 | on S.date = N.date 10 | WHEN MATCHED then 11 | UPDATE SET `count` = N.count 12 | when not matched then 13 | INSERT (`date`, `count`) VALUES(N.date, N.count) 14 | -------------------------------------------------------------------------------- /task/bq2bq/executor/samples/tasks/drop/properties.cfg: -------------------------------------------------------------------------------- 1 | [DESTINATION] 2 | PROJECT="g-project" 3 | DATASET="playground" 4 | TABLE="sample_select_copy" 5 | 6 | [TRANSFORMATION] 7 | WINDOW_SIZE = 1d 8 | WINDOW_OFFSET = 0 9 | WINDOW_TRUNCATE_UPTO = d 10 | TIMEZONE="UTC" 11 | 12 | [LOAD] 13 | LOAD_METHOD="MERGE" -------------------------------------------------------------------------------- /task/bq2bq/executor/samples/tasks/drop/query.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE `__destination_table__` -------------------------------------------------------------------------------- /task/bq2bq/executor/samples/tasks/legacy/not_use_spillover/properties.cfg: -------------------------------------------------------------------------------- 1 | [DESTINATION] 2 | PROJECT="g-project" 3 | DATASET="playground" 4 | TABLE="test_booking_count" 5 | 6 | [TRANSFORMATION] 7 | TASK_WINDOW="DAILY" 8 | USE_SPILLOVER="FALSE" 9 | CONCURRENCY=1 10 | 11 | [LOAD] 12 | LOAD_METHOD="REPLACE" -------------------------------------------------------------------------------- /task/bq2bq/executor/samples/tasks/legacy/not_use_spillover/query.sql: -------------------------------------------------------------------------------- 1 | select count(1) as count, date(booking_creation_time) as date 2 | from `g-project.playground.booking_log` 3 | where date(booking_creation_time) >= 'dstart' and date(booking_creation_time) < 'dend' 4 | group by date -------------------------------------------------------------------------------- /task/bq2bq/executor/samples/tasks/legacy/not_use_spillover/spillover_date.sql: -------------------------------------------------------------------------------- 1 | SELECT 'dstart' -------------------------------------------------------------------------------- /task/bq2bq/executor/samples/tasks/legacy/use_spillover/properties.cfg: -------------------------------------------------------------------------------- 1 | [DESTINATION] 2 | PROJECT="g-project" 3 | DATASET="playground" 4 | TABLE="test_booking_count" 5 | 6 | [TRANSFORMATION] 7 | TASK_WINDOW="DAILY" 8 | TIMEZONE="Asia/Jakarta" 9 | USE_SPILLOVER="TRUE" 10 | CONCURRENCY=5 11 | 12 | [LOAD] 13 | LOAD_METHOD="REPLACE" -------------------------------------------------------------------------------- /task/bq2bq/executor/samples/tasks/legacy/use_spillover/query.sql: -------------------------------------------------------------------------------- 1 | select count(1) as count, date(booking_creation_time) as date 2 | from `g-project.playground.booking_log` 3 | where date(booking_creation_time) >= 'dstart' and date(booking_creation_time) < 'dend' 4 | group by date -------------------------------------------------------------------------------- /task/bq2bq/executor/samples/tasks/legacy/use_spillover/spillover_date.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | DISTINCT DATE(_partitiontime) 3 | FROM 4 | `g-project.integration.fd_booking_all` a 5 | WHERE 6 | _partitiontime >=TIMESTAMP(DATE_SUB(current_date,INTERVAL 30 day)) 7 | AND _partitiontime < TIMESTAMP(current_date('Asia/Jakarta')) 8 | AND DATE(load_time,'Asia/Jakarta') = 'dstart' 9 | order by 1 -------------------------------------------------------------------------------- /task/bq2bq/executor/samples/tasks/non_partitioned_append/properties.cfg: -------------------------------------------------------------------------------- 1 | [DESTINATION] 2 | PROJECT="g-project" 3 | DATASET="playground" 4 | TABLE="sample_select" 5 | 6 | [TRANSFORMATION] 7 | WINDOW_SIZE = 1d 8 | WINDOW_OFFSET = 0 9 | WINDOW_TRUNCATE_UPTO = d 10 | TIMEZONE="UTC" 11 | 12 | [LOAD] 13 | LOAD_METHOD="APPEND" -------------------------------------------------------------------------------- /task/bq2bq/executor/samples/tasks/non_partitioned_append/query.sql: -------------------------------------------------------------------------------- 1 | select 2 | "beerus" as hakai, 3 | "naruto" as rasengan, 4 | EXTRACT(DAY FROM CURRENT_TIMESTAMP()) + 8000 as `over`, 5 | CAST("__execution_time__" AS TIMESTAMP) as `load_timestamp`; -------------------------------------------------------------------------------- /task/bq2bq/executor/samples/tasks/partition_append/properties.cfg: -------------------------------------------------------------------------------- 1 | [DESTINATION] 2 | PROJECT="g-project" 3 | DATASET="playground" 4 | TABLE="sample_select_partitioned" 5 | 6 | [TRANSFORMATION] 7 | WINDOW_SIZE = 3d 8 | WINDOW_OFFSET = 0 9 | WINDOW_TRUNCATE_UPTO = h 10 | TIMEZONE="UTC" 11 | 12 | [LOAD] 13 | LOAD_METHOD="APPEND" -------------------------------------------------------------------------------- /task/bq2bq/executor/samples/tasks/partition_append/query.sql: -------------------------------------------------------------------------------- 1 | SELECT hakai, rasengan, `over`, load_timestamp as event_timestamp 2 | FROM `g-project.playground.sample_select` 3 | WHERE CAST(load_timestamp AS DATETIME) >= CAST('dstart' AS DATETIME) and CAST(load_timestamp AS DATETIME) < CAST('dend' AS DATETIME) -------------------------------------------------------------------------------- /task/bq2bq/executor/samples/tasks/partition_by_column/properties.cfg: -------------------------------------------------------------------------------- 1 | [DESTINATION] 2 | PROJECT="g-project" 3 | DATASET="playground" 4 | TABLE="test_booking_count_column_partitioned" 5 | 6 | [TRANSFORMATION] 7 | WINDOW_SIZE = 1d 8 | WINDOW_OFFSET = 0 9 | WINDOW_TRUNCATE_UPTO = d 10 | TIMEZONE="Asia/Jakarta" 11 | 12 | [LOAD] 13 | LOAD_METHOD="REPLACE" -------------------------------------------------------------------------------- /task/bq2bq/executor/samples/tasks/partition_by_column/query.sql: -------------------------------------------------------------------------------- 1 | select count(1) as count, date(booking_creation_time) as date 2 | from `g-project.playground.booking_log` 3 | where date(booking_creation_time) >= 'dstart' and date(booking_creation_time) < 'dend' 4 | group by date -------------------------------------------------------------------------------- /task/bq2bq/executor/samples/tasks/partition_by_column_load_timestamp/properties.cfg: -------------------------------------------------------------------------------- 1 | [DESTINATION] 2 | PROJECT="g-project" 3 | DATASET="playground" 4 | TABLE="sample_select_partitioned" 5 | 6 | [TRANSFORMATION] 7 | WINDOW_SIZE = 2d 8 | WINDOW_OFFSET = 1d 9 | WINDOW_TRUNCATE_UPTO = d 10 | USE_SPILLOVER=true 11 | TIMEZONE="UTC" 12 | 13 | [LOAD] 14 | LOAD_METHOD="REPLACE" -------------------------------------------------------------------------------- /task/bq2bq/executor/samples/tasks/partition_by_column_load_timestamp/query.sql: -------------------------------------------------------------------------------- 1 | SELECT hakai, rasengan, `over`, load_timestamp as event_timestamp 2 | FROM `g-project.playground.sample_select` 3 | WHERE DATE(load_timestamp) >= 'dstart' and date(load_timestamp) < 'dend' -------------------------------------------------------------------------------- /task/bq2bq/executor/samples/tasks/partition_by_ingestiontime/properties.cfg: -------------------------------------------------------------------------------- 1 | [DESTINATION] 2 | PROJECT="g-project" 3 | DATASET="playground" 4 | TABLE="test_booking_count" 5 | 6 | [TRANSFORMATION] 7 | WINDOW_SIZE = 1d 8 | WINDOW_OFFSET = 0 9 | WINDOW_TRUNCATE_UPTO = d 10 | TIMEZONE="Asia/Jakarta" 11 | 12 | [LOAD] 13 | LOAD_METHOD="REPLACE" -------------------------------------------------------------------------------- /task/bq2bq/executor/samples/tasks/partition_by_ingestiontime/query.sql: -------------------------------------------------------------------------------- 1 | select count(1) as count, date(booking_creation_time) as date 2 | from `g-project.playground.booking_log` 3 | where date(booking_creation_time) >= 'dstart' and date(booking_creation_time) < 'dend' 4 | group by date -------------------------------------------------------------------------------- /task/bq2bq/executor/samples/tasks/replace_all/basic/properties.cfg: -------------------------------------------------------------------------------- 1 | [DESTINATION] 2 | PROJECT="g-project" 3 | DATASET="playground" 4 | TABLE="sample_replace_monthly" 5 | 6 | [TRANSFORMATION] 7 | WINDOW_SIZE = 720h 8 | WINDOW_OFFSET = 0 9 | WINDOW_TRUNCATE_UPTO = M 10 | 11 | [LOAD] 12 | LOAD_METHOD="REPLACE_ALL" -------------------------------------------------------------------------------- /task/bq2bq/executor/samples/tasks/replace_all/basic/query.sql: -------------------------------------------------------------------------------- 1 | select 2 | `hakai`, 3 | `rasengan`, 4 | `over`, 5 | TIMESTAMP ('2021-09-01T01:02:03') as `event_timestamp` 6 | from 7 | `g-project.playground.sample_select` 8 | WHERE 9 | DATE(`load_timestamp`) >= DATE('2021-09-01') 10 | AND DATE(`load_timestamp`) < DATE('2021-09-30') -------------------------------------------------------------------------------- /task/bq2bq/executor/samples/tasks/replace_merge/auto/properties.cfg: -------------------------------------------------------------------------------- 1 | [DESTINATION] 2 | PROJECT="g-project" 3 | DATASET="playground" 4 | TABLE="sample_replace" 5 | 6 | [TRANSFORMATION] 7 | WINDOW_SIZE = 1h 8 | WINDOW_OFFSET = 0 9 | WINDOW_TRUNCATE_UPTO = d 10 | 11 | [LOAD] 12 | LOAD_METHOD="REPLACE_MERGE" -------------------------------------------------------------------------------- /task/bq2bq/executor/samples/tasks/replace_merge/auto/query.sql: -------------------------------------------------------------------------------- 1 | select 2 | `hakai`, 3 | `rasengan`, 4 | `over`, 5 | `load_timestamp` as `event_timestamp` 6 | from 7 | `g-project.playground.sample_select` 8 | WHERE 9 | DATE(`load_timestamp`) >= DATE('2020-08-04') 10 | AND DATE(`load_timestamp`) < DATE('2020-08-08') -------------------------------------------------------------------------------- /task/bq2bq/executor/samples/tasks/replace_merge/with_filter/properties.cfg: -------------------------------------------------------------------------------- 1 | [DESTINATION] 2 | PROJECT="g-project" 3 | DATASET="playground" 4 | TABLE="sample_replace" 5 | 6 | [TRANSFORMATION] 7 | WINDOW_SIZE = 1h 8 | WINDOW_OFFSET = 0 9 | WINDOW_TRUNCATE_UPTO = d 10 | 11 | [LOAD] 12 | LOAD_METHOD="REPLACE_MERGE" 13 | PARTITION_FILTER="DATE(`event_timestamp`) >= DATE('2020-08-04') 14 | AND DATE(`event_timestamp`) < DATE('2020-08-08')" -------------------------------------------------------------------------------- /task/bq2bq/executor/samples/tasks/replace_merge/with_filter/query.sql: -------------------------------------------------------------------------------- 1 | select 2 | `hakai`, 3 | `rasengan`, 4 | `over`, 5 | `load_timestamp` as `event_timestamp` 6 | from 7 | `g-project.playground.sample_select` 8 | WHERE 9 | DATE(`load_timestamp`) >= DATE('2020-08-04') 10 | AND DATE(`load_timestamp`) < DATE('2020-08-08') -------------------------------------------------------------------------------- /task/bq2bq/executor/samples/tasks/select/federated_table/properties.cfg: -------------------------------------------------------------------------------- 1 | [DESTINATION] 2 | PROJECT="g-project" 3 | DATASET="playground" 4 | TABLE="gsheet_log" 5 | SQL_TYPE="STANDARD" 6 | 7 | [TRANSFORMATION] 8 | WINDOW_SIZE= 48h 9 | WINDOW_OFFSET= 24h 10 | WINDOW_TRUNCATE_UPTO = d 11 | TIMEZONE="UTC" 12 | 13 | [LOAD] 14 | LOAD_METHOD="REPLACE" 15 | -------------------------------------------------------------------------------- /task/bq2bq/executor/samples/tasks/select/federated_table/query.sql: -------------------------------------------------------------------------------- 1 | CREATE TEMP FUNCTION standardRule(fieldContent STRING, rules ARRAY) 2 | RETURNS STRING 3 | LANGUAGE js AS """ 4 | return standardizedRule(fieldContent, rules) 5 | """ 6 | OPTIONS (library="gs://bi_playground_eoch7goo/project/bq_lib/standardizedRule.js"); 7 | 8 | WITH 9 | dedup_source AS ( 10 | SELECT DISTINCT 11 | method, 12 | type, 13 | accuracy, 14 | total_sample 15 | FROM 16 | `g-project.playground.gsheet_log` 17 | ) 18 | 19 | SELECT 20 | standardRule(method, ['cleanup']) AS method_name, 21 | standardRule(type, ['cleanup']) AS type, 22 | accuracy, 23 | CAST(total_sample AS NUMERIC) AS total_sample_count 24 | FROM dedup_source 25 | -------------------------------------------------------------------------------- /task/bq2bq/executor/samples/tasks/select/script/properties.cfg: -------------------------------------------------------------------------------- 1 | [DESTINATION] 2 | PROJECT="g-project" 3 | DATASET="playground" 4 | TABLE="sample_select_dependency_level_1" 5 | 6 | [TRANSFORMATION] 7 | WINDOW_SIZE = 1d 8 | WINDOW_OFFSET = 0 9 | WINDOW_TRUNCATE_UPTO = d 10 | TIMEZONE="UTC" 11 | 12 | [LOAD] 13 | LOAD_METHOD="MERGE" -------------------------------------------------------------------------------- /task/bq2bq/executor/samples/tasks/select/script/query.sql: -------------------------------------------------------------------------------- 1 | DECLARE power INT64; 2 | SET power = 9001; 3 | 4 | WITH simple_sel as ( 5 | SELECT * from `g-project.playground.sample_select` 6 | WHERE `over` = 9001 7 | ) 8 | select * from simple_sel; -------------------------------------------------------------------------------- /task/bq2bq/executor/samples/tasks/select/without_dependency/properties.cfg: -------------------------------------------------------------------------------- 1 | [DESTINATION] 2 | PROJECT="g-project" 3 | DATASET="playground" 4 | TABLE="sample_select" 5 | 6 | [TRANSFORMATION] 7 | WINDOW_SIZE = 1d 8 | WINDOW_OFFSET = 0 9 | WINDOW_TRUNCATE_UPTO = d 10 | TIMEZONE="UTC" 11 | 12 | [LOAD] 13 | LOAD_METHOD="APPEND" -------------------------------------------------------------------------------- /task/bq2bq/executor/samples/tasks/select/without_dependency/query.sql: -------------------------------------------------------------------------------- 1 | select 2 | "beerus" as hakai, 3 | "naruto" as rasengan, 4 | 9001 as `over`, 5 | CAST("__execution_time__" AS TIMESTAMP) as `load_timestamp`; -------------------------------------------------------------------------------- /task/bq2bq/executor/samples/tasks/weekly_partitioned/properties.cfg: -------------------------------------------------------------------------------- 1 | [DESTINATION] 2 | PROJECT="g-project" 3 | DATASET="playground" 4 | TABLE="count_weekly" 5 | SQL_TYPE="STANDARD" 6 | 7 | [TRANSFORMATION] 8 | WINDOW_SIZE = 168h 9 | WINDOW_OFFSET = -24h 10 | WINDOW_TRUNCATE_UPTO = d 11 | USE_SPILLOVER = false 12 | TIMEZONE="UTC" 13 | 14 | [LOAD] 15 | LOAD_METHOD="REPLACE" -------------------------------------------------------------------------------- /task/bq2bq/executor/samples/tasks/weekly_partitioned/query.sql: -------------------------------------------------------------------------------- 1 | SELECT DISTINCT 2 | DATE_TRUNC(DATE(created_timestamp,'Asia/Jakarta'), WEEK(MONDAY)) AS week_start_date, 3 | CAST(count(order_no) AS NUMERIC) as order_count, 4 | CURRENT_TIMESTAMP() AS load_timestamp, 5 | TIMESTAMP("__execution_time__") AS last_modified_timestamp 6 | FROM 7 | `g-project.playground.twomonths_data` 8 | WHERE 9 | DATE(created_timestamp,'Asia/Jakarta') >= DATE('dstart') 10 | AND DATE(created_timestamp,'Asia/Jakarta') < DATE('dend') 11 | AND LOWER(latest_status_name) = 'completed' 12 | GROUP BY 13 | 1 -------------------------------------------------------------------------------- /task/bq2bq/executor/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | 4 | with open("./bumblebee/version.py") as fp: 5 | exec(fp.read()) 6 | 7 | 8 | requirements = [] 9 | with open('requirements.txt') as f: 10 | requirements = f.read().splitlines() 11 | 12 | if __name__ == "__main__": 13 | setup( 14 | name="bumblebee", 15 | version=VERSION, 16 | author="ODPF", 17 | author_email="thekushsharma@gmail.com", 18 | description="BigQuery to BigQuery Transformation client", 19 | packages=find_packages(), 20 | install_requires=requirements, 21 | test_suite='nose.collector', 22 | tests_require=['nose','coverage'] 23 | ) -------------------------------------------------------------------------------- /task/bq2bq/executor/tests/sample_config/in/query.sql: -------------------------------------------------------------------------------- 1 | select 2 | "beerus" as hakai, 3 | "naruto" as rasengan, 4 | EXTRACT(DAY FROM CURRENT_TIMESTAMP()) + 8000 as `over`, 5 | CAST("__execution_time__" AS TIMESTAMP) as `load_timestamp`; 6 | -------------------------------------------------------------------------------- /task/bq2bq/executor/tests/test_config.py: -------------------------------------------------------------------------------- 1 | 2 | import pytz 3 | from pytz.exceptions import UnknownTimeZoneError 4 | 5 | from unittest import TestCase 6 | from unittest.mock import MagicMock,call 7 | 8 | from datetime import datetime 9 | import iso8601 10 | from bumblebee.config import LoadMethod, TaskFiles, AppConfig, TaskConfigFromEnv, parse_date 11 | from bumblebee.datehelper import localise_datetime 12 | from bumblebee.filesystem import FileSystem 13 | 14 | from google.cloud.bigquery.job import WriteDisposition 15 | import os 16 | 17 | 18 | class TestAppConfig(TestCase): 19 | def set_vars_with_default(self, dt): 20 | os.environ['DSTART'] = dt 21 | os.environ['DEND'] = dt 22 | os.environ['EXECUTION_TIME'] = dt 23 | os.environ['DRY_RUN'] = "true" 24 | os.environ['JOB_LABELS'] = "environment=integration,lifecycle=process" 25 | os.environ['JOB_DIR'] = "./tests/sample_config" 26 | os.environ['PROJECT'] = "project-id" 27 | os.environ['DATASET'] = "dataset" 28 | os.environ['TABLE'] = "sample_select" 29 | os.environ['SQL_TYPE'] = "STANDARD" 30 | os.environ['LOAD_METHOD'] = "APPEND" 31 | os.environ['TASK_WINDOW'] = "DAILY" 32 | os.environ['TIMEZONE'] = "UTC" 33 | os.environ['USE_SPILLOVER'] = "false" 34 | os.environ['CONCURRENCY'] = "1" 35 | 36 | def test_should_parse_args(self): 37 | dt = "2020-11-30T19:32:25.680530" 38 | self.set_vars_with_default(dt) 39 | 40 | opts = AppConfig() 41 | self.assertEqual(opts.dry_run, True) 42 | self.assertEqual(opts.dstart, parse_date(dt)) 43 | self.assertEqual(opts.dend, parse_date(dt)) 44 | self.assertEqual(opts.execution_time, parse_date(dt)) 45 | self.assertEqual(opts.job_labels, {"environment": "integration", "lifecycle": "process"}) 46 | self.assertEqual("./tests/sample_config/in/query.sql", opts.sql_file) 47 | self.assertEqual(None, opts.spillover_sql_file) 48 | 49 | def test_should_fail_for_invalid_date(self): 50 | dt = "2020-invalid-date" 51 | self.set_vars_with_default(dt) 52 | 53 | with self.assertRaises(iso8601.ParseError) as e: 54 | AppConfig() 55 | self.assertEqual("Unable to parse date string '2020-invalid-date'", str(e.exception)) 56 | 57 | def test_should_fail_for_empty_dstart(self): 58 | dt = "2020-11-30T19:32:25.680530" 59 | self.set_vars_with_default(dt) 60 | os.environ['DSTART'] = "" 61 | 62 | with self.assertRaises(AssertionError) as e: 63 | AppConfig() 64 | self.assertEqual("config 'DSTART' must be provided", str(e.exception)) 65 | 66 | def test_should_fail_for_missing_dstart(self): 67 | dt = "2020-11-30T19:32:25.680530" 68 | self.set_vars_with_default(dt) 69 | del os.environ['DSTART'] 70 | 71 | with self.assertRaises(AssertionError) as e: 72 | AppConfig() 73 | self.assertEqual("config 'DSTART' must be provided", str(e.exception)) 74 | 75 | def test_should_fail_for_missing_label_value(self): 76 | dt = "2020-11-30T19:32:25.680530" 77 | self.set_vars_with_default(dt) 78 | os.environ['JOB_LABELS'] = "environment=integration,lifecycle=" 79 | 80 | with self.assertRaises(AssertionError) as e: 81 | AppConfig() 82 | self.assertEqual("label value cannot be empty in JOB_LABELS", str(e.exception)) 83 | 84 | def test_should_fail_for_missing_label_key_name(self): 85 | dt = "2020-11-30T19:32:25.680530" 86 | self.set_vars_with_default(dt) 87 | os.environ['JOB_LABELS'] = "environment=integration,=some_value" 88 | 89 | with self.assertRaises(AssertionError) as e: 90 | AppConfig() 91 | self.assertEqual("label name cannot be empty in JOB_LABELS", str(e.exception)) 92 | 93 | def test_dry_run_should_be_false_by_default(self): 94 | dt = "2020-11-30T19:32:25.680530" 95 | self.set_vars_with_default(dt) 96 | del os.environ['DRY_RUN'] 97 | 98 | c = AppConfig() 99 | self.assertEqual(False, c.dry_run) 100 | 101 | 102 | class TestConfig(TestCase): 103 | def set_vars_with_default(self, tz: str = "UTC"): 104 | os.environ['PROJECT'] = "project-id" 105 | os.environ['DATASET'] = "dataset" 106 | os.environ['TABLE'] = "sample_select" 107 | os.environ['SQL_TYPE'] = "STANDARD" 108 | os.environ['LOAD_METHOD'] = "APPEND" 109 | os.environ['TASK_WINDOW'] = "DAILY" 110 | os.environ['TIMEZONE'] = tz 111 | os.environ['USE_SPILLOVER'] = "false" 112 | os.environ['CONCURRENCY'] = "1" 113 | 114 | def setUp(self): 115 | self.timezone = pytz.timezone("Asia/Jakarta") 116 | 117 | def test_localise_datetime(self): 118 | tzname = "Asia/Jakarta" 119 | start_time = datetime(2019, 1, 1) 120 | localised_start_time = localise_datetime(start_time, tzname) 121 | 122 | expected_start_time = self.timezone.localize(datetime(year=2019, month=1, day=1)) 123 | self.assertEqual(expected_start_time,localised_start_time) 124 | 125 | def test_localise_datetime_utc(self): 126 | utc = pytz.timezone("UTC") 127 | start_time = utc.localize(datetime(2019, 1, 1, 17)) 128 | 129 | localized_start_time = localise_datetime(start_time, "Asia/Jakarta") 130 | 131 | expected = self.timezone.localize(datetime(year=2019, month=1, day=2)) 132 | self.assertEqual(expected, localized_start_time) 133 | 134 | def test_timezone_config(self): 135 | tz = "Asia/Jakarta" 136 | self.set_vars_with_default(tz) 137 | 138 | task_config = TaskConfigFromEnv() 139 | 140 | self.assertEqual(task_config.timezone, tz) 141 | 142 | def test_invalid_timezone_trigger_exception(self): 143 | tz = "xxwdw" 144 | self.set_vars_with_default(tz) 145 | 146 | with self.assertRaises(UnknownTimeZoneError) as ex: 147 | TaskConfigFromEnv() 148 | 149 | self.assertTrue(tz in str(ex.exception)) 150 | 151 | def test_concurrency(self): 152 | self.set_vars_with_default() 153 | os.environ['CONCURRENCY'] = "2" 154 | 155 | config = TaskConfigFromEnv() 156 | 157 | self.assertEqual(config.concurrency, 2) 158 | 159 | def test_concurrency_should_not_zero_exception(self): 160 | self.set_vars_with_default() 161 | os.environ['CONCURRENCY'] = "0" 162 | 163 | with self.assertRaises(Exception) as ex: 164 | TaskConfigFromEnv() 165 | 166 | self.assertTrue('value should be integer and greater than 0' in str(ex.exception)) 167 | 168 | def test_empty_destination_exception(self): 169 | properties = """[DESTINATION] 170 | PROJECT="" 171 | DATASET="" 172 | TABLE="" 173 | SQL_TYPE="STANDARD" #LEGACY/STANDARD 174 | 175 | [TRANSFORMATION] 176 | WINDOW_SIZE="1d" 177 | WINDOW_OFFSET="" 178 | WINDOW_TRUNCATE_UPTO="d" 179 | TIMEZONE="Asia/Jakarta" 180 | USE_SPILLOVER="TRUE" 181 | CONCURRENCY=0 182 | 183 | [LOAD] 184 | LOAD_METHOD="REPLACE" 185 | """ 186 | 187 | self.set_vars_with_default() 188 | os.environ['PROJECT'] = "" 189 | os.environ['DATASET'] = "" 190 | os.environ['TABLE'] = "" 191 | 192 | with self.assertRaises(AssertionError) as ex: 193 | TaskConfigFromEnv() 194 | 195 | self.assertEqual("config 'PROJECT' must be provided", str(ex.exception)) 196 | 197 | def test_allow_field_addition(self): 198 | self.set_vars_with_default() 199 | os.environ['ALLOW_FIELD_ADDITION'] = 'true' 200 | 201 | config = TaskConfigFromEnv() 202 | self.assertEqual(True, config.allow_field_addition) 203 | del os.environ['ALLOW_FIELD_ADDITION'] 204 | 205 | def test_allow_field_addition_should_be_false_by_default(self): 206 | self.set_vars_with_default() 207 | 208 | config = TaskConfigFromEnv() 209 | self.assertEqual(False, config.allow_field_addition) 210 | 211 | 212 | class TestTaskFiles(TestCase): 213 | 214 | def test_task_files_without_spillover_query(self): 215 | fs = FileSystem() 216 | fs.exist = MagicMock(return_value=True) 217 | fs.read = MagicMock(return_value="content") 218 | 219 | query_sql_file = "./booking/query.sql" 220 | files = [query_sql_file] 221 | 222 | task_files = TaskFiles(fs, files) 223 | 224 | fs.exist.assert_has_calls([call(query_sql_file)]) 225 | fs.read.assert_has_calls([call(query_sql_file)]) 226 | 227 | self.assertEqual(task_files.query, "content") 228 | self.assertEqual(task_files.spillover_query, None) 229 | 230 | def test_task_files_with_spillover_query(self): 231 | fs = FileSystem() 232 | fs.exist = MagicMock(return_value=True) 233 | fs.read = MagicMock(return_value="content") 234 | 235 | query_sql_file = "./booking/query.sql" 236 | spillover_sql_file = "./booking/spillover_date.sql" 237 | files = [query_sql_file, spillover_sql_file] 238 | 239 | task_files = TaskFiles(fs, files) 240 | 241 | fs.exist.assert_has_calls([call(query_sql_file)]) 242 | fs.read.assert_has_calls([call(query_sql_file)]) 243 | 244 | self.assertEqual(task_files.query, "content") 245 | self.assertEqual(task_files.spillover_query, "content") 246 | 247 | 248 | class TestLoadMethod(TestCase): 249 | 250 | def test_write_disposition(self): 251 | load_method = LoadMethod.APPEND 252 | expected_write_disposition = WriteDisposition.WRITE_APPEND 253 | self.assertEqual(load_method.write_disposition, expected_write_disposition) 254 | -------------------------------------------------------------------------------- /task/bq2bq/executor/tests/test_query.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from bumblebee.query import Query, WindowParameter, ExecutionParameter, DestinationParameter 4 | from bumblebee.window import WindowFactory 5 | from datetime import datetime, timedelta 6 | 7 | 8 | class test_Query(TestCase): 9 | def setUp(self): 10 | self.scheduled_at = datetime(2020, 7, 8, 4) 11 | self.scheduled_next_at = datetime(2020, 7, 9, 4) 12 | 13 | def test_should_replace_dstart_and_dend_with_date(self): 14 | 15 | params = { 16 | '__dstart__': '2019-01-01', 17 | '__dend__': '2019-01-02' 18 | } 19 | 20 | query = Query("select * from table where date => '__dstart__' and date < '__dend__'") 21 | result = query.replace_param(params) 22 | 23 | self.assertEqual("select * from table where date => '2019-01-01' and date < '2019-01-02'",result) 24 | 25 | def test_should_replace_destination_table_and_execution_date(self): 26 | window_parameter = WindowParameter(WindowFactory.create_window_with_time(self.scheduled_at, self.scheduled_at + timedelta(days=1))) 27 | execution_parameter = ExecutionParameter(self.scheduled_at) 28 | destination_parameter = DestinationParameter("table") 29 | 30 | query = Query("select * from `__destination_table__` where date => '__execution_time__' and date < '__dend__'") 31 | result = query.apply_parameter(window_parameter).apply_parameter(execution_parameter)\ 32 | .apply_parameter(destination_parameter) 33 | 34 | self.assertEqual("select * from `table` where date => '2020-07-08T04:00:00.000000' and date < '2020-07-09'", result) 35 | 36 | def test_apply_window(self): 37 | start_time = datetime(2019,1,1) 38 | window_parameter = WindowParameter(WindowFactory.create_window_with_time(start_time, start_time + timedelta(days=1))) 39 | 40 | query = Query("select * from table where date => '__dstart__' and date < '__dend__'") 41 | result = query.apply_parameter(window_parameter) 42 | self.assertEqual("select * from table where date => '2019-01-01' and date < '2019-01-02'", result) 43 | 44 | query = Query("select * from table where date => '__dstart__' and date < '__dend__'") 45 | result = query.apply_parameter(window_parameter) 46 | self.assertEqual("select * from table where date => '2019-01-01' and date < '2019-01-02'", result) 47 | 48 | def test_valid_hour_size_in_window_parameter(self): 49 | window = WindowFactory.create_window(self.scheduled_next_at, "2h", "0", "h") 50 | window_parameter = WindowParameter(window) 51 | 52 | query = Query("select * from table where date => '__dstart__' and date < '__dend__' and tt < '__dstart__'") 53 | result = query.replace_param(window_parameter) 54 | 55 | self.assertEqual("select * from table where date => '2020-07-09 02:00:00' and date < '2020-07-09 04:00:00' and tt < '2020-07-09 02:00:00'",result) 56 | 57 | def test_dend_should_not_be_replaced_inside_the_word(self): 58 | window = WindowFactory.create_window(self.scheduled_next_at, "2h", "0", "h") 59 | window_parameter = WindowParameter(window) 60 | 61 | query = Query("select * from table where date => adstarta && event > __execution_time__") 62 | result = query.replace_param(window_parameter) 63 | 64 | self.assertEqual("select * from table where date => adstarta && event > __execution_time__",result) 65 | -------------------------------------------------------------------------------- /task/bq2bq/executor/tests/test_transformation.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase, mock 2 | from unittest.mock import MagicMock, PropertyMock, call 3 | 4 | from bumblebee.transformation import * 5 | from bumblebee.config import TaskConfigFromEnv 6 | 7 | from google.cloud.bigquery.job import WriteDisposition 8 | from google.cloud.bigquery.table import TimePartitioningType 9 | import os 10 | 11 | 12 | def set_vars_with_default( 13 | tz="UTC", 14 | project="bq_project", 15 | dataset="playground_dev", 16 | table="abcd", 17 | load_method="REPLACE", 18 | ): 19 | os.environ['PROJECT'] = project 20 | os.environ['DATASET'] = dataset 21 | os.environ['TABLE'] = table 22 | os.environ['SQL_TYPE'] = "STANDARD" 23 | os.environ['LOAD_METHOD'] = load_method 24 | os.environ['TASK_WINDOW'] = "DAILY" 25 | os.environ['TIMEZONE'] = tz 26 | os.environ['USE_SPILLOVER'] = "false" 27 | os.environ['CONCURRENCY'] = "1" 28 | 29 | 30 | class TestTransformationTask(TestCase): 31 | @mock.patch("bumblebee.bigquery_service.BigqueryService") 32 | def test_partition_transform_execute(self, BigqueryServiceMock): 33 | query = """select count(1) from table where date >= '__dstart__' and date < '__dend__' 34 | """ 35 | 36 | set_vars_with_default() 37 | task_config = TaskConfigFromEnv() 38 | localized_end_time = localise_datetime(datetime(2019, 1, 2), task_config.timezone) 39 | localized_execution_time = localise_datetime(datetime(2019, 1, 1), task_config.timezone) 40 | 41 | bigquery_service = BigqueryServiceMock() 42 | window = WindowFactory.create_window_with_time(localized_end_time - timedelta(days=1), localized_end_time) 43 | loader = PartitionLoader(bigquery_service, task_config.destination_table, 44 | task_config.load_method, window.start) 45 | task = PartitionTransformation(task_config, loader, query, window, False, localized_execution_time) 46 | task.execute() 47 | 48 | final_query = """select count(1) from table where date >= '2019-01-01' and date < '2019-01-02' 49 | """ 50 | 51 | bigquery_service.transform_load.assert_called_with(query=final_query, 52 | write_disposition=WriteDisposition.WRITE_TRUNCATE, 53 | destination_table="bq_project.playground_dev.abcd$20190101", 54 | allow_field_addition=False) 55 | 56 | @mock.patch("bumblebee.bigquery_service.BigqueryService") 57 | def test_table_transform(self, BigqueryServiceMock): 58 | query = """select count(1) from table where date >= '__dstart__' and date < '__dend__' 59 | """ 60 | 61 | properties = """[DESTINATION] 62 | PROJECT="bq_project" 63 | DATASET="playground_dev" 64 | TABLE="abcd" 65 | SQL_TYPE="STANDARD" #LEGACY/STANDARD 66 | 67 | [TRANSFORMATION] 68 | WINDOW_SIZE = 1d 69 | WINDOW_OFFSET = 1d 70 | WINDOW_TRUNCATE_UPTO = d 71 | TIMEZONE="UTC" 72 | 73 | [LOAD] 74 | LOAD_METHOD="REPLACE" 75 | """ 76 | set_vars_with_default() 77 | 78 | task_config = TaskConfigFromEnv() 79 | localized_start_time = localise_datetime(datetime(2019, 1, 2), task_config.timezone) 80 | localized_end_time = localise_datetime(datetime(2019, 1, 3), task_config.timezone) 81 | localized_execution_time = localise_datetime(datetime(2019, 1, 1), task_config.timezone) 82 | 83 | bigquery_service = BigqueryServiceMock() 84 | task = TableTransformation(bigquery_service, task_config, query, localized_start_time, localized_end_time, False, localized_execution_time) 85 | task.transform() 86 | 87 | final_query = """select count(1) from table where date >= '2019-01-02' and date < '2019-01-03' 88 | """ 89 | 90 | bigquery_service.transform_load.assert_called_with(query=final_query, 91 | write_disposition=WriteDisposition.WRITE_TRUNCATE, 92 | destination_table="bq_project.playground_dev.abcd", 93 | allow_field_addition=False) 94 | 95 | @mock.patch("bumblebee.bigquery_service.BigqueryService") 96 | def test_single_partition_transform_1d_window_0_offset_without_spillover(self, BigqueryServiceMock): 97 | query = """select count(1) from table where date >= '__dstart__' and date < '__dend__' 98 | """ 99 | set_vars_with_default() 100 | task_config = TaskConfigFromEnv() 101 | localized_start_time = localise_datetime(datetime(2019, 1, 1), task_config.timezone) 102 | localized_end_time = localise_datetime(datetime(2019, 1, 2), task_config.timezone) 103 | localized_execution_time = localise_datetime(datetime(2019, 1, 1), task_config.timezone) 104 | 105 | bigquery_service = BigqueryServiceMock() 106 | task = SinglePartitionTransformation(bigquery_service, task_config, query, localized_start_time, 107 | localized_end_time, False, localized_execution_time) 108 | task.transform() 109 | 110 | final_query = """select count(1) from table where date >= '2019-01-01' and date < '2019-01-02' 111 | """ 112 | 113 | bigquery_service.transform_load.assert_called_with(query=final_query, 114 | write_disposition=WriteDisposition.WRITE_TRUNCATE, 115 | destination_table="bq_project.playground_dev.abcd$20190101", 116 | allow_field_addition=False) 117 | 118 | @mock.patch("bumblebee.bigquery_service.BigqueryService") 119 | def test_single_partition_transform_2d_window_24h_offset_without_spillover(self, BigqueryServiceMock): 120 | query = """select count(1) from table where date >= '__dstart__' and date < '__dend__'""" 121 | 122 | set_vars_with_default() 123 | task_config = TaskConfigFromEnv() 124 | localized_start_time = localise_datetime(datetime(2019, 1, 4), task_config.timezone) 125 | localized_end_time = localise_datetime(datetime(2019, 1, 6), task_config.timezone) 126 | localized_execution_time = localise_datetime(datetime(2019, 1, 4), task_config.timezone) 127 | 128 | bigquery_service = BigqueryServiceMock() 129 | task = SinglePartitionTransformation(bigquery_service, task_config, query, localized_start_time, 130 | localized_end_time, False, localized_execution_time) 131 | task.transform() 132 | 133 | final_query = """select count(1) from table where date >= '2019-01-04' and date < '2019-01-06'""" 134 | 135 | bigquery_service.transform_load.assert_called_with(query=final_query, 136 | write_disposition=WriteDisposition.WRITE_TRUNCATE, 137 | destination_table="bq_project.playground_dev.abcd$20190104", 138 | allow_field_addition=False) 139 | 140 | @mock.patch("bumblebee.bigquery_service.BigqueryService") 141 | def test_single_partition_transform_7d_window_without_spillover(self, BigqueryServiceMock): 142 | query = """select count(1) from table where date >= '__dstart__' and date < '__dend__'""" 143 | 144 | properties = """ 145 | [DESTINATION] 146 | PROJECT="bq_project" 147 | DATASET="playground_dev" 148 | TABLE="abcd" 149 | SQL_TYPE="STANDARD" #LEGACY/STANDARD 150 | 151 | [TRANSFORMATION] 152 | WINDOW_SIZE = 7d 153 | WINDOW_OFFSET = 0 154 | WINDOW_TRUNCATE_UPTO = d 155 | TIMEZONE="UTC" 156 | 157 | [LOAD] 158 | LOAD_METHOD="REPLACE" 159 | """ 160 | 161 | set_vars_with_default() 162 | task_config = TaskConfigFromEnv() 163 | localized_start_time = localise_datetime(datetime(2019, 1, 3), task_config.timezone) 164 | localized_end_time = localise_datetime(datetime(2019, 1, 10), task_config.timezone) 165 | localized_execution_time = localise_datetime(datetime(2019, 1, 3), task_config.timezone) 166 | 167 | bigquery_service = BigqueryServiceMock() 168 | task = SinglePartitionTransformation(bigquery_service, task_config, query, localized_start_time, 169 | localized_end_time, False, localized_execution_time) 170 | task.transform() 171 | 172 | final_query = """select count(1) from table where date >= '2019-01-03' and date < '2019-01-10'""" 173 | 174 | bigquery_service.transform_load.assert_called_with(query=final_query, 175 | write_disposition=WriteDisposition.WRITE_TRUNCATE, 176 | destination_table="bq_project.playground_dev.abcd$20190103", 177 | allow_field_addition=False) 178 | 179 | @mock.patch("bumblebee.bigquery_service.BigqueryService") 180 | def test_single_partition_transform_2d_with_spillover(self, BigqueryServiceMock): 181 | query = "select count(1) from table where date >= '2019-01-03' and date < '2019-01-04'\n" + OPTIMUS_QUERY_BREAK_MARKER+ "\n" 182 | query += "select count(1) from table where date >= '2019-01-04' and date < '2019-01-05'" 183 | 184 | set_vars_with_default() 185 | task_config = TaskConfigFromEnv() 186 | localized_start_time = localise_datetime(datetime(2019, 1, 3), task_config.timezone) 187 | localized_end_time = localise_datetime(datetime(2019, 1, 5), task_config.timezone) 188 | localized_execution_time = localise_datetime(datetime(2019, 1, 5), task_config.timezone) 189 | 190 | bigquery_service = BigqueryServiceMock() 191 | task = MultiPartitionTransformation(bigquery_service, task_config, query, localized_start_time, 192 | localized_end_time, False, localized_execution_time, timedelta(days=1), 1) 193 | task.transform() 194 | 195 | final_query_1 = """select count(1) from table where date >= '2019-01-03' and date < '2019-01-04'\n""" 196 | final_query_2 = """\nselect count(1) from table where date >= '2019-01-04' and date < '2019-01-05'""" 197 | 198 | calls = [call(query=final_query_1, write_disposition=WriteDisposition.WRITE_TRUNCATE, 199 | destination_table="bq_project.playground_dev.abcd$20190103", 200 | allow_field_addition=False), 201 | call(query=final_query_2, write_disposition=WriteDisposition.WRITE_TRUNCATE, 202 | destination_table="bq_project.playground_dev.abcd$20190104", 203 | allow_field_addition=False)] 204 | bigquery_service.transform_load.assert_has_calls(calls, any_order=True) 205 | self.assertEqual(len(bigquery_service.transform_load.call_args_list), len(calls)) 206 | 207 | @mock.patch("bumblebee.bigquery_service.BigqueryService") 208 | def test_dml_transform(self, BigqueryServiceMock): 209 | query = """select count(1) from table where date >= '__dstart__' and date < '__dend__'""" 210 | 211 | set_vars_with_default() 212 | task_config = TaskConfigFromEnv() 213 | localized_start_time = localise_datetime(datetime(2019, 1, 2), task_config.timezone) 214 | localized_end_time = localise_datetime(datetime(2019, 1, 3), task_config.timezone) 215 | localized_execution_time = localise_datetime(datetime(2019, 1, 1), task_config.timezone) 216 | 217 | bigquery_service = BigqueryServiceMock() 218 | task = DMLBasedTransformation(bigquery_service, task_config, query, localized_start_time, localized_end_time, False, 219 | localized_execution_time) 220 | task.execute() 221 | 222 | final_query = """select count(1) from table where date >= '2019-01-02' and date < '2019-01-03'""" 223 | bigquery_service.execute_query.assert_called_with(final_query) 224 | 225 | @mock.patch("bumblebee.bigquery_service.BigqueryService") 226 | def test_execute_dry_run(self, BigqueryServiceMock): 227 | query = """select count(1) from table where date >= '__dstart__' and date < '__dend__'""" 228 | 229 | set_vars_with_default() 230 | task_config = TaskConfigFromEnv() 231 | localized_start_time = localise_datetime(datetime(2019, 1, 1), task_config.timezone) 232 | localized_end_time = localise_datetime(datetime(2019, 1, 1), task_config.timezone) 233 | localized_execution_time = localise_datetime(datetime(2019, 1, 1), task_config.timezone) 234 | 235 | bigquery_service = BigqueryServiceMock() 236 | dry_run = True 237 | task = TableTransformation(bigquery_service, task_config, query, localized_start_time, 238 | localized_end_time, dry_run, localized_execution_time) 239 | task.transform() 240 | bigquery_service.transform_load.assert_not_called() 241 | 242 | 243 | @mock.patch("bumblebee.bigquery_service.BigqueryService") 244 | def test_allow_field_addition(self, BigqueryServiceMock): 245 | query = """select count(1) from table where date >= '__dstart__' and date < '__dend__'""" 246 | 247 | set_vars_with_default() 248 | os.environ['ALLOW_FIELD_ADDITION'] = 'true' 249 | task_config = TaskConfigFromEnv() 250 | del os.environ['ALLOW_FIELD_ADDITION'] 251 | 252 | localized_start_time = localise_datetime(datetime(2019, 1, 1), task_config.timezone) 253 | localized_end_time = localise_datetime(datetime(2019, 1, 2), task_config.timezone) 254 | localized_execution_time = localise_datetime(datetime(2019, 1, 1), task_config.timezone) 255 | 256 | bigquery_service = BigqueryServiceMock() 257 | task = TableTransformation(bigquery_service, task_config, query, localized_start_time, 258 | localized_end_time, False, localized_execution_time) 259 | task.transform() 260 | 261 | final_query = """select count(1) from table where date >= '2019-01-01' and date < '2019-01-02'""" 262 | bigquery_service.transform_load.assert_called_with(query=final_query, 263 | write_disposition=WriteDisposition.WRITE_TRUNCATE, 264 | destination_table="bq_project.playground_dev.abcd", 265 | allow_field_addition=True) 266 | 267 | 268 | class TestTransformation(TestCase): 269 | 270 | @mock.patch("bumblebee.bigquery_service.BigqueryService") 271 | def test_should_run_dml_merge_statements(self, BigqueryServiceMock): 272 | query = """select count(1) from table where date >= '__dstart__' and date < '__dend__'""" 273 | 274 | set_vars_with_default(load_method="MERGE") 275 | task_config = TaskConfigFromEnv() 276 | 277 | end_time = datetime(2019, 2, 2) 278 | execution_time = datetime(2019, 2, 2) 279 | 280 | bigquery_service = BigqueryServiceMock() 281 | transformation = Transformation(bigquery_service, 282 | task_config, 283 | query, 284 | None, 285 | end_time - timedelta(days=1), 286 | end_time, 287 | execution_time, 288 | False) 289 | transformation.transform() 290 | 291 | final_query = """select count(1) from table where date >= '2019-02-01' and date < '2019-02-02'""" 292 | bigquery_service.execute_query.assert_called_with(final_query) 293 | 294 | @mock.patch("bumblebee.bigquery_service.BigqueryService") 295 | def test_should_run_table_task(self, BigqueryServiceMock): 296 | query = """select count(1) from table where date >= '__dstart__' and date < '__dend__'""" 297 | 298 | set_vars_with_default(load_method="APPEND") 299 | task_config = TaskConfigFromEnv() 300 | 301 | end_time = datetime(2019, 2, 2) 302 | execution_time = datetime(2019, 2, 2) 303 | 304 | bigquery_service = BigqueryServiceMock() 305 | 306 | def get_table_mock(table_name): 307 | if table_name == 'bq_project.playground_dev.abcd': 308 | table_mock = MagicMock() 309 | type(table_mock).time_partitioning = None 310 | return table_mock 311 | 312 | bigquery_service.get_table = MagicMock(side_effect=get_table_mock) 313 | 314 | transformation = Transformation(bigquery_service, 315 | task_config, 316 | query, 317 | None, 318 | end_time - timedelta(days=1), 319 | end_time, 320 | execution_time, 321 | False) 322 | transformation.transform() 323 | 324 | final_query = """select count(1) from table where date >= '2019-02-01' and date < '2019-02-02'""" 325 | bigquery_service.transform_load.assert_called_with(query=final_query, 326 | write_disposition=WriteDisposition.WRITE_APPEND, 327 | destination_table="bq_project.playground_dev.abcd", 328 | allow_field_addition=False) 329 | 330 | @mock.patch("bumblebee.bigquery_service.BigqueryService") 331 | def test_table_transform_with_merge_load_method_and_non_partitioned_destination(self, BigqueryServiceMock): 332 | query = "select count(1) from table where date >= '2019-01-03' and date < '2019-01-04'\n" + OPTIMUS_QUERY_BREAK_MARKER + "\n" 333 | query += "select count(1) from table where date >= '2019-01-04' and date < '2019-01-05'" 334 | 335 | set_vars_with_default() 336 | task_config = TaskConfigFromEnv() 337 | 338 | end_time = datetime(2019, 1, 5) 339 | execution_time = datetime(2019, 2, 2) 340 | 341 | bigquery_service = BigqueryServiceMock() 342 | 343 | def get_table_mock(table_name): 344 | if table_name == 'bq_project.playground_dev.abcd': 345 | table_mock = MagicMock() 346 | type(table_mock).time_partitioning = None 347 | return table_mock 348 | 349 | bigquery_service.get_table = MagicMock(side_effect=get_table_mock) 350 | 351 | transformation = Transformation(bigquery_service, 352 | task_config, 353 | query, 354 | None, 355 | end_time - timedelta(days=2), 356 | end_time, 357 | execution_time, 358 | False) 359 | transformation.transform() 360 | 361 | final_query = """select count(1) from table where date >= '2019-01-03' and date < '2019-01-04'\n""" 362 | bigquery_service.transform_load.assert_called_with(query=final_query, 363 | write_disposition=WriteDisposition.WRITE_TRUNCATE, 364 | destination_table="bq_project.playground_dev.abcd", 365 | allow_field_addition=False) 366 | 367 | @mock.patch("bumblebee.bigquery_service.BigqueryService") 368 | def test_should_run_partition_task_on_field(self, BigqueryServiceMock): 369 | query = """select count(1) from table where date >= '__dstart__' and date < '__dend__'""" 370 | 371 | set_vars_with_default(load_method="REPLACE_MERGE") 372 | task_config = TaskConfigFromEnv() 373 | 374 | end_time = datetime(2019, 2, 2) 375 | execution_time = datetime(2019, 2, 2) 376 | 377 | bigquery_service = BigqueryServiceMock() 378 | 379 | def get_table_mock(table_name): 380 | if table_name == 'bq_project.playground_dev.abcd': 381 | time_partitioning = MagicMock() 382 | type(time_partitioning).field = PropertyMock(return_value="event_timestamp") 383 | type(time_partitioning).field_type = PropertyMock(return_value="TIMESTAMP") 384 | type(time_partitioning).type_ = PropertyMock(return_value=TimePartitioningType.DAY) 385 | 386 | table_mock = MagicMock() 387 | type(table_mock).time_partitioning = PropertyMock(return_value=time_partitioning) 388 | type(table_mock).partitioning_type = "DAY" 389 | return table_mock 390 | 391 | bigquery_service.get_table = MagicMock(side_effect=get_table_mock) 392 | 393 | transformation = Transformation(bigquery_service, 394 | task_config, 395 | query, 396 | None, 397 | end_time - timedelta(days=1), 398 | end_time, 399 | execution_time, 400 | False) 401 | transformation.transform() 402 | 403 | final_query = """-- Optimus generated\nDECLARE partitions ARRAY;\n\n\n\nCREATE TEMP TABLE `opt__partitions` AS (\n select count(1) from table where date >= '__dstart__' and date < '__dend__'\n);\n\nSET (partitions) = (\n SELECT AS STRUCT\n array_agg(DISTINCT DATE(`event_timestamp`))\n FROM opt__partitions\n);\n\nMERGE INTO\n `bq_project.playground_dev.abcd` AS target\nUSING\n (\n Select * from `opt__partitions`\n ) AS source\nON FALSE\nWHEN NOT MATCHED BY SOURCE AND DATE(`event_timestamp`) IN UNNEST(partitions)\nTHEN DELETE\nWHEN NOT MATCHED THEN INSERT\n (\n \n )\nVALUES\n (\n \n );\n""" 404 | bigquery_service.execute_query.assert_called_with(final_query) 405 | 406 | @mock.patch("bumblebee.bigquery_service.BigqueryService") 407 | def test_should_fail_if_partition_task_for_ingestion_time_without_filter_in_REPLACE_MERGE(self, BigqueryServiceMock): 408 | query = """select count(1) from table where date >= '__dstart__' and date < '__dend__'""" 409 | 410 | set_vars_with_default(load_method="REPLACE_MERGE") 411 | task_config = TaskConfigFromEnv() 412 | 413 | end_time = datetime(2019, 2, 2) 414 | execution_time = datetime(2019, 2, 2) 415 | 416 | bigquery_service = BigqueryServiceMock() 417 | 418 | def get_table_mock(table_name): 419 | if table_name == 'bq_project.playground_dev.abcd': 420 | time_partitioning = MagicMock() 421 | type(time_partitioning).field = PropertyMock(return_value=None) 422 | type(time_partitioning).type_ = PropertyMock(return_value=TimePartitioningType.DAY) 423 | 424 | table_mock = MagicMock() 425 | type(table_mock).time_partitioning = PropertyMock(return_value=time_partitioning) 426 | type(table_mock).partitioning_type = "DAY" 427 | return table_mock 428 | 429 | bigquery_service.get_table = MagicMock(side_effect=get_table_mock) 430 | transformation = Transformation(bigquery_service, 431 | task_config, 432 | query, 433 | None, 434 | end_time - timedelta(days=1), 435 | end_time, 436 | execution_time, 437 | False) 438 | with self.assertRaises(Exception) as ex: 439 | transformation.transform() 440 | self.assertTrue("partition filter is required" in str(ex.exception)) 441 | 442 | @mock.patch("bumblebee.bigquery_service.BigqueryService") 443 | def test_execute_table_not_found_raise_exception(self, BigqueryServiceMock): 444 | query = """select count(1) from table where date >= '__dstart__' and date < '__dend__'""" 445 | 446 | set_vars_with_default() 447 | task_config = TaskConfigFromEnv() 448 | 449 | end_time = datetime(2019, 2, 2) 450 | execution_time = datetime(2019, 2, 2) 451 | 452 | bigquery_service = BigqueryServiceMock() 453 | 454 | def get_table_mock(table_name): 455 | raise Exception("{} table not found".format(table_name)) 456 | bigquery_service.get_table = MagicMock(side_effect=get_table_mock) 457 | 458 | transformation = Transformation(bigquery_service, 459 | task_config, 460 | query, 461 | None, 462 | end_time - timedelta(days=1), 463 | end_time, 464 | execution_time, 465 | False) 466 | 467 | 468 | with self.assertRaises(Exception) as ex: 469 | transformation.transform() 470 | self.assertTrue("table not found" in str(ex.exception)) 471 | 472 | class TestBulkExecutor(TestCase): 473 | 474 | def test_bulk_executor(self): 475 | timezone = "Asia/Jakarta" 476 | concurrency = 10 477 | 478 | start_time = localise_datetime(datetime(2019, 1, 1), timezone) 479 | next_day = start_time + timedelta(days=1) 480 | 481 | datetime_ranges = [start_time, next_day] 482 | 483 | def execute_mock(): 484 | print("a") 485 | 486 | task_mock = MagicMock() 487 | task_mock.execute = MagicMock(side_effect=execute_mock) 488 | 489 | tasks = [ task_mock for dt in datetime_ranges] 490 | 491 | executor = ConcurrentTaskExecutor(concurrency) 492 | executor.execute(tasks) 493 | 494 | task_mock.execute.assert_has_calls([call(), call()]) -------------------------------------------------------------------------------- /task/bq2bq/executor/tests/test_window.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from datetime import datetime 4 | from datetime import timedelta 5 | 6 | from bumblebee.window import WindowFactory 7 | 8 | class test_Window(TestCase): 9 | 10 | def setUp(self): 11 | self.scheduled_at = datetime(2020, 7, 8, 4) 12 | self.scheduled_next_at = datetime(2020, 7, 9, 4) 13 | 14 | def test_provide_date_one_and_date_zero_with_window_size(self): 15 | window = WindowFactory.create_window(self.scheduled_next_at, "24h", "", "") 16 | 17 | day_one = self.scheduled_next_at 18 | day_zero = day_one - timedelta(days=1) 19 | 20 | self.assertEqual(window.start, day_zero) 21 | self.assertEqual(window.end, day_one) 22 | 23 | def test_valid_offset_in_window(self): 24 | window = WindowFactory.create_window(self.scheduled_next_at, "2d", "1d", "") 25 | 26 | day_one = self.scheduled_next_at + timedelta(days=1) 27 | day_zero = day_one - timedelta(days=2) 28 | 29 | self.assertEqual(window.start, day_zero) 30 | self.assertEqual(window.end, day_one) 31 | 32 | def test_valid_negative_offset_in_window(self): 33 | window = WindowFactory.create_window(self.scheduled_next_at, "2d", "-24h", "") 34 | 35 | day_one = self.scheduled_next_at + timedelta(days=-1) 36 | day_zero = day_one - timedelta(days=2) 37 | 38 | self.assertEqual(window.start, day_zero) 39 | self.assertEqual(window.end, day_one) 40 | 41 | def test_valid_hour_size_in_window(self): 42 | window = WindowFactory.create_window(self.scheduled_next_at, "2h", "0", "h") 43 | 44 | day_one = datetime(2020, 7, 9, 4) 45 | day_zero = datetime(2020, 7, 9, 4) - timedelta(hours=2) 46 | 47 | self.assertEqual(window.start, day_zero) 48 | self.assertEqual(window.end, day_one) 49 | 50 | def test_valid_truncation_in_window(self): 51 | window = WindowFactory.create_window(self.scheduled_next_at, "2d", "1d", "d") 52 | 53 | day_one = datetime(2020, 7, 9) + timedelta(days=1) 54 | day_zero = day_one - timedelta(days=2) 55 | 56 | self.assertEqual(window.start, day_zero) 57 | self.assertEqual(window.end, day_one) 58 | 59 | def test_valid_week_and_hour_notation(self): 60 | window = WindowFactory.create_window(self.scheduled_next_at, "1w", "24h", "d") 61 | 62 | day_one = datetime(2020, 7, 9) + timedelta(days=1) 63 | day_zero = day_one - timedelta(days=7) 64 | 65 | self.assertEqual(window.start, day_zero) 66 | self.assertEqual(window.end, day_one) 67 | 68 | def test_valid_week_truncation(self): 69 | window = WindowFactory.create_window(self.scheduled_next_at, "1w", "0", "w") 70 | 71 | day_zero = datetime(2020, 7, 5) 72 | day_one = datetime(2020, 7, 12) 73 | 74 | self.assertEqual(window.start, day_zero) 75 | self.assertEqual(window.end, day_one) 76 | 77 | def test_valid_week_from_tuesday_to_tuesday(self): 78 | window = WindowFactory.create_window(self.scheduled_next_at, "1w", "2d", "w") 79 | 80 | day_zero = datetime(2020, 7, 7) 81 | day_one = datetime(2020, 7, 14) 82 | 83 | self.assertEqual(window.start, day_zero) 84 | self.assertEqual(window.end, day_one) 85 | 86 | 87 | -------------------------------------------------------------------------------- /task/bq2bq/factory.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "context" 6 | "fmt" 7 | "sync" 8 | 9 | "google.golang.org/api/drive/v2" 10 | 11 | "google.golang.org/api/option" 12 | 13 | "cloud.google.com/go/bigquery" 14 | "github.com/googleapis/google-cloud-go-testing/bigquery/bqiface" 15 | "golang.org/x/oauth2/google" 16 | 17 | storageV1 "google.golang.org/api/storage/v1" 18 | ) 19 | 20 | const ( 21 | MaxBQClientReuse = 5 22 | ) 23 | 24 | type DefaultBQClientFactory struct { 25 | cachedClient bqiface.Client 26 | cachedCred *google.Credentials 27 | timesUsed int 28 | mu sync.Mutex 29 | } 30 | 31 | func (fac *DefaultBQClientFactory) New(ctx context.Context, svcAccount string) (bqiface.Client, error) { 32 | fac.mu.Lock() 33 | defer fac.mu.Unlock() 34 | 35 | cred, err := google.CredentialsFromJSON(ctx, []byte(svcAccount), 36 | bigquery.Scope, storageV1.CloudPlatformScope, drive.DriveScope) 37 | if err != nil { 38 | return nil, fmt.Errorf("failed to read secret: %w", err) 39 | } 40 | 41 | // check if cached client can be reused 42 | if fac.cachedCred != nil && fac.cachedClient != nil && fac.timesUsed == MaxBQClientReuse && 43 | bytes.Equal(cred.JSON, fac.cachedCred.JSON) { 44 | fac.timesUsed++ 45 | return fac.cachedClient, nil 46 | } 47 | 48 | client, err := bigquery.NewClient(ctx, cred.ProjectID, option.WithCredentials(cred)) 49 | if err != nil { 50 | return nil, fmt.Errorf("failed to create BQ client: %w", err) 51 | } 52 | 53 | fac.cachedCred = cred 54 | fac.cachedClient = bqiface.AdaptClient(client) 55 | fac.timesUsed = 1 56 | return fac.cachedClient, nil 57 | } 58 | -------------------------------------------------------------------------------- /task/bq2bq/go.mod: -------------------------------------------------------------------------------- 1 | module github.com/odpf/transformers/task/bq2bq 2 | 3 | go 1.18 4 | 5 | require ( 6 | cloud.google.com/go/bigquery v1.44.0 7 | github.com/AlecAivazis/survey/v2 v2.3.6 8 | github.com/googleapis/google-cloud-go-testing v0.0.0-20210719221736-1c9a4c676720 9 | github.com/hashicorp/go-hclog v1.2.0 10 | github.com/mitchellh/hashstructure/v2 v2.0.2 11 | github.com/odpf/optimus v0.6.0-rc.3 12 | github.com/odpf/optimus/sdk v0.0.0-20230104093625-9b6abe3fe8d3 13 | github.com/patrickmn/go-cache v2.1.0+incompatible 14 | github.com/spf13/cast v1.4.1 15 | github.com/stretchr/testify v1.8.1 16 | go.opentelemetry.io/otel v1.7.0 17 | go.opentelemetry.io/otel/exporters/jaeger v1.0.1 18 | go.opentelemetry.io/otel/sdk v1.3.0 19 | go.opentelemetry.io/otel/trace v1.7.0 20 | golang.org/x/oauth2 v0.0.0-20221014153046-6fdb5e3db783 21 | golang.org/x/sync v0.1.0 22 | google.golang.org/api v0.103.0 23 | ) 24 | 25 | require ( 26 | cloud.google.com/go v0.105.0 // indirect 27 | cloud.google.com/go/compute v1.12.1 // indirect 28 | cloud.google.com/go/compute/metadata v0.2.1 // indirect 29 | cloud.google.com/go/iam v0.7.0 // indirect 30 | cloud.google.com/go/storage v1.27.0 // indirect 31 | github.com/aws/aws-sdk-go v1.43.31 // indirect 32 | github.com/bgentry/go-netrc v0.0.0-20140422174119-9fd32a8b3d3d // indirect 33 | github.com/davecgh/go-spew v1.1.1 // indirect 34 | github.com/fatih/color v1.13.0 // indirect 35 | github.com/fsnotify/fsnotify v1.5.1 // indirect 36 | github.com/go-logr/logr v1.2.3 // indirect 37 | github.com/go-logr/stdr v1.2.2 // indirect 38 | github.com/go-ozzo/ozzo-validation/v4 v4.3.0 // indirect 39 | github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect 40 | github.com/golang/protobuf v1.5.2 // indirect 41 | github.com/google/go-cmp v0.5.9 // indirect 42 | github.com/google/martian/v3 v3.3.2 // indirect 43 | github.com/google/uuid v1.3.0 // indirect 44 | github.com/googleapis/enterprise-certificate-proxy v0.2.0 // indirect 45 | github.com/googleapis/gax-go/v2 v2.7.0 // indirect 46 | github.com/grpc-ecosystem/go-grpc-middleware v1.3.0 // indirect 47 | github.com/grpc-ecosystem/grpc-gateway/v2 v2.10.0 // indirect 48 | github.com/hashicorp/go-cleanhttp v0.5.2 // indirect 49 | github.com/hashicorp/go-getter v1.6.2 // indirect 50 | github.com/hashicorp/go-plugin v1.4.3 // indirect 51 | github.com/hashicorp/go-safetemp v1.0.0 // indirect 52 | github.com/hashicorp/go-version v1.3.0 // indirect 53 | github.com/hashicorp/hcl v1.0.0 // indirect 54 | github.com/hashicorp/yamux v0.0.0-20211028200310-0bc27b27de87 // indirect 55 | github.com/jeremywohl/flatten v1.0.1 // indirect 56 | github.com/jmespath/go-jmespath v0.4.0 // indirect 57 | github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51 // indirect 58 | github.com/klauspost/compress v1.15.1 // indirect 59 | github.com/kr/pretty v0.3.0 // indirect 60 | github.com/magiconair/properties v1.8.5 // indirect 61 | github.com/mattn/go-colorable v0.1.12 // indirect 62 | github.com/mattn/go-isatty v0.0.16 // indirect 63 | github.com/mcuadros/go-defaults v1.2.0 // indirect 64 | github.com/mgutz/ansi v0.0.0-20200706080929-d51e80ef957d // indirect 65 | github.com/mitchellh/go-homedir v1.1.0 // indirect 66 | github.com/mitchellh/go-testing-interface v1.14.1 // indirect 67 | github.com/mitchellh/mapstructure v1.4.3 // indirect 68 | github.com/odpf/salt v0.0.0-20220614042821-c5613a78b4d6 // indirect 69 | github.com/oklog/run v1.1.0 // indirect 70 | github.com/pelletier/go-toml v1.9.3 // indirect 71 | github.com/pmezard/go-difflib v1.0.0 // indirect 72 | github.com/robfig/cron/v3 v3.0.1 // indirect 73 | github.com/spf13/afero v1.8.2 // indirect 74 | github.com/spf13/jwalterweatherman v1.1.0 // indirect 75 | github.com/spf13/pflag v1.0.5 // indirect 76 | github.com/spf13/viper v1.8.1 // indirect 77 | github.com/stretchr/objx v0.5.0 // indirect 78 | github.com/subosito/gotenv v1.2.0 // indirect 79 | github.com/ulikunitz/xz v0.5.8 // indirect 80 | go.opencensus.io v0.24.0 // indirect 81 | go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.32.0 // indirect 82 | golang.org/x/net v0.0.0-20221014081412-f15817d10f9b // indirect 83 | golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab // indirect 84 | golang.org/x/term v0.0.0-20220411215600-e5f449aeb171 // indirect 85 | golang.org/x/text v0.4.0 // indirect 86 | golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 // indirect 87 | google.golang.org/appengine v1.6.7 // indirect 88 | google.golang.org/genproto v0.0.0-20221117204609-8f9c96812029 // indirect 89 | google.golang.org/grpc v1.50.1 // indirect 90 | google.golang.org/protobuf v1.28.1 // indirect 91 | gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect 92 | gopkg.in/ini.v1 v1.62.0 // indirect 93 | gopkg.in/yaml.v2 v2.4.0 // indirect 94 | gopkg.in/yaml.v3 v3.0.1 // indirect 95 | ) 96 | -------------------------------------------------------------------------------- /task/bq2bq/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "errors" 6 | "flag" 7 | "fmt" 8 | "regexp" 9 | "strings" 10 | "sync" 11 | "time" 12 | 13 | "cloud.google.com/go/bigquery" 14 | "github.com/googleapis/google-cloud-go-testing/bigquery/bqiface" 15 | "github.com/hashicorp/go-hclog" 16 | "github.com/mitchellh/hashstructure/v2" 17 | oplugin "github.com/odpf/optimus/plugin" 18 | "github.com/odpf/optimus/sdk/plugin" 19 | "github.com/patrickmn/go-cache" 20 | "github.com/spf13/cast" 21 | "go.opentelemetry.io/otel/attribute" 22 | "golang.org/x/sync/errgroup" 23 | ) 24 | 25 | const ( 26 | ConfigKeyDstart = "DSTART" 27 | ConfigKeyDend = "DEND" 28 | 29 | dataTypeEnv = "env" 30 | dataTypeFile = "file" 31 | destinationTypeBigquery = "bigquery" 32 | scheduledAtTimeLayout = time.RFC3339 33 | ) 34 | 35 | var ( 36 | Name = "bq2bq" 37 | 38 | // Version should be injected while building 39 | Version = "dev" 40 | 41 | tableDestinationPatterns = regexp.MustCompile("" + 42 | "(?i)(?:FROM)\\s*(?:/\\*\\s*([a-zA-Z0-9@_-]*)\\s*\\*/)?\\s+`?([\\w-]+)\\.([\\w-]+)\\.([\\w-]+)`?" + 43 | "|" + 44 | "(?i)(?:JOIN)\\s*(?:/\\*\\s*([a-zA-Z0-9@_-]*)\\s*\\*/)?\\s+`?([\\w-]+)\\.([\\w-]+)\\.([\\w-]+)`?" + 45 | "|" + 46 | "(?i)(?:WITH)\\s*(?:/\\*\\s*([a-zA-Z0-9@_-]*)\\s*\\*/)?\\s+`?([\\w-]+)\\.([\\w-]+)\\.([\\w-]+)`?\\s+(?:AS)") 47 | 48 | queryCommentPatterns = regexp.MustCompile("(--.*)|(((/\\*)+?[\\w\\W]*?(\\*/)+))") 49 | helperPattern = regexp.MustCompile("(\\/\\*\\s*(@[a-zA-Z0-9_-]+)\\s*\\*\\/)") 50 | 51 | QueryFileName = "query.sql" 52 | 53 | BqServiceAccount = "BQ_SERVICE_ACCOUNT" 54 | 55 | TimeoutDuration = time.Second * 180 56 | MaxBQApiRetries = 3 57 | FakeSelectStmt = "SELECT * from `%s` WHERE FALSE LIMIT 1" 58 | 59 | CacheTTL = time.Hour * 24 60 | CacheCleanUp = time.Hour * 6 61 | ErrCacheNotFound = errors.New("item not found") 62 | 63 | LoadMethod = "LOAD_METHOD" 64 | LoadMethodReplace = "REPLACE" 65 | 66 | QueryFileReplaceBreakMarker = "\n--*--optimus-break-marker--*--\n" 67 | 68 | _ plugin.DependencyResolverMod = &BQ2BQ{} 69 | ) 70 | 71 | type ClientFactory interface { 72 | New(ctx context.Context, svcAccount string) (bqiface.Client, error) 73 | } 74 | 75 | type BQ2BQ struct { 76 | ClientFac ClientFactory 77 | mu sync.Mutex 78 | C *cache.Cache 79 | Compiler *Compiler 80 | 81 | logger hclog.Logger 82 | } 83 | 84 | func (*BQ2BQ) GetName(_ context.Context) (string, error) { 85 | return Name, nil 86 | } 87 | 88 | func (b *BQ2BQ) CompileAssets(ctx context.Context, req plugin.CompileAssetsRequest) (*plugin.CompileAssetsResponse, error) { 89 | method, ok := req.Config.Get(LoadMethod) 90 | if !ok || method.Value != LoadMethodReplace { 91 | return &plugin.CompileAssetsResponse{ 92 | Assets: req.Assets, 93 | }, nil 94 | } 95 | 96 | // partition window in range 97 | instanceFileMap := map[string]string{} 98 | instanceEnvMap := map[string]interface{}{} 99 | if req.InstanceData != nil { 100 | for _, jobRunData := range req.InstanceData { 101 | switch jobRunData.Type { 102 | case dataTypeFile: 103 | instanceFileMap[jobRunData.Name] = jobRunData.Value 104 | case dataTypeEnv: 105 | instanceEnvMap[jobRunData.Name] = jobRunData.Value 106 | } 107 | } 108 | } 109 | 110 | // TODO: making few assumptions here, should be documented 111 | // assume destination table is time partitioned 112 | // assume table is partitioned as DAY 113 | partitionDelta := time.Hour * 24 114 | 115 | // find destination partitions 116 | var destinationsPartitions []struct { 117 | start time.Time 118 | end time.Time 119 | } 120 | dstart := req.StartTime 121 | dend := req.EndTime 122 | for currentPart := dstart; currentPart.Before(dend); currentPart = currentPart.Add(partitionDelta) { 123 | destinationsPartitions = append(destinationsPartitions, struct { 124 | start time.Time 125 | end time.Time 126 | }{ 127 | start: currentPart, 128 | end: currentPart.Add(partitionDelta), 129 | }) 130 | } 131 | 132 | // check if window size is greater than partition delta(a DAY), if not do nothing 133 | if dend.Sub(dstart) <= partitionDelta { 134 | return &plugin.CompileAssetsResponse{ 135 | Assets: req.Assets, 136 | }, nil 137 | } 138 | 139 | var parsedQueries []string 140 | var err error 141 | 142 | compiledAssetMap := map[string]string{} 143 | for _, asset := range req.Assets { 144 | compiledAssetMap[asset.Name] = asset.Value 145 | } 146 | // append job spec assets to list of files need to write 147 | fileMap := mergeStringMap(instanceFileMap, compiledAssetMap) 148 | for _, part := range destinationsPartitions { 149 | instanceEnvMap[ConfigKeyDstart] = part.start.Format(scheduledAtTimeLayout) 150 | instanceEnvMap[ConfigKeyDend] = part.end.Format(scheduledAtTimeLayout) 151 | if compiledAssetMap, err = b.Compiler.Compile(fileMap, instanceEnvMap); err != nil { 152 | return &plugin.CompileAssetsResponse{}, err 153 | } 154 | parsedQueries = append(parsedQueries, compiledAssetMap[QueryFileName]) 155 | } 156 | compiledAssetMap[QueryFileName] = strings.Join(parsedQueries, QueryFileReplaceBreakMarker) 157 | 158 | taskAssets := plugin.Assets{} 159 | for name, val := range compiledAssetMap { 160 | taskAssets = append(taskAssets, plugin.Asset{ 161 | Name: name, 162 | Value: val, 163 | }) 164 | } 165 | return &plugin.CompileAssetsResponse{ 166 | Assets: taskAssets, 167 | }, nil 168 | } 169 | 170 | func mergeStringMap(mp1, mp2 map[string]string) (mp3 map[string]string) { 171 | mp3 = make(map[string]string) 172 | for k, v := range mp1 { 173 | mp3[k] = v 174 | } 175 | for k, v := range mp2 { 176 | mp3[k] = v 177 | } 178 | return mp3 179 | } 180 | 181 | // GenerateDestination uses config details to build target table 182 | // this format should match with GenerateDependencies output 183 | func (b *BQ2BQ) GenerateDestination(ctx context.Context, request plugin.GenerateDestinationRequest) (*plugin.GenerateDestinationResponse, error) { 184 | _, span := StartChildSpan(ctx, "GenerateDestination") 185 | defer span.End() 186 | 187 | proj, ok1 := request.Config.Get("PROJECT") 188 | dataset, ok2 := request.Config.Get("DATASET") 189 | tab, ok3 := request.Config.Get("TABLE") 190 | if ok1 && ok2 && ok3 { 191 | return &plugin.GenerateDestinationResponse{ 192 | Destination: fmt.Sprintf("%s:%s.%s", proj.Value, dataset.Value, tab.Value), 193 | Type: destinationTypeBigquery, 194 | }, nil 195 | } 196 | return nil, errors.New("missing config key required to generate destination") 197 | } 198 | 199 | // GenerateDependencies uses assets to find out the source tables of this 200 | // transformation. 201 | // Try using BQ APIs to search for referenced tables. This work for Select stmts 202 | // but not for Merge/Scripts, for them use regex based search and then create 203 | // fake select stmts. Fake statements help finding actual referenced tables in 204 | // case regex based table is a view & not actually a source table. Because this 205 | // fn should generate the actual source as dependency 206 | // BQ2BQ dependencies are BQ tables in format "project:dataset.table" 207 | func (b *BQ2BQ) GenerateDependencies(ctx context.Context, request plugin.GenerateDependenciesRequest) (response *plugin.GenerateDependenciesResponse, err error) { 208 | spanCtx, span := StartChildSpan(ctx, "GenerateDependencies") 209 | defer span.End() 210 | 211 | response = &plugin.GenerateDependenciesResponse{} 212 | response.Dependencies = []string{} 213 | 214 | // check if exists in cache 215 | if cachedResponse, err := b.IsCached(request); err == nil { 216 | // cache ready 217 | span.AddEvent("Request found in cache") 218 | return cachedResponse, nil 219 | } else if err != ErrCacheNotFound { 220 | return nil, err 221 | } 222 | 223 | var svcAcc string 224 | accConfig, ok := request.Config.Get(BqServiceAccount) 225 | if !ok || len(accConfig.Value) == 0 { 226 | span.AddEvent("Required secret BQ_SERVICE_ACCOUNT not found in config") 227 | return response, fmt.Errorf("secret BQ_SERVICE_ACCOUNT required to generate dependencies not found for %s", Name) 228 | } else { 229 | svcAcc = accConfig.Value 230 | } 231 | 232 | queryData, ok := request.Assets.Get(QueryFileName) 233 | if !ok { 234 | return nil, errors.New("empty sql file") 235 | } 236 | 237 | selfTable, err := b.GenerateDestination(spanCtx, plugin.GenerateDestinationRequest{ 238 | Config: request.Config, 239 | Assets: request.Assets, 240 | }) 241 | if err != nil { 242 | return response, err 243 | } 244 | 245 | // first parse sql statement to find dependencies and ignored tables 246 | parsedDependencies, ignoredDependencies, err := b.FindDependenciesWithRegex(spanCtx, queryData.Value, selfTable.Destination) 247 | if err != nil { 248 | return response, err 249 | } 250 | 251 | timeoutCtx, cancel := context.WithTimeout(ctx, TimeoutDuration) 252 | defer cancel() 253 | 254 | // try to resolve referenced tables for ignoredDependencies 255 | var ignoredDependenciesReferencedTables []string 256 | for _, tableName := range ignoredDependencies { 257 | // ignore the tables with : 258 | if strings.Contains(tableName, ":") { // project:dataset.table 259 | continue 260 | } 261 | // find referenced tables and add it to ignoredDependenciesReferencedTables 262 | fakeQuery := fmt.Sprintf(FakeSelectStmt, tableName) 263 | deps, err := b.FindDependenciesWithRetryableDryRun(timeoutCtx, fakeQuery, svcAcc) 264 | if err != nil { 265 | return response, err 266 | } 267 | ignoredDependenciesReferencedTables = append(ignoredDependenciesReferencedTables, deps...) 268 | } 269 | ignoredDependencies = append(ignoredDependencies, ignoredDependenciesReferencedTables...) 270 | 271 | // try to resolve referenced tables directly from BQ APIs 272 | response.Dependencies, err = b.FindDependenciesWithRetryableDryRun(spanCtx, queryData.Value, svcAcc) 273 | if err != nil { 274 | // SQL query with reference to destination table such as DML and self joins will have dependency 275 | // cycle on dry run since the table might not be available yet. We check the error from BQ 276 | // to ignore if the error message contains destination table not found. 277 | if !strings.Contains(err.Error(), fmt.Sprintf("Not found: Table %s was not found", selfTable.Destination)) { 278 | return response, err 279 | } 280 | } 281 | 282 | if len(response.Dependencies) == 0 { 283 | span.AddEvent("Unable to get dependencies, query tables on regex") 284 | // stmt could be BQ script, find table names using regex and create 285 | // fake Select STMTs to find actual referenced tables 286 | 287 | resultChan := make(chan []string) 288 | eg, apiCtx := errgroup.WithContext(spanCtx) // it will stop executing further after first error 289 | for _, tableName := range parsedDependencies { 290 | fakeQuery := fmt.Sprintf(FakeSelectStmt, tableName) 291 | // find dependencies in parallel 292 | eg.Go(func() error { 293 | //prepare dummy query 294 | deps, err := b.FindDependenciesWithRetryableDryRun(spanCtx, fakeQuery, svcAcc) 295 | if err != nil { 296 | return err 297 | } 298 | select { 299 | case resultChan <- deps: 300 | return nil 301 | // requests to be cancelled 302 | case <-apiCtx.Done(): 303 | return apiCtx.Err() 304 | } 305 | }) 306 | } 307 | 308 | go func() { 309 | // if all done, stop waiting for results 310 | eg.Wait() 311 | close(resultChan) 312 | }() 313 | 314 | // accumulate results 315 | for dep := range resultChan { 316 | response.Dependencies = append(response.Dependencies, dep...) 317 | } 318 | 319 | // check if wait was finished because of an error 320 | if err := eg.Wait(); err != nil { 321 | return response, err 322 | } 323 | } 324 | 325 | response.Dependencies = removeString(response.Dependencies, selfTable.Destination) 326 | 327 | // before returning remove ignored tables 328 | for _, ignored := range ignoredDependencies { 329 | response.Dependencies = removeString(response.Dependencies, ignored) 330 | } 331 | 332 | // before returning wrap dependencies with datastore type 333 | dedupDependency := make(map[string]int) 334 | for _, dependency := range response.Dependencies { 335 | dedupDependency[fmt.Sprintf(plugin.DestinationURNFormat, selfTable.Type, dependency)] = 0 336 | } 337 | var dependencies []string 338 | for dependency := range dedupDependency { 339 | dependencies = append(dependencies, dependency) 340 | } 341 | response.Dependencies = dependencies 342 | 343 | b.Cache(request, response) 344 | return response, nil 345 | } 346 | 347 | // FindDependenciesWithRegex look for table patterns in SQL query to find 348 | // source tables. 349 | // Task destination is required to avoid cycles 350 | // 351 | // we look for certain patterns in the query source code 352 | // in particular, we look for the following constructs 353 | // * from {table} ... 354 | // * join {table} ... 355 | // * with {table} as ... 356 | // where {table} => {project}.{dataset}.{name} 357 | // for `from` and `join` we build a optimus.Table object and 358 | // store it's name in a set. For `with` query we store the name in 359 | // a separate set called `pseudoTables` that is used for filtering 360 | // out tables from `from`/`join` matches. 361 | // the algorithm roughly locates all from/join clauses, filters it 362 | // in case it's a known pseudo table (since with queries come before 363 | // either `from` or `join` queries, so they're match first). 364 | // notice that only clauses that end in "." delimited sequences 365 | // are matched (for instance: foo.bar.baz, but not foo.bar). 366 | // This helps weed out pseudo tables since most of the time 367 | // they're a single sequence of characters. But on the other hand 368 | // this also means that otherwise valid reference to "dataset.table" 369 | // will not be recognised. 370 | func (b *BQ2BQ) FindDependenciesWithRegex(ctx context.Context, queryString string, destination string) ([]string, []string, error) { 371 | _, span := StartChildSpan(ctx, "FindDependenciesWithRegex") 372 | defer span.End() 373 | 374 | tablesFound := make(map[string]bool) 375 | pseudoTables := make(map[string]bool) 376 | var tablesIgnored []string 377 | 378 | // we mark destination as a pseudo table to avoid a dependency 379 | // cycle. This is for supporting DML queries that may also refer 380 | // to themselves. 381 | 382 | pseudoTables[destination] = true 383 | 384 | // remove comments from query 385 | matches := queryCommentPatterns.FindAllStringSubmatch(queryString, -1) 386 | for _, match := range matches { 387 | helperToken := match[2] 388 | 389 | // check if its a helper 390 | if helperPattern.MatchString(helperToken) { 391 | continue 392 | } 393 | 394 | // replace full match 395 | queryString = strings.ReplaceAll(queryString, match[0], " ") 396 | } 397 | 398 | matches = tableDestinationPatterns.FindAllStringSubmatch(queryString, -1) 399 | for _, match := range matches { 400 | var projectIdx, datasetIdx, nameIdx, ignoreUpstreamIdx int 401 | tokens := strings.Fields(match[0]) 402 | clause := strings.ToLower(tokens[0]) 403 | 404 | switch clause { 405 | case "from": 406 | ignoreUpstreamIdx, projectIdx, datasetIdx, nameIdx = 1, 2, 3, 4 407 | case "join": 408 | ignoreUpstreamIdx, projectIdx, datasetIdx, nameIdx = 5, 6, 7, 8 409 | case "with": 410 | ignoreUpstreamIdx, projectIdx, datasetIdx, nameIdx = 9, 10, 11, 12 411 | } 412 | 413 | tableName := createTableName(match[projectIdx], match[datasetIdx], match[nameIdx]) 414 | 415 | // if upstream is ignored, don't treat it as source 416 | if strings.TrimSpace(match[ignoreUpstreamIdx]) == "@ignoreupstream" { 417 | // make sure to handle both the conventions 418 | tablesIgnored = append(tablesIgnored, tableName) 419 | tablesIgnored = append(tablesIgnored, createTableNameWithColon(match[projectIdx], match[datasetIdx], match[nameIdx])) 420 | continue 421 | } 422 | 423 | if clause == "with" { 424 | pseudoTables[tableName] = true 425 | } else { 426 | tablesFound[tableName] = true 427 | } 428 | } 429 | var tables []string 430 | for table := range tablesFound { 431 | if pseudoTables[table] { 432 | continue 433 | } 434 | tables = append(tables, table) 435 | } 436 | return tables, tablesIgnored, nil 437 | } 438 | 439 | func (b *BQ2BQ) FindDependenciesWithRetryableDryRun(ctx context.Context, query, svcAccSecret string) ([]string, error) { 440 | spanCtx, span := StartChildSpan(ctx, "FindDependenciesWithRetryableDryRun") 441 | defer span.End() 442 | 443 | for try := 1; try <= MaxBQApiRetries; try++ { 444 | client, err := b.ClientFac.New(spanCtx, svcAccSecret) 445 | if err != nil { 446 | return nil, fmt.Errorf("failed to create bigquery client: %v", err) 447 | } 448 | deps, err := b.FindDependenciesWithDryRun(spanCtx, client, query) 449 | if err != nil { 450 | if strings.Contains(err.Error(), "net/http: TLS handshake timeout") || 451 | strings.Contains(err.Error(), "unexpected EOF") || 452 | strings.Contains(err.Error(), "i/o timeout") || 453 | strings.Contains(err.Error(), "connection reset by peer") { 454 | // retry 455 | continue 456 | } 457 | 458 | return nil, err 459 | } 460 | return deps, nil 461 | } 462 | return nil, errors.New("bigquery api retries exhausted") 463 | } 464 | 465 | func (b *BQ2BQ) FindDependenciesWithDryRun(ctx context.Context, client bqiface.Client, query string) ([]string, error) { 466 | spanCtx, span := StartChildSpan(ctx, "FindDependenciesWithDryRun") 467 | defer span.End() 468 | span.SetAttributes(attribute.String("kind", "client"), attribute.String("client.type", "bigquery")) 469 | 470 | q := client.Query(query) 471 | q.SetQueryConfig(bqiface.QueryConfig{ 472 | QueryConfig: bigquery.QueryConfig{ 473 | Q: query, 474 | DryRun: true, 475 | }, 476 | }) 477 | 478 | job, err := q.Run(spanCtx) 479 | if err != nil { 480 | return nil, fmt.Errorf("query run: %w", err) 481 | } 482 | // Dry run is not asynchronous, so get the latest status and statistics. 483 | status := job.LastStatus() 484 | if err := status.Err(); err != nil { 485 | return nil, fmt.Errorf("query status: %w", err) 486 | } 487 | 488 | details, ok := status.Statistics.Details.(*bigquery.QueryStatistics) 489 | if !ok { 490 | return nil, errors.New("failed to cast to Query Statistics") 491 | } 492 | 493 | tables := []string{} 494 | for _, tab := range details.ReferencedTables { 495 | tables = append(tables, tab.FullyQualifiedName()) 496 | } 497 | return tables, nil 498 | } 499 | 500 | func createTableName(proj, dataset, table string) string { 501 | return fmt.Sprintf("%s.%s.%s", proj, dataset, table) 502 | } 503 | 504 | func createTableNameWithColon(proj, dataset, table string) string { 505 | return fmt.Sprintf("%s:%s.%s", proj, dataset, table) 506 | } 507 | 508 | func removeString(s []string, match string) []string { 509 | if len(s) == 0 { 510 | return s 511 | } 512 | idx := -1 513 | for i, tab := range s { 514 | if tab == match { 515 | idx = i 516 | break 517 | } 518 | } 519 | // not found 520 | if idx < 0 { 521 | return s 522 | } 523 | s[len(s)-1], s[idx] = s[idx], s[len(s)-1] 524 | return s[:len(s)-1] 525 | } 526 | 527 | func (b *BQ2BQ) IsCached(request plugin.GenerateDependenciesRequest) (*plugin.GenerateDependenciesResponse, error) { 528 | if b.C == nil { 529 | return nil, ErrCacheNotFound 530 | } 531 | b.mu.Lock() 532 | defer b.mu.Unlock() 533 | requestHash, err := hashstructure.Hash(request, hashstructure.FormatV2, nil) 534 | if err != nil { 535 | return nil, err 536 | } 537 | hashString := cast.ToString(requestHash) 538 | if item, ok := b.C.Get(hashString); ok { 539 | return item.(*plugin.GenerateDependenciesResponse), nil 540 | } 541 | return nil, ErrCacheNotFound 542 | } 543 | 544 | func (b *BQ2BQ) Cache(request plugin.GenerateDependenciesRequest, response *plugin.GenerateDependenciesResponse) error { 545 | if b.C == nil { 546 | return nil 547 | } 548 | b.mu.Lock() 549 | defer b.mu.Unlock() 550 | requestHash, err := hashstructure.Hash(request, hashstructure.FormatV2, nil) 551 | if err != nil { 552 | return err 553 | } 554 | hashString := cast.ToString(requestHash) 555 | b.C.Set(hashString, response, cache.DefaultExpiration) 556 | return nil 557 | } 558 | 559 | func main() { 560 | var tracingAddr string 561 | flag.StringVar(&tracingAddr, "t", "", "endpoint for traces collector") 562 | flag.Parse() 563 | 564 | var cleanupFunc func() 565 | oplugin.Serve(func(log hclog.Logger) interface{} { 566 | var err error 567 | log.Info("Telemetry setup with", tracingAddr) 568 | cleanupFunc, err = InitTelemetry(log, tracingAddr) 569 | if err != nil { 570 | log.Warn("Error while telemetry init") 571 | } 572 | 573 | return &BQ2BQ{ 574 | ClientFac: &DefaultBQClientFactory{}, 575 | C: cache.New(CacheTTL, CacheCleanUp), 576 | Compiler: NewCompiler(), 577 | logger: log, 578 | } 579 | }) 580 | cleanupFunc() 581 | } 582 | -------------------------------------------------------------------------------- /task/bq2bq/optimus-plugin-bq2bq.yaml: -------------------------------------------------------------------------------- 1 | name: bq2bq 2 | description: BigQuery to BigQuery transformation task 3 | plugintype: task 4 | pluginversion: 0.3.2 # update this with expected tag before release 5 | image: docker.io/odpf/optimus-task-bq2bq-executor:0.3.2 6 | entrypoint: 7 | script: "python3 /opt/bumblebee/main.py" 8 | questions: 9 | - name: PROJECT 10 | prompt: Project ID 11 | help: Destination bigquery project ID 12 | regexp: ^[a-zA-Z0-9_\-]+$ 13 | validationerror: invalid name (can only contain characters A-Z (in either case), 14 | 0-9, hyphen(-) or underscore (_) 15 | minlength: 3 16 | - name: DATASET 17 | prompt: Dataset Name 18 | help: Destination bigquery dataset ID 19 | regexp: ^[a-zA-Z0-9_\-]+$ 20 | validationerror: invalid name (can only contain characters A-Z (in either case), 21 | 0-9, hyphen(-) or underscore (_) 22 | minlength: 3 23 | - name: TABLE 24 | prompt: Table ID 25 | help: Destination bigquery table ID 26 | regexp: ^[a-zA-Z0-9_-]+$ 27 | validationerror: invalid table name (can only contain characters A-Z (in either 28 | case), 0-9, hyphen(-) or underscore (_) 29 | minlength: 3 30 | maxlength: 1024 31 | - name: LOAD_METHOD 32 | prompt: Load method to use on destination 33 | help: | 34 | APPEND - Append to existing table 35 | REPLACE - Deletes existing partition and insert result of select query 36 | MERGE - DML statements, BQ scripts 37 | REPLACE_MERGE - [Experimental] Advanced replace using merge query 38 | default: APPEND 39 | multiselect: 40 | - APPEND 41 | - REPLACE 42 | - MERGE 43 | - REPLACE_MERGE 44 | - REPLACE_ALL 45 | subquestions: 46 | - ifvalue: REPLACE_MERGE 47 | questions: 48 | - name: PARTITION_FILTER 49 | prompt: Partition filter expression 50 | help: "\nWhere condition over partitioned column used to delete existing partitions\nin 51 | destination table. These partitions will be replaced with sql query result.\nLeave 52 | empty for optimus to automatically figure this out although it will be \nfaster 53 | and cheaper to provide the exact condition.\nfor example: DATE(event_timestamp) 54 | >= \"{{ .DSTART|Date }}\" AND DATE(event_timestamp) < \"{{ .DEND|Date }}\"" 55 | required: true 56 | defaultconfig: 57 | - name: SQL_TYPE 58 | value: STANDARD 59 | defaultassets: 60 | - name: query.sql 61 | value: |- 62 | -- SQL query goes here 63 | 64 | Select * from "project.dataset.table"; 65 | -------------------------------------------------------------------------------- /task/bq2bq/telemetry.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | 6 | "github.com/hashicorp/go-hclog" 7 | "go.opentelemetry.io/otel" 8 | "go.opentelemetry.io/otel/exporters/jaeger" 9 | "go.opentelemetry.io/otel/propagation" 10 | "go.opentelemetry.io/otel/sdk/resource" 11 | tracesdk "go.opentelemetry.io/otel/sdk/trace" 12 | semconv "go.opentelemetry.io/otel/semconv/v1.4.0" 13 | "go.opentelemetry.io/otel/trace" 14 | ) 15 | 16 | func InitTelemetry(l hclog.Logger, tracingAddr string) (func(), error) { 17 | var tp *tracesdk.TracerProvider 18 | var err error 19 | if tracingAddr != "" { 20 | l.Info("enabling jaeger traces", "addr", tracingAddr) 21 | tp, err = tracerProvider(tracingAddr) 22 | if err != nil { 23 | return nil, err 24 | } 25 | 26 | // Register our TracerProvider as the global so any imported 27 | // instrumentation in the future will default to using it. 28 | otel.SetTracerProvider(tp) 29 | 30 | // Required to receive trace info from upstream 31 | otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator(propagation.TraceContext{}, propagation.Baggage{})) 32 | } 33 | 34 | return func() { 35 | if tp != nil { 36 | if err = tp.Shutdown(context.Background()); err != nil { 37 | l.Warn("failed to shutdown trace provider", "err", err) 38 | } 39 | } 40 | }, nil 41 | } 42 | 43 | // tracerProvider returns an OpenTelemetry TracerProvider configured to use 44 | // the Jaeger exporter that will send spans to the provided url. The returned 45 | // TracerProvider will also use a Resource configured with all the information 46 | // about the application. 47 | func tracerProvider(url string) (*tracesdk.TracerProvider, error) { 48 | // create the Jaeger exporter 49 | jaegerExporter, err := jaeger.New(jaeger.WithCollectorEndpoint(jaeger.WithEndpoint(url))) 50 | if err != nil { 51 | return nil, err 52 | } 53 | tp := tracesdk.NewTracerProvider( 54 | // Always be sure to batch in production 55 | tracesdk.WithBatcher(jaegerExporter), 56 | 57 | // Record information about this application in an Resource 58 | tracesdk.WithResource(resource.NewWithAttributes( 59 | semconv.SchemaURL, 60 | semconv.ServiceNameKey.String(Name), 61 | semconv.ServiceVersionKey.String(Version), 62 | )), 63 | ) 64 | 65 | return tp, nil 66 | } 67 | 68 | func StartChildSpan(ctx context.Context, name string) (context.Context, trace.Span) { 69 | tracer := otel.Tracer("bq2bq") 70 | 71 | return tracer.Start(ctx, name) 72 | } 73 | -------------------------------------------------------------------------------- /task/bq2bq/validate.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "reflect" 6 | "regexp" 7 | 8 | "github.com/AlecAivazis/survey/v2" 9 | ) 10 | 11 | // validatorFactory, name abbreviated so that 12 | // the global implementation can be called 'validatorFactory' 13 | type vFactory struct{} 14 | 15 | func (f *vFactory) NewFromRegex(re, message string) survey.Validator { 16 | var regex = regexp.MustCompile(re) 17 | return func(v interface{}) error { 18 | k := reflect.ValueOf(v).Kind() 19 | if k != reflect.String { 20 | return fmt.Errorf("was expecting a string, got %s", k.String()) 21 | } 22 | val := v.(string) 23 | if !regex.Match([]byte(val)) { 24 | return fmt.Errorf(message) 25 | } 26 | return nil 27 | } 28 | } 29 | 30 | var ValidatorFactory = new(vFactory) 31 | --------------------------------------------------------------------------------