├── .github └── workflows │ ├── ci.yaml │ └── release.yaml ├── .gitignore ├── LICENSE ├── README.md ├── cmd ├── main.go ├── main_test.go └── postgres_test.go ├── config ├── conf.json ├── conf_test.json ├── conf_test_oracle.json ├── config.go ├── config_test.go └── config_test_mssql.json ├── go.mod ├── go.sum ├── ingester ├── ingest_databend.go └── stats.go ├── source ├── mysql.go ├── oracle.go ├── postgres.go ├── postgres_test.go ├── source.go ├── source_test.go ├── sql_server.go └── stats.go ├── tools └── confgenerotor │ ├── README.md │ └── conf_generate.go ├── utils └── testutils │ └── postgres.go └── worker ├── stats.go └── worker.go /.github/workflows/ci.yaml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: [main] 6 | pull_request: 7 | branches: [main] 8 | 9 | jobs: 10 | ci: 11 | runs-on: ubuntu-latest 12 | services: 13 | databend: 14 | image: docker.io/datafuselabs/databend 15 | env: 16 | QUERY_DEFAULT_USER: databend 17 | QUERY_DEFAULT_PASSWORD: databend 18 | MINIO_ENABLED: true 19 | ports: 20 | - 8000:8000 21 | - 9000:9000 22 | mysql: 23 | image: mysql:5.7 24 | env: 25 | MYSQL_ROOT_PASSWORD: 123456 26 | MYSQL_DATABASE: default 27 | ports: 28 | - 3306:3306 29 | options: --health-cmd="mysqladmin ping" --health-interval=10s --health-timeout=5s --health-retries=10 30 | oracle: 31 | image: wnameless/oracle-xe-11g-r2 32 | env: 33 | ORACLE_SID: XE 34 | ports: 35 | - 49161:1521 36 | mssql: 37 | image : mcr.microsoft.com/azure-sql-edge:latest 38 | env: 39 | ACCEPT_EULA: Y 40 | MSSQL_SA_PASSWORD: "Password1234!" 41 | ports: 42 | - 1433:1433 43 | 44 | steps: 45 | - name: Checkout 46 | uses: actions/checkout@v2 47 | 48 | - name: Verify Service Running 49 | run: | 50 | sleep 60 51 | curl -v http://localhost:8000/v1/health 52 | 53 | - name: Cache Instant Client 54 | id: cache-instant-client 55 | uses: actions/cache@v3 56 | with: 57 | path: /opt/oracle/instantclient_23_4 58 | key: oracle-instant-client-23.4.0.24.05-basic-linux-x64 59 | 60 | - name: Install Oracle Instant Client 61 | run: | 62 | wget https://download.oracle.com/otn_software/linux/instantclient/214000/instantclient-basic-linux.x64-21.4.0.0.0dbru.zip 63 | wget https://download.oracle.com/otn_software/linux/instantclient/214000/instantclient-sqlplus-linux.x64-21.4.0.0.0dbru.zip 64 | mkdir -p /opt/oracle 65 | unzip -d /opt/oracle instantclient-basic-linux.x64-21.4.0.0.0dbru.zip 66 | unzip -d /opt/oracle instantclient-sqlplus-linux.x64-21.4.0.0.0dbru.zip 67 | export LD_LIBRARY_PATH=/opt/oracle/instantclient_21_4:$LD_LIBRARY_PATH 68 | export PATH=$LD_LIBRARY_PATH:$PATH 69 | rm instantclient-basic-linux.x64-21.4.0.0.0dbru.zip 70 | rm instantclient-sqlplus-linux.x64-21.4.0.0.0dbru.zip 71 | 72 | # - name: Create Oracle User 73 | # env: 74 | # TEST_DATABEND_DSN: "http://databend:databend@localhost:8000" 75 | # LD_LIBRARY_PATH: /opt/oracle/instantclient_21_4:$LD_LIBRARY_PATH 76 | # PATH: /opt/oracle/instantclient_21_4:$LD_LIBRARY_PATH:$PATH 77 | # run: | 78 | # sqlplus system/oracle@localhost:49161/XE << EOF 79 | # CREATE USER a IDENTIFIED BY 123 DEFAULT TABLESPACE USERS; 80 | # GRANT DBA TO a; 81 | # EXIT 82 | # EOF 83 | 84 | - name: Test 85 | env: 86 | TEST_DATABEND_DSN: "http://databend:databend@localhost:8000" 87 | LD_LIBRARY_PATH: /opt/oracle/instantclient_21_4:$LD_LIBRARY_PATH 88 | run: | 89 | go test -v -p 1 -cover ./... -------------------------------------------------------------------------------- /.github/workflows/release.yaml: -------------------------------------------------------------------------------- 1 | name: Release 2 | 3 | on: 4 | release: 5 | types: 6 | - created 7 | 8 | env: 9 | GO_VERSION: '1.21' 10 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 11 | 12 | jobs: 13 | build_and_upload: 14 | runs-on: ubuntu-latest 15 | strategy: 16 | matrix: 17 | TARGETS: [ linux/amd64, darwin/amd64, windows/amd64, linux/arm64, darwin/arm64 ] 18 | env: 19 | GO_BUILD_ENV: GO111MODULE=on CGO_ENABLED=0 20 | DIST_DIRS: find * -type d -exec 21 | steps: 22 | - uses: actions/checkout@v4 23 | - name: Setup Go 24 | uses: actions/setup-go@v4 25 | with: 26 | go-version: ${{ env.GO_VERSION }} 27 | - name: Get release 28 | id: get_release 29 | uses: bruceadams/get-release@v1.2.2 30 | - name: Get matrix 31 | id: get_matrix 32 | run: | 33 | TARGETS=${{matrix.TARGETS}} 34 | echo "OS=${TARGETS%/*}" >> $GITHUB_OUTPUT 35 | echo "ARCH=${TARGETS#*/}" >> $GITHUB_OUTPUT 36 | - name: Build 37 | run: | 38 | cd cmd && \ 39 | ${{ env.GO_BUILD_ENV }} GOOS=${{ steps.get_matrix.outputs.OS }} GOARCH=${{ steps.get_matrix.outputs.ARCH }} \ 40 | go build \ 41 | -o _bin/db-archiver/${{ steps.get_matrix.outputs.OS }}-${{ steps.get_matrix.outputs.ARCH }}/db-archiver 42 | - name: Compress 43 | run: | 44 | cd cmd/_bin/db-archiver && \ 45 | ${{ env.DIST_DIRS }} cp ../../LICENSE {} \; && \ 46 | ${{ env.DIST_DIRS }} cp ../../README.md {} \; && \ 47 | ${{ env.DIST_DIRS }} tar -zcf db-archiver-{}-${{ github.ref_name }}.tar.gz {} \; && \ 48 | cd .. && \ 49 | sha256sum db-archiver/db-archiver-*-${{ github.ref_name }}.tar.gz >> sha256-${{ steps.get_matrix.outputs.OS }}-${{ steps.get_matrix.outputs.ARCH }}.txt 50 | - name: Delete existing asset 51 | uses: actions/github-script@v6 52 | with: 53 | github-token: ${{secrets.GITHUB_TOKEN}} 54 | script: | 55 | const fs = require('fs'); 56 | const release = await github.rest.repos.getReleaseByTag({ 57 | owner: context.repo.owner, 58 | repo: context.repo.repo, 59 | tag: '${{ github.ref_name }}' 60 | }); 61 | const asset_name = 'db-archiver-${{ steps.get_matrix.outputs.OS }}-${{ steps.get_matrix.outputs.ARCH }}-${{ github.ref_name }}.tar.gz'; 62 | for (const asset of release.data.assets) { 63 | if (asset.name === asset_name) { 64 | await github.rest.repos.deleteReleaseAsset({ 65 | owner: context.repo.owner, 66 | repo: context.repo.repo, 67 | asset_id: asset.id 68 | }); 69 | console.log(`Deleted existing asset: ${asset_name}`); 70 | break; 71 | } 72 | } 73 | - name: Upload db-archiver tar.gz 74 | uses: actions/upload-release-asset@v1.0.2 75 | with: 76 | upload_url: ${{ steps.get_release.outputs.upload_url }} 77 | asset_path: cmd/_bin/db-archiver/db-archiver-${{ steps.get_matrix.outputs.OS }}-${{ steps.get_matrix.outputs.ARCH }}-${{ github.ref_name }}.tar.gz 78 | asset_name: db-archiver-${{ steps.get_matrix.outputs.OS }}-${{ steps.get_matrix.outputs.ARCH }}-${{ github.ref_name }}.tar.gz 79 | asset_content_type: application/gzip 80 | - name: Post sha256 81 | uses: actions/upload-artifact@v4 82 | with: 83 | name: sha256sums-${{ steps.get_matrix.outputs.OS }}-${{ steps.get_matrix.outputs.ARCH }} 84 | path: cmd/_bin/sha256-${{ steps.get_matrix.outputs.OS }}-${{ steps.get_matrix.outputs.ARCH }}.txt 85 | retention-days: 1 86 | 87 | upload-sha256sums: 88 | needs: build_and_upload 89 | runs-on: ubuntu-latest 90 | steps: 91 | - name: Get release 92 | id: get_release 93 | uses: bruceadams/get-release@v1.2.2 94 | - name: Download sha256sums 95 | uses: actions/download-artifact@v4 96 | with: 97 | pattern: sha256sums-* 98 | merge-multiple: true 99 | - name: Combine sha256sums 100 | run: | 101 | cat sha256-*.txt > sha256sums.txt 102 | - name: Delete existing sha256sums 103 | uses: actions/github-script@v6 104 | with: 105 | github-token: ${{secrets.GITHUB_TOKEN}} 106 | script: | 107 | const release = await github.rest.repos.getReleaseByTag({ 108 | owner: context.repo.owner, 109 | repo: context.repo.repo, 110 | tag: '${{ github.ref_name }}' 111 | }); 112 | for (const asset of release.data.assets) { 113 | if (asset.name === 'sha256sums.txt') { 114 | await github.rest.repos.deleteReleaseAsset({ 115 | owner: context.repo.owner, 116 | repo: context.repo.repo, 117 | asset_id: asset.id 118 | }); 119 | console.log('Deleted existing sha256sums.txt'); 120 | break; 121 | } 122 | } 123 | - name: Upload Checksums 124 | uses: actions/upload-release-asset@v1.0.2 125 | with: 126 | upload_url: ${{ steps.get_release.outputs.upload_url }} 127 | asset_path: sha256sums.txt 128 | asset_name: sha256sums.txt 129 | asset_content_type: text/plain -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Binaries for programs and plugins 2 | .idea 3 | *.exe 4 | *.exe~ 5 | *.dll 6 | *.so 7 | *.dylib 8 | *.log 9 | 10 | # Test binary, built with `go test -c` 11 | *.test 12 | 13 | # Output of the go coverage tool, specifically when used with LiteIDE 14 | *.out 15 | 16 | # Dependency directories (remove the comment below to include it) 17 | # vendor/ 18 | bend-ingest-kafka 19 | 20 | front/node_modules 21 | front/.DS_Store 22 | front/dist 23 | front/dist-ssr 24 | front/*.local 25 | front/src/.umi 26 | front/src/.umi-production 27 | front/yarn-error.log 28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # db-archiver 2 | A simple tool to archive databases to Databend. 3 | 4 | ## Supported data sources 5 | | DataSources | Supported | 6 | |:------------|:-----------:| 7 | | MySQL | Yes | 8 | | PostgreSQL | Yes | 9 | | TiDB | Yes | 10 | | SQL Server | Yes | 11 | | Oracle | Coming soon | 12 | | CSV | Coming soon | 13 | | NDJSON | Coming soon | 14 | 15 | 16 | ## Installation 17 | Download the binary from [release page](https://github.com/databendcloud/db-archiver/releases) according to your arch. 18 | 19 | ## Usage 20 | 21 | Config your database and Databend connection in `config/conf.json`: 22 | ```json 23 | { 24 | "sourceHost": "127.0.0.1", 25 | "sourcePort": 3306, 26 | "sourceUser": "root", 27 | "sourcePass": "123456", 28 | "sourceDbTables": ["mydb.*@table.*"], 29 | "sourceQuery": "select * from mydb.my_table", 30 | "sourceWhereCondition": "id < 100", 31 | "sourceSplitKey": "id", 32 | "databendDSN": "http://username:password@host:port", 33 | "databendTable": "testSync.my_table", 34 | "batchSize": 20000, 35 | "batchMaxInterval": 30, 36 | "workers": 1, 37 | "copyPurge": true, 38 | "copyForce": false, 39 | "disableVariantCheck": true, 40 | "userStage": "~", 41 | "deleteAfterSync": false 42 | } 43 | 44 | ``` 45 | 46 | Run the tool and start your sync: 47 | ```bash 48 | ./db-archiver -f conf.json 49 | ``` 50 | 51 | The log output: 52 | ``` 53 | INFO[0000] Starting worker 54 | 2024/06/25 11:35:37 ingest 2 rows (0.565646 rows/s), 64 bytes (18.100678 bytes/s) 55 | 2024/06/25 11:35:38 ingest 1 rows (0.556652 rows/s), 33 bytes (17.812853 bytes/s) 56 | 2024/06/25 11:35:38 ingest 2 rows (0.551906 rows/s), 65 bytes (17.660995 bytes/s) 57 | 2024/06/25 11:35:38 ingest 2 rows (0.531644 rows/s), 64 bytes (17.012600 bytes/s) 58 | 2024/06/25 11:35:38 ingest 2 rows (0.531768 rows/s), 64 bytes (17.016584 bytes/s) 59 | ``` 60 | 61 | 62 | ## Parameter References 63 | | Parameter | Description | Default | example | required | 64 | |----------------------|------------------------------------------------------------------------------------------------------|----------|-------------------------------|----------| 65 | | sourceHost | source host | | | true | 66 | | sourcePort | source port | 3306 | 3306 | true | 67 | | sourceUser | source user | | | true | 68 | | sourcePass | source password | | | true | 69 | | sourceDB | source database | | | false | 70 | | sourceTable | source table | | | false | 71 | | sourceDbTables | source db tables | [] | [db.*@table.*,mydb.*.table.*] | false | 72 | | sourceQuery | source query, which is query SQL to fetch data | | | false | 73 | | sourceWhereCondition | source where condition, the condition SQL to limit the query | | | false | 74 | | sourceSplitKey | source split key, the column name to split the data, must be integer type primary key | no | "id" | false | | no | "id" | false | | no | "id" | false | 75 | | sourceSplitTimeKey | source split time key, the column name to split the data by time, must be time type | no | "t1" | false | 76 | | timeSplitUnit | time split unit, the unit of time split, can be `minute`, `hour`, `day` | "minute" | "day" | false | 77 | | databendDSN | databend dsn | no | "http://localhost:8000" | true | 78 | | databendTable | databend table | no | "db1.tbl" | true | 79 | | batchSize | batch size, the number of rows to sync in one batch | 1000 | 1000 | false | 80 | | copyPurge | copy purge, refer to https://docs.databend.com/sql/sql-commands/dml/dml-copy-into-table#copy-options | false | false | false | 81 | | copyForce | copy force | false | false | false | 82 | | DisableVariantCheck | disable variant check | false | false | false | 83 | | userStage | user external stage name | ~ | ~ | false | 84 | 85 | NOTE: 86 | 87 | 1. To reduce the server load, we set the `sourceSplitKey` which is the primary key of the source table and type must be integer. The tool will split the data by the `sourceSplitKey` and sync the data to Databend in parallel. 88 | The `sourceSplitTimeKey` is used to split the data by the time column. And the `sourceSplitTimeKey` and `sourceSplitKey` must be set at least one. 89 | 90 | 2. `sourceDbTables` is used to sync the data from multiple tables. The format is `db.*@table.*` or `db.table.*`. The `.*` is a regex pattern. The `db.*@table.*` means all tables match the regex pattern `table.*` in the database match the regex pattern `db.*`. 91 | 92 | 3. `sourceDbTables` has a higher priority than `sourceTable` and `sourceDB`. If `sourceDbTables` is set, the `sourceTable` will be ignored. 93 | 94 | 4. The `database` and `table` all support regex pattern. 95 | 96 | 5. If you set `sourceDbTables` the `sourceQuery` no need to set. In other words, the proirity of `sourceDbTables` if high than `sourceQuery`. 97 | 98 | 6. The `copyPurge` and `copyForce`, `DisableVariantCheck` can found in this [doc](https://docs.databend.com/sql/sql-commands/dml/dml-copy-into-table#copy-options). 99 | 100 | 101 | ## Two modes 102 | ### Sync data according to the `sourceSplitKey` 103 | If your source table has a primary key, you can set the `sourceSplitKey` to sync the data in parallel. The tool will split the data by the `sourceSplitKey` and sync the data to Databend in parallel. 104 | It is the most high performance mode. 105 | Th example of the `conf.json`: 106 | ```json 107 | { 108 | "sourceHost": "0.0.0.0", 109 | "sourcePort": 3306, 110 | "sourceUser": "root", 111 | "sourcePass": "123456", 112 | "sourceDB": "mydb", 113 | "sourceTable": "my_table", 114 | "sourceQuery": "select * from mydb.my_table", 115 | "sourceWhereCondition": "id < 100", 116 | "sourceSplitKey": "id", 117 | "databendDSN": "https://cloudapp:password@tn3ftqihs--medium-p8at.gw.aws-us-east-2.default.databend.com:443", 118 | "databendTable": "testSync.my_table", 119 | "batchSize": 2, 120 | "batchMaxInterval": 30, 121 | "workers": 1, 122 | "copyPurge": false, 123 | "copyForce": false, 124 | "disableVariantCheck": false, 125 | "userStage": "~", 126 | "deleteAfterSync": false, 127 | "maxThread": 10 128 | } 129 | ``` 130 | 131 | ### Sync data according to the `sourceSplitTimeKey` 132 | If your source table has a time column, you can set the `sourceSplitTimeKey` to sync the data in parallel. The tool will split the data by the `sourceSplitTimeKey` and sync the data to Databend in parallel. 133 | The `sourceSplitTimeKey` must be set with `timeSplitUnit`. The `timeSplitUnit` can be `minute`, `hour`, `day`. The `timeSplitUnit` is used to split the data by the time column. 134 | The example of the `conf.json`: 135 | ```json 136 | "sourceHost": "127.0.0.1", 137 | "sourcePort": 3306, 138 | "sourceUser": "root", 139 | "sourcePass": "12345678", 140 | "sourceDB": "mydb", 141 | "sourceTable": "test_table1", 142 | "sourceQuery": "select * from mydb.test_table1", 143 | "sourceWhereCondition": "t1 >= '2024-06-01' and t1 < '2024-07-01'", 144 | "sourceSplitKey": "", 145 | "sourceSplitTimeKey": "t1", 146 | "timeSplitUnit": "hour", 147 | "databendDSN": "https://cloudapp:password@tn3ftqihs--medium-p8at.gw.aws-us-east-2.default.databend.com:443", 148 | "databendTable": "default.test_table1", 149 | "batchSize": 10000, 150 | "batchMaxInterval": 30, 151 | "copyPurge": true, 152 | "copyForce": false, 153 | "disableVariantCheck": true, 154 | "userStage": "~", 155 | "deleteAfterSync": false, 156 | "maxThread": 10 157 | ``` 158 | NOTE: 159 | 160 | 1. If you set `sourceSplitTimeKey` the `sourceWhereCondition` format must be `t > xx and t < yy`. 161 | 162 | 163 | NOTE: The `mysql-go` will handle the bool type as TINYINT(1). So you need to use `TINYINT` in databend to store the bool type. 164 | -------------------------------------------------------------------------------- /cmd/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "flag" 6 | "fmt" 7 | "net/http" 8 | _ "net/http/pprof" 9 | "os" 10 | "os/signal" 11 | "syscall" 12 | "time" 13 | 14 | "github.com/sirupsen/logrus" 15 | 16 | "github.com/databendcloud/db-archiver/config" 17 | "github.com/databendcloud/db-archiver/ingester" 18 | "github.com/databendcloud/db-archiver/source" 19 | "github.com/databendcloud/db-archiver/worker" 20 | ) 21 | 22 | func main() { 23 | go func() { 24 | http.ListenAndServe("localhost:6060", nil) 25 | }() 26 | start := fmt.Sprintf("start time: %s", time.Now().Format("2006-01-02 15:04:05")) 27 | fmt.Println(start) 28 | startTime := time.Now() 29 | ctx, cancel := context.WithCancel(context.Background()) 30 | go func() { 31 | sigch := make(chan os.Signal, 1) 32 | signal.Notify(sigch, syscall.SIGQUIT, syscall.SIGTERM) 33 | <-sigch 34 | cancel() 35 | }() 36 | 37 | configFile := flag.String("f", "", "Path to the configuration file") 38 | flag.Parse() 39 | 40 | if *configFile == "" { 41 | *configFile = "config/conf.json" 42 | if _, err := os.Stat(*configFile); os.IsNotExist(err) { 43 | fmt.Printf("json config file does not exist, you can use -f to specify it.Example: ./dbarchiver -f conf.json \n") 44 | os.Exit(1) 45 | } 46 | } 47 | cfg := parseConfigWithFile(*configFile) 48 | ig := ingester.NewDatabendIngester(cfg) 49 | src, err := source.NewSource(cfg) 50 | if err != nil { 51 | panic(err) 52 | } 53 | 54 | dbTables := make(map[string][]string) 55 | if len(cfg.SourceDbTables) != 0 { 56 | dbTables, err = src.GetDbTablesAccordingToSourceDbTables() 57 | if err != nil { 58 | panic(err) 59 | } 60 | } else { 61 | dbName := fmt.Sprintf("^%s$", cfg.SourceDB) 62 | dbs, err := src.GetDatabasesAccordingToSourceDbRegex(dbName) 63 | if err != nil { 64 | panic(err) 65 | } 66 | tableName := fmt.Sprintf("^%s$", cfg.SourceTable) 67 | dbTables, err = src.GetTablesAccordingToSourceTableRegex(tableName, dbs) 68 | if err != nil { 69 | panic(err) 70 | } 71 | } 72 | 73 | w := &worker.Worker{Cfg: cfg, Ig: ig, Src: src, Name: "dbarchiver"} 74 | syncedCount, err := w.Ig.GetAllSyncedCount() 75 | if err != nil || syncedCount != 0 { 76 | if syncedCount != 0 { 77 | logrus.Errorf("syncedCount is not 0, already ingested %d rows", syncedCount) 78 | return 79 | } 80 | logrus.Errorf("pre-check failed: %v", err) 81 | return 82 | } 83 | for db, tables := range dbTables { 84 | for _, table := range tables { 85 | logrus.Infof("Start worker %s.%s", db, table) 86 | db := db 87 | table := table 88 | cfgCopy := *cfg 89 | cfgCopy.SourceDB = db 90 | cfgCopy.SourceTable = table 91 | ig := ingester.NewDatabendIngester(&cfgCopy) 92 | src, err := source.NewSource(&cfgCopy) 93 | if err != nil { 94 | panic(err) 95 | } 96 | // adjust batch size according to source db table 97 | cfgCopy.BatchSize = src.AdjustBatchSizeAccordingToSourceDbTable() 98 | w := worker.NewWorker(&cfgCopy, fmt.Sprintf("%s.%s", db, table), ig, src) 99 | w.Run(ctx) 100 | } 101 | } 102 | targetCount, sourceCount, workerCorrect := w.IsWorkerCorrect() 103 | 104 | if workerCorrect { 105 | logrus.Infof("Worker %s finished and data correct, source data count is %d,"+ 106 | " target data count is %d", w.Name, sourceCount, targetCount) 107 | } else { 108 | logrus.Errorf("Worker %s finished and data incorrect, source data count is %d,"+ 109 | " but databend data count is %d", w.Name, sourceCount, targetCount) 110 | } 111 | 112 | if w.Cfg.DeleteAfterSync && workerCorrect { 113 | err := w.Src.DeleteAfterSync() 114 | if err != nil { 115 | logrus.Errorf("DeleteAfterSync failed: %v, please do it mannually", err) 116 | } 117 | } 118 | endTime := fmt.Sprintf("end time: %s", time.Now().Format("2006-01-02 15:04:05")) 119 | fmt.Println(endTime) 120 | fmt.Println(fmt.Sprintf("total time: %s", time.Since(startTime))) 121 | } 122 | 123 | func parseConfigWithFile(configFile string) *config.Config { 124 | cfg, err := config.LoadConfig(configFile) 125 | if err != nil { 126 | panic(err) 127 | } 128 | return cfg 129 | } 130 | -------------------------------------------------------------------------------- /cmd/main_test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "database/sql" 6 | "fmt" 7 | "log" 8 | "net/url" 9 | "sync" 10 | "testing" 11 | "time" 12 | 13 | _ "github.com/go-sql-driver/mysql" 14 | go_ora "github.com/sijms/go-ora/v2" 15 | "github.com/test-go/testify/assert" 16 | 17 | cfg "github.com/databendcloud/db-archiver/config" 18 | "github.com/databendcloud/db-archiver/ingester" 19 | "github.com/databendcloud/db-archiver/source" 20 | "github.com/databendcloud/db-archiver/worker" 21 | 22 | _ "github.com/datafuselabs/databend-go" 23 | ) 24 | 25 | func TestMultipleDbTablesWorkflow(t *testing.T) { 26 | { 27 | fmt.Println("=== TEST MYSQL SOURCE ===") 28 | prepareMySQLDbxTablex() 29 | prepareDatabend("test_table2", "http://databend:databend@localhost:8000") 30 | 31 | testConfig := prepareMySQLMultipleConfig() 32 | startTime := time.Now() 33 | 34 | src, err := source.NewSource(testConfig) 35 | assert.NoError(t, err) 36 | dbTables, err := src.GetDbTablesAccordingToSourceDbTables() 37 | assert.NoError(t, err) 38 | for db, tables := range dbTables { 39 | for _, table := range tables { 40 | db := db 41 | table := table 42 | cfgCopy := *testConfig 43 | cfgCopy.SourceDB = db 44 | cfgCopy.SourceTable = table 45 | ig := ingester.NewDatabendIngester(&cfgCopy) 46 | src, err := source.NewSource(&cfgCopy) 47 | assert.NoError(t, err) 48 | w := worker.NewWorker(&cfgCopy, fmt.Sprintf("%s.%s", db, table), ig, src) 49 | w.Run(context.Background()) 50 | } 51 | } 52 | endTime := fmt.Sprintf("end time: %s", time.Now().Format("2006-01-02 15:04:05")) 53 | fmt.Println(endTime) 54 | fmt.Println(fmt.Sprintf("total time: %s", time.Since(startTime))) 55 | err = checkTargetTable("test_table2", 15) 56 | assert.NoError(t, err) 57 | } 58 | 59 | } 60 | 61 | func TestOracleMultiTableWorkflow(t *testing.T) { 62 | t.Skip("skip test") 63 | fmt.Println("=== TEST ORACLE SOURCE ===") 64 | prepareOracleDbxTablex() 65 | truncateDatabend("test_table2", "http://databend:databend@localhost:8000") 66 | prepareDatabend("test_table2", "http://databend:databend@localhost:8000") 67 | 68 | testConfig := prepareOracleMultipleConfig() 69 | startTime := time.Now() 70 | 71 | src, err := source.NewSource(testConfig) 72 | assert.NoError(t, err) 73 | dbTables, err := src.GetDbTablesAccordingToSourceDbTables() 74 | assert.NoError(t, err) 75 | for db, tables := range dbTables { 76 | for _, table := range tables { 77 | db := db 78 | table := table 79 | cfgCopy := *testConfig 80 | cfgCopy.SourceDB = db 81 | cfgCopy.SourceTable = table 82 | ig := ingester.NewDatabendIngester(&cfgCopy) 83 | src, err := source.NewSource(&cfgCopy) 84 | assert.NoError(t, err) 85 | w := worker.NewWorker(&cfgCopy, fmt.Sprintf("%s.%s", db, table), ig, src) 86 | w.Run(context.Background()) 87 | } 88 | } 89 | endTime := fmt.Sprintf("end time: %s", time.Now().Format("2006-01-02 15:04:05")) 90 | fmt.Println(endTime) 91 | fmt.Println(fmt.Sprintf("total time: %s", time.Since(startTime))) 92 | err = checkTargetTable("test_table2", 15) 93 | assert.NoError(t, err) 94 | } 95 | 96 | func TestMySQLWorkFlow(t *testing.T) { 97 | { 98 | fmt.Println("=== TEST MYSQL SOURCE ===") 99 | prepareMysql() 100 | prepareDatabend("test_table", "http://databend:databend@localhost:8000") 101 | testConfig := prepareTestConfig() 102 | startTime := time.Now() 103 | 104 | src, err := source.NewSource(testConfig) 105 | if err != nil { 106 | panic(err) 107 | } 108 | wg := sync.WaitGroup{} 109 | dbs, err := src.GetDatabasesAccordingToSourceDbRegex(testConfig.SourceDB) 110 | if err != nil { 111 | panic(err) 112 | } 113 | dbTables, err := src.GetTablesAccordingToSourceTableRegex(testConfig.SourceTable, dbs) 114 | if err != nil { 115 | panic(err) 116 | } 117 | for db, tables := range dbTables { 118 | for _, table := range tables { 119 | wg.Add(1) 120 | db := db 121 | table := table 122 | go func(cfg *cfg.Config, db, table string) { 123 | cfgCopy := *testConfig 124 | cfgCopy.SourceTable = table 125 | cfgCopy.SourceDB = db 126 | ig := ingester.NewDatabendIngester(&cfgCopy) 127 | src, err := source.NewSource(&cfgCopy) 128 | assert.NoError(t, err) 129 | w := worker.NewWorker(&cfgCopy, fmt.Sprintf("%s.%s", db, table), ig, src) 130 | w.Run(context.Background()) 131 | wg.Done() 132 | }(testConfig, db, table) 133 | } 134 | } 135 | wg.Wait() 136 | endTime := fmt.Sprintf("end time: %s", time.Now().Format("2006-01-02 15:04:05")) 137 | fmt.Println(endTime) 138 | fmt.Println(fmt.Sprintf("total time: %s", time.Since(startTime))) 139 | 140 | err = checkTargetTable("test_table", 20) 141 | assert.NoError(t, err) 142 | } 143 | } 144 | 145 | func TestMssqlWorkflow(t *testing.T) { 146 | fmt.Println("=== TEST MSSQL SOURCE ===") 147 | prepareSQLServer() 148 | truncateDatabend("test_table", "http://databend:databend@localhost:8000") 149 | prepareDatabend("test_table", "http://databend:databend@localhost:8000") 150 | testConfig := prepareSqlServerTestConfig() 151 | startTime := time.Now() 152 | 153 | src, err := source.NewSource(testConfig) 154 | if err != nil { 155 | panic(err) 156 | } 157 | wg := sync.WaitGroup{} 158 | dbs, err := src.GetDatabasesAccordingToSourceDbRegex(testConfig.SourceDB) 159 | if err != nil { 160 | panic(err) 161 | } 162 | log.Printf("dbs: %v", dbs) 163 | dbTables, err := src.GetTablesAccordingToSourceTableRegex(testConfig.SourceTable, dbs) 164 | if err != nil { 165 | panic(err) 166 | } 167 | log.Printf("dbTables: %v", dbTables) 168 | for db, tables := range dbTables { 169 | for _, table := range tables { 170 | wg.Add(1) 171 | db := db 172 | table := table 173 | go func(cfg *cfg.Config, db, table string) { 174 | cfgCopy := *testConfig 175 | cfgCopy.SourceTable = table 176 | cfgCopy.SourceDB = db 177 | ig := ingester.NewDatabendIngester(&cfgCopy) 178 | src, err := source.NewSource(&cfgCopy) 179 | assert.NoError(t, err) 180 | w := worker.NewWorker(&cfgCopy, fmt.Sprintf("%s.%s", db, table), ig, src) 181 | w.Run(context.Background()) 182 | wg.Done() 183 | }(testConfig, db, table) 184 | } 185 | } 186 | wg.Wait() 187 | endTime := fmt.Sprintf("end time: %s", time.Now().Format("2006-01-02 15:04:05")) 188 | fmt.Println(endTime) 189 | fmt.Println(fmt.Sprintf("total time: %s", time.Since(startTime))) 190 | 191 | err = checkTargetTable("test_table", 10) 192 | assert.NoError(t, err) 193 | } 194 | 195 | func TestMssqlTimeKeyWorkflow(t *testing.T) { 196 | fmt.Println("=== TEST MSSQL SOURCE WITH TIME KEY ===") 197 | prepareSQLServer() 198 | truncateDatabend("test_table", "http://databend:databend@localhost:8000") 199 | prepareDatabend("test_table", "http://databend:databend@localhost:8000") 200 | testConfig := prepareSqlServerTimeKeyTestConfig() 201 | startTime := time.Now() 202 | 203 | src, err := source.NewSource(testConfig) 204 | if err != nil { 205 | panic(err) 206 | } 207 | wg := sync.WaitGroup{} 208 | dbs, err := src.GetDatabasesAccordingToSourceDbRegex(testConfig.SourceDB) 209 | if err != nil { 210 | panic(err) 211 | } 212 | log.Printf("dbs: %v", dbs) 213 | dbTables, err := src.GetTablesAccordingToSourceTableRegex(testConfig.SourceTable, dbs) 214 | if err != nil { 215 | panic(err) 216 | } 217 | log.Printf("dbTables: %v", dbTables) 218 | for db, tables := range dbTables { 219 | for _, table := range tables { 220 | wg.Add(1) 221 | db := db 222 | table := table 223 | go func(cfg *cfg.Config, db, table string) { 224 | cfgCopy := *testConfig 225 | cfgCopy.SourceTable = table 226 | cfgCopy.SourceDB = db 227 | ig := ingester.NewDatabendIngester(&cfgCopy) 228 | src, err := source.NewSource(&cfgCopy) 229 | assert.NoError(t, err) 230 | w := worker.NewWorker(&cfgCopy, fmt.Sprintf("%s.%s", db, table), ig, src) 231 | w.Run(context.Background()) 232 | wg.Done() 233 | }(testConfig, db, table) 234 | } 235 | } 236 | wg.Wait() 237 | endTime := fmt.Sprintf("end time: %s", time.Now().Format("2006-01-02 15:04:05")) 238 | fmt.Println(endTime) 239 | fmt.Println(fmt.Sprintf("total time: %s", time.Since(startTime))) 240 | 241 | err = checkTargetTable("test_table", 10) 242 | assert.NoError(t, err) 243 | } 244 | 245 | func TestSimpleOracleWorkflow(t *testing.T) { 246 | t.Skip("skip test") 247 | fmt.Println("=== TEST ORACLE SOURCE ===") 248 | prepareOracle() 249 | truncateDatabend("test_table", "http://databend:databend@localhost:8000") 250 | prepareDatabend("test_table", "http://databend:databend@localhost:8000") 251 | testConfig := prepareOracleTestConfig() 252 | startTime := time.Now() 253 | 254 | src, err := source.NewSource(testConfig) 255 | if err != nil { 256 | panic(err) 257 | } 258 | wg := sync.WaitGroup{} 259 | dbs, err := src.GetDatabasesAccordingToSourceDbRegex(testConfig.SourceDB) 260 | if err != nil { 261 | panic(err) 262 | } 263 | dbTables, err := src.GetTablesAccordingToSourceTableRegex(testConfig.SourceTable, dbs) 264 | if err != nil { 265 | panic(err) 266 | } 267 | for db, tables := range dbTables { 268 | for _, table := range tables { 269 | wg.Add(1) 270 | db := db 271 | table := table 272 | go func(cfg *cfg.Config, db, table string) { 273 | cfgCopy := *testConfig 274 | cfgCopy.SourceTable = table 275 | cfgCopy.SourceDB = db 276 | ig := ingester.NewDatabendIngester(&cfgCopy) 277 | src, err := source.NewSource(&cfgCopy) 278 | assert.NoError(t, err) 279 | w := worker.NewWorker(&cfgCopy, fmt.Sprintf("%s.%s", db, table), ig, src) 280 | w.Run(context.Background()) 281 | wg.Done() 282 | }(testConfig, db, table) 283 | } 284 | } 285 | wg.Wait() 286 | endTime := fmt.Sprintf("end time: %s", time.Now().Format("2006-01-02 15:04:05")) 287 | fmt.Println(endTime) 288 | fmt.Println(fmt.Sprintf("total time: %s", time.Since(startTime))) 289 | 290 | // mydb1 10 rows , mydb2 5 rows, mydb 10 rows 291 | err = checkTargetTable("test_table", 25) 292 | assert.NoError(t, err) 293 | } 294 | 295 | func prepareMySQLDbxTablex() { 296 | db, err := sql.Open("mysql", "root:123456@tcp(127.0.0.1:3306)/mysql") 297 | if err != nil { 298 | log.Fatal(err) 299 | } 300 | defer db.Close() 301 | 302 | db.Exec("create database if not exists db1") 303 | db.Exec("create database if not exists db2") 304 | db.Exec(` 305 | CREATE TABLE db1.test_table1 ( 306 | id BIGINT UNSIGNED PRIMARY KEY, 307 | int_col INT, 308 | varchar_col VARCHAR(255), 309 | float_col FLOAT, 310 | bool_col BOOL, 311 | de decimal(18,6), 312 | date_col DATE, 313 | datetime_col DATETIME, 314 | timestamp_col TIMESTAMP 315 | ) 316 | `) 317 | db.Exec(` 318 | CREATE TABLE db2.test_table2 ( 319 | id BIGINT UNSIGNED PRIMARY KEY, 320 | int_col INT, 321 | varchar_col VARCHAR(255), 322 | float_col FLOAT, 323 | bool_col BOOL, 324 | de decimal(18,6), 325 | date_col DATE, 326 | datetime_col DATETIME, 327 | timestamp_col TIMESTAMP 328 | ) 329 | `) 330 | for i := 1; i <= 10; i++ { 331 | _, err = db.Exec(` 332 | INSERT INTO db1.test_table1 333 | (id, int_col, varchar_col, float_col, de, bool_col, date_col, datetime_col, timestamp_col) 334 | VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) 335 | `, i, i, fmt.Sprintf("varchar %d", i), float64(i), i%2 == 0, 1.1, "2022-01-01", "2022-01-01 00:00:00", "2024-06-30 20:00:00") 336 | if err != nil { 337 | log.Fatal(err) 338 | } 339 | } 340 | 341 | for i := 1; i <= 5; i++ { 342 | _, err = db.Exec(` 343 | INSERT INTO db2.test_table2 344 | (id, int_col, varchar_col, float_col, de, bool_col, date_col, datetime_col, timestamp_col) 345 | VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) 346 | `, i+1, i, fmt.Sprintf("varchar %d", i), float64(i), i%2 == 0, 1.1, "2022-01-01", "2022-01-01 00:00:00", "2024-06-30 20:00:00") 347 | if err != nil { 348 | log.Fatal(err) 349 | } 350 | } 351 | } 352 | 353 | func prepareOracleDbxTablex() { 354 | server := "localhost" 355 | port := 49161 356 | serviceName := "XE" 357 | username := "a" 358 | password := "123" 359 | 360 | // 使用 go-ora 构建连接字符串 361 | connStr := go_ora.BuildUrl(server, port, serviceName, username, password, nil) 362 | 363 | // 打开数据库连接 364 | db, err := sql.Open("oracle", connStr) 365 | if err != nil { 366 | log.Fatalf("Failed to open connection: %v", err) 367 | } 368 | defer db.Close() 369 | 370 | _, err = db.Exec("BEGIN EXECUTE IMMEDIATE 'DROP USER mydb1 CASCADE'; EXCEPTION WHEN OTHERS THEN IF SQLCODE = -1918 THEN NULL; ELSE RAISE; END IF; END;") 371 | if err != nil { 372 | log.Fatal(err) 373 | } 374 | _, err = db.Exec(fmt.Sprintf("CREATE USER mydb1 IDENTIFIED BY mydb DEFAULT TABLESPACE USERS")) 375 | if err != nil { 376 | log.Fatal(err) 377 | } 378 | 379 | _, err = db.Exec(fmt.Sprintf("GRANT DBA TO mydb1")) 380 | if err != nil { 381 | log.Fatal(err) 382 | } 383 | 384 | db.Exec(` 385 | CREATE TABLE mydb1.test_table1 ( 386 | id NUMBER(10) PRIMARY KEY, 387 | int_col NUMBER(10), 388 | varchar_col VARCHAR2(255), 389 | float_col NUMBER, 390 | bool_col NUMBER(1) CHECK (bool_col IN (0, 1)), 391 | de NUMBER(18,6), 392 | date_col DATE, 393 | datetime_col TIMESTAMP, 394 | timestamp_col TIMESTAMP 395 | ) 396 | `) 397 | 398 | _, err = db.Exec("BEGIN EXECUTE IMMEDIATE 'DROP USER mydb2 CASCADE'; EXCEPTION WHEN OTHERS THEN IF SQLCODE = -1918 THEN NULL; ELSE RAISE; END IF; END;") 399 | if err != nil { 400 | log.Fatal(err) 401 | } 402 | _, err = db.Exec(fmt.Sprintf("CREATE USER mydb2 IDENTIFIED BY mydb DEFAULT TABLESPACE USERS")) 403 | if err != nil { 404 | log.Fatal(err) 405 | } 406 | 407 | _, err = db.Exec(fmt.Sprintf("GRANT DBA TO mydb2")) 408 | if err != nil { 409 | log.Fatal(err) 410 | } 411 | 412 | db.Exec(` 413 | CREATE TABLE mydb2.test_table2 ( 414 | id NUMBER(10) PRIMARY KEY, 415 | int_col NUMBER(10), 416 | varchar_col VARCHAR2(255), 417 | float_col NUMBER, 418 | bool_col NUMBER(1) CHECK (bool_col IN (0, 1)), 419 | de NUMBER(18,6), 420 | date_col DATE, 421 | datetime_col TIMESTAMP, 422 | timestamp_col TIMESTAMP 423 | ) 424 | `) 425 | for i := 1; i <= 10; i++ { 426 | insert := fmt.Sprintf("INSERT INTO mydb1.test_table1 "+ 427 | "(id, int_col, varchar_col, float_col, de, bool_col, date_col, datetime_col, timestamp_col) "+ 428 | "values (%d, %d, '%s', %f, %d, %f,%s,%s,%s)", i, i, fmt.Sprintf("varchar %d", i), float64(i), i%2, 1.1, "TO_DATE('2022-01-01', 'YYYY-MM-DD')", "TO_TIMESTAMP('2022-01-01 00:00:00','YYYY-MM-DD HH24:MI:SS')", "TO_TIMESTAMP('2024-06-30 20:00:00', 'YYYY-MM-DD HH24:MI:SS')") 429 | _, err = db.Exec(insert) 430 | if err != nil { 431 | log.Fatal(err) 432 | } 433 | } 434 | 435 | for i := 1; i <= 5; i++ { 436 | insert := fmt.Sprintf("INSERT INTO mydb2.test_table2 "+ 437 | "(id, int_col, varchar_col, float_col, de, bool_col, date_col, datetime_col, timestamp_col) "+ 438 | "values (%d, %d, '%s', %f, %d, %f,%s,%s,%s)", i, i, fmt.Sprintf("varchar %d", i), float64(i), i%2, 1.1, "TO_DATE('2022-01-01', 'YYYY-MM-DD')", "TO_TIMESTAMP('2022-01-01 00:00:00','YYYY-MM-DD HH24:MI:SS')", "TO_TIMESTAMP('2024-06-30 20:00:00', 'YYYY-MM-DD HH24:MI:SS')") 439 | _, err = db.Exec(insert) 440 | if err != nil { 441 | log.Fatal(err) 442 | } 443 | } 444 | } 445 | 446 | func prepareMysql() { 447 | db, err := sql.Open("mysql", "root:123456@tcp(127.0.0.1:3306)/mysql") 448 | if err != nil { 449 | log.Fatal(err) 450 | } 451 | defer db.Close() 452 | db.Exec("Create database if not exists mydb") 453 | db.Exec("drop table if exists mydb.test_table") 454 | 455 | // Create table 456 | _, err = db.Exec(` 457 | CREATE TABLE mydb.test_table ( 458 | id BIGINT UNSIGNED PRIMARY KEY, 459 | int_col INT, 460 | varchar_col VARCHAR(255), 461 | float_col FLOAT, 462 | bool_col BOOL, 463 | de decimal(18,6), 464 | date_col DATE, 465 | datetime_col DATETIME, 466 | timestamp_col TIMESTAMP 467 | ) 468 | `) 469 | // need to test the TIME type in mysql 470 | if err != nil { 471 | log.Fatal(err) 472 | } 473 | 474 | // Insert data 475 | for i := 1; i <= 10; i++ { 476 | _, err = db.Exec(` 477 | INSERT INTO mydb.test_table 478 | (id, int_col, varchar_col, float_col, de, bool_col, date_col, datetime_col, timestamp_col) 479 | VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) 480 | `, i, i, fmt.Sprintf("varchar %d", i), float64(i), i%2 == 0, 1.1, "2022-01-01", "2022-01-01 00:00:00", "2024-06-30 20:00:00") 481 | if err != nil { 482 | log.Fatal(err) 483 | } 484 | } 485 | 486 | for i := 1; i <= 10; i++ { 487 | var intCol sql.NullInt64 488 | var varcharCol sql.NullString 489 | var timeCol sql.NullTime 490 | if i%2 == 0 { 491 | intCol = sql.NullInt64{Int64: int64(i), Valid: true} 492 | varcharCol = sql.NullString{String: fmt.Sprintf("varchar %d", i), Valid: true} 493 | timeCol = sql.NullTime{Time: time.Date(2022, 1, 1, 0, 0, 0, 0, time.UTC), Valid: true} 494 | } else { 495 | intCol = sql.NullInt64{Valid: false} 496 | varcharCol = sql.NullString{Valid: false} 497 | timeCol = sql.NullTime{Valid: false} 498 | } 499 | _, err = db.Exec(` 500 | INSERT INTO mydb.test_table 501 | (id, int_col, varchar_col, float_col, de, bool_col, date_col, datetime_col, timestamp_col) 502 | VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) 503 | `, i*11, intCol, varcharCol, float64(i), i%2 == 0, 1.1, "2022-01-01", "2022-01-01 00:00:00", timeCol) 504 | if err != nil { 505 | log.Fatal(err) 506 | } 507 | } 508 | } 509 | 510 | func prepareOracle() { 511 | server := "localhost" 512 | port := 49161 513 | serviceName := "XE" 514 | username := "a" 515 | password := "123" 516 | 517 | connStr := go_ora.BuildUrl(server, port, serviceName, username, password, nil) 518 | 519 | db, err := sql.Open("oracle", connStr) 520 | if err != nil { 521 | log.Fatalf("Failed to open connection: %v", err) 522 | } 523 | defer db.Close() 524 | _, err = db.Exec("BEGIN EXECUTE IMMEDIATE 'DROP USER mydb CASCADE'; EXCEPTION WHEN OTHERS THEN IF SQLCODE = -1918 THEN NULL; ELSE RAISE; END IF; END;") 525 | if err != nil { 526 | log.Fatal(err) 527 | } 528 | _, err = db.Exec(fmt.Sprintf("CREATE USER mydb IDENTIFIED BY mydb DEFAULT TABLESPACE USERS")) 529 | if err != nil { 530 | log.Fatal(err) 531 | } 532 | 533 | _, err = db.Exec(fmt.Sprintf("GRANT DBA TO mydb")) 534 | if err != nil { 535 | log.Fatal(err) 536 | } 537 | 538 | // Create table 539 | _, err = db.Exec(` 540 | CREATE TABLE mydb.test_table ( 541 | id NUMBER(10) PRIMARY KEY, 542 | int_col NUMBER(10), 543 | varchar_col VARCHAR2(255), 544 | float_col NUMBER, 545 | bool_col NUMBER(1) CHECK (bool_col IN (0, 1)), 546 | de NUMBER(18,6), 547 | date_col DATE, 548 | datetime_col TIMESTAMP, 549 | timestamp_col TIMESTAMP 550 | ) 551 | `) 552 | 553 | if err != nil { 554 | log.Fatal(err) 555 | } 556 | 557 | // Insert data 558 | for i := 1; i <= 10; i++ { 559 | insert := fmt.Sprintf("INSERT INTO mydb.test_table "+ 560 | "(id, int_col, varchar_col, float_col, de, bool_col, date_col, datetime_col, timestamp_col) "+ 561 | "values (%d, %d, '%s', %f, %d, %f,%s,%s,%s)", i, i, fmt.Sprintf("varchar %d", i), float64(i), i%2, 1.1, "TO_DATE('2022-01-01', 'YYYY-MM-DD')", "TO_TIMESTAMP('2022-01-01 00:00:00','YYYY-MM-DD HH24:MI:SS')", "TO_TIMESTAMP('2024-06-30 20:00:00', 'YYYY-MM-DD HH24:MI:SS')") 562 | _, err = db.Exec(insert) 563 | if err != nil { 564 | log.Fatal(err) 565 | } 566 | } 567 | 568 | } 569 | 570 | func prepareDatabend(tableName string, dsn string) { 571 | db, err := sql.Open("databend", dsn) 572 | if err != nil { 573 | log.Fatal(err) 574 | } 575 | defer db.Close() 576 | 577 | // Create table 578 | _, err = db.Exec(fmt.Sprintf( 579 | `CREATE TABLE if not exists default.%s ( 580 | id UINT64, 581 | int_col INT, 582 | varchar_col VARCHAR(255), 583 | float_col FLOAT, 584 | bool_col TINYINT, 585 | de decimal(18,6), 586 | date_col DATE, 587 | datetime_col TIMESTAMP, 588 | timestamp_col TIMESTAMP 589 | ) 590 | `, tableName)) 591 | if err != nil { 592 | log.Fatal(err) 593 | } 594 | } 595 | 596 | func truncateDatabend(tableName string, dsn string) { 597 | db, err := sql.Open("databend", dsn) 598 | if err != nil { 599 | log.Fatal(err) 600 | } 601 | defer db.Close() 602 | 603 | // TRUNCATE TABLE 604 | _, err = db.Exec(fmt.Sprintf( 605 | `TRUNCATE TABLE default.%s 606 | `, tableName)) 607 | if err != nil { 608 | log.Fatal(err) 609 | } 610 | } 611 | 612 | func prepareTestConfig() *cfg.Config { 613 | config := cfg.Config{ 614 | SourceDB: "mydb", 615 | SourceHost: "127.0.0.1", 616 | SourcePort: 3306, 617 | SourceUser: "root", 618 | SourcePass: "123456", 619 | SourceTable: "test_table", 620 | SourceWhereCondition: "id > 0", 621 | SourceQuery: "select * from mydb.test_table", 622 | SourceSplitKey: "id", 623 | SourceSplitTimeKey: "", 624 | DatabendDSN: "http://databend:databend@localhost:8000", 625 | DatabendTable: "default.test_table", 626 | BatchSize: 5, 627 | BatchMaxInterval: 3, 628 | MaxThread: 2, 629 | CopyForce: false, 630 | CopyPurge: false, 631 | DeleteAfterSync: false, 632 | DisableVariantCheck: false, 633 | UserStage: "~", 634 | } 635 | 636 | return &config 637 | } 638 | 639 | func prepareSQLServer() { 640 | log.Println("===prepareSQLServer===") 641 | encodedPassword := url.QueryEscape("Password1234!") 642 | // sqlserver://username:password@host:port?database=dbname 643 | db, err := sql.Open("mssql", fmt.Sprintf("sqlserver://sa:%s@localhost:1433?encrypt=disable", encodedPassword)) 644 | if err != nil { 645 | log.Fatal(err) 646 | } 647 | defer db.Close() 648 | 649 | // delete database 650 | _, err = db.Exec(` 651 | IF EXISTS (SELECT * FROM sys.databases WHERE name = 'mydb') 652 | BEGIN 653 | ALTER DATABASE mydb SET SINGLE_USER WITH ROLLBACK IMMEDIATE; 654 | DROP DATABASE mydb; 655 | END 656 | `) 657 | if err != nil { 658 | log.Fatal(err) 659 | } 660 | 661 | // new database 662 | _, err = db.Exec("CREATE DATABASE mydb") 663 | if err != nil { 664 | log.Fatal(err) 665 | } 666 | 667 | db, err = sql.Open("mssql", fmt.Sprintf("sqlserver://sa:%s@localhost:1433?database=mydb&encrypt=disable", encodedPassword)) 668 | if err != nil { 669 | log.Fatal(err) 670 | } 671 | defer db.Close() 672 | 673 | // create table 674 | _, err = db.Exec(` 675 | CREATE TABLE test_table ( 676 | id INT PRIMARY KEY, 677 | int_col INT, 678 | varchar_col VARCHAR(255), 679 | float_col FLOAT, 680 | bool_col BIT, 681 | de DECIMAL(18,6), 682 | date_col DATE, 683 | datetime_col DATETIME2, 684 | timestamp_col DATETIME2 685 | ) 686 | `) 687 | if err != nil { 688 | log.Fatal(err) 689 | } 690 | 691 | // insert 10 rows 692 | for i := 1; i <= 10; i++ { 693 | insert := fmt.Sprintf(` 694 | INSERT INTO test_table 695 | (id, int_col, varchar_col, float_col, de, bool_col, date_col, datetime_col, timestamp_col) 696 | VALUES 697 | (%d, %d, '%s', %f, %d, %d, '%s', '%s', '%s')`, 698 | i, i, fmt.Sprintf("varchar %d", i), float64(i), i%2, 1, 699 | "2022-01-01", 700 | "2022-01-01 00:00:00", 701 | "2024-06-30 20:00:00") 702 | 703 | _, err = db.Exec(insert) 704 | if err != nil { 705 | log.Fatal(err) 706 | } 707 | } 708 | log.Println("===prepareSQLServer done.===") 709 | } 710 | 711 | func prepareOracleTestConfig() *cfg.Config { 712 | config := cfg.Config{ 713 | DatabaseType: "oracle", 714 | SourceDB: "MYDB", 715 | SourceHost: "127.0.0.1", 716 | SourcePort: 49161, 717 | SourceUser: "mydb", 718 | SourcePass: "mydb", 719 | SourceTable: "TEST_TABLE", 720 | SourceWhereCondition: "id > 0", 721 | SourceSplitKey: "id", 722 | SourceSplitTimeKey: "", 723 | DatabendDSN: "http://databend:databend@localhost:8000", 724 | DatabendTable: "default.test_table", 725 | BatchSize: 5, 726 | BatchMaxInterval: 3, 727 | MaxThread: 2, 728 | CopyForce: false, 729 | CopyPurge: false, 730 | DeleteAfterSync: false, 731 | DisableVariantCheck: false, 732 | UserStage: "~", 733 | OracleSID: "XE", 734 | } 735 | 736 | return &config 737 | } 738 | 739 | func prepareSqlServerTimeKeyTestConfig() *cfg.Config { 740 | config := cfg.Config{ 741 | DatabaseType: "mssql", 742 | SourceDB: "mydb", 743 | SourceHost: "127.0.0.1", 744 | SourcePort: 1433, 745 | SourceUser: "sa", 746 | SourcePass: "Password1234!", 747 | SourceTable: "test_table", 748 | SourceWhereCondition: "timestamp_col > '2024-06-29 00:00:00' and timestamp_col < '2024-07-10 20:00:00'", 749 | SourceSplitKey: "", 750 | SourceSplitTimeKey: "timestamp_col", 751 | TimeSplitUnit: "day", 752 | DatabendDSN: "http://databend:databend@localhost:8000", 753 | DatabendTable: "default.test_table", 754 | BatchSize: 5, 755 | BatchMaxInterval: 3, 756 | MaxThread: 1, 757 | CopyForce: false, 758 | CopyPurge: false, 759 | DeleteAfterSync: false, 760 | DisableVariantCheck: false, 761 | UserStage: "~", 762 | } 763 | 764 | return &config 765 | } 766 | 767 | func prepareSqlServerTestConfig() *cfg.Config { 768 | config := cfg.Config{ 769 | DatabaseType: "mssql", 770 | SourceDB: "mydb", 771 | SourceHost: "127.0.0.1", 772 | SourcePort: 1433, 773 | SourceUser: "sa", 774 | SourcePass: "Password1234!", 775 | SourceTable: "test_table", 776 | SourceWhereCondition: "id > 0", 777 | SourceSplitKey: "id", 778 | SourceSplitTimeKey: "", 779 | DatabendDSN: "http://databend:databend@localhost:8000", 780 | DatabendTable: "default.test_table", 781 | BatchSize: 5, 782 | BatchMaxInterval: 3, 783 | MaxThread: 2, 784 | CopyForce: false, 785 | CopyPurge: false, 786 | DeleteAfterSync: false, 787 | DisableVariantCheck: false, 788 | UserStage: "~", 789 | } 790 | 791 | return &config 792 | } 793 | 794 | func prepareMySQLMultipleConfig() *cfg.Config { 795 | config := cfg.Config{ 796 | SourceDB: "mydb", 797 | SourceHost: "127.0.0.1", 798 | SourcePort: 3306, 799 | SourceUser: "root", 800 | SourcePass: "123456", 801 | SourceDbTables: []string{"db.*@test_table.*"}, 802 | SourceTable: "test_table", 803 | SourceWhereCondition: "id > 0", 804 | SourceQuery: "select * from mydb2.test_table", 805 | SourceSplitKey: "id", 806 | SourceSplitTimeKey: "", 807 | DatabendDSN: "http://databend:databend@localhost:8000", 808 | DatabendTable: "default.test_table2", 809 | BatchSize: 5, 810 | BatchMaxInterval: 3, 811 | MaxThread: 2, 812 | CopyForce: false, 813 | CopyPurge: false, 814 | DeleteAfterSync: false, 815 | DisableVariantCheck: false, 816 | UserStage: "~", 817 | } 818 | 819 | return &config 820 | } 821 | 822 | func prepareOracleMultipleConfig() *cfg.Config { 823 | config := cfg.Config{ 824 | DatabaseType: "oracle", 825 | SourceHost: "127.0.0.1", 826 | SourcePort: 49161, 827 | SourceUser: "a", 828 | SourcePass: "123", 829 | SourceDbTables: []string{"MYDB.*@TEST_TABLE.*"}, 830 | SourceWhereCondition: "id > 0", 831 | SourceQuery: "select * from mydb2.test_table", 832 | SourceSplitKey: "id", 833 | SourceSplitTimeKey: "", 834 | DatabendDSN: "http://databend:databend@localhost:8000", 835 | DatabendTable: "default.test_table2", 836 | BatchSize: 5, 837 | BatchMaxInterval: 3, 838 | MaxThread: 2, 839 | CopyForce: false, 840 | CopyPurge: false, 841 | DeleteAfterSync: false, 842 | DisableVariantCheck: false, 843 | UserStage: "~", 844 | OracleSID: "XE", 845 | } 846 | 847 | return &config 848 | } 849 | 850 | func checkTargetTable(tableName string, target int) error { 851 | db, err := sql.Open("databend", "http://databend:databend@localhost:8000") 852 | if err != nil { 853 | log.Fatal(err) 854 | return err 855 | } 856 | defer db.Close() 857 | 858 | rows, err := db.Query(fmt.Sprintf(`SELECT * FROM default.%s`, tableName)) 859 | if err != nil { 860 | log.Fatal(err) 861 | return err 862 | } 863 | defer rows.Close() 864 | count := 0 865 | 866 | for rows.Next() { 867 | var id int 868 | var int_col interface{} 869 | var varchar_col string 870 | var float_col float64 871 | var bool_col bool 872 | var de float64 873 | var date_col string 874 | var datetime_col string 875 | var timestamp_col string 876 | err = rows.Scan(&id, &int_col, &varchar_col, &float_col, &bool_col, &de, &date_col, &datetime_col, ×tamp_col) 877 | if err != nil { 878 | log.Fatal(err) 879 | } 880 | count += 1 881 | } 882 | 883 | if err := rows.Err(); err != nil { 884 | log.Fatal(err) 885 | } 886 | defer rows.Close() 887 | defer db.Close() 888 | fmt.Println("target table count: ", count) 889 | if count != target { 890 | return fmt.Errorf("target table count not equal %d", target) 891 | } 892 | return nil 893 | } 894 | -------------------------------------------------------------------------------- /cmd/postgres_test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "database/sql" 6 | "fmt" 7 | "log" 8 | "testing" 9 | "time" 10 | 11 | "github.com/test-go/testify/assert" 12 | 13 | cfg "github.com/databendcloud/db-archiver/config" 14 | "github.com/databendcloud/db-archiver/ingester" 15 | "github.com/databendcloud/db-archiver/source" 16 | "github.com/databendcloud/db-archiver/utils/testutils" 17 | "github.com/databendcloud/db-archiver/worker" 18 | ) 19 | 20 | const testPostgresPort = 15432 21 | 22 | func TestMultiplePgTable(t *testing.T) { 23 | dsn, tearDown := testutils.PostgresForTest() 24 | defer tearDown() 25 | preparePgDbxTablex(dsn) 26 | prepareDatabend("test_table3", "http://databend:databend@localhost:8000") 27 | 28 | testConfig := preparePGMultipleConfig() 29 | startTime := time.Now() 30 | 31 | src, err := source.NewSource(testConfig) 32 | assert.NoError(t, err) 33 | 34 | dbTables, err := src.GetDbTablesAccordingToSourceDbTables() 35 | assert.NoError(t, err) 36 | for db, tables := range dbTables { 37 | for _, table := range tables { 38 | db := db 39 | table := table 40 | cfgCopy := *testConfig 41 | cfgCopy.SourceDB = db 42 | cfgCopy.SourceTable = table 43 | ig := ingester.NewDatabendIngester(&cfgCopy) 44 | src, err := source.NewSource(&cfgCopy) 45 | assert.NoError(t, err) 46 | w := worker.NewWorker(&cfgCopy, fmt.Sprintf("%s.%s", db, table), ig, src) 47 | w.Run(context.Background()) 48 | } 49 | } 50 | endTime := fmt.Sprintf("end time: %s", time.Now().Format("2006-01-02 15:04:05")) 51 | fmt.Println(endTime) 52 | fmt.Println(fmt.Sprintf("total time: %s", time.Since(startTime))) 53 | err = checkTargetTable("test_table3", 15) 54 | assert.NoError(t, err) 55 | } 56 | 57 | func preparePgDbxTablex(dsn string) { 58 | db, err := sql.Open("postgres", dsn) 59 | if err != nil { 60 | log.Fatal(err) 61 | } 62 | defer db.Close() 63 | db.Exec(` 64 | CREATE TABLE test_table1 ( 65 | id BIGINT PRIMARY KEY, 66 | int_col INT, 67 | varchar_col VARCHAR(255), 68 | float_col FLOAT, 69 | bool_col BOOL, 70 | de decimal(18,6), 71 | date_col DATE, 72 | datetime_col TIMESTAMP, 73 | timestamp_col TIMESTAMP 74 | ) 75 | `) 76 | for i := 1; i <= 10; i++ { 77 | deValue := 0 78 | if i%2 == 0 { 79 | deValue = 1 80 | } 81 | _, err = db.Exec(` 82 | INSERT INTO test_table1 83 | (id, int_col, varchar_col, float_col, bool_col, de, date_col, datetime_col, timestamp_col) 84 | VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9) 85 | `, i, i, fmt.Sprintf("varchar %d", i), float64(i), deValue, 1.1, "2022-01-01", "2022-01-01 00:00:00", "2024-06-30 20:00:00") 86 | if err != nil { 87 | log.Fatal(err) 88 | } 89 | } 90 | 91 | db.Exec("create database mydb1") 92 | dsn = fmt.Sprintf("postgres://postgres:postgres@localhost:%d/mydb1?sslmode=disable", testPostgresPort) 93 | 94 | db.Exec(` 95 | CREATE TABLE test_table2 ( 96 | id BIGINT PRIMARY KEY, 97 | int_col INT, 98 | varchar_col VARCHAR(255), 99 | float_col FLOAT, 100 | bool_col BOOL, 101 | de decimal(18,6), 102 | date_col DATE, 103 | datetime_col TIMESTAMP, 104 | timestamp_col TIMESTAMP 105 | ) 106 | `) 107 | 108 | for i := 1; i <= 5; i++ { 109 | deValue := 0 110 | if i%2 == 0 { 111 | deValue = 1 112 | } 113 | _, err = db.Exec(` 114 | INSERT INTO test_table2 115 | (id, int_col, varchar_col, float_col, bool_col, de, date_col, datetime_col, timestamp_col) 116 | VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9) 117 | `, i+1, i, fmt.Sprintf("varchar %d", i), float64(i), deValue, 1.1, "2022-01-01", "2022-01-01 00:00:00", "2024-06-30 20:00:00") 118 | if err != nil { 119 | log.Fatal(err) 120 | } 121 | } 122 | } 123 | 124 | func preparePGMultipleConfig() *cfg.Config { 125 | config := cfg.Config{ 126 | SourceDB: "mydb", 127 | SourceHost: "127.0.0.1", 128 | SourcePort: testPostgresPort, 129 | SourceUser: "postgres", 130 | SourcePass: "postgres", 131 | SourceDbTables: []string{"mydb.*@test_table.*"}, 132 | SourceTable: "test_table", 133 | SourceWhereCondition: "id > 0", 134 | DatabaseType: "pg", 135 | SourceQuery: "select * from mydb2.test_table", 136 | SourceSplitKey: "id", 137 | SourceSplitTimeKey: "", 138 | DatabendDSN: "http://databend:databend@localhost:8000", 139 | DatabendTable: "default.test_table3", 140 | BatchSize: 5, 141 | BatchMaxInterval: 3, 142 | MaxThread: 2, 143 | CopyForce: false, 144 | CopyPurge: false, 145 | DeleteAfterSync: false, 146 | DisableVariantCheck: false, 147 | UserStage: "~", 148 | } 149 | 150 | return &config 151 | } 152 | -------------------------------------------------------------------------------- /config/conf.json: -------------------------------------------------------------------------------- 1 | { 2 | "sourceHost": "127.0.0.1", 3 | "sourcePort": 3306, 4 | "sourceUser": "root", 5 | "sourcePass": "123456", 6 | "sourceDB": "mydb", 7 | "sourceTable": "my_table2", 8 | "sourceQuery": "select * from mydb.my_table2", 9 | "sourceWhereCondition": "t1 >= '2024-06-01' and t1 < '2024-07-01'", 10 | "sourceSplitKey": "", 11 | "sourceSplitTimeKey": "t1", 12 | "timeSplitUnit": "minute", 13 | "databendDSN": "https://cloudapp:password@tn3ftqihs--medium-p8at.gw.aws-us-east-2.default.databend.com:443", 14 | "databendTable": "testSync.my_table2", 15 | "batchSize": 2, 16 | "batchMaxInterval": 30, 17 | "userStage": "~", 18 | "deleteAfterSync": false, 19 | "maxThread": 10 20 | } 21 | -------------------------------------------------------------------------------- /config/conf_test.json: -------------------------------------------------------------------------------- 1 | { 2 | "sourceHost": "127.0.0.1", 3 | "sourcePort": 3306, 4 | "sourceUser": "root", 5 | "sourcePass": "123456", 6 | "sourceDB": "mydb", 7 | "sourceTable": "t1", 8 | "sourceQuery": "select * from mydb.t1", 9 | "sourceWhereCondition": "id > 0", 10 | "sourceSplitKey": "id", 11 | "sourceSplitTimeKey": "", 12 | "timeSplitUnit": "minute", 13 | "databendDSN": "http://databend:databend@localhost:8000", 14 | "databendTable": "testSync.t1", 15 | "batchSize": 2, 16 | "batchMaxInterval": 30, 17 | "userStage": "~", 18 | "deleteAfterSync": false, 19 | "maxThread": 10 20 | } 21 | -------------------------------------------------------------------------------- /config/conf_test_oracle.json: -------------------------------------------------------------------------------- 1 | { 2 | "databaseType": "oracle", 3 | "sourceHost": "127.0.0.1", 4 | "sourcePort": 49161, 5 | "sourceUser": "a", 6 | "sourcePass": "123", 7 | "sourceDbTables": ["MYDB.*@TEST_TABLE.*"], 8 | "sourceQuery": "select * from mydb2.test_table", 9 | "sourceWhereCondition": "id > 0", 10 | "sourceSplitKey": "id", 11 | "sourceSplitTimeKey": "", 12 | "databendDSN": "http://databend:databend@localhost:8000", 13 | "databendTable": "default.test_table", 14 | "batchSize": 2, 15 | "batchMaxInterval": 30, 16 | "userStage": "~", 17 | "deleteAfterSync": false, 18 | "maxThread": 5, 19 | "oracleSID": "XE" 20 | } 21 | 22 | 23 | -------------------------------------------------------------------------------- /config/config.go: -------------------------------------------------------------------------------- 1 | package config 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "os" 7 | "regexp" 8 | "time" 9 | 10 | "github.com/pkg/errors" 11 | ) 12 | 13 | type TimeSplitUnit int 14 | 15 | const ( 16 | Minute TimeSplitUnit = iota 17 | Quarter // 15 minutes 18 | Hour 19 | Day 20 | ) 21 | 22 | var TimeSplitUnitToString = map[TimeSplitUnit]string{ 23 | Minute: "minute", 24 | Quarter: "quarter", 25 | Hour: "hour", 26 | Day: "day", 27 | } 28 | 29 | var StringToTimeSplitUnit = map[string]TimeSplitUnit{ 30 | "minute": Minute, 31 | "quarter": Quarter, 32 | "hour": Hour, 33 | "day": Day, 34 | } 35 | 36 | type Config struct { 37 | // Source configuration 38 | DatabaseType string `json:"databaseType" default:"mysql"` 39 | SourceHost string `json:"sourceHost"` 40 | SourcePort int `json:"sourcePort"` 41 | SourceUser string `json:"sourceUser"` 42 | SourcePass string `json:"sourcePass"` 43 | SourceDB string `json:"sourceDB"` 44 | SSLMode string `json:"sslMode"` 45 | SourceTable string `json:"sourceTable"` 46 | SourceDbTables []string `json:"sourceDbTables"` // source db tables format: [db1.table1,db2.table2] or [db.*@table.*,mydb.*.table.*] 47 | SourceQuery string `json:"sourceQuery"` // select * from table where condition 48 | SourceWhereCondition string `json:"sourceWhereCondition"` //example: where id > 100 and id < 200 and time > '2023-01-01' 49 | SourceSplitKey string `json:"sourceSplitKey"` // primary split key for split table, only for int type 50 | // the format of time field must be: 2006-01-02 15:04:05 51 | SourceSplitTimeKey string `json:"SourceSplitTimeKey"` // time field for split table 52 | TimeSplitUnit string `json:"TimeSplitUnit" default:"hour"` // time split unit, default is hour, option is: minute, hour, day 53 | 54 | // Databend configuration 55 | DatabendDSN string `json:"databendDSN" default:"localhost:8000"` 56 | DatabendTable string `json:"databendTable"` 57 | BatchSize int64 `json:"batchSize" default:"1000"` 58 | BatchMaxInterval int `json:"batchMaxInterval" default:"3"` // for rate limit control 59 | 60 | // related docs: https://docs.databend.com/sql/sql-commands/dml/dml-copy-into-table 61 | CopyPurge bool `json:"copyPurge" default:"true"` 62 | CopyForce bool `json:"copyForce" default:"false"` 63 | DisableVariantCheck bool `json:"disableVariantCheck" default:"true"` 64 | UserStage string `json:"userStage" default:"~"` 65 | DeleteAfterSync bool `json:"deleteAfterSync" default:"false"` 66 | MaxThread int `json:"maxThread" default:"1"` 67 | // Oracle 68 | OracleSID string `json:"oracleSID"` 69 | } 70 | 71 | func LoadConfig(configFile string) (*Config, error) { 72 | conf := Config{} 73 | 74 | f, err := os.Open(configFile) 75 | if err != nil { 76 | return nil, err 77 | } 78 | defer f.Close() 79 | decoder := json.NewDecoder(f) 80 | err = decoder.Decode(&conf) 81 | if err != nil { 82 | fmt.Println("Error decoding JSON:", err) 83 | return &conf, err 84 | } 85 | preCheckConfig(&conf) 86 | 87 | return &conf, nil 88 | } 89 | 90 | func preCheckConfig(cfg *Config) { 91 | if cfg.UserStage == "" { 92 | cfg.UserStage = "~" 93 | } 94 | if cfg.SourceSplitKey != "" && cfg.SourceSplitTimeKey != "" { 95 | panic("cannot set both sourceSplitKey and sourceSplitTimeKey") 96 | } 97 | if cfg.SourceSplitKey == "" && cfg.SourceSplitTimeKey == "" { 98 | panic("must set one of sourceSplitKey and sourceSplitTimeKey") 99 | } 100 | if cfg.SourceSplitTimeKey != "" || cfg.SourceSplitKey != "" { 101 | if cfg.SourceWhereCondition == "" { 102 | panic("must set sourceWhereCondition when sourceSplitTimeKey is set") 103 | } 104 | } 105 | if cfg.SourceSplitTimeKey != "" { 106 | // time warehouse condition must be x < time and y > time 107 | err := validateSourceSplitTimeKey(cfg.SourceWhereCondition) 108 | if err != nil { 109 | panic(err) 110 | } 111 | } 112 | if cfg.SourceSplitTimeKey != "" { 113 | err := checkTimeSplitUnit(cfg.TimeSplitUnit) 114 | if err != nil { 115 | panic(err) 116 | } 117 | } 118 | } 119 | 120 | func validateSourceSplitTimeKey(value string) error { 121 | // 正则表达式匹配 field>'x' and field <'y' 或者 field >= 'x' and field <='y', 或者 field >='x' and field <'y', 或者 field>'x' and field <='y' 的格式 122 | pattern := `^\w+\s*(>|>=)\s*'[^']*'\s+and\s+\w+\s*(<|<=)\s*'[^']*'$` 123 | matched, err := regexp.MatchString(pattern, value) 124 | if err != nil { 125 | return err 126 | } 127 | if !matched { 128 | return errors.New("SourceSplitTimeKey does not match the required format") 129 | } 130 | return nil 131 | } 132 | 133 | func checkTimeSplitUnit(unit string) error { 134 | _, ok := StringToTimeSplitUnit[unit] 135 | if !ok { 136 | return fmt.Errorf("invalid TimeSplitUnit: %s, it should be 'minute', 'quarter', 'hour', or 'day'", unit) 137 | } 138 | return nil 139 | } 140 | 141 | func (c *Config) GetTimeRangeBySplitUnit() time.Duration { 142 | switch StringToTimeSplitUnit[c.TimeSplitUnit] { 143 | case Minute: 144 | return 10 * time.Minute 145 | case Quarter: 146 | return 15 * time.Minute 147 | case Hour: 148 | return 2 * time.Hour 149 | case Day: 150 | return 24 * time.Hour 151 | default: 152 | return 0 153 | } 154 | } 155 | -------------------------------------------------------------------------------- /config/config_test.go: -------------------------------------------------------------------------------- 1 | package config 2 | 3 | import ( 4 | "testing" 5 | "time" 6 | ) 7 | 8 | func TestValidateSourceSplitTimeKey(t *testing.T) { 9 | tests := []struct { 10 | name string 11 | value string 12 | wantErr bool 13 | }{ 14 | { 15 | name: "valid format 1", 16 | value: "t1 > '2024-06-30 2:00:00' and t1 < '2024-06-30 20:00:00'", 17 | wantErr: false, 18 | }, 19 | { 20 | name: "valid format 1", 21 | value: "t1>'2024-06-30 2:00:00' and t1< '2024-06-30 20:00:00'", 22 | wantErr: false, 23 | }, 24 | 25 | { 26 | name: "valid format 2", 27 | value: "field >= 'x' and field <= 'y'", 28 | wantErr: false, 29 | }, 30 | { 31 | name: "valid format 3", 32 | value: "field >= 'x' and field < 'y'", 33 | wantErr: false, 34 | }, 35 | { 36 | name: "valid format 4", 37 | value: "field > 'x' and field <= 'y'", 38 | wantErr: false, 39 | }, 40 | { 41 | name: "invalid format", 42 | value: "field > 'x' and field 'y'", 43 | wantErr: true, 44 | }, 45 | { 46 | name: "invalid format", 47 | value: "field > 'x'", 48 | wantErr: true, 49 | }, 50 | { 51 | name: "invalid format", 52 | value: "field >= 'x'", 53 | wantErr: true, 54 | }, 55 | } 56 | 57 | for _, tt := range tests { 58 | t.Run(tt.name, func(t *testing.T) { 59 | err := validateSourceSplitTimeKey(tt.value) 60 | if (err != nil) != tt.wantErr { 61 | t.Errorf("validateSourceSplitTimeKey() error = %v, wantErr %v", err, tt.wantErr) 62 | } 63 | }) 64 | } 65 | } 66 | 67 | func TestGetTimeRangeBySplitUnit(t *testing.T) { 68 | tests := []struct { 69 | name string 70 | unit string 71 | expected time.Duration 72 | }{ 73 | { 74 | name: "Minute unit returns 10 minutes", 75 | unit: "minute", 76 | expected: 10 * time.Minute, 77 | }, 78 | { 79 | name: "Quarter unit returns 15 minutes", 80 | unit: "quarter", 81 | expected: 15 * time.Minute, 82 | }, 83 | { 84 | name: "Hour unit returns 2 hours", 85 | unit: "hour", 86 | expected: 2 * time.Hour, 87 | }, 88 | { 89 | name: "Day unit returns 24 hours", 90 | unit: "day", 91 | expected: 24 * time.Hour, 92 | }, 93 | } 94 | 95 | for _, tt := range tests { 96 | t.Run(tt.name, func(t *testing.T) { 97 | cfg := &Config{TimeSplitUnit: tt.unit} 98 | got := cfg.GetTimeRangeBySplitUnit() 99 | if got != tt.expected { 100 | t.Errorf("GetTimeRangeBySplitUnit() = %v, want %v", got, tt.expected) 101 | } 102 | }) 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /config/config_test_mssql.json: -------------------------------------------------------------------------------- 1 | { 2 | "databaseType": "mssql", 3 | "sourceHost": "127.0.0.1", 4 | "sourcePort": 1433, 5 | "sourceUser": "sa", 6 | "sourcePass": "Passw@rd", 7 | "sourceDB": "test1", 8 | "sourceTable": "test1", 9 | "sourceQuery": "select * from test1.test1", 10 | "sourceWhereCondition": "id > 0", 11 | "sourceSplitKey": "id", 12 | "sourceSplitTimeKey": "", 13 | "timeSplitUnit": "minute", 14 | "databendDSN": "https://user:pass@host.databend.com:443", 15 | "databendTable": "testSync.test1", 16 | "batchSize": 2, 17 | "batchMaxInterval": 30, 18 | "copyPurge": false, 19 | "copyForce": false, 20 | "disableVariantCheck": false, 21 | "userStage": "~", 22 | "deleteAfterSync": false, 23 | "maxThread": 10 24 | } -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/databendcloud/db-archiver 2 | 3 | go 1.23.0 4 | 5 | toolchain go1.23.9 6 | 7 | require ( 8 | github.com/avast/retry-go v3.0.0+incompatible 9 | github.com/codesuki/go-time-series v0.0.0-20210430055340-c4c8d8fa61d4 10 | github.com/datafuselabs/databend-go v0.7.4 11 | github.com/denisenkom/go-mssqldb v0.12.3 12 | github.com/fergusstrange/embedded-postgres v1.30.0 13 | github.com/go-sql-driver/mysql v1.9.2 14 | github.com/lib/pq v1.10.9 15 | github.com/pkg/errors v0.9.1 16 | github.com/sijms/go-ora/v2 v2.8.24 17 | github.com/sirupsen/logrus v1.9.3 18 | github.com/test-go/testify v1.1.4 19 | ) 20 | 21 | require ( 22 | filippo.io/edwards25519 v1.1.0 // indirect 23 | github.com/BurntSushi/toml v1.5.0 // indirect 24 | github.com/benbjohnson/clock v1.3.5 // indirect 25 | github.com/davecgh/go-spew v1.1.1 // indirect 26 | github.com/felixge/httpsnoop v1.0.4 // indirect 27 | github.com/go-logr/logr v1.4.2 // indirect 28 | github.com/go-logr/stdr v1.2.2 // indirect 29 | github.com/golang-sql/civil v0.0.0-20220223132316-b832511892a9 // indirect 30 | github.com/golang-sql/sqlexp v0.1.0 // indirect 31 | github.com/google/uuid v1.6.0 // indirect 32 | github.com/pmezard/go-difflib v1.0.0 // indirect 33 | github.com/xi2/xz v0.0.0-20171230120015-48954b6210f8 // indirect 34 | go.opentelemetry.io/auto/sdk v1.1.0 // indirect 35 | go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0 // indirect 36 | go.opentelemetry.io/otel v1.35.0 // indirect 37 | go.opentelemetry.io/otel/metric v1.35.0 // indirect 38 | go.opentelemetry.io/otel/trace v1.35.0 // indirect 39 | golang.org/x/crypto v0.38.0 // indirect 40 | golang.org/x/sys v0.33.0 // indirect 41 | ) 42 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | filippo.io/edwards25519 v1.1.0 h1:FNf4tywRC1HmFuKW5xopWpigGjJKiJSV0Cqo0cJWDaA= 2 | filippo.io/edwards25519 v1.1.0/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4VDT4= 3 | github.com/Azure/azure-sdk-for-go/sdk/azcore v0.19.0/go.mod h1:h6H6c8enJmmocHUbLiiGY6sx7f9i+X3m1CHdd5c6Rdw= 4 | github.com/Azure/azure-sdk-for-go/sdk/azidentity v0.11.0/go.mod h1:HcM1YX14R7CJcghJGOYCgdezslRSVzqwLf/q+4Y2r/0= 5 | github.com/Azure/azure-sdk-for-go/sdk/internal v0.7.0/go.mod h1:yqy467j36fJxcRV2TzfVZ1pCb5vxm4BtZPUdYWe/Xo8= 6 | github.com/BurntSushi/toml v1.2.1 h1:9F2/+DoOYIOksmaJFPw1tGFy1eDnIJXg+UHjuD8lTak= 7 | github.com/BurntSushi/toml v1.2.1/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ= 8 | github.com/BurntSushi/toml v1.5.0 h1:W5quZX/G/csjUnuI8SUYlsHs9M38FC7znL0lIO+DvMg= 9 | github.com/BurntSushi/toml v1.5.0/go.mod h1:ukJfTF/6rtPPRCnwkur4qwRxa8vTRFBF0uk2lLoLwho= 10 | github.com/avast/retry-go v3.0.0+incompatible h1:4SOWQ7Qs+oroOTQOYnAHqelpCO0biHSxpiH9JdtuBj0= 11 | github.com/avast/retry-go v3.0.0+incompatible/go.mod h1:XtSnn+n/sHqQIpZ10K1qAevBhOOCWBLXXy3hyiqqBrY= 12 | github.com/benbjohnson/clock v1.3.5 h1:VvXlSJBzZpA/zum6Sj74hxwYI2DIxRWuNIoXAzHZz5o= 13 | github.com/benbjohnson/clock v1.3.5/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA= 14 | github.com/codesuki/go-time-series v0.0.0-20210430055340-c4c8d8fa61d4 h1:xKzsxCG6QVIh31ZIXuRR/eCvEflCFwpeET6cvTFYhVo= 15 | github.com/codesuki/go-time-series v0.0.0-20210430055340-c4c8d8fa61d4/go.mod h1:Rm6RJZPJg9b/vwne8fiAcfh0X5QFNszEhijK6d6qW9k= 16 | github.com/datafuselabs/databend-go v0.7.4 h1:B+qqK89TuGSDIdNgd3rxpUjYyTFBiyr5VE+7VYZkv0E= 17 | github.com/datafuselabs/databend-go v0.7.4/go.mod h1:h/sGUBZs7EqJgqnZ3XB0KHfyUlpGvfNrw2lWcdDJVIw= 18 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 19 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= 20 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 21 | github.com/denisenkom/go-mssqldb v0.12.3 h1:pBSGx9Tq67pBOTLmxNuirNTeB8Vjmf886Kx+8Y+8shw= 22 | github.com/denisenkom/go-mssqldb v0.12.3/go.mod h1:k0mtMFOnU+AihqFxPMiF05rtiDrorD1Vrm1KEz5hxDo= 23 | github.com/dnaeon/go-vcr v1.2.0/go.mod h1:R4UdLID7HZT3taECzJs4YgbbH6PIGXB6W/sc5OLb6RQ= 24 | github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= 25 | github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= 26 | github.com/fergusstrange/embedded-postgres v1.24.0 h1:WqXbmYrBeT5JfNWQ8Qa+yHa5YJO/0sBIgL9k5rn3dFk= 27 | github.com/fergusstrange/embedded-postgres v1.24.0/go.mod h1:wL562t1V+iuFwq0UcgMi2e9rp8CROY9wxWZEfP8Y874= 28 | github.com/fergusstrange/embedded-postgres v1.30.0 h1:ewv1e6bBlqOIYtgGgRcEnNDpfGlmfPxB8T3PO9tV68Q= 29 | github.com/fergusstrange/embedded-postgres v1.30.0/go.mod h1:w0YvnCgf19o6tskInrOOACtnqfVlOvluz3hlNLY7tRk= 30 | github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= 31 | github.com/go-logr/logr v1.4.1 h1:pKouT5E8xu9zeFC39JXRDukb6JFQPXM5p5I91188VAQ= 32 | github.com/go-logr/logr v1.4.1/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= 33 | github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= 34 | github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= 35 | github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= 36 | github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= 37 | github.com/go-sql-driver/mysql v1.8.1 h1:LedoTUt/eveggdHS9qUFC1EFSa8bU2+1pZjSRpvNJ1Y= 38 | github.com/go-sql-driver/mysql v1.8.1/go.mod h1:wEBSXgmK//2ZFJyE+qWnIsVGmvmEKlqwuVSjsCm7DZg= 39 | github.com/go-sql-driver/mysql v1.9.2 h1:4cNKDYQ1I84SXslGddlsrMhc8k4LeDVj6Ad6WRjiHuU= 40 | github.com/go-sql-driver/mysql v1.9.2/go.mod h1:qn46aNg1333BRMNU69Lq93t8du/dwxI64Gl8i5p1WMU= 41 | github.com/golang-sql/civil v0.0.0-20190719163853-cb61b32ac6fe h1:lXe2qZdvpiX5WZkZR4hgp4KJVfY3nMkvmwbVkpv1rVY= 42 | github.com/golang-sql/civil v0.0.0-20190719163853-cb61b32ac6fe/go.mod h1:8vg3r2VgvsThLBIFL93Qb5yWzgyZWhEmBwUJWevAkK0= 43 | github.com/golang-sql/civil v0.0.0-20220223132316-b832511892a9 h1:au07oEsX2xN0ktxqI+Sida1w446QrXBRJ0nee3SNZlA= 44 | github.com/golang-sql/civil v0.0.0-20220223132316-b832511892a9/go.mod h1:8vg3r2VgvsThLBIFL93Qb5yWzgyZWhEmBwUJWevAkK0= 45 | github.com/golang-sql/sqlexp v0.1.0 h1:ZCD6MBpcuOVfGVqsEmY5/4FtYiKz6tSyUv9LPEDei6A= 46 | github.com/golang-sql/sqlexp v0.1.0/go.mod h1:J4ad9Vo8ZCWQ2GMrC4UCQy1JpCbwU9m3EOqtpKwwwHI= 47 | github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= 48 | github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= 49 | github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I= 50 | github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= 51 | github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= 52 | github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= 53 | github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw= 54 | github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o= 55 | github.com/modocache/gover v0.0.0-20171022184752-b58185e213c5/go.mod h1:caMODM3PzxT8aQXRPkAt8xlV/e7d7w8GM5g0fa5F0D8= 56 | github.com/pkg/browser v0.0.0-20180916011732-0a3d74bf9ce4/go.mod h1:4OwLy04Bl9Ef3GJJCoec+30X3LQs/0/m4HFRt/2LUSA= 57 | github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= 58 | github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= 59 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 60 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 61 | github.com/sijms/go-ora/v2 v2.8.22 h1:3ABgRzVKxS439cEgSLjFKutIwOyhnyi4oOSBywEdOlU= 62 | github.com/sijms/go-ora/v2 v2.8.22/go.mod h1:QgFInVi3ZWyqAiJwzBQA+nbKYKH77tdp1PYoCqhR2dU= 63 | github.com/sijms/go-ora/v2 v2.8.24 h1:TODRWjWGwJ1VlBOhbTLat+diTYe8HXq2soJeB+HMjnw= 64 | github.com/sijms/go-ora/v2 v2.8.24/go.mod h1:QgFInVi3ZWyqAiJwzBQA+nbKYKH77tdp1PYoCqhR2dU= 65 | github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= 66 | github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= 67 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 68 | github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= 69 | github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= 70 | github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= 71 | github.com/test-go/testify v1.1.4 h1:Tf9lntrKUMHiXQ07qBScBTSA0dhYQlu83hswqelv1iE= 72 | github.com/test-go/testify v1.1.4/go.mod h1:rH7cfJo/47vWGdi4GPj16x3/t1xGOj2YxzmNQzk2ghU= 73 | github.com/xi2/xz v0.0.0-20171230120015-48954b6210f8 h1:nIPpBwaJSVYIxUFsDv3M8ofmx9yWTog9BfvIu0q41lo= 74 | github.com/xi2/xz v0.0.0-20171230120015-48954b6210f8/go.mod h1:HUYIGzjTL3rfEspMxjDjgmT5uz5wzYJKVo23qUhYTos= 75 | go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA= 76 | go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A= 77 | go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.49.0 h1:jq9TW8u3so/bN+JPT166wjOI6/vQPF6Xe7nMNIltagk= 78 | go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.49.0/go.mod h1:p8pYQP+m5XfbZm9fxtSKAbM6oIllS7s2AfxrChvc7iw= 79 | go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0 h1:sbiXRNDSWJOTobXh5HyQKjq6wUC5tNybqjIqDpAY4CU= 80 | go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0/go.mod h1:69uWxva0WgAA/4bu2Yy70SLDBwZXuQ6PbBpbsa5iZrQ= 81 | go.opentelemetry.io/otel v1.24.0 h1:0LAOdjNmQeSTzGBzduGe/rU4tZhMwL5rWgtp9Ku5Jfo= 82 | go.opentelemetry.io/otel v1.24.0/go.mod h1:W7b9Ozg4nkF5tWI5zsXkaKKDjdVjpD4oAt9Qi/MArHo= 83 | go.opentelemetry.io/otel v1.35.0 h1:xKWKPxrxB6OtMCbmMY021CqC45J+3Onta9MqjhnusiQ= 84 | go.opentelemetry.io/otel v1.35.0/go.mod h1:UEqy8Zp11hpkUrL73gSlELM0DupHoiq72dR+Zqel/+Y= 85 | go.opentelemetry.io/otel/metric v1.24.0 h1:6EhoGWWK28x1fbpA4tYTOWBkPefTDQnb8WSGXlc88kI= 86 | go.opentelemetry.io/otel/metric v1.24.0/go.mod h1:VYhLe1rFfxuTXLgj4CBiyz+9WYBA8pNGJgDcSFRKBco= 87 | go.opentelemetry.io/otel/metric v1.35.0 h1:0znxYu2SNyuMSQT4Y9WDWej0VpcsxkuklLa4/siN90M= 88 | go.opentelemetry.io/otel/metric v1.35.0/go.mod h1:nKVFgxBZ2fReX6IlyW28MgZojkoAkJGaE8CpgeAU3oE= 89 | go.opentelemetry.io/otel/trace v1.24.0 h1:CsKnnL4dUAr/0llH9FKuc698G04IrpWV0MQA/Y1YELI= 90 | go.opentelemetry.io/otel/trace v1.24.0/go.mod h1:HPc3Xr/cOApsBI154IU0OI0HJexz+aw5uPdbs3UCjNU= 91 | go.opentelemetry.io/otel/trace v1.35.0 h1:dPpEfJu1sDIqruz7BHFG3c7528f6ddfSWfFDVt/xgMs= 92 | go.opentelemetry.io/otel/trace v1.35.0/go.mod h1:WUk7DtFp1Aw2MkvqGdwiXYDZZNvA/1J8o6xRXLrIkyc= 93 | go.uber.org/goleak v1.1.12 h1:gZAh5/EyT/HQwlpkCy6wTpqfH9H8Lz8zbm3dZh+OyzA= 94 | go.uber.org/goleak v1.1.12/go.mod h1:cwTWslyiVhfpKIDGSZEM2HlOvcqm+tG4zioyIeLoqMQ= 95 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= 96 | golang.org/x/crypto v0.0.0-20201016220609-9e8e0b390897/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= 97 | golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= 98 | golang.org/x/crypto v0.38.0 h1:jt+WWG8IZlBnVbomuhg2Mdq0+BBQaHbtqHEFEigjUV8= 99 | golang.org/x/crypto v0.38.0/go.mod h1:MvrbAqul58NNYPKnOra203SB9vpuZW0e+RRZV+Ggqjw= 100 | golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= 101 | golang.org/x/net v0.0.0-20210610132358-84b48f89b13b/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= 102 | golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= 103 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 104 | golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 105 | golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 106 | golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 107 | golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 108 | golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 109 | golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw= 110 | golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= 111 | golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= 112 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= 113 | golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= 114 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 115 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 116 | gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= 117 | gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= 118 | gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 119 | gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 120 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= 121 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 122 | -------------------------------------------------------------------------------- /ingester/ingest_databend.go: -------------------------------------------------------------------------------- 1 | package ingester 2 | 3 | import ( 4 | "bufio" 5 | "context" 6 | "database/sql" 7 | "fmt" 8 | "io" 9 | "log" 10 | "net/http" 11 | "os" 12 | "path/filepath" 13 | "time" 14 | 15 | "github.com/avast/retry-go" 16 | "github.com/pkg/errors" 17 | "github.com/sirupsen/logrus" 18 | 19 | godatabend "github.com/datafuselabs/databend-go" 20 | 21 | "github.com/databendcloud/db-archiver/config" 22 | "github.com/databendcloud/db-archiver/source" 23 | ) 24 | 25 | var ( 26 | ErrUploadStageFailed = errors.New("upload stage failed") 27 | ErrCopyIntoFailed = errors.New("copy into failed") 28 | ErrGetPresignUrl = errors.New("failed to get presigned url") 29 | ) 30 | 31 | type databendIngester struct { 32 | databendIngesterCfg *config.Config 33 | statsRecorder *DatabendIngesterStatsRecorder 34 | } 35 | 36 | type DatabendIngester interface { 37 | IngestData(threadNum int, columns []string, batchJsonData [][]interface{}) error 38 | uploadToStage(fileName string) (*godatabend.StageLocation, error) 39 | GetAllSyncedCount() (int, error) 40 | DoRetry(f retry.RetryableFunc) error 41 | } 42 | 43 | func NewDatabendIngester(cfg *config.Config) DatabendIngester { 44 | stats := NewDatabendIntesterStatsRecorder() 45 | return &databendIngester{ 46 | databendIngesterCfg: cfg, 47 | statsRecorder: stats, 48 | } 49 | } 50 | 51 | func (ig *databendIngester) GetAllSyncedCount() (int, error) { 52 | db, err := sql.Open("databend", ig.databendIngesterCfg.DatabendDSN) 53 | defer db.Close() 54 | if err != nil { 55 | return 0, err 56 | } 57 | rows, err := db.Query(fmt.Sprintf("SELECT count(*) FROM %s WHERE %s", 58 | ig.databendIngesterCfg.DatabendTable, ig.databendIngesterCfg.SourceWhereCondition)) 59 | if err != nil { 60 | return 0, err 61 | } 62 | defer rows.Close() 63 | var count int 64 | if rows.Next() { 65 | err = rows.Scan(&count) 66 | if err != nil { 67 | return 0, err 68 | } 69 | return count, nil 70 | } 71 | return 0, nil 72 | } 73 | 74 | func (ig *databendIngester) IngestData(threadNum int, columns []string, batchData [][]interface{}) error { 75 | l := logrus.WithFields(logrus.Fields{"ingest_databend": "IngestData"}) 76 | startTime := time.Now() 77 | 78 | if len(batchData) == 0 { 79 | return nil 80 | } 81 | 82 | fileName, bytesSize, err := source.GenerateJSONFile(columns, batchData) 83 | if err != nil { 84 | l.Errorf("generate NDJson file failed: %v\n", err) 85 | return err 86 | } 87 | 88 | stage, err := ig.uploadToStage(fileName) 89 | if err != nil { 90 | return err 91 | } 92 | 93 | copyIntoStartTime := time.Now() 94 | err = ig.copyInto(stage) 95 | if err != nil { 96 | return err 97 | } 98 | l.Infof("thread-%d: copy into cost: %v ms", threadNum, time.Since(copyIntoStartTime).Milliseconds()) 99 | ig.statsRecorder.RecordMetric(bytesSize, len(batchData)) 100 | stats := ig.statsRecorder.Stats(time.Since(startTime)) 101 | log.Printf("thread-%d: ingest %d rows (%f rows/s), %d bytes (%f bytes/s)", threadNum, 102 | len(batchData), stats.RowsPerSecondd, bytesSize, stats.BytesPerSecond) 103 | return nil 104 | } 105 | 106 | func (ig *databendIngester) uploadToStage(fileName string) (*godatabend.StageLocation, error) { 107 | defer func() { 108 | err := os.RemoveAll(fileName) 109 | if err != nil { 110 | logrus.Errorf("delete batch insert file failed: %v", err) 111 | } 112 | }() 113 | 114 | databendConfig, err := godatabend.ParseDSN(ig.databendIngesterCfg.DatabendDSN) 115 | if err != nil { 116 | return nil, err 117 | } 118 | apiClient := godatabend.NewAPIClientFromConfig(databendConfig) 119 | fi, err := os.Stat(fileName) 120 | if err != nil { 121 | return nil, errors.Wrap(err, "get batch file size failed") 122 | } 123 | size := fi.Size() 124 | 125 | f, err := os.Open(fileName) 126 | if err != nil { 127 | return nil, errors.Wrap(err, "open batch file failed") 128 | } 129 | defer f.Close() 130 | input := bufio.NewReader(f) 131 | stage := &godatabend.StageLocation{ 132 | Name: ig.databendIngesterCfg.UserStage, 133 | Path: fmt.Sprintf("batch/%d-%s", time.Now().Unix(), filepath.Base(fileName)), 134 | } 135 | 136 | presignedStartTime := time.Now() 137 | presigned, err := apiClient.GetPresignedURL(context.Background(), stage) 138 | if err != nil { 139 | return nil, errors.Wrap(ErrGetPresignUrl, err.Error()) 140 | } 141 | logrus.Infof("get presigned url cost: %v ms", time.Since(presignedStartTime).Milliseconds()) 142 | 143 | uploadByPresignedUrl := time.Now() 144 | if err := ig.UploadToStageByPresignURL(presigned, input, size); err != nil { 145 | return nil, errors.Wrap(ErrUploadStageFailed, err.Error()) 146 | } 147 | logrus.Infof("upload by presigned url cost: %v ms", time.Since(uploadByPresignedUrl).Milliseconds()) 148 | 149 | return stage, nil 150 | } 151 | 152 | func (ig *databendIngester) UploadToStageByPresignURL(presignedResp *godatabend.PresignedResponse, input *bufio.Reader, size int64) error { 153 | req, err := http.NewRequest("PUT", presignedResp.URL, input) 154 | if err != nil { 155 | return err 156 | } 157 | for k, v := range presignedResp.Headers { 158 | req.Header.Set(k, v) 159 | } 160 | req.ContentLength = size 161 | // TODO: configurable timeout 162 | httpClient := &http.Client{ 163 | Timeout: time.Second * 120, 164 | } 165 | resp, err := httpClient.Do(req) 166 | if err != nil { 167 | return errors.Wrap(err, "failed to upload to stage by presigned url") 168 | } 169 | defer func() { 170 | _ = resp.Body.Close() 171 | }() 172 | respBody, err := io.ReadAll(resp.Body) 173 | if err != nil { 174 | return err 175 | } 176 | if resp.StatusCode >= 400 { 177 | return errors.Errorf("failed to upload to stage by presigned url, status code: %d, body: %s", resp.StatusCode, string(respBody)) 178 | } 179 | return nil 180 | } 181 | 182 | func (ig *databendIngester) copyInto(stage *godatabend.StageLocation) error { 183 | copyIntoSQL := fmt.Sprintf("COPY INTO %s FROM %s FILE_FORMAT = (type = NDJSON missing_field_as = FIELD_DEFAULT COMPRESSION = AUTO) "+ 184 | "PURGE = %v FORCE = %v DISABLE_VARIANT_CHECK = %v", ig.databendIngesterCfg.DatabendTable, stage.String(), 185 | ig.databendIngesterCfg.CopyPurge, ig.databendIngesterCfg.CopyForce, ig.databendIngesterCfg.DisableVariantCheck) 186 | db, err := sql.Open("databend", ig.databendIngesterCfg.DatabendDSN) 187 | if err != nil { 188 | logrus.Errorf("init db error: %v", err) 189 | return err 190 | } 191 | if err := execute(db, copyIntoSQL); err != nil { 192 | return errors.Wrap(ErrCopyIntoFailed, err.Error()) 193 | } 194 | return nil 195 | } 196 | 197 | func execute(db *sql.DB, sql string) error { 198 | _, err := db.Exec(sql) 199 | if err != nil { 200 | logrus.Errorf("exec '%s' failed, err: %v", sql, err) 201 | return err 202 | } 203 | return nil 204 | } 205 | 206 | func (ig *databendIngester) DoRetry(f retry.RetryableFunc) error { 207 | delay := time.Second 208 | maxDelay := 60 * time.Minute 209 | maxAttempts := 500 210 | attempt := 0 211 | 212 | return retry.Do( 213 | func() error { 214 | err := f() 215 | if err != nil { 216 | logrus.Infof("Attempt %d failed: %v", attempt, err) 217 | } 218 | attempt++ 219 | return err 220 | }, 221 | retry.RetryIf(func(err error) bool { 222 | if err == nil { 223 | return false 224 | } 225 | if attempt >= maxAttempts { 226 | logrus.Warnf("Reached maximum retry attempts (%d)", maxAttempts) 227 | return false 228 | } 229 | if errors.Is(err, ErrUploadStageFailed) || 230 | errors.Is(err, ErrCopyIntoFailed) || 231 | errors.Is(err, ErrGetPresignUrl) { 232 | return true 233 | } 234 | return false 235 | }), 236 | retry.Delay(delay), 237 | retry.MaxDelay(maxDelay), 238 | retry.DelayType(retry.BackOffDelay), 239 | retry.Attempts(uint(maxAttempts)), 240 | ) 241 | } 242 | -------------------------------------------------------------------------------- /ingester/stats.go: -------------------------------------------------------------------------------- 1 | package ingester 2 | 3 | import ( 4 | "sync" 5 | "time" 6 | 7 | timeseries "github.com/codesuki/go-time-series" 8 | ) 9 | 10 | type DatabendIngesterStatsRecorder struct { 11 | ingestedBytes *timeseries.TimeSeries 12 | ingestedRows *timeseries.TimeSeries 13 | mu sync.Mutex 14 | } 15 | 16 | type DatabendIngesterStatsData struct { 17 | BytesPerSecond float64 18 | RowsPerSecondd float64 19 | } 20 | 21 | func NewDatabendIntesterStatsRecorder() *DatabendIngesterStatsRecorder { 22 | ingestedBytes, err := timeseries.NewTimeSeries() 23 | if err != nil { 24 | panic(err) 25 | } 26 | ingestedRows, err := timeseries.NewTimeSeries() 27 | if err != nil { 28 | panic(err) 29 | } 30 | return &DatabendIngesterStatsRecorder{ 31 | ingestedBytes: ingestedBytes, 32 | ingestedRows: ingestedRows, 33 | } 34 | } 35 | 36 | func (stats *DatabendIngesterStatsRecorder) RecordMetric(bytes int, rows int) { 37 | stats.mu.Lock() 38 | defer stats.mu.Unlock() 39 | stats.ingestedBytes.Increase(bytes) 40 | stats.ingestedRows.Increase(rows) 41 | } 42 | 43 | func (stats *DatabendIngesterStatsRecorder) Stats(statsWindow time.Duration) DatabendIngesterStatsData { 44 | stats.mu.Lock() 45 | defer stats.mu.Unlock() 46 | 47 | bytesPerSecond := stats.calcPerSecond(stats.ingestedBytes, statsWindow) 48 | rowsPerSecond := stats.calcPerSecond(stats.ingestedRows, statsWindow) 49 | return DatabendIngesterStatsData{ 50 | BytesPerSecond: bytesPerSecond, 51 | RowsPerSecondd: rowsPerSecond, 52 | } 53 | } 54 | 55 | func (stats *DatabendIngesterStatsRecorder) calcPerSecond(ts *timeseries.TimeSeries, duration time.Duration) float64 { 56 | amount, err := ts.Range(time.Now().Add(-duration), time.Now()) 57 | if err != nil { 58 | return -1 59 | } 60 | 61 | return float64(amount) / duration.Seconds() 62 | } 63 | -------------------------------------------------------------------------------- /source/mysql.go: -------------------------------------------------------------------------------- 1 | package source 2 | 3 | import ( 4 | "database/sql" 5 | "fmt" 6 | "log" 7 | "regexp" 8 | "strings" 9 | "time" 10 | 11 | "github.com/sirupsen/logrus" 12 | 13 | "github.com/databendcloud/db-archiver/config" 14 | ) 15 | 16 | type MysqlSource struct { 17 | db *sql.DB 18 | cfg *config.Config 19 | statsRecorder *DatabendSourceStatsRecorder 20 | } 21 | 22 | func NewMysqlSource(cfg *config.Config) (*MysqlSource, error) { 23 | stats := NewDatabendIntesterStatsRecorder() 24 | db, err := sql.Open("mysql", fmt.Sprintf("%s:%s@tcp(%s:%d)/mysql", 25 | cfg.SourceUser, 26 | cfg.SourcePass, 27 | cfg.SourceHost, 28 | cfg.SourcePort)) 29 | if err != nil { 30 | logrus.Errorf("failed to open db: %v", err) 31 | return nil, err 32 | } 33 | //fmt.Printf("connected to mysql successfully %v", cfg) 34 | return &MysqlSource{ 35 | db: db, 36 | cfg: cfg, 37 | statsRecorder: stats, 38 | }, nil 39 | } 40 | 41 | // AdjustBatchSizeAccordingToSourceDbTable has a concept called s, s = (maxKey - minKey) / sourceTableRowCount 42 | // if s == 1 it means the data is uniform in the table, if s is much bigger than 1, it means the data is not uniform in the table 43 | func (s *MysqlSource) AdjustBatchSizeAccordingToSourceDbTable() int64 { 44 | minSplitKey, maxSplitKey, err := s.GetMinMaxSplitKey() 45 | if err != nil { 46 | return s.cfg.BatchSize 47 | } 48 | sourceTableRowCount, err := s.GetSourceReadRowsCount() 49 | if err != nil { 50 | return s.cfg.BatchSize 51 | } 52 | rangeSize := maxSplitKey - minSplitKey + 1 53 | switch { 54 | case int64(sourceTableRowCount) <= s.cfg.BatchSize: 55 | return rangeSize 56 | case rangeSize/int64(sourceTableRowCount) >= 10: 57 | return s.cfg.BatchSize * 5 58 | case rangeSize/int64(sourceTableRowCount) >= 100: 59 | return s.cfg.BatchSize * 20 60 | default: 61 | return s.cfg.BatchSize 62 | } 63 | } 64 | 65 | func (s *MysqlSource) GetSourceReadRowsCount() (int, error) { 66 | row := s.db.QueryRow(fmt.Sprintf("SELECT count(*) FROM %s.%s WHERE %s", s.cfg.SourceDB, 67 | s.cfg.SourceTable, s.cfg.SourceWhereCondition)) 68 | var rowCount int 69 | err := row.Scan(&rowCount) 70 | if err != nil { 71 | return 0, err 72 | } 73 | 74 | return rowCount, nil 75 | } 76 | 77 | func (s *MysqlSource) GetMinMaxSplitKey() (int64, int64, error) { 78 | rows, err := s.db.Query(fmt.Sprintf("select min(%s), max(%s) from %s.%s WHERE %s", s.cfg.SourceSplitKey, 79 | s.cfg.SourceSplitKey, s.cfg.SourceDB, s.cfg.SourceTable, s.cfg.SourceWhereCondition)) 80 | if err != nil { 81 | return 0, 0, err 82 | } 83 | defer rows.Close() 84 | 85 | var minSplitKey, maxSplitKey sql.NullInt64 86 | for rows.Next() { 87 | err = rows.Scan(&minSplitKey, &maxSplitKey) 88 | if err != nil { 89 | return 0, 0, err 90 | } 91 | } 92 | 93 | // Check if minSplitKey and maxSplitKey are valid (not NULL) 94 | if !minSplitKey.Valid || !maxSplitKey.Valid { 95 | return 0, 0, nil 96 | } 97 | 98 | return minSplitKey.Int64, maxSplitKey.Int64, nil 99 | } 100 | 101 | func (s *MysqlSource) GetMinMaxTimeSplitKey() (string, string, error) { 102 | rows, err := s.db.Query(fmt.Sprintf("select min(%s), max(%s) from %s.%s WHERE %s", s.cfg.SourceSplitTimeKey, 103 | s.cfg.SourceSplitTimeKey, s.cfg.SourceDB, s.cfg.SourceTable, s.cfg.SourceWhereCondition)) 104 | if err != nil { 105 | return "", "", err 106 | } 107 | defer rows.Close() 108 | 109 | var minSplitKey, maxSplitKey string 110 | for rows.Next() { 111 | err = rows.Scan(&minSplitKey, &maxSplitKey) 112 | if err != nil { 113 | return "", "", err 114 | } 115 | } 116 | return minSplitKey, maxSplitKey, nil 117 | } 118 | 119 | func (s *MysqlSource) DeleteAfterSync() error { 120 | if !s.cfg.DeleteAfterSync { 121 | return nil 122 | } 123 | 124 | dbTables, err := s.GetDbTablesAccordingToSourceDbTables() 125 | if err != nil { 126 | return err 127 | } 128 | 129 | for db, tables := range dbTables { 130 | for _, table := range tables { 131 | count, err := s.GetSourceReadRowsCount() 132 | if err != nil { 133 | log.Printf("Error getting row count for table %s.%s: %v", db, table, err) 134 | continue 135 | } 136 | 137 | // Delete in batches 138 | for count > 0 { 139 | limit := min(int(s.cfg.BatchSize), count) 140 | query := fmt.Sprintf("DELETE FROM %s.%s WHERE %s LIMIT %d", db, table, s.cfg.SourceWhereCondition, limit) 141 | _, err := s.db.Exec(query) 142 | if err != nil { 143 | log.Printf("Error deleting rows from table %s.%s: %v", db, table, err) 144 | break 145 | } 146 | count -= limit 147 | log.Printf("Deleted %d rows from table %s.%s\n", limit, db, table) 148 | time.Sleep(time.Duration(s.cfg.BatchMaxInterval) * time.Second) 149 | } 150 | } 151 | } 152 | 153 | return nil 154 | } 155 | 156 | // Utility function to get the smaller of two integers 157 | func min(a, b int) int { 158 | if a < b { 159 | return a 160 | } 161 | return b 162 | } 163 | 164 | func (s *MysqlSource) QueryTableData(threadNum int, conditionSql string) ([][]interface{}, []string, error) { 165 | startTime := time.Now() 166 | execSql := fmt.Sprintf("SELECT * FROM %s.%s WHERE %s", s.cfg.SourceDB, 167 | s.cfg.SourceTable, conditionSql) 168 | if s.cfg.SourceWhereCondition != "" && s.cfg.SourceSplitKey != "" { 169 | execSql = fmt.Sprintf("%s AND %s", execSql, s.cfg.SourceWhereCondition) 170 | } 171 | rows, err := s.db.Query(execSql) 172 | if err != nil { 173 | return nil, nil, err 174 | } 175 | defer rows.Close() 176 | columns, err := rows.Columns() 177 | if err != nil { 178 | return nil, nil, err 179 | } 180 | 181 | columnTypes, err := rows.ColumnTypes() 182 | if err != nil { 183 | return nil, nil, err 184 | } 185 | 186 | scanArgs := make([]interface{}, len(columns)) 187 | for i, columnType := range columnTypes { 188 | switch columnType.DatabaseTypeName() { 189 | case "INT", "SMALLINT", "TINYINT", "MEDIUMINT", "BIGINT": 190 | scanArgs[i] = new(sql.NullInt64) 191 | case "UNSIGNED INT", "UNSIGNED TINYINT", "UNSIGNED MEDIUMINT", "UNSIGNED BIGINT": 192 | scanArgs[i] = new(sql.NullInt64) 193 | case "FLOAT", "DOUBLE": 194 | scanArgs[i] = new(sql.NullFloat64) 195 | case "DECIMAL": 196 | scanArgs[i] = new(sql.NullFloat64) 197 | case "CHAR", "VARCHAR", "TEXT", "TINYTEXT", "MEDIUMTEXT", "LONGTEXT": 198 | scanArgs[i] = new(sql.NullString) 199 | case "DATE", "TIME", "DATETIME", "TIMESTAMP": 200 | scanArgs[i] = new(sql.NullString) // or use time.Time 201 | case "BOOL", "BOOLEAN": 202 | scanArgs[i] = new(sql.NullBool) 203 | default: 204 | scanArgs[i] = new(sql.RawBytes) 205 | } 206 | } 207 | 208 | var result [][]interface{} 209 | //rowCount, err := s.GetRowsCountByConditionSql(conditionSql) 210 | //if err != nil { 211 | // return nil, nil, err 212 | //} 213 | //result := make([][]interface{}, rowCount) 214 | for rows.Next() { 215 | err = rows.Scan(scanArgs...) 216 | if err != nil { 217 | return nil, nil, err 218 | } 219 | 220 | row := make([]interface{}, len(columns)) 221 | for i, v := range scanArgs { 222 | switch v := v.(type) { 223 | case *int: 224 | row[i] = *v 225 | case *string: 226 | row[i] = *v 227 | case *sql.NullString: 228 | if v.Valid { 229 | row[i] = v.String 230 | } else { 231 | row[i] = nil 232 | } 233 | case *bool: 234 | row[i] = *v 235 | case *sql.NullInt64: 236 | if v.Valid { 237 | row[i] = v.Int64 238 | } else { 239 | row[i] = nil 240 | } 241 | case *sql.NullFloat64: 242 | if v.Valid { 243 | row[i] = v.Float64 244 | } else { 245 | row[i] = nil 246 | } 247 | case *sql.NullBool: 248 | if v.Valid { 249 | row[i] = v.Bool 250 | } else { 251 | row[i] = nil 252 | } 253 | case *float64: 254 | row[i] = *v 255 | case *sql.RawBytes: 256 | row[i] = string(*v) 257 | } 258 | } 259 | result = append(result, row) 260 | } 261 | 262 | if err = rows.Err(); err != nil { 263 | return nil, nil, err 264 | } 265 | s.statsRecorder.RecordMetric(len(result)) 266 | stats := s.statsRecorder.Stats(time.Since(startTime)) 267 | log.Printf("thread-%d: extract %d rows (%f rows/s)", threadNum, len(result), stats.RowsPerSecondd) 268 | 269 | return result, columns, nil 270 | } 271 | 272 | func (s *MysqlSource) GetDatabasesAccordingToSourceDbRegex(sourceDatabasePattern string) ([]string, error) { 273 | rows, err := s.db.Query("SHOW DATABASES") 274 | if err != nil { 275 | return nil, err 276 | } 277 | defer rows.Close() 278 | 279 | var databases []string 280 | for rows.Next() { 281 | var database string 282 | err = rows.Scan(&database) 283 | if err != nil { 284 | return nil, err 285 | } 286 | fmt.Println("sourcedatabase pattern", sourceDatabasePattern) 287 | match, err := regexp.MatchString(sourceDatabasePattern, database) 288 | if err != nil { 289 | return nil, err 290 | } 291 | if match { 292 | fmt.Println("match db: ", database) 293 | databases = append(databases, database) 294 | } else { 295 | fmt.Println("not match db: ", database) 296 | } 297 | } 298 | return databases, nil 299 | } 300 | 301 | func (s *MysqlSource) GetTablesAccordingToSourceTableRegex(sourceTablePattern string, databases []string) (map[string][]string, error) { 302 | dbTables := make(map[string][]string) 303 | for _, database := range databases { 304 | rows, err := s.db.Query(fmt.Sprintf("SHOW TABLES FROM %s", database)) 305 | if err != nil { 306 | return nil, err 307 | } 308 | defer rows.Close() 309 | 310 | var tables []string 311 | for rows.Next() { 312 | var table string 313 | err = rows.Scan(&table) 314 | if err != nil { 315 | return nil, err 316 | } 317 | match, err := regexp.MatchString(sourceTablePattern, table) 318 | if err != nil { 319 | return nil, err 320 | } 321 | if match { 322 | tables = append(tables, table) 323 | } 324 | } 325 | dbTables[database] = tables 326 | } 327 | return dbTables, nil 328 | } 329 | 330 | func (s *MysqlSource) GetAllSourceReadRowsCount() (int, error) { 331 | allCount := 0 332 | 333 | dbTables, err := s.GetDbTablesAccordingToSourceDbTables() 334 | if err != nil { 335 | return 0, err 336 | } 337 | for db, tables := range dbTables { 338 | s.cfg.SourceDB = db 339 | for _, table := range tables { 340 | s.cfg.SourceTable = table 341 | count, err := s.GetSourceReadRowsCount() 342 | if err != nil { 343 | return 0, err 344 | } 345 | allCount += count 346 | } 347 | } 348 | if allCount != 0 { 349 | return allCount, nil 350 | } 351 | if len(dbTables) == 0 && s.cfg.SourceTable != "" { 352 | count, err := s.GetSourceReadRowsCount() 353 | if err != nil { 354 | return 0, err 355 | } 356 | allCount += count 357 | } 358 | 359 | return allCount, nil 360 | } 361 | 362 | func (s *MysqlSource) GetDbTablesAccordingToSourceDbTables() (map[string][]string, error) { 363 | allDbTables := make(map[string][]string) 364 | for _, sourceDbTable := range s.cfg.SourceDbTables { 365 | dbTable := strings.Split(sourceDbTable, "@") // because `.` in regex is a special character, so use `@` to split 366 | if len(dbTable) != 2 { 367 | return nil, fmt.Errorf("invalid sourceDbTable: %s, should be a.b format", sourceDbTable) 368 | } 369 | dbs, err := s.GetDatabasesAccordingToSourceDbRegex(dbTable[0]) 370 | if err != nil { 371 | return nil, fmt.Errorf("get databases according to sourceDbRegex failed: %v", err) 372 | } 373 | dbTables, err := s.GetTablesAccordingToSourceTableRegex(dbTable[1], dbs) 374 | if err != nil { 375 | return nil, fmt.Errorf("get tables according to sourceTableRegex failed: %v", err) 376 | } 377 | for db, tables := range dbTables { 378 | allDbTables[db] = append(allDbTables[db], tables...) 379 | } 380 | } 381 | return allDbTables, nil 382 | } 383 | -------------------------------------------------------------------------------- /source/oracle.go: -------------------------------------------------------------------------------- 1 | package source 2 | 3 | import ( 4 | "database/sql" 5 | "fmt" 6 | "log" 7 | "regexp" 8 | "strings" 9 | "time" 10 | 11 | _ "github.com/lib/pq" 12 | go_ora "github.com/sijms/go-ora/v2" 13 | "github.com/sirupsen/logrus" 14 | 15 | _ "github.com/sijms/go-ora/v2" 16 | 17 | "github.com/databendcloud/db-archiver/config" 18 | ) 19 | 20 | type OracleSource struct { 21 | db *sql.DB 22 | cfg *config.Config 23 | statsRecorder *DatabendSourceStatsRecorder 24 | } 25 | 26 | func (p *OracleSource) AdjustBatchSizeAccordingToSourceDbTable() int64 { 27 | minSplitKey, maxSplitKey, err := p.GetMinMaxSplitKey() 28 | if err != nil { 29 | return p.cfg.BatchSize 30 | } 31 | sourceTableRowCount, err := p.GetSourceReadRowsCount() 32 | if err != nil { 33 | return p.cfg.BatchSize 34 | } 35 | rangeSize := maxSplitKey - minSplitKey + 1 36 | switch { 37 | case int64(sourceTableRowCount) <= p.cfg.BatchSize: 38 | return rangeSize 39 | case rangeSize/int64(sourceTableRowCount) >= 10: 40 | return p.cfg.BatchSize * 5 41 | case rangeSize/int64(sourceTableRowCount) >= 100: 42 | return p.cfg.BatchSize * 20 43 | default: 44 | return p.cfg.BatchSize 45 | } 46 | } 47 | 48 | func NewOracleSource(cfg *config.Config) (*OracleSource, error) { 49 | stats := NewDatabendIntesterStatsRecorder() 50 | // disable - No SSL 51 | //require - Always SSL (skip verification) 52 | //verify-ca - Always SSL (verify that the certificate presented by the server was signed by a trusted CA) 53 | //verify-full - Always SSL (verify that the certification presented by the server was signed by a trusted CA and the server host name matches the one in the certificate) 54 | if cfg.SSLMode == "" { 55 | cfg.SSLMode = "disable" 56 | } 57 | params := map[string]string{ 58 | "sslmode": cfg.SSLMode, // 启用 SSL 并验证服务器证书 59 | } 60 | connStr := go_ora.BuildUrl(cfg.SourceHost, cfg.SourcePort, cfg.SourceDB, cfg.SourceUser, cfg.SourcePass, params) 61 | 62 | db, err := sql.Open("oracle", connStr) 63 | if err != nil { 64 | logrus.Errorf("failed to open oracle db: %v", err) 65 | return nil, err 66 | } 67 | err = db.Ping() 68 | if err != nil { 69 | log.Fatal(err) 70 | } 71 | return &OracleSource{ 72 | db: db, 73 | cfg: cfg, 74 | statsRecorder: stats, 75 | }, nil 76 | } 77 | 78 | func (p *OracleSource) SwitchDatabase() error { 79 | // Close the current connection 80 | err := p.db.Close() 81 | if err != nil { 82 | return err 83 | } 84 | params := map[string]string{ 85 | "sslmode": p.cfg.SSLMode, 86 | } 87 | connStr := go_ora.BuildUrl(p.cfg.SourceHost, p.cfg.SourcePort, p.cfg.SourceDB, p.cfg.SourceUser, p.cfg.SourcePass, params) 88 | 89 | // Open a new connection to the new database 90 | db, err := sql.Open("oracle", connStr) 91 | if err != nil { 92 | return err 93 | } 94 | 95 | // Replace the old connection with the new one 96 | p.db = db 97 | return nil 98 | } 99 | func (p *OracleSource) GetSourceReadRowsCount() (int, error) { 100 | err := p.SwitchDatabase() 101 | if err != nil { 102 | return 0, err 103 | } 104 | row := p.db.QueryRow(fmt.Sprintf("SELECT count(*) FROM %s.%s WHERE %s", 105 | p.cfg.SourceDB, p.cfg.SourceTable, p.cfg.SourceWhereCondition)) 106 | var rowCount int 107 | err = row.Scan(&rowCount) 108 | if err != nil { 109 | return 0, err 110 | } 111 | return rowCount, nil 112 | } 113 | 114 | func (p *OracleSource) GetMinMaxSplitKey() (int64, int64, error) { 115 | err := p.SwitchDatabase() 116 | if err != nil { 117 | return 0, 0, err 118 | } 119 | rows, err := p.db.Query(fmt.Sprintf("select COALESCE(min(%s),0), COALESCE(max(%s),0) from %s.%s WHERE %s", 120 | p.cfg.SourceSplitKey, p.cfg.SourceSplitKey, p.cfg.SourceDB, p.cfg.SourceTable, p.cfg.SourceWhereCondition)) 121 | if err != nil { 122 | return 0, 0, err 123 | } 124 | defer rows.Close() 125 | 126 | var minSplitKey, maxSplitKey sql.NullInt64 127 | for rows.Next() { 128 | err = rows.Scan(&minSplitKey, &maxSplitKey) 129 | if err != nil { 130 | return 0, 0, err 131 | } 132 | } 133 | 134 | // Check if minSplitKey and maxSplitKey are valid (not NULL) 135 | if !minSplitKey.Valid || !maxSplitKey.Valid { 136 | return 0, 0, nil 137 | } 138 | 139 | return minSplitKey.Int64, maxSplitKey.Int64, nil 140 | } 141 | 142 | func (p *OracleSource) GetMinMaxTimeSplitKey() (string, string, error) { 143 | err := p.SwitchDatabase() 144 | if err != nil { 145 | return "", "", err 146 | } 147 | rows, err := p.db.Query(fmt.Sprintf("select min(%s), max(%s) from %s.%s WHERE %s", p.cfg.SourceSplitTimeKey, 148 | p.cfg.SourceSplitTimeKey, p.cfg.SourceDB, p.cfg.SourceTable, p.cfg.SourceWhereCondition)) 149 | if err != nil { 150 | return "", "", err 151 | } 152 | defer rows.Close() 153 | 154 | var minSplitKey, maxSplitKey string 155 | for rows.Next() { 156 | err = rows.Scan(&minSplitKey, &maxSplitKey) 157 | if err != nil { 158 | return "", "", err 159 | } 160 | } 161 | return minSplitKey, maxSplitKey, nil 162 | } 163 | 164 | func (p *OracleSource) DeleteAfterSync() error { 165 | err := p.SwitchDatabase() 166 | if err != nil { 167 | return err 168 | } 169 | if p.cfg.DeleteAfterSync { 170 | _, err := p.db.Exec(fmt.Sprintf("delete from %s.%s where %s", 171 | p.cfg.SourceDB, p.cfg.SourceTable, p.cfg.SourceWhereCondition)) 172 | if err != nil { 173 | return err 174 | } 175 | } 176 | return nil 177 | } 178 | 179 | func (p *OracleSource) QueryTableData(threadNum int, conditionSql string) ([][]interface{}, []string, error) { 180 | startTime := time.Now() 181 | err := p.SwitchDatabase() 182 | if err != nil { 183 | return nil, nil, err 184 | } 185 | execSql := fmt.Sprintf("SELECT * FROM %s.%s WHERE %s", 186 | p.cfg.SourceDB, p.cfg.SourceTable, conditionSql) 187 | if p.cfg.SourceWhereCondition != "" && p.cfg.SourceSplitKey != "" { 188 | execSql = fmt.Sprintf("%s AND %s", execSql, p.cfg.SourceWhereCondition) 189 | } 190 | rows, err := p.db.Query(execSql) 191 | if err != nil { 192 | return nil, nil, err 193 | } 194 | defer rows.Close() 195 | columns, err := rows.Columns() 196 | if err != nil { 197 | return nil, nil, err 198 | } 199 | 200 | columnTypes, err := rows.ColumnTypes() 201 | if err != nil { 202 | return nil, nil, err 203 | } 204 | 205 | scanArgs := make([]interface{}, len(columns)) 206 | for i, columnType := range columnTypes { 207 | switch columnType.DatabaseTypeName() { 208 | case "INT", "SMALLINT", "TINYINT", "MEDIUMINT", "BIGINT", "INT4", "INT8", "NUMBER": 209 | scanArgs[i] = new(sql.NullInt64) 210 | case "UNSIGNED INT", "UNSIGNED TINYINT", "UNSIGNED MEDIUMINT", "UNSIGNED BIGINT": 211 | scanArgs[i] = new(sql.NullInt64) 212 | case "FLOAT", "DOUBLE", "FLOAT8": 213 | scanArgs[i] = new(sql.NullFloat64) 214 | case "DECIMAL", "NUMERIC": 215 | scanArgs[i] = new(sql.NullFloat64) 216 | case "CHAR", "VARCHAR", "VARCHAR2", "TEXT", "TINYTEXT", "MEDIUMTEXT", "LONGTEXT": 217 | scanArgs[i] = new(sql.NullString) 218 | case "DATE", "TIME", "DATETIME", "TIMESTAMP": 219 | scanArgs[i] = new(sql.NullString) // or use time.Time 220 | case "BOOL", "BOOLEAN": 221 | scanArgs[i] = new(sql.NullBool) 222 | default: 223 | scanArgs[i] = new(sql.RawBytes) 224 | } 225 | } 226 | 227 | var result [][]interface{} 228 | //rowCount, err := s.GetRowsCountByConditionSql(conditionSql) 229 | //if err != nil { 230 | // return nil, nil, err 231 | //} 232 | //result := make([][]interface{}, rowCount) 233 | for rows.Next() { 234 | err = rows.Scan(scanArgs...) 235 | if err != nil { 236 | return nil, nil, err 237 | } 238 | 239 | row := make([]interface{}, len(columns)) 240 | for i, v := range scanArgs { 241 | switch v := v.(type) { 242 | case *int: 243 | row[i] = *v 244 | case *string: 245 | row[i] = *v 246 | case *sql.NullString: 247 | if v.Valid { 248 | row[i] = v.String 249 | } else { 250 | row[i] = nil 251 | } 252 | case *bool: 253 | row[i] = *v 254 | case *sql.NullInt64: 255 | if v.Valid { 256 | row[i] = v.Int64 257 | } else { 258 | row[i] = nil 259 | } 260 | case *sql.NullFloat64: 261 | if v.Valid { 262 | row[i] = v.Float64 263 | } else { 264 | row[i] = nil 265 | } 266 | case *sql.NullBool: 267 | if v.Valid { 268 | //row[i] = v.Bool 269 | if v.Bool { 270 | row[i] = 1 271 | } else { 272 | row[i] = 0 273 | } 274 | } else { 275 | row[i] = nil 276 | } 277 | case *float64: 278 | row[i] = *v 279 | case *sql.RawBytes: 280 | row[i] = string(*v) 281 | } 282 | } 283 | result = append(result, row) 284 | } 285 | 286 | if err = rows.Err(); err != nil { 287 | return nil, nil, err 288 | } 289 | p.statsRecorder.RecordMetric(len(result)) 290 | stats := p.statsRecorder.Stats(time.Since(startTime)) 291 | log.Printf("thread-%d: extract %d rows (%f rows/s)", threadNum, len(result), stats.RowsPerSecondd) 292 | 293 | return result, columns, nil 294 | } 295 | 296 | func (p *OracleSource) GetDatabasesAccordingToSourceDbRegex(sourceDatabasePattern string) ([]string, error) { 297 | rows, err := p.db.Query("SELECT username AS schema_name FROM all_users") 298 | if err != nil { 299 | return nil, err 300 | } 301 | defer rows.Close() 302 | 303 | var databases []string 304 | for rows.Next() { 305 | var database string 306 | err = rows.Scan(&database) 307 | if err != nil { 308 | return nil, err 309 | } 310 | match, err := regexp.MatchString(sourceDatabasePattern, database) 311 | if err != nil { 312 | return nil, err 313 | } 314 | if match { 315 | databases = append(databases, database) 316 | } 317 | } 318 | return databases, nil 319 | } 320 | 321 | func (p *OracleSource) GetTablesAccordingToSourceTableRegex(sourceTablePattern string, databases []string) (map[string][]string, error) { 322 | dbTables := make(map[string][]string) 323 | for _, database := range databases { 324 | p.cfg.SourceDB = database 325 | err := p.SwitchDatabase() 326 | if err != nil { 327 | return nil, err 328 | } 329 | rows, err := p.db.Query(fmt.Sprintf("SELECT table_name FROM ALL_TABLES WHERE OWNER = '%s'", database)) 330 | if err != nil { 331 | return nil, err 332 | } 333 | defer rows.Close() 334 | 335 | var tables []string 336 | for rows.Next() { 337 | var table string 338 | err = rows.Scan(&table) 339 | if err != nil { 340 | return nil, err 341 | } 342 | match, err := regexp.MatchString(sourceTablePattern, table) 343 | if err != nil { 344 | return nil, err 345 | } 346 | if match { 347 | tables = append(tables, table) 348 | } 349 | } 350 | dbTables[database] = tables 351 | } 352 | return dbTables, nil 353 | } 354 | 355 | func (p *OracleSource) GetAllSourceReadRowsCount() (int, error) { 356 | allCount := 0 357 | 358 | dbTables, err := p.GetDbTablesAccordingToSourceDbTables() 359 | if err != nil { 360 | return 0, err 361 | } 362 | for db, tables := range dbTables { 363 | p.cfg.SourceDB = db 364 | for _, table := range tables { 365 | p.cfg.SourceTable = table 366 | count, err := p.GetSourceReadRowsCount() 367 | if err != nil { 368 | return 0, err 369 | } 370 | allCount += count 371 | } 372 | } 373 | 374 | return allCount, nil 375 | } 376 | 377 | func (p *OracleSource) GetDbTablesAccordingToSourceDbTables() (map[string][]string, error) { 378 | allDbTables := make(map[string][]string) 379 | for _, sourceDbTable := range p.cfg.SourceDbTables { 380 | dbTable := strings.Split(sourceDbTable, "@") // because `.` in regex is a special character, so use `@` to split 381 | if len(dbTable) != 2 { 382 | return nil, fmt.Errorf("invalid sourceDbTable: %s, should be a.b format", sourceDbTable) 383 | } 384 | dbs, err := p.GetDatabasesAccordingToSourceDbRegex(dbTable[0]) 385 | if err != nil { 386 | return nil, fmt.Errorf("get databases according to sourceDbRegex failed: %v", err) 387 | } 388 | dbTables, err := p.GetTablesAccordingToSourceTableRegex(dbTable[1], dbs) 389 | if err != nil { 390 | return nil, fmt.Errorf("get tables according to sourceTableRegex failed: %v", err) 391 | } 392 | for db, tables := range dbTables { 393 | allDbTables[db] = append(allDbTables[db], tables...) 394 | } 395 | } 396 | return allDbTables, nil 397 | } 398 | -------------------------------------------------------------------------------- /source/postgres.go: -------------------------------------------------------------------------------- 1 | package source 2 | 3 | import ( 4 | "database/sql" 5 | "fmt" 6 | "log" 7 | "regexp" 8 | "strings" 9 | "time" 10 | 11 | _ "github.com/lib/pq" 12 | "github.com/sirupsen/logrus" 13 | 14 | "github.com/databendcloud/db-archiver/config" 15 | ) 16 | 17 | type PostgresSource struct { 18 | db *sql.DB 19 | cfg *config.Config 20 | statsRecorder *DatabendSourceStatsRecorder 21 | } 22 | 23 | func (p *PostgresSource) AdjustBatchSizeAccordingToSourceDbTable() int64 { 24 | minSplitKey, maxSplitKey, err := p.GetMinMaxSplitKey() 25 | if err != nil { 26 | return p.cfg.BatchSize 27 | } 28 | sourceTableRowCount, err := p.GetSourceReadRowsCount() 29 | if err != nil { 30 | return p.cfg.BatchSize 31 | } 32 | rangeSize := maxSplitKey - minSplitKey + 1 33 | switch { 34 | case int64(sourceTableRowCount) <= p.cfg.BatchSize: 35 | return rangeSize 36 | case rangeSize/int64(sourceTableRowCount) >= 10: 37 | return p.cfg.BatchSize * 5 38 | case rangeSize/int64(sourceTableRowCount) >= 100: 39 | return p.cfg.BatchSize * 20 40 | default: 41 | return p.cfg.BatchSize 42 | } 43 | } 44 | 45 | func NewPostgresSource(cfg *config.Config) (*PostgresSource, error) { 46 | stats := NewDatabendIntesterStatsRecorder() 47 | // disable - No SSL 48 | //require - Always SSL (skip verification) 49 | //verify-ca - Always SSL (verify that the certificate presented by the server was signed by a trusted CA) 50 | //verify-full - Always SSL (verify that the certification presented by the server was signed by a trusted CA and the server host name matches the one in the certificate) 51 | if cfg.SSLMode == "" { 52 | cfg.SSLMode = "disable" 53 | } 54 | db, err := sql.Open("postgres", fmt.Sprintf("postgres://%s:%s@%s:%d/postgres?sslmode=%s", 55 | cfg.SourceUser, 56 | cfg.SourcePass, 57 | cfg.SourceHost, 58 | cfg.SourcePort, 59 | cfg.SSLMode)) 60 | if err != nil { 61 | logrus.Errorf("failed to open postgres db: %v", err) 62 | return nil, err 63 | } 64 | return &PostgresSource{ 65 | db: db, 66 | cfg: cfg, 67 | statsRecorder: stats, 68 | }, nil 69 | } 70 | 71 | func (p *PostgresSource) SwitchDatabase() error { 72 | // Close the current connection 73 | err := p.db.Close() 74 | if err != nil { 75 | return err 76 | } 77 | 78 | // Open a new connection to the new database 79 | db, err := sql.Open("postgres", fmt.Sprintf("postgres://%s:%s@%s:%d/%s?sslmode=%s", 80 | p.cfg.SourceUser, 81 | p.cfg.SourcePass, 82 | p.cfg.SourceHost, 83 | p.cfg.SourcePort, 84 | p.cfg.SourceDB, 85 | p.cfg.SSLMode)) 86 | if err != nil { 87 | return err 88 | } 89 | 90 | // Replace the old connection with the new one 91 | p.db = db 92 | return nil 93 | } 94 | func (p *PostgresSource) GetSourceReadRowsCount() (int, error) { 95 | err := p.SwitchDatabase() 96 | if err != nil { 97 | return 0, err 98 | } 99 | row := p.db.QueryRow(fmt.Sprintf("SELECT count(*) FROM %s WHERE %s", 100 | p.cfg.SourceTable, p.cfg.SourceWhereCondition)) 101 | var rowCount int 102 | err = row.Scan(&rowCount) 103 | if err != nil { 104 | return 0, err 105 | } 106 | 107 | return rowCount, nil 108 | } 109 | 110 | func (p *PostgresSource) GetMinMaxSplitKey() (int64, int64, error) { 111 | err := p.SwitchDatabase() 112 | if err != nil { 113 | return 0, 0, err 114 | } 115 | rows, err := p.db.Query(fmt.Sprintf("select COALESCE(min(%s),0), COALESCE(max(%s),0) from %s WHERE %s", 116 | p.cfg.SourceSplitKey, p.cfg.SourceSplitKey, p.cfg.SourceTable, p.cfg.SourceWhereCondition)) 117 | if err != nil { 118 | return 0, 0, err 119 | } 120 | defer rows.Close() 121 | 122 | var minSplitKey, maxSplitKey sql.NullInt64 123 | for rows.Next() { 124 | err = rows.Scan(&minSplitKey, &maxSplitKey) 125 | if err != nil { 126 | return 0, 0, err 127 | } 128 | } 129 | 130 | // Check if minSplitKey and maxSplitKey are valid (not NULL) 131 | if !minSplitKey.Valid || !maxSplitKey.Valid { 132 | return 0, 0, nil 133 | } 134 | 135 | return minSplitKey.Int64, maxSplitKey.Int64, nil 136 | } 137 | 138 | func (p *PostgresSource) GetMinMaxTimeSplitKey() (string, string, error) { 139 | err := p.SwitchDatabase() 140 | if err != nil { 141 | return "", "", err 142 | } 143 | rows, err := p.db.Query(fmt.Sprintf("select min(%s), max(%s) from %s WHERE %s", p.cfg.SourceSplitTimeKey, 144 | p.cfg.SourceSplitTimeKey, p.cfg.SourceTable, p.cfg.SourceWhereCondition)) 145 | if err != nil { 146 | return "", "", err 147 | } 148 | defer rows.Close() 149 | 150 | var minSplitKey, maxSplitKey string 151 | for rows.Next() { 152 | err = rows.Scan(&minSplitKey, &maxSplitKey) 153 | if err != nil { 154 | return "", "", err 155 | } 156 | } 157 | return minSplitKey, maxSplitKey, nil 158 | } 159 | 160 | func (p *PostgresSource) DeleteAfterSync() error { 161 | err := p.SwitchDatabase() 162 | if err != nil { 163 | return err 164 | } 165 | if p.cfg.DeleteAfterSync { 166 | _, err := p.db.Exec(fmt.Sprintf("delete from %s where %s", 167 | p.cfg.SourceTable, p.cfg.SourceWhereCondition)) 168 | if err != nil { 169 | return err 170 | } 171 | } 172 | return nil 173 | } 174 | 175 | func (p *PostgresSource) QueryTableData(threadNum int, conditionSql string) ([][]interface{}, []string, error) { 176 | startTime := time.Now() 177 | err := p.SwitchDatabase() 178 | if err != nil { 179 | return nil, nil, err 180 | } 181 | execSql := fmt.Sprintf("SELECT * FROM %s WHERE %s", 182 | p.cfg.SourceTable, conditionSql) 183 | if p.cfg.SourceWhereCondition != "" && p.cfg.SourceSplitKey != "" { 184 | execSql = fmt.Sprintf("%s AND %s", execSql, p.cfg.SourceWhereCondition) 185 | } 186 | rows, err := p.db.Query(execSql) 187 | if err != nil { 188 | return nil, nil, err 189 | } 190 | defer rows.Close() 191 | columns, err := rows.Columns() 192 | if err != nil { 193 | return nil, nil, err 194 | } 195 | 196 | columnTypes, err := rows.ColumnTypes() 197 | if err != nil { 198 | return nil, nil, err 199 | } 200 | 201 | scanArgs := make([]interface{}, len(columns)) 202 | for i, columnType := range columnTypes { 203 | fmt.Printf("%s\n", columnType.DatabaseTypeName()) 204 | switch columnType.DatabaseTypeName() { 205 | case "INT", "SMALLINT", "TINYINT", "MEDIUMINT", "BIGINT", "INT4", "INT8": 206 | scanArgs[i] = new(sql.NullInt64) 207 | case "UNSIGNED INT", "UNSIGNED TINYINT", "UNSIGNED MEDIUMINT", "UNSIGNED BIGINT": 208 | scanArgs[i] = new(sql.NullInt64) 209 | case "FLOAT", "DOUBLE", "FLOAT8": 210 | scanArgs[i] = new(sql.NullFloat64) 211 | case "DECIMAL", "NUMERIC": 212 | scanArgs[i] = new(sql.NullFloat64) 213 | case "CHAR", "VARCHAR", "TEXT", "TINYTEXT", "MEDIUMTEXT", "LONGTEXT": 214 | scanArgs[i] = new(sql.NullString) 215 | case "DATE", "TIME", "DATETIME", "TIMESTAMP": 216 | scanArgs[i] = new(sql.NullString) // or use time.Time 217 | case "BOOL", "BOOLEAN": 218 | scanArgs[i] = new(sql.NullBool) 219 | default: 220 | scanArgs[i] = new(sql.RawBytes) 221 | } 222 | } 223 | 224 | var result [][]interface{} 225 | //rowCount, err := s.GetRowsCountByConditionSql(conditionSql) 226 | //if err != nil { 227 | // return nil, nil, err 228 | //} 229 | //result := make([][]interface{}, rowCount) 230 | for rows.Next() { 231 | err = rows.Scan(scanArgs...) 232 | if err != nil { 233 | return nil, nil, err 234 | } 235 | 236 | row := make([]interface{}, len(columns)) 237 | for i, v := range scanArgs { 238 | switch v := v.(type) { 239 | case *int: 240 | row[i] = *v 241 | case *string: 242 | row[i] = *v 243 | case *sql.NullString: 244 | if v.Valid { 245 | row[i] = v.String 246 | } else { 247 | row[i] = nil 248 | } 249 | case *bool: 250 | row[i] = *v 251 | case *sql.NullInt64: 252 | if v.Valid { 253 | row[i] = v.Int64 254 | } else { 255 | row[i] = nil 256 | } 257 | case *sql.NullFloat64: 258 | if v.Valid { 259 | row[i] = v.Float64 260 | } else { 261 | row[i] = nil 262 | } 263 | case *sql.NullBool: 264 | if v.Valid { 265 | //row[i] = v.Bool 266 | if v.Bool { 267 | row[i] = 1 268 | } else { 269 | row[i] = 0 270 | } 271 | } else { 272 | row[i] = nil 273 | } 274 | case *float64: 275 | row[i] = *v 276 | case *sql.RawBytes: 277 | row[i] = string(*v) 278 | } 279 | } 280 | result = append(result, row) 281 | } 282 | 283 | if err = rows.Err(); err != nil { 284 | return nil, nil, err 285 | } 286 | p.statsRecorder.RecordMetric(len(result)) 287 | stats := p.statsRecorder.Stats(time.Since(startTime)) 288 | log.Printf("thread-%d: extract %d rows (%f rows/s)", threadNum, len(result), stats.RowsPerSecondd) 289 | 290 | return result, columns, nil 291 | } 292 | 293 | func (p *PostgresSource) GetDatabasesAccordingToSourceDbRegex(sourceDatabasePattern string) ([]string, error) { 294 | rows, err := p.db.Query("SELECT datname FROM pg_database") 295 | if err != nil { 296 | return nil, err 297 | } 298 | defer rows.Close() 299 | 300 | var databases []string 301 | for rows.Next() { 302 | var database string 303 | err = rows.Scan(&database) 304 | if err != nil { 305 | return nil, err 306 | } 307 | match, err := regexp.MatchString(sourceDatabasePattern, database) 308 | if err != nil { 309 | return nil, err 310 | } 311 | if match { 312 | databases = append(databases, database) 313 | } 314 | } 315 | return databases, nil 316 | } 317 | 318 | func (p *PostgresSource) GetTablesAccordingToSourceTableRegex(sourceTablePattern string, databases []string) (map[string][]string, error) { 319 | dbTables := make(map[string][]string) 320 | for _, database := range databases { 321 | p.cfg.SourceDB = database 322 | err := p.SwitchDatabase() 323 | if err != nil { 324 | return nil, err 325 | } 326 | rows, err := p.db.Query(fmt.Sprintf("SELECT tablename FROM pg_catalog.pg_tables WHERE schemaname != 'pg_catalog' AND schemaname != 'information_schema'")) 327 | if err != nil { 328 | return nil, err 329 | } 330 | defer rows.Close() 331 | 332 | var tables []string 333 | for rows.Next() { 334 | var table string 335 | err = rows.Scan(&table) 336 | if err != nil { 337 | return nil, err 338 | } 339 | match, err := regexp.MatchString(sourceTablePattern, table) 340 | if err != nil { 341 | return nil, err 342 | } 343 | if match { 344 | tables = append(tables, table) 345 | } 346 | } 347 | dbTables[database] = tables 348 | } 349 | return dbTables, nil 350 | } 351 | 352 | func (p *PostgresSource) GetAllSourceReadRowsCount() (int, error) { 353 | allCount := 0 354 | 355 | dbTables, err := p.GetDbTablesAccordingToSourceDbTables() 356 | if err != nil { 357 | return 0, err 358 | } 359 | for db, tables := range dbTables { 360 | p.cfg.SourceDB = db 361 | for _, table := range tables { 362 | p.cfg.SourceTable = table 363 | count, err := p.GetSourceReadRowsCount() 364 | if err != nil { 365 | return 0, err 366 | } 367 | allCount += count 368 | } 369 | } 370 | 371 | return allCount, nil 372 | } 373 | 374 | func (p *PostgresSource) GetDbTablesAccordingToSourceDbTables() (map[string][]string, error) { 375 | allDbTables := make(map[string][]string) 376 | for _, sourceDbTable := range p.cfg.SourceDbTables { 377 | dbTable := strings.Split(sourceDbTable, "@") // because `.` in regex is a special character, so use `@` to split 378 | if len(dbTable) != 2 { 379 | return nil, fmt.Errorf("invalid sourceDbTable: %s, should be a.b format", sourceDbTable) 380 | } 381 | dbs, err := p.GetDatabasesAccordingToSourceDbRegex(dbTable[0]) 382 | if err != nil { 383 | return nil, fmt.Errorf("get databases according to sourceDbRegex failed: %v", err) 384 | } 385 | dbTables, err := p.GetTablesAccordingToSourceTableRegex(dbTable[1], dbs) 386 | if err != nil { 387 | return nil, fmt.Errorf("get tables according to sourceTableRegex failed: %v", err) 388 | } 389 | for db, tables := range dbTables { 390 | allDbTables[db] = append(allDbTables[db], tables...) 391 | } 392 | } 393 | return allDbTables, nil 394 | } 395 | -------------------------------------------------------------------------------- /source/postgres_test.go: -------------------------------------------------------------------------------- 1 | package source 2 | 3 | import ( 4 | "database/sql" 5 | "fmt" 6 | "testing" 7 | 8 | "github.com/test-go/testify/assert" 9 | 10 | "github.com/databendcloud/db-archiver/config" 11 | "github.com/databendcloud/db-archiver/utils/testutils" 12 | ) 13 | 14 | type postgresSourceTest struct { 15 | postgresSource PostgresSource 16 | } 17 | 18 | var postgresPort = 15432 19 | 20 | func setupPostgresSourceTest() (*postgresSourceTest, func()) { 21 | pgDsn, tearDownFunc := testutils.PostgresForTest() 22 | sourceDbTables := []string{"mydb.*@test_table.*"} 23 | db, err := sql.Open("postgres", pgDsn) 24 | if err != nil { 25 | panic(err) 26 | } 27 | _, err = db.Exec(` 28 | DO $$ 29 | BEGIN 30 | IF NOT EXISTS ( 31 | SELECT FROM pg_database WHERE datname = 'mydb' 32 | ) THEN 33 | CREATE DATABASE mydb; 34 | END IF; 35 | END 36 | $$;`) 37 | if err != nil { 38 | panic(err) 39 | } 40 | 41 | db, err = sql.Open("postgres", fmt.Sprintf("postgres://postgres:postgres@localhost:%d/mydb?sslmode=disable&client_encoding=UTF8", postgresPort)) 42 | 43 | _, err = db.Exec("CREATE TABLE test_table (id SERIAL primary key, name varchar(255), ts timestamp default current_timestamp)") 44 | if err != nil { 45 | panic(err) 46 | } 47 | _, err = db.Exec("INSERT INTO test_table (name) VALUES ('test')") 48 | _, err = db.Exec("INSERT INTO test_table (name) VALUES ('test2')") 49 | if err != nil { 50 | panic(err) 51 | } 52 | cfg := &config.Config{ 53 | DatabendDSN: "http://databend:databend@localhost:8080", 54 | DatabendTable: "default.test_table", 55 | DatabaseType: "postgres", 56 | SourceHost: "localhost", 57 | SourcePort: postgresPort, 58 | SourceUser: "postgres", 59 | SourcePass: "postgres", 60 | SourceDbTables: sourceDbTables, 61 | SourceDB: "mydb", 62 | SourceWhereCondition: "id > 0", 63 | SSLMode: "disable", 64 | SourceTable: "test_table", 65 | SourceSplitKey: "id", 66 | SourceSplitTimeKey: "ts", 67 | BatchSize: 1000, 68 | BatchMaxInterval: 3, 69 | MaxThread: 2, 70 | CopyPurge: true, 71 | } 72 | source, err := NewPostgresSource(cfg) 73 | if err != nil { 74 | panic(err) 75 | } 76 | return &postgresSourceTest{ 77 | postgresSource: *source, 78 | }, tearDownFunc 79 | } 80 | 81 | func TestPostgresSource_GetDbTablesAccordingToSourceDbTables(t *testing.T) { 82 | postgresSourceTest, tearDownFunc := setupPostgresSourceTest() 83 | defer tearDownFunc() 84 | tables, err := postgresSourceTest.postgresSource.GetDbTablesAccordingToSourceDbTables() 85 | assert.NoError(t, err) 86 | assert.Equal(t, 1, len(tables)) 87 | assert.Equal(t, []string{"test_table"}, tables["mydb"]) 88 | } 89 | 90 | func TestPostgresSource_GetTablesAccordingToSourceTableRegex(t *testing.T) { 91 | postgresSourceTest, tearDownFunc := setupPostgresSourceTest() 92 | defer tearDownFunc() 93 | tables, err := postgresSourceTest.postgresSource.GetTablesAccordingToSourceTableRegex("test_table", []string{"mydb"}) 94 | assert.NoError(t, err) 95 | assert.Equal(t, 1, len(tables)) 96 | assert.Equal(t, "test_table", tables["mydb"][0]) 97 | } 98 | 99 | func TestPostgresSource_GetDatabasesAccordingToSourceDbRegex(t *testing.T) { 100 | postgresSourceTest, tearDownFunc := setupPostgresSourceTest() 101 | defer tearDownFunc() 102 | dbs, err := postgresSourceTest.postgresSource.GetDatabasesAccordingToSourceDbRegex("mydb") 103 | assert.NoError(t, err) 104 | assert.Equal(t, 1, len(dbs)) 105 | assert.Equal(t, "mydb", dbs[0]) 106 | } 107 | 108 | func TestPostgresSource_GetSourceReadRowsCount(t *testing.T) { 109 | postgresSourceTest, tearDownFunc := setupPostgresSourceTest() 110 | defer tearDownFunc() 111 | count, err := postgresSourceTest.postgresSource.GetSourceReadRowsCount() 112 | assert.NoError(t, err) 113 | assert.Equal(t, 2, count) 114 | } 115 | 116 | func TestPostgresSource_GetAllSourceReadRowsCount(t *testing.T) { 117 | postgresSourceTest, tearDownFunc := setupPostgresSourceTest() 118 | defer tearDownFunc() 119 | count, err := postgresSourceTest.postgresSource.GetAllSourceReadRowsCount() 120 | assert.NoError(t, err) 121 | assert.Equal(t, 2, count) 122 | } 123 | 124 | func TestPostgresSource_QueryTableData(t *testing.T) { 125 | postgresSourceTest, tearDownFunc := setupPostgresSourceTest() 126 | defer tearDownFunc() 127 | data, columns, err := postgresSourceTest.postgresSource.QueryTableData(1, "id > 0") 128 | assert.NoError(t, err) 129 | assert.Equal(t, 2, len(data)) 130 | assert.Equal(t, 3, len(columns)) 131 | assert.Equal(t, "id", columns[0]) 132 | assert.Equal(t, "name", columns[1]) 133 | t.Log(data) 134 | assert.Equal(t, int64(1), data[0][0]) 135 | } 136 | 137 | func TestPostgresSource_GetMinMaxSplitKey(t *testing.T) { 138 | postgresSourceTest, tearDownFunc := setupPostgresSourceTest() 139 | defer tearDownFunc() 140 | min, max, err := postgresSourceTest.postgresSource.GetMinMaxSplitKey() 141 | assert.NoError(t, err) 142 | assert.Equal(t, int64(1), min) 143 | assert.Equal(t, int64(2), max) 144 | } 145 | 146 | func TestPostgresSource_GetMinMaxTimeSplitKey(t *testing.T) { 147 | postgresSourceTest, tearDownFunc := setupPostgresSourceTest() 148 | defer tearDownFunc() 149 | min, max, err := postgresSourceTest.postgresSource.GetMinMaxTimeSplitKey() 150 | assert.NoError(t, err) 151 | assert.NotEmpty(t, min) 152 | assert.NotEmpty(t, max) 153 | } 154 | 155 | func TestPostgresSource_DeleteAfterSync(t *testing.T) { 156 | postgresSourceTest, tearDownFunc := setupPostgresSourceTest() 157 | defer tearDownFunc() 158 | postgresSourceTest.postgresSource.cfg.DeleteAfterSync = true 159 | err := postgresSourceTest.postgresSource.DeleteAfterSync() 160 | assert.NoError(t, err) 161 | } 162 | 163 | func TestPostgresSource_SwitchDatabase(t *testing.T) { 164 | postgresSourceTest, tearDownFunc := setupPostgresSourceTest() 165 | defer tearDownFunc() 166 | err := postgresSourceTest.postgresSource.SwitchDatabase() 167 | assert.NoError(t, err) 168 | } 169 | 170 | func TestPostgresSource_AdjustBatchSizeAccordingToSourceDbTable(t *testing.T) { 171 | postgresSourceTest, tearDownFunc := setupPostgresSourceTest() 172 | defer tearDownFunc() 173 | batchSize := postgresSourceTest.postgresSource.AdjustBatchSizeAccordingToSourceDbTable() 174 | assert.Equal(t, int64(2), batchSize) 175 | } 176 | -------------------------------------------------------------------------------- /source/source.go: -------------------------------------------------------------------------------- 1 | package source 2 | 3 | import ( 4 | "bufio" 5 | "encoding/json" 6 | "fmt" 7 | "os" 8 | "time" 9 | 10 | _ "github.com/denisenkom/go-mssqldb" 11 | _ "github.com/go-sql-driver/mysql" 12 | "github.com/sirupsen/logrus" 13 | 14 | "github.com/databendcloud/db-archiver/config" 15 | ) 16 | 17 | type Sourcer interface { 18 | AdjustBatchSizeAccordingToSourceDbTable() int64 19 | GetSourceReadRowsCount() (int, error) 20 | GetMinMaxSplitKey() (int64, int64, error) 21 | GetMinMaxTimeSplitKey() (string, string, error) 22 | DeleteAfterSync() error 23 | QueryTableData(threadNum int, conditionSql string) ([][]interface{}, []string, error) 24 | GetDatabasesAccordingToSourceDbRegex(sourceDatabasePattern string) ([]string, error) 25 | GetTablesAccordingToSourceTableRegex(sourceTablePattern string, databases []string) (map[string][]string, error) 26 | GetAllSourceReadRowsCount() (int, error) 27 | GetDbTablesAccordingToSourceDbTables() (map[string][]string, error) 28 | } 29 | 30 | func NewSource(cfg *config.Config) (Sourcer, error) { 31 | switch cfg.DatabaseType { 32 | case "mysql": 33 | return NewMysqlSource(cfg) 34 | case "tidb": 35 | return NewMysqlSource(cfg) 36 | case "pg": 37 | return NewPostgresSource(cfg) 38 | case "oracle": 39 | return NewOracleSource(cfg) 40 | case "mssql": 41 | return NewSqlServerSource(cfg) 42 | default: 43 | return NewMysqlSource(cfg) 44 | } 45 | } 46 | 47 | func SlimCondition(maxThread int, minSplitKey, maxSplitKey int64) [][]int64 { 48 | var conditions [][]int64 49 | if minSplitKey > maxSplitKey { 50 | return conditions 51 | } 52 | rangeSize := (maxSplitKey - minSplitKey) / int64(maxThread) 53 | for i := 0; i < maxThread; i++ { 54 | lowerBound := minSplitKey + rangeSize*int64(i) 55 | upperBound := lowerBound + rangeSize 56 | if i == maxThread-1 { 57 | // Ensure the last condition includes maxSplitKey 58 | upperBound = maxSplitKey 59 | } 60 | conditions = append(conditions, []int64{lowerBound, upperBound}) 61 | } 62 | return conditions 63 | } 64 | 65 | func SplitCondition(sourceSplitKey string, batchSize, minSplitKey, maxSplitKey int64) []string { 66 | var conditions []string 67 | for { 68 | if minSplitKey >= maxSplitKey { 69 | conditions = append(conditions, fmt.Sprintf("(%s >= %d and %s <= %d)", sourceSplitKey, minSplitKey, sourceSplitKey, maxSplitKey)) 70 | break 71 | } 72 | conditions = append(conditions, fmt.Sprintf("(%s >= %d and %s < %d)", sourceSplitKey, minSplitKey, sourceSplitKey, minSplitKey+batchSize)) 73 | minSplitKey += batchSize 74 | } 75 | return conditions 76 | } 77 | 78 | func SplitConditionAccordingMaxGoRoutine(sourceSplitKey string, batchSize, minSplitKey, maxSplitKey, allMax int64) <-chan string { 79 | conditions := make(chan string, 100) // make a buffered channel 80 | 81 | go func() { 82 | defer close(conditions) // make sure close channel 83 | 84 | if minSplitKey > maxSplitKey { 85 | return 86 | } 87 | 88 | for { 89 | if (minSplitKey + batchSize - 1) >= maxSplitKey { 90 | if minSplitKey > allMax { 91 | return 92 | } 93 | if maxSplitKey == allMax { 94 | conditions <- fmt.Sprintf("(%s >= %d and %s <= %d)", sourceSplitKey, minSplitKey, sourceSplitKey, maxSplitKey) 95 | } else { 96 | conditions <- fmt.Sprintf("(%s >= %d and %s < %d)", sourceSplitKey, minSplitKey, sourceSplitKey, maxSplitKey) 97 | } 98 | break 99 | } 100 | if (minSplitKey + batchSize - 1) >= allMax { 101 | conditions <- fmt.Sprintf("(%s >= %d and %s <= %d)", sourceSplitKey, minSplitKey, sourceSplitKey, allMax) 102 | return 103 | } 104 | conditions <- fmt.Sprintf("(%s >= %d and %s < %d)", sourceSplitKey, minSplitKey, sourceSplitKey, minSplitKey+batchSize-1) 105 | minSplitKey += batchSize - 1 106 | } 107 | }() 108 | 109 | return conditions 110 | } 111 | 112 | func SplitTimeConditionsByMaxThread(conditions []string, maxThread int) [][]string { 113 | // If maxThread is greater than the length of conditions, return conditions as a single group 114 | if maxThread >= len(conditions) { 115 | return [][]string{conditions} 116 | } 117 | var splitConditions [][]string 118 | chunkSize := (len(conditions) + maxThread - 1) / maxThread 119 | for i := 0; i < len(conditions); i += chunkSize { 120 | end := i + chunkSize 121 | if end > len(conditions) { 122 | end = len(conditions) 123 | } 124 | splitConditions = append(splitConditions, conditions[i:end]) 125 | } 126 | return splitConditions 127 | } 128 | 129 | func SplitConditionAccordingToTimeSplitKey(cfg *config.Config, minTimeSplitKey, maxTimeSplitKey string) ([]string, error) { 130 | var conditions []string 131 | 132 | // Parse the time strings 133 | minTime, err := parseTimeDynamic(minTimeSplitKey) 134 | if err != nil { 135 | return nil, err 136 | } 137 | 138 | maxTime, err := parseTimeDynamic(maxTimeSplitKey) 139 | if err != nil { 140 | return nil, err 141 | } 142 | if minTime.After(maxTime) { 143 | return conditions, nil 144 | } 145 | 146 | // Iterate over the time range 147 | for { 148 | if minTime.After(maxTime) { 149 | conditions = append(conditions, fmt.Sprintf("(%s >= '%s' and %s <= '%s')", cfg.SourceSplitTimeKey, minTime.Format("2006-01-02 15:04:05"), cfg.SourceSplitTimeKey, maxTime.Format("2006-01-02 15:04:05"))) 150 | break 151 | } 152 | if minTime.Equal(maxTime) { 153 | conditions = append(conditions, fmt.Sprintf("(%s >= '%s' and %s <= '%s')", cfg.SourceSplitTimeKey, minTime.Format("2006-01-02 15:04:05"), cfg.SourceSplitTimeKey, maxTime.Format("2006-01-02 15:04:05"))) 154 | break 155 | } 156 | conditions = append(conditions, fmt.Sprintf("(%s >= '%s' and %s < '%s')", cfg.SourceSplitTimeKey, minTime.Format("2006-01-02 15:04:05"), cfg.SourceSplitTimeKey, minTime.Add(cfg.GetTimeRangeBySplitUnit()).Format("2006-01-02 15:04:05"))) 157 | minTime = minTime.Add(cfg.GetTimeRangeBySplitUnit()) 158 | } 159 | 160 | return conditions, nil 161 | } 162 | 163 | func GenerateJSONFile(columns []string, data [][]interface{}) (string, int, error) { 164 | l := logrus.WithFields(logrus.Fields{"tardatabend": "IngestData"}) 165 | var batchJsonData []string 166 | 167 | for _, row := range data { 168 | if len(row) == 0 { 169 | continue 170 | } 171 | rowMap := make(map[string]interface{}) 172 | for i, column := range columns { 173 | rowMap[column] = row[i] 174 | } 175 | jsonData, err := json.Marshal(rowMap) 176 | if err != nil { 177 | return "", 0, err 178 | } 179 | batchJsonData = append(batchJsonData, string(jsonData)) 180 | } 181 | 182 | fileName, bytesSize, err := generateNDJsonFile(batchJsonData) 183 | if err != nil { 184 | l.Errorf("generate NDJson file failed: %v\n", err) 185 | return "", 0, err 186 | } 187 | return fileName, bytesSize, nil 188 | } 189 | 190 | func generateNDJsonFile(batchJsonData []string) (string, int, error) { 191 | fileName := fmt.Sprintf("databend-ingest-%d-", time.Now().UnixNano()) 192 | outputFile, err := os.CreateTemp("/tmp", fileName+"*.ndjson") 193 | if err != nil { 194 | return "", 0, err 195 | } 196 | defer outputFile.Close() 197 | 198 | // Create a buffered writer for the Ndjson file 199 | writer := bufio.NewWriter(outputFile) 200 | bytesSum := 0 201 | 202 | for _, data := range batchJsonData { 203 | n, err := writer.WriteString(data + "\n") 204 | if err != nil { 205 | return "", 0, err 206 | } 207 | bytesSum += n 208 | } 209 | // Flush any remaining data to the NDJson file 210 | err = writer.Flush() 211 | if err != nil { 212 | return "", 0, err 213 | } 214 | return outputFile.Name(), bytesSum, err 215 | } 216 | 217 | func parseTimeDynamic(timeStr string) (time.Time, error) { 218 | var layouts = []string{ 219 | "2006-01-02 15:04:05", 220 | "2006-01-02T15:04:05.000", 221 | "2006-01-02T15:04:05", 222 | "2006-01-02T15:04:05.000Z07:00", 223 | } 224 | 225 | var err error 226 | var parsedTime time.Time 227 | for _, layout := range layouts { 228 | parsedTime, err = time.Parse(layout, timeStr) 229 | if err == nil { 230 | return parsedTime, nil 231 | } 232 | } 233 | 234 | return time.Time{}, fmt.Errorf("failed to parse time: %v", err) 235 | } 236 | -------------------------------------------------------------------------------- /source/source_test.go: -------------------------------------------------------------------------------- 1 | package source 2 | 3 | import ( 4 | "fmt" 5 | "reflect" 6 | "regexp" 7 | "testing" 8 | 9 | "github.com/test-go/testify/assert" 10 | 11 | "github.com/databendcloud/db-archiver/config" 12 | ) 13 | 14 | func TestSlimCondition(t *testing.T) { 15 | 16 | // Test when minSplitKey is less than maxSplitKey 17 | conditions := SlimCondition(5, 0, 100) 18 | if len(conditions) != 5 { 19 | t.Errorf("Expected 5 conditions, got %d", len(conditions)) 20 | } 21 | if conditions[4][1] != 100 { 22 | t.Errorf("Expected last upperBound to be 100, got %d", conditions[4][1]) 23 | } 24 | 25 | // Test when minSplitKey is greater than maxSplitKey 26 | conditions = SlimCondition(5, 200, 100) 27 | if len(conditions) != 0 { 28 | t.Errorf("Expected 0 conditions, got %d", len(conditions)) 29 | } 30 | } 31 | 32 | func TestSlimConditionWithMaxThreadOne(t *testing.T) { 33 | 34 | // Test when minSplitKey is less than maxSplitKey 35 | conditions := SlimCondition(1, 0, 100) 36 | if len(conditions) != 1 { 37 | t.Errorf("Expected 1 condition, got %d", len(conditions)) 38 | } 39 | if conditions[0][1] != 100 { 40 | t.Errorf("Expected last upperBound to be 100, got %d", conditions[0][1]) 41 | } 42 | 43 | // Test when minSplitKey is equal to maxSplitKey 44 | conditions = SlimCondition(1, 100, 100) 45 | if len(conditions) != 1 { 46 | t.Errorf("Expected 1 condition, got %d", len(conditions)) 47 | } 48 | if conditions[0][1] != 100 { 49 | t.Errorf("Expected last upperBound to be 100, got %d", conditions[0][1]) 50 | } 51 | 52 | // Test when minSplitKey is greater than maxSplitKey 53 | conditions = SlimCondition(1, 200, 100) 54 | if len(conditions) != 0 { 55 | t.Errorf("Expected 0 conditions, got %d", len(conditions)) 56 | } 57 | } 58 | 59 | func TestSplitConditionAccordingMaxGoRoutine(t *testing.T) { 60 | conditions := SplitConditionAccordingMaxGoRoutine("id", 10, 0, 100, 100) 61 | var count0 = 0 62 | for condition := range conditions { 63 | fmt.Printf(condition) 64 | count0++ 65 | } 66 | if count0 != 12 { 67 | t.Errorf("Expected 12 conditions, got %d", len(conditions)) 68 | } 69 | 70 | // Test when minSplitKey is less than maxSplitKey and maxSplitKey is less than allMax 71 | conditions = SplitConditionAccordingMaxGoRoutine("id", 10, 0, 50, 100) 72 | var count1 = 0 73 | for condition := range conditions { 74 | fmt.Printf(condition) 75 | count1++ 76 | if count1 == 5 { 77 | assert.Equal(t, condition, fmt.Sprintf("(%s >= %d and %s < %d)", "id", 36, "id", 45)) 78 | } 79 | } 80 | if count1 != 6 { 81 | t.Errorf("Expected 6 conditions, got %d", len(conditions)) 82 | } 83 | 84 | // Test when minSplitKey is less than maxSplitKey and maxSplitKey is equal to allMax 85 | conditions = SplitConditionAccordingMaxGoRoutine("id", 10, 0, 100, 100) 86 | var count2 = 0 87 | for condition := range conditions { 88 | count2++ 89 | if count2 == 10 { 90 | assert.Equal(t, condition, fmt.Sprintf("(%s >= %d and %s < %d)", "id", 81, "id", 90)) 91 | } 92 | } 93 | if count2 != 12 { 94 | t.Errorf("Expected 12 conditions, got %d", len(conditions)) 95 | } 96 | 97 | // Test when minSplitKey is greater than maxSplitKey 98 | conditions = SplitConditionAccordingMaxGoRoutine("id", 10, 200, 100, 300) 99 | if len(conditions) != 0 { 100 | t.Errorf("Expected 0 conditions, got %d", len(conditions)) 101 | } 102 | } 103 | 104 | func TestSplitConditionAccordingToTimeSplitKey(t *testing.T) { 105 | cfg := &config.Config{ 106 | SourceSplitTimeKey: "t1", 107 | TimeSplitUnit: "hour", 108 | } 109 | 110 | // Test when minTimeSplitKey is less than maxTimeSplitKey 111 | conditions, err := SplitConditionAccordingToTimeSplitKey(cfg, "2024-06-30 2:00:00", "2024-06-30 20:00:00") 112 | fmt.Println(conditions) 113 | if err != nil { 114 | t.Errorf("SplitConditionAccordingToTimeSplitKey() error = %v", err) 115 | } 116 | if len(conditions) != 10 { 117 | t.Errorf("Expected 10 conditions, got %d", len(conditions)) 118 | } 119 | 120 | // Test when minTimeSplitKey is equal to maxTimeSplitKey 121 | conditions, err = SplitConditionAccordingToTimeSplitKey(cfg, "2024-06-30 2:00:00", "2024-06-30 2:00:00") 122 | if err != nil { 123 | t.Errorf("SplitConditionAccordingToTimeSplitKey() error = %v", err) 124 | } 125 | if len(conditions) != 1 { 126 | t.Errorf("Expected 1 conditions, got %d", len(conditions)) 127 | } 128 | 129 | // Test when minTimeSplitKey is greater than maxTimeSplitKey 130 | conditions, err = SplitConditionAccordingToTimeSplitKey(cfg, "2024-06-30 20:00:00", "2024-06-30 2:00:00") 131 | if err != nil { 132 | t.Errorf("SplitConditionAccordingToTimeSplitKey() error = %v", err) 133 | } 134 | if len(conditions) != 0 { 135 | t.Errorf("Expected 0 conditions, got %d", len(conditions)) 136 | } 137 | } 138 | 139 | func TestSplitConditionsByMaxThread(t *testing.T) { 140 | tests := []struct { 141 | name string 142 | conditions []string 143 | maxThread int 144 | want [][]string 145 | }{ 146 | { 147 | name: "split into 2 groups", 148 | conditions: []string{"a", "b", "c", "d", "e"}, 149 | maxThread: 2, 150 | want: [][]string{{"a", "b", "c"}, {"d", "e"}}, 151 | }, 152 | { 153 | name: "split into 3 groups", 154 | conditions: []string{"a", "b", "c", "d", "e", "f"}, 155 | maxThread: 2, 156 | want: [][]string{{"a", "b", "c"}, {"d", "e", "f"}}, 157 | }, 158 | { 159 | name: "all in one group", 160 | conditions: []string{"a", "b", "c", "d"}, 161 | maxThread: 5, 162 | want: [][]string{{"a", "b", "c", "d"}}, 163 | }, 164 | } 165 | 166 | for _, tt := range tests { 167 | t.Run(tt.name, func(t *testing.T) { 168 | got := SplitTimeConditionsByMaxThread(tt.conditions, tt.maxThread) 169 | if !reflect.DeepEqual(got, tt.want) { 170 | t.Errorf("SplitConditionsByMaxThread() = %v, want %v", got, tt.want) 171 | } 172 | }) 173 | } 174 | } 175 | 176 | func TestMatchDatabase(t *testing.T) { 177 | databasePattern := "db.*" 178 | sourceDbs := []string{"db1", "db2", "default"} 179 | targetDbs := []string{"db1", "db2"} 180 | res := []string{} 181 | for _, sourceDb := range sourceDbs { 182 | match, err := regexp.MatchString(databasePattern, sourceDb) 183 | assert.NoError(t, err) 184 | if match { 185 | res = append(res, sourceDb) 186 | } 187 | } 188 | assert.Equal(t, targetDbs, res) 189 | } 190 | -------------------------------------------------------------------------------- /source/sql_server.go: -------------------------------------------------------------------------------- 1 | package source 2 | 3 | import ( 4 | "context" 5 | "database/sql" 6 | "fmt" 7 | "log" 8 | "net/url" 9 | "regexp" 10 | "strings" 11 | "time" 12 | 13 | "github.com/sirupsen/logrus" 14 | 15 | "github.com/databendcloud/db-archiver/config" 16 | ) 17 | 18 | type SQLServerSource struct { 19 | db *sql.DB 20 | cfg *config.Config 21 | statsRecorder *DatabendSourceStatsRecorder 22 | } 23 | 24 | func NewSqlServerSource(cfg *config.Config) (*SQLServerSource, error) { 25 | stats := NewDatabendIntesterStatsRecorder() 26 | encodedPassword := url.QueryEscape(cfg.SourcePass) 27 | db, err := sql.Open("mssql", fmt.Sprintf("sqlserver://%s:%s@%s:%d?database=%s&encrypt=disable", 28 | cfg.SourceUser, 29 | encodedPassword, 30 | cfg.SourceHost, 31 | cfg.SourcePort, cfg.SourceDB)) 32 | if err != nil { 33 | logrus.Errorf("failed to open db: %v", err) 34 | return nil, err 35 | } 36 | return &SQLServerSource{ 37 | db: db, 38 | cfg: cfg, 39 | statsRecorder: stats, 40 | }, nil 41 | } 42 | 43 | func (s *SQLServerSource) GetSourceReadRowsCount() (int, error) { 44 | // SQL Server table name contains schema,格式为 schema.table 45 | tableName := s.cfg.SourceTable 46 | if !strings.Contains(tableName, ".") { 47 | // if no schema,default use dbo schema 48 | tableName = "dbo." + tableName 49 | } 50 | 51 | query := fmt.Sprintf("SELECT COUNT(*) FROM %s", tableName) 52 | if s.cfg.SourceWhereCondition != "" { 53 | query += " WHERE " + s.cfg.SourceWhereCondition 54 | } 55 | 56 | row := s.db.QueryRow(query) 57 | var rowCount int 58 | err := row.Scan(&rowCount) 59 | if err != nil { 60 | return 0, err 61 | } 62 | 63 | return rowCount, nil 64 | } 65 | 66 | func (s *SQLServerSource) GetMinMaxSplitKey() (int64, int64, error) { 67 | tableName := s.cfg.SourceTable 68 | if !strings.Contains(tableName, ".") { 69 | tableName = "dbo." + tableName 70 | } 71 | 72 | query := fmt.Sprintf("SELECT MIN(%s) as min_key, MAX(%s) as max_key FROM %s.%s", 73 | s.cfg.SourceSplitKey, 74 | s.cfg.SourceSplitKey, 75 | s.cfg.SourceDB, 76 | tableName) 77 | 78 | if s.cfg.SourceWhereCondition != "" { 79 | query += " WHERE " + s.cfg.SourceWhereCondition 80 | } 81 | 82 | rows, err := s.db.Query(query) 83 | if err != nil { 84 | return 0, 0, err 85 | } 86 | defer rows.Close() 87 | 88 | var minSplitKey, maxSplitKey sql.NullInt64 89 | for rows.Next() { 90 | err = rows.Scan(&minSplitKey, &maxSplitKey) 91 | if err != nil { 92 | return 0, 0, err 93 | } 94 | } 95 | 96 | if err = rows.Err(); err != nil { 97 | return 0, 0, err 98 | } 99 | 100 | if !minSplitKey.Valid || !maxSplitKey.Valid { 101 | return 0, 0, nil 102 | } 103 | 104 | return minSplitKey.Int64, maxSplitKey.Int64, nil 105 | } 106 | 107 | func (s *SQLServerSource) AdjustBatchSizeAccordingToSourceDbTable() int64 { 108 | minSplitKey, maxSplitKey, err := s.GetMinMaxSplitKey() 109 | if err != nil { 110 | return s.cfg.BatchSize 111 | } 112 | sourceTableRowCount, err := s.GetSourceReadRowsCount() 113 | if err != nil { 114 | return s.cfg.BatchSize 115 | } 116 | rangeSize := maxSplitKey - minSplitKey + 1 117 | switch { 118 | case int64(sourceTableRowCount) <= s.cfg.BatchSize: 119 | return rangeSize 120 | case rangeSize/int64(sourceTableRowCount) >= 10: 121 | return s.cfg.BatchSize * 5 122 | case rangeSize/int64(sourceTableRowCount) >= 100: 123 | return s.cfg.BatchSize * 20 124 | default: 125 | return s.cfg.BatchSize 126 | } 127 | } 128 | 129 | func (s *SQLServerSource) GetMinMaxTimeSplitKey() (string, string, error) { 130 | parts := strings.Split(s.cfg.SourceTable, ".") 131 | var tableName string 132 | if len(parts) == 2 { 133 | tableName = fmt.Sprintf("[%s].[%s]", parts[0], parts[1]) 134 | } else { 135 | tableName = fmt.Sprintf("[dbo].[%s]", s.cfg.SourceTable) 136 | } 137 | 138 | query := fmt.Sprintf(` 139 | SELECT 140 | CONVERT(VARCHAR(23), MIN([%s]), 126) as min_key, 141 | CONVERT(VARCHAR(23), MAX([%s]), 126) as max_key 142 | FROM [%s].%s`, 143 | s.cfg.SourceSplitTimeKey, 144 | s.cfg.SourceSplitTimeKey, 145 | s.cfg.SourceDB, 146 | tableName) 147 | 148 | if s.cfg.SourceWhereCondition != "" { 149 | query += " WHERE " + s.cfg.SourceWhereCondition 150 | } 151 | 152 | rows, err := s.db.Query(query) 153 | if err != nil { 154 | return "", "", fmt.Errorf("executing query: %w", err) 155 | } 156 | defer rows.Close() 157 | 158 | var minSplitKey, maxSplitKey sql.NullString 159 | if rows.Next() { 160 | err = rows.Scan(&minSplitKey, &maxSplitKey) 161 | if err != nil { 162 | return "", "", fmt.Errorf("scanning results: %w", err) 163 | } 164 | } else { 165 | return "", "", fmt.Errorf("no results returned") 166 | } 167 | 168 | if err = rows.Err(); err != nil { 169 | return "", "", fmt.Errorf("reading rows: %w", err) 170 | } 171 | 172 | if !minSplitKey.Valid || !maxSplitKey.Valid { 173 | return "", "", nil 174 | } 175 | 176 | return minSplitKey.String, maxSplitKey.String, nil 177 | } 178 | 179 | func (s *SQLServerSource) DeleteAfterSync() error { 180 | if !s.cfg.DeleteAfterSync { 181 | return nil 182 | } 183 | 184 | parts := strings.Split(s.cfg.SourceTable, ".") 185 | var tableName string 186 | if len(parts) == 2 { 187 | tableName = fmt.Sprintf("[%s].[%s]", parts[0], parts[1]) 188 | } else { 189 | tableName = fmt.Sprintf("[dbo].[%s]", s.cfg.SourceTable) 190 | } 191 | 192 | query := fmt.Sprintf("DELETE FROM [%s].%s", 193 | s.cfg.SourceDB, 194 | tableName) 195 | 196 | if s.cfg.SourceWhereCondition != "" { 197 | query += " WHERE " + s.cfg.SourceWhereCondition 198 | } 199 | 200 | _, err := s.db.Exec(query) 201 | if err != nil { 202 | return fmt.Errorf("executing delete query: %w", err) 203 | } 204 | 205 | return nil 206 | } 207 | 208 | func (s *SQLServerSource) QueryTableData(threadNum int, conditionSql string) ([][]interface{}, []string, error) { 209 | startTime := time.Now() 210 | 211 | parts := strings.Split(s.cfg.SourceTable, ".") 212 | var tableName string 213 | if len(parts) == 2 { 214 | tableName = fmt.Sprintf("[%s].[%s]", parts[0], parts[1]) 215 | } else { 216 | tableName = fmt.Sprintf("[dbo].[%s]", s.cfg.SourceTable) 217 | } 218 | 219 | baseQuery := fmt.Sprintf(` 220 | SELECT TOP 1 * 221 | FROM [%s].%s WITH (NOLOCK) 222 | WHERE %s`, 223 | s.cfg.SourceDB, 224 | tableName, 225 | strings.Split(conditionSql, "OFFSET")[0]) 226 | 227 | if s.cfg.SourceWhereCondition != "" && s.cfg.SourceSplitKey != "" { 228 | baseQuery = fmt.Sprintf("%s AND %s", baseQuery, s.cfg.SourceWhereCondition) 229 | } 230 | 231 | rows, err := s.db.Query(baseQuery) 232 | if err != nil { 233 | return nil, nil, fmt.Errorf("executing base query: %w", err) 234 | } 235 | 236 | columns, err := rows.Columns() 237 | if err != nil { 238 | rows.Close() 239 | return nil, nil, fmt.Errorf("getting columns: %w", err) 240 | } 241 | 242 | columnTypes, err := rows.ColumnTypes() 243 | if err != nil { 244 | rows.Close() 245 | return nil, nil, fmt.Errorf("getting column types: %w", err) 246 | } 247 | rows.Close() 248 | 249 | // scan value 250 | scanArgs := make([]interface{}, len(columns)) 251 | for i, columnType := range columnTypes { 252 | switch columnType.DatabaseTypeName() { 253 | case "TINYINT", "SMALLINT", "INT", "BIGINT": 254 | scanArgs[i] = new(sql.NullInt64) 255 | case "REAL", "FLOAT": 256 | scanArgs[i] = new(sql.NullFloat64) 257 | case "DECIMAL", "NUMERIC", "MONEY", "SMALLMONEY": 258 | scanArgs[i] = new(sql.NullFloat64) 259 | case "CHAR", "VARCHAR", "TEXT", "NCHAR", "NVARCHAR", "NTEXT": 260 | scanArgs[i] = new(sql.NullString) 261 | case "DATE", "TIME", "DATETIME", "DATETIME2", "SMALLDATETIME", "DATETIMEOFFSET": 262 | scanArgs[i] = new(sql.NullString) 263 | case "BIT": 264 | scanArgs[i] = new(sql.NullBool) 265 | case "BINARY", "VARBINARY", "IMAGE": 266 | scanArgs[i] = new(sql.RawBytes) 267 | case "UNIQUEIDENTIFIER": 268 | scanArgs[i] = new(sql.NullString) 269 | default: 270 | scanArgs[i] = new(sql.RawBytes) 271 | } 272 | } 273 | 274 | const batchSize = 10000 275 | var result [][]interface{} 276 | offset := 0 277 | 278 | for { 279 | query := fmt.Sprintf(` 280 | SELECT * 281 | FROM [%s].%s WITH (NOLOCK) 282 | WHERE %s`, 283 | s.cfg.SourceDB, 284 | tableName, 285 | conditionSql) 286 | 287 | if s.cfg.SourceWhereCondition != "" && s.cfg.SourceSplitKey != "" { 288 | query = fmt.Sprintf("%s AND %s", query, s.cfg.SourceWhereCondition) 289 | } 290 | 291 | // page 292 | query = fmt.Sprintf(` 293 | SELECT * 294 | FROM ( 295 | %s 296 | ) AS t 297 | ORDER BY (SELECT NULL) 298 | OFFSET %d ROWS 299 | FETCH NEXT %d ROWS ONLY`, 300 | query, 301 | offset, 302 | batchSize) 303 | 304 | ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) 305 | defer cancel() 306 | rows, err := s.db.QueryContext(ctx, query) 307 | 308 | if err != nil { 309 | return nil, nil, fmt.Errorf("executing batch query at offset %d: %w", offset, err) 310 | } 311 | 312 | rowCount := 0 313 | for rows.Next() { 314 | err = rows.Scan(scanArgs...) 315 | if err != nil { 316 | rows.Close() 317 | return nil, nil, fmt.Errorf("scanning row at offset %d: %w", offset, err) 318 | } 319 | 320 | row := make([]interface{}, len(columns)) 321 | for i, v := range scanArgs { 322 | switch v := v.(type) { 323 | case *sql.NullString: 324 | if v.Valid { 325 | row[i] = v.String 326 | } else { 327 | row[i] = nil 328 | } 329 | case *sql.NullInt64: 330 | if v.Valid { 331 | row[i] = v.Int64 332 | } else { 333 | row[i] = nil 334 | } 335 | case *sql.NullFloat64: 336 | if v.Valid { 337 | row[i] = v.Float64 338 | } else { 339 | row[i] = nil 340 | } 341 | case *sql.NullBool: 342 | if v.Valid { 343 | if v.Bool { 344 | row[i] = 1 // target databend bool is int8 345 | } else { 346 | row[i] = 0 347 | } 348 | } else { 349 | row[i] = nil 350 | } 351 | case *sql.RawBytes: 352 | if v != nil { 353 | row[i] = string(*v) 354 | } else { 355 | row[i] = nil 356 | } 357 | default: 358 | row[i] = v 359 | } 360 | } 361 | result = append(result, row) 362 | rowCount++ 363 | } 364 | 365 | rows.Close() 366 | 367 | if err = rows.Err(); err != nil { 368 | return nil, nil, fmt.Errorf("reading rows at offset %d: %w", offset, err) 369 | } 370 | 371 | // if the number of rows returned is less than the batch size, we can assume that we've reached the end of the data 372 | if rowCount < batchSize { 373 | break 374 | } 375 | 376 | offset += batchSize 377 | 378 | log.Printf("thread-%d: processed %d rows so far", threadNum, len(result)) 379 | } 380 | 381 | s.statsRecorder.RecordMetric(len(result)) 382 | stats := s.statsRecorder.Stats(time.Since(startTime)) 383 | log.Printf("thread-%d: extract total %d rows (%f rows/s)", threadNum, len(result), stats.RowsPerSecondd) 384 | 385 | return result, columns, nil 386 | } 387 | 388 | func (s *SQLServerSource) GetDatabasesAccordingToSourceDbRegex(sourceDatabasePattern string) ([]string, error) { 389 | // SQL Server use system view to get databases 390 | query := ` 391 | SELECT name 392 | FROM sys.databases 393 | WHERE name NOT IN ('master', 'tempdb', 'model', 'msdb') 394 | AND state_desc = 'ONLINE' 395 | AND HAS_DBACCESS(name) = 1 396 | ORDER BY name` 397 | 398 | rows, err := s.db.Query(query) 399 | if err != nil { 400 | return nil, fmt.Errorf("querying databases: %w", err) 401 | } 402 | defer rows.Close() 403 | 404 | var databases []string 405 | for rows.Next() { 406 | var database string 407 | err = rows.Scan(&database) 408 | if err != nil { 409 | return nil, fmt.Errorf("scanning database name: %w", err) 410 | } 411 | 412 | fmt.Println("sourcedatabase pattern:", sourceDatabasePattern) 413 | match, err := regexp.MatchString(sourceDatabasePattern, database) 414 | if err != nil { 415 | return nil, fmt.Errorf("matching pattern: %w", err) 416 | } 417 | 418 | if match { 419 | fmt.Println("match db:", database) 420 | databases = append(databases, database) 421 | } else { 422 | fmt.Println("not match db:", database) 423 | } 424 | } 425 | 426 | if err = rows.Err(); err != nil { 427 | return nil, fmt.Errorf("reading rows: %w", err) 428 | } 429 | 430 | return databases, nil 431 | } 432 | 433 | func (s *SQLServerSource) GetTablesAccordingToSourceTableRegex(sourceTablePattern string, databases []string) (map[string][]string, error) { 434 | dbTables := make(map[string][]string) 435 | 436 | baseQuery := ` 437 | SELECT 438 | SCHEMA_NAME(schema_id) as schema_name, 439 | name as table_name 440 | FROM sys.tables 441 | WHERE type = 'U' 442 | AND is_ms_shipped = 0 443 | ORDER BY schema_name, name` 444 | 445 | for _, database := range databases { 446 | // switch db 447 | _, err := s.db.Exec(fmt.Sprintf("USE [%s]", database)) 448 | if err != nil { 449 | return nil, fmt.Errorf("switching to database %s: %w", database, err) 450 | } 451 | 452 | rows, err := s.db.Query(baseQuery) 453 | if err != nil { 454 | return nil, fmt.Errorf("querying tables in database %s: %w", database, err) 455 | } 456 | 457 | var tables []string 458 | for rows.Next() { 459 | var schemaName, tableName string 460 | err = rows.Scan(&schemaName, &tableName) 461 | if err != nil { 462 | rows.Close() 463 | return nil, fmt.Errorf("scanning table info in database %s: %w", database, err) 464 | } 465 | 466 | // schema.tablename, example: dbo.table1 467 | fullTableName := fmt.Sprintf("%s.%s", schemaName, tableName) 468 | fmt.Println("full name table:", fullTableName) 469 | 470 | match, err := regexp.MatchString(sourceTablePattern, fullTableName) 471 | if err != nil { 472 | rows.Close() 473 | return nil, fmt.Errorf("matching pattern for table %s: %w", fullTableName, err) 474 | } 475 | 476 | if match { 477 | fmt.Println("match table:", fullTableName) 478 | tables = append(tables, fullTableName) 479 | } 480 | } 481 | 482 | rows.Close() 483 | if err = rows.Err(); err != nil { 484 | return nil, fmt.Errorf("reading rows for database %s: %w", database, err) 485 | } 486 | 487 | dbTables[database] = tables 488 | } 489 | 490 | fmt.Println("dbTables:", dbTables) 491 | 492 | return dbTables, nil 493 | } 494 | 495 | func (s *SQLServerSource) GetAllSourceReadRowsCount() (int, error) { 496 | allCount := 0 497 | 498 | dbTables, err := s.GetDbTablesAccordingToSourceDbTables() 499 | if err != nil { 500 | return 0, fmt.Errorf("getting database tables: %w", err) 501 | } 502 | 503 | for db, tables := range dbTables { 504 | _, err := s.db.Exec(fmt.Sprintf("USE [%s]", db)) 505 | if err != nil { 506 | return 0, fmt.Errorf("switching to database %s: %w", db, err) 507 | } 508 | 509 | s.cfg.SourceDB = db 510 | for _, table := range tables { 511 | // 解析 schema 和表名 512 | parts := strings.Split(table, ".") 513 | if len(parts) != 2 { 514 | return 0, fmt.Errorf("invalid table name format for %s, expected schema.table", table) 515 | } 516 | s.cfg.SourceTable = table 517 | 518 | count, err := s.GetSourceReadRowsCount() 519 | if err != nil { 520 | return 0, fmt.Errorf("getting row count for %s.%s: %w", db, table, err) 521 | } 522 | allCount += count 523 | } 524 | } 525 | 526 | if allCount == 0 && len(dbTables) == 0 && s.cfg.SourceTable != "" { 527 | count, err := s.GetSourceReadRowsCount() 528 | if err != nil { 529 | return 0, fmt.Errorf("getting row count for single table %s: %w", s.cfg.SourceTable, err) 530 | } 531 | allCount += count 532 | } 533 | 534 | return allCount, nil 535 | } 536 | 537 | func (s *SQLServerSource) GetDbTablesAccordingToSourceDbTables() (map[string][]string, error) { 538 | allDbTables := make(map[string][]string) 539 | 540 | for _, sourceDbTable := range s.cfg.SourceDbTables { 541 | dbTable := strings.Split(sourceDbTable, "@") 542 | if len(dbTable) != 2 { 543 | return nil, fmt.Errorf("invalid sourceDbTable: %s, should be database@schema.table format", sourceDbTable) 544 | } 545 | 546 | dbs, err := s.GetDatabasesAccordingToSourceDbRegex(dbTable[0]) 547 | if err != nil { 548 | return nil, fmt.Errorf("get databases according to sourceDbRegex failed: %w", err) 549 | } 550 | 551 | if len(dbs) == 0 { 552 | log.Printf("Warning: No databases match pattern %s", dbTable[0]) 553 | continue 554 | } 555 | 556 | // match table 557 | dbTables, err := s.GetTablesAccordingToSourceTableRegex(dbTable[1], dbs) 558 | if err != nil { 559 | return nil, fmt.Errorf("get tables according to sourceTableRegex failed: %w", err) 560 | } 561 | 562 | for db, tables := range dbTables { 563 | if existingTables, ok := allDbTables[db]; ok { 564 | tableSet := make(map[string]struct{}) 565 | for _, t := range existingTables { 566 | tableSet[t] = struct{}{} 567 | } 568 | 569 | for _, t := range tables { 570 | if _, exists := tableSet[t]; !exists { 571 | allDbTables[db] = append(allDbTables[db], t) 572 | } 573 | } 574 | } else { 575 | allDbTables[db] = tables 576 | } 577 | } 578 | } 579 | 580 | return allDbTables, nil 581 | } 582 | -------------------------------------------------------------------------------- /source/stats.go: -------------------------------------------------------------------------------- 1 | package source 2 | 3 | import ( 4 | "sync" 5 | "time" 6 | 7 | timeseries "github.com/codesuki/go-time-series" 8 | ) 9 | 10 | type DatabendSourceStatsRecorder struct { 11 | extractRows *timeseries.TimeSeries 12 | mu sync.Mutex 13 | } 14 | 15 | type DatabendIngesterStatsData struct { 16 | BytesPerSecond float64 17 | RowsPerSecondd float64 18 | } 19 | 20 | func NewDatabendIntesterStatsRecorder() *DatabendSourceStatsRecorder { 21 | ingestedRows, err := timeseries.NewTimeSeries() 22 | if err != nil { 23 | panic(err) 24 | } 25 | return &DatabendSourceStatsRecorder{ 26 | extractRows: ingestedRows, 27 | } 28 | } 29 | 30 | func (stats *DatabendSourceStatsRecorder) RecordMetric(rows int) { 31 | stats.mu.Lock() 32 | defer stats.mu.Unlock() 33 | stats.extractRows.Increase(rows) 34 | } 35 | 36 | func (stats *DatabendSourceStatsRecorder) Stats(statsWindow time.Duration) DatabendIngesterStatsData { 37 | stats.mu.Lock() 38 | defer stats.mu.Unlock() 39 | 40 | rowsPerSecond := stats.calcPerSecond(stats.extractRows, statsWindow) 41 | return DatabendIngesterStatsData{ 42 | RowsPerSecondd: rowsPerSecond, 43 | } 44 | } 45 | 46 | func (stats *DatabendSourceStatsRecorder) calcPerSecond(ts *timeseries.TimeSeries, duration time.Duration) float64 { 47 | amount, err := ts.Range(time.Now().Add(-duration), time.Now()) 48 | if err != nil { 49 | return -1 50 | } 51 | 52 | return float64(amount) / duration.Seconds() 53 | } 54 | -------------------------------------------------------------------------------- /tools/confgenerotor/README.md: -------------------------------------------------------------------------------- 1 | # 配置文件生成器 2 | 3 | 这是一个用于生成数据同步配置文件的命令行工具。该工具基于现有的配置模板,根据输入参数更新特定字段并生成新的配置文件。 4 | 5 | ## 功能特性 6 | 7 | - 基于模板生成配置文件 8 | - 支持自定义数据库名和表名 9 | - 支持按天、周、月设置时间范围 10 | - 自动生成时间条件查询语句 11 | - 保持模板中的其他配置不变 12 | 13 | ## 使用方法 14 | 15 | ### 命令行参数 16 | 17 | ```bash 18 | ./config-generator --sourceDb <数据库名> --sourceTable <表名> --targetDbTable <目标数据库表名> --timeunit <时间单位> [-template <模板配置路径>] 19 | ``` 20 | 21 | # 示例 22 | 23 | ```bash 24 | ./config-generator --template config/conf.json --sourceDb db22 --sourceTable test22 --targetDbTable dd.tt --timeunit week 25 | ``` -------------------------------------------------------------------------------- /tools/confgenerotor/conf_generate.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "encoding/json" 6 | "flag" 7 | "fmt" 8 | "os" 9 | "time" 10 | ) 11 | 12 | type Config struct { 13 | DatabaseType string `json:"databaseType"` 14 | SourceHost string `json:"sourceHost"` 15 | SourcePort int `json:"sourcePort"` 16 | SourceUser string `json:"sourceUser"` 17 | SourcePass string `json:"sourcePass"` 18 | SourceDB string `json:"sourceDB"` 19 | SSLMode string `json:"sslMode"` 20 | SourceTable string `json:"sourceTable"` 21 | SourceDbTables []string `json:"sourceDbTables"` 22 | SourceQuery string `json:"sourceQuery"` 23 | SourceWhereCondition string `json:"sourceWhereCondition"` 24 | SourceSplitKey string `json:"sourceSplitKey"` 25 | SourceSplitTimeKey string `json:"sourceSplitTimeKey"` 26 | TimeSplitUnit string `json:"timeSplitUnit"` 27 | DatabendDSN string `json:"databendDSN"` 28 | DatabendTable string `json:"databendTable"` 29 | BatchSize int64 `json:"batchSize"` 30 | BatchMaxInterval int `json:"batchMaxInterval"` 31 | CopyPurge bool `json:"copyPurge"` 32 | CopyForce bool `json:"copyForce"` 33 | DisableVariantCheck bool `json:"disableVariantCheck"` 34 | UserStage string `json:"userStage"` 35 | DeleteAfterSync bool `json:"deleteAfterSync"` 36 | MaxThread int `json:"maxThread"` 37 | } 38 | 39 | func main() { 40 | templatePath := flag.String("template", "conf.json", "Path to template conf.json") 41 | sourceDB := flag.String("sourceDb", "", "Source database name") 42 | sourceTable := flag.String("sourceTable", "", "Source table name") 43 | targetDBTable := flag.String("targetDbTable", "", "Target database and table name") 44 | timeUnit := flag.String("timeunit", "day", "Time unit (day/week/month)") 45 | flag.Parse() 46 | 47 | if *sourceDB == "" || *sourceTable == "" { 48 | fmt.Println("Please provide source database and table names") 49 | flag.Usage() 50 | os.Exit(1) 51 | } 52 | 53 | templateData, err := os.ReadFile(*templatePath) 54 | if err != nil { 55 | fmt.Printf("Error reading template file: %v\n", err) 56 | os.Exit(1) 57 | } 58 | 59 | var config Config 60 | if err := json.Unmarshal(templateData, &config); err != nil { 61 | fmt.Printf("Error parsing template JSON: %v\n", err) 62 | os.Exit(1) 63 | } 64 | 65 | // cal time range 66 | now := time.Now() 67 | var startTime time.Time 68 | 69 | switch *timeUnit { 70 | case "day": 71 | startTime = now.AddDate(0, 0, -1) 72 | case "week": 73 | startTime = now.AddDate(0, 0, -7) 74 | case "month": 75 | startTime = now.AddDate(0, -1, 0) 76 | default: 77 | fmt.Println("Invalid time unit. Must be day, week, or month") 78 | os.Exit(1) 79 | } 80 | 81 | // update conf 82 | config.SourceDB = *sourceDB 83 | config.SourceTable = *sourceTable 84 | config.SourceQuery = fmt.Sprintf("select * from %s.%s", *sourceDB, *sourceTable) 85 | config.SourceWhereCondition = fmt.Sprintf("t1 >= '%s' AND t1 < '%s'", 86 | startTime.Format("2006-01-02 15:04:05"), 87 | now.Format("2006-01-02 15:04:05")) 88 | config.DatabendTable = *targetDBTable 89 | 90 | // 使用自定义编码器,禁用 HTML 转义 91 | buf := new(bytes.Buffer) 92 | encoder := json.NewEncoder(buf) 93 | encoder.SetEscapeHTML(false) 94 | encoder.SetIndent("", " ") 95 | 96 | if err := encoder.Encode(config); err != nil { 97 | fmt.Printf("Error marshaling JSON: %v\n", err) 98 | os.Exit(1) 99 | } 100 | 101 | outputPath := "conf.json" 102 | if err := os.WriteFile(outputPath, buf.Bytes(), 0644); err != nil { 103 | fmt.Printf("Error writing file: %v\n", err) 104 | os.Exit(1) 105 | } 106 | 107 | fmt.Printf("Configuration file generated successfully: %s\n", outputPath) 108 | } 109 | -------------------------------------------------------------------------------- /utils/testutils/postgres.go: -------------------------------------------------------------------------------- 1 | package testutils 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "sync" 7 | 8 | embeddedpostgres "github.com/fergusstrange/embedded-postgres" 9 | ) 10 | 11 | var ( 12 | testPostgres *embeddedpostgres.EmbeddedPostgres 13 | testPostgresOnce sync.Once 14 | ) 15 | 16 | const testPostgresPort = 15432 17 | 18 | func PostgresForTest() (string, func()) { 19 | // take an external postgres DSN 20 | if dsn := os.Getenv("TEST_POSTGRES_DSN"); dsn != "" { 21 | return dsn, func() {} 22 | } 23 | 24 | // take a long lived postgres instance TEST_POSTGRES_DSN is set 25 | dsn := fmt.Sprintf("postgres://postgres:postgres@localhost:%d/mydb?sslmode=disable&client_encoding=UTF8", testPostgresPort) 26 | if os.Getenv("TEST_POSTGRES_SINGLETON") != "" { 27 | testPostgresOnce.Do(func() { 28 | testPostgres = embeddedpostgres.NewDatabase(embeddedpostgres.DefaultConfig().Database("mydb"). 29 | Port(testPostgresPort)) 30 | if err := testPostgres.Start(); err != nil { 31 | panic(err) 32 | } 33 | }) 34 | return dsn, func() {} 35 | } 36 | 37 | // take a temp postgres instance 38 | fakePostgresDB := embeddedpostgres.NewDatabase(embeddedpostgres.DefaultConfig().Database("mydb").Port(testPostgresPort)) 39 | if err := fakePostgresDB.Start(); err != nil { 40 | panic(err) 41 | } 42 | return dsn, func() { 43 | err := fakePostgresDB.Stop() 44 | if err != nil { 45 | panic(err) 46 | } 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /worker/stats.go: -------------------------------------------------------------------------------- 1 | package worker 2 | 3 | import ( 4 | "sync" 5 | "time" 6 | 7 | timeseries "github.com/codesuki/go-time-series" 8 | ) 9 | 10 | type DatabendWorkerStatsRecorder struct { 11 | ingestedBytes *timeseries.TimeSeries 12 | ingestedRows *timeseries.TimeSeries 13 | mu sync.Mutex 14 | } 15 | 16 | type DatabendWorkerStatsData struct { 17 | BytesPerSecond float64 18 | RowsPerSecond float64 19 | } 20 | 21 | func NewDatabendWorkerStatsRecorder() *DatabendWorkerStatsRecorder { 22 | ingestedBytes, err := timeseries.NewTimeSeries() 23 | if err != nil { 24 | panic(err) 25 | } 26 | ingestedRows, err := timeseries.NewTimeSeries() 27 | if err != nil { 28 | panic(err) 29 | } 30 | return &DatabendWorkerStatsRecorder{ 31 | ingestedBytes: ingestedBytes, 32 | ingestedRows: ingestedRows, 33 | } 34 | } 35 | 36 | func (stats *DatabendWorkerStatsRecorder) RecordMetric(bytes int, rows int) { 37 | stats.mu.Lock() 38 | defer stats.mu.Unlock() 39 | stats.ingestedBytes.Increase(bytes) 40 | stats.ingestedRows.Increase(rows) 41 | } 42 | 43 | func (stats *DatabendWorkerStatsRecorder) Stats(statsWindow time.Duration) DatabendWorkerStatsData { 44 | stats.mu.Lock() 45 | defer stats.mu.Unlock() 46 | 47 | bytesPerSecond := stats.calcPerSecond(stats.ingestedBytes, statsWindow) 48 | rowsPerSecond := stats.calcPerSecond(stats.ingestedRows, statsWindow) 49 | return DatabendWorkerStatsData{ 50 | BytesPerSecond: bytesPerSecond, 51 | RowsPerSecond: rowsPerSecond, 52 | } 53 | } 54 | 55 | func (stats *DatabendWorkerStatsRecorder) calcPerSecond(ts *timeseries.TimeSeries, duration time.Duration) float64 { 56 | amount, err := ts.Range(time.Now().Add(-duration), time.Now()) 57 | if err != nil { 58 | return -1 59 | } 60 | 61 | return float64(amount) / duration.Seconds() 62 | } 63 | -------------------------------------------------------------------------------- /worker/worker.go: -------------------------------------------------------------------------------- 1 | package worker 2 | 3 | import ( 4 | "context" 5 | "encoding/json" 6 | "fmt" 7 | "log" 8 | "strings" 9 | "sync" 10 | "time" 11 | 12 | "github.com/sirupsen/logrus" 13 | 14 | "github.com/databendcloud/db-archiver/config" 15 | "github.com/databendcloud/db-archiver/ingester" 16 | "github.com/databendcloud/db-archiver/source" 17 | ) 18 | 19 | type Worker struct { 20 | Name string 21 | Cfg *config.Config 22 | Ig ingester.DatabendIngester 23 | Src source.Sourcer 24 | statsRecorder *DatabendWorkerStatsRecorder 25 | } 26 | 27 | var ( 28 | AlreadyIngestRows = 0 29 | AlreadyIngestBytes = 0 30 | ) 31 | 32 | func NewWorker(cfg *config.Config, name string, ig ingester.DatabendIngester, src source.Sourcer) *Worker { 33 | stats := NewDatabendWorkerStatsRecorder() 34 | cfg.SourceQuery = fmt.Sprintf("select * from %s.%s", cfg.SourceDB, cfg.SourceTable) 35 | 36 | return &Worker{ 37 | Name: name, 38 | Cfg: cfg, 39 | Ig: ig, 40 | Src: src, 41 | statsRecorder: stats, 42 | } 43 | } 44 | 45 | func (w *Worker) stepBatchWithCondition(threadNum int, conditionSql string) error { 46 | data, columns, err := w.Src.QueryTableData(threadNum, conditionSql) 47 | if err != nil { 48 | return err 49 | } 50 | if len(data) == 0 { 51 | return nil 52 | } 53 | startTime := time.Now() 54 | err = w.Ig.DoRetry( 55 | func() error { 56 | return w.Ig.IngestData(threadNum, columns, data) 57 | }) 58 | AlreadyIngestRows += len(data) 59 | AlreadyIngestBytes += calculateBytesSize(data) 60 | w.statsRecorder.RecordMetric(AlreadyIngestBytes, AlreadyIngestRows) 61 | stats := w.statsRecorder.Stats(time.Since(startTime)) 62 | log.Printf("Globla speed: total ingested %d rows (%f rows/s), %d bytes (%f bytes/s)", 63 | AlreadyIngestRows, stats.RowsPerSecond, AlreadyIngestBytes, stats.BytesPerSecond) 64 | 65 | if err != nil { 66 | logrus.Errorf("Failed to ingest data between %s into Databend: %v", conditionSql, err) 67 | return err 68 | } 69 | 70 | return nil 71 | } 72 | 73 | func calculateBytesSize(batch [][]interface{}) int { 74 | bytes, err := json.Marshal(batch) 75 | if err != nil { 76 | log.Fatal(err) 77 | } 78 | return len(bytes) 79 | } 80 | 81 | // IsSplitAccordingMaxGoRoutine checks if the split key is according to the max go routine 82 | func (w *Worker) IsSplitAccordingMaxGoRoutine(minSplitKey, maxSplitKey, batchSize int64) bool { 83 | return (maxSplitKey-minSplitKey)/batchSize > int64(w.Cfg.MaxThread) 84 | } 85 | 86 | func (w *Worker) stepBatch() error { 87 | wg := &sync.WaitGroup{} 88 | minSplitKey, maxSplitKey, err := w.Src.GetMinMaxSplitKey() 89 | if err != nil { 90 | return err 91 | } 92 | if minSplitKey == 0 && maxSplitKey == 0 { 93 | logrus.Infof("db.table is %s.%s, minSplitKey: %d, maxSplitKey : %d", w.Cfg.SourceDB, w.Cfg.SourceTable, minSplitKey, maxSplitKey) 94 | return nil 95 | } 96 | logrus.Infof("db.table is %s.%s, minSplitKey: %d, maxSplitKey : %d", w.Cfg.SourceDB, w.Cfg.SourceTable, minSplitKey, maxSplitKey) 97 | 98 | if w.IsSplitAccordingMaxGoRoutine(minSplitKey, maxSplitKey, w.Cfg.BatchSize) { 99 | fmt.Println("split according maxGoRoutine", w.Cfg.MaxThread) 100 | slimedRange := source.SlimCondition(w.Cfg.MaxThread, minSplitKey, maxSplitKey) 101 | fmt.Println("slimedRange", slimedRange) 102 | wg.Add(w.Cfg.MaxThread) 103 | for i := 0; i < w.Cfg.MaxThread; i++ { 104 | go func(idx int) { 105 | defer wg.Done() 106 | conditions := source.SplitConditionAccordingMaxGoRoutine(w.Cfg.SourceSplitKey, w.Cfg.BatchSize, slimedRange[idx][0], slimedRange[idx][1], maxSplitKey) 107 | logrus.Infof("conditions in one routine: %v", len(conditions)) 108 | if err != nil { 109 | logrus.Errorf("stepBatchWithCondition failed: %v", err) 110 | } 111 | for condition := range conditions { 112 | logrus.Infof("condition: %s", condition) 113 | err := w.stepBatchWithCondition(idx, condition) 114 | if err != nil { 115 | logrus.Errorf("Thread %d, stepBatchWithCondition failed: %v", idx, err) 116 | } 117 | } 118 | }(i) 119 | } 120 | wg.Wait() 121 | return nil 122 | } 123 | conditions := source.SplitCondition(w.Cfg.SourceSplitKey, w.Cfg.BatchSize, minSplitKey, maxSplitKey) 124 | for _, condition := range conditions { 125 | wg.Add(1) 126 | go func(condition string) { 127 | defer wg.Done() 128 | err := w.stepBatchWithCondition(1, condition) 129 | if err != nil { 130 | logrus.Errorf("stepBatchWithCondition failed: %v", err) 131 | } 132 | }(condition) 133 | } 134 | wg.Wait() 135 | return nil 136 | } 137 | 138 | func (w *Worker) StepBatchByTimeSplitKey() error { 139 | wg := &sync.WaitGroup{} 140 | minSplitKey, maxSplitKey, err := w.Src.GetMinMaxTimeSplitKey() 141 | if err != nil { 142 | return err 143 | } 144 | fmt.Println("minSplitKey", minSplitKey, "maxSplitKey", maxSplitKey) 145 | 146 | fmt.Println("split according time split key", w.Cfg.MaxThread) 147 | allConditions, err := source.SplitConditionAccordingToTimeSplitKey(w.Cfg, minSplitKey, maxSplitKey) 148 | if err != nil { 149 | return err 150 | } 151 | fmt.Println("allConditions: ", len(allConditions)) 152 | fmt.Println("all split conditions", allConditions) 153 | slimedRange := source.SplitTimeConditionsByMaxThread(allConditions, w.Cfg.MaxThread) 154 | fmt.Println(len(slimedRange)) 155 | fmt.Println("slimedRange", slimedRange) 156 | wg.Add(w.Cfg.MaxThread) 157 | for i := 0; i < 1; i++ { 158 | go func(idx int) { 159 | defer wg.Done() 160 | conditions := slimedRange[idx] 161 | logrus.Infof("conditions in one routine: %d", len(conditions)) 162 | if err != nil { 163 | logrus.Errorf("stepBatchWithCondition failed: %v", err) 164 | } 165 | for _, condition := range conditions { 166 | logrus.Infof("condition: %s", condition) 167 | switch w.Cfg.DatabaseType { 168 | case "mysql": 169 | err = w.stepBatchWithTimeCondition(condition, w.Cfg.BatchSize) 170 | case "mssql": 171 | err = w.stepBatchWithTimeConditionMssql(condition, w.Cfg.BatchSize) 172 | default: 173 | err = w.stepBatchWithTimeCondition(condition, w.Cfg.BatchSize) 174 | } 175 | if err != nil { 176 | logrus.Errorf("stepBatchWithCondition failed: %v", err) 177 | } 178 | } 179 | }(i) 180 | } 181 | wg.Wait() 182 | 183 | return nil 184 | } 185 | 186 | func (w *Worker) stepBatchWithTimeCondition(conditionSql string, batchSize int64) error { 187 | var offset int64 = 0 188 | for { 189 | batchSql := fmt.Sprintf("%s LIMIT %d OFFSET %d", conditionSql, batchSize, offset) 190 | data, columns, err := w.Src.QueryTableData(1, batchSql) 191 | if err != nil { 192 | return err 193 | } 194 | if len(data) == 0 { 195 | break 196 | } 197 | err = w.Ig.DoRetry( 198 | func() error { 199 | return w.Ig.IngestData(1, columns, data) 200 | }) 201 | if err != nil { 202 | logrus.Errorf("Failed to ingest data between %s into Databend: %v", conditionSql, err) 203 | return err 204 | } 205 | offset += batchSize 206 | } 207 | return nil 208 | } 209 | 210 | func (w *Worker) stepBatchWithTimeConditionMssql(conditionSql string, batchSize int64) error { 211 | var offset int64 = 0 212 | conditionSql = ensureOrderBy(conditionSql) 213 | fmt.Println("conditionSql", conditionSql) 214 | for { 215 | batchSql := fmt.Sprintf("%s OFFSET %d ROWS FETCH NEXT %d ROWS ONLY", conditionSql, offset, batchSize) 216 | 217 | data, columns, err := w.Src.QueryTableData(1, batchSql) 218 | if err != nil { 219 | return err 220 | } 221 | 222 | if len(data) == 0 { 223 | break 224 | } 225 | 226 | err = w.Ig.DoRetry( 227 | func() error { 228 | return w.Ig.IngestData(1, columns, data) 229 | }) 230 | if err != nil { 231 | logrus.Errorf("Failed to ingest data between %s into Databend: %v", conditionSql, err) 232 | return err 233 | } 234 | 235 | offset += batchSize 236 | } 237 | return nil 238 | } 239 | 240 | func (w *Worker) IsWorkerCorrect() (int, int, bool) { 241 | syncedCount, err := w.Ig.GetAllSyncedCount() 242 | if err != nil { 243 | logrus.Errorf("GetAllSyncedCount failed: %v", err) 244 | return 0, 0, false 245 | } 246 | sourceCount, err := w.Src.GetAllSourceReadRowsCount() 247 | if err != nil { 248 | logrus.Errorf("GetAllSourceReadRowsCount failed: %v", err) 249 | return 0, 0, false 250 | } 251 | return syncedCount, sourceCount, syncedCount == sourceCount 252 | } 253 | 254 | func (w *Worker) Run(ctx context.Context) { 255 | logrus.Printf("Worker %s checking before start", w.Name) 256 | 257 | logrus.Printf("Starting worker %s", w.Name) 258 | if w.Cfg.SourceSplitTimeKey != "" { 259 | err := w.StepBatchByTimeSplitKey() 260 | if err != nil { 261 | logrus.Errorf("StepBatchByTimeSplitKey failed: %v", err) 262 | } 263 | } else { 264 | err := w.stepBatch() 265 | if err != nil { 266 | logrus.Errorf("stepBatch failed: %v", err) 267 | } 268 | } 269 | } 270 | 271 | func ensureOrderBy(conditionSql string) string { 272 | if !strings.Contains(strings.ToLower(conditionSql), "order by") { 273 | conditionSql += " ORDER BY id" 274 | } 275 | return conditionSql 276 | } 277 | --------------------------------------------------------------------------------