├── .gitignore
├── LICENSE
├── README.md
├── build.gradle
├── cloudbuild.yaml
├── gradle
    └── wrapper
    │   ├── gradle-wrapper.jar
    │   └── gradle-wrapper.properties
├── gradlew
├── gradlew.bat
└── src
    └── main
        ├── java
            └── org
            │   └── polleyg
            │       ├── BQTableCopyPipeline.java
            │       └── GCPHelpers.java
        └── resources
            ├── config.yaml
            └── log4j.properties


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.lock
 2 | *.iml
 3 | purge.sh
 4 | trigger_pipeline.sh
 5 | /.gradle/
 6 | /gradle/
 7 | /.idea/
 8 | /build/
 9 | /classes/
10 | /out/
11 | key.json
12 | .DS_Store
13 | cloud-function/.vscode/
14 | .classpath
15 | .project
16 | .vscode
17 | .settings
18 | /bin


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # An application that uses Cloud Dataflow and Cloud Build to copy/transfer BigQuery tables between locations/regions.
 2 | https://medium.com/weareservian/how-to-transfer-bigquery-tables-between-locations-with-cloud-dataflow-9582acc6ae1d
 3 | 
 4 | This is an Apache 2.0 license. Feel free to fork, change, or basically do whatever you want with this
 5 | repo. PRs more than welcome.
 6 | 
 7 | ## What's the tech?
 8 |  - Java (Cloud Dataflow & BigQuery/GCS APIs)
 9 |  - Gradle (build)
10 |  - Cloud Build (CI/CD)
11 | 
12 | ## Why did you build it?
13 | Unfortunately, there's no easy/native way to do copy tables between locations/regions directly in BigQuery. For example,
14 | you can't just copy a table from the US to the EU without jumping through a few hoops.
15 | 
16 | The process is convoluted. For example, to copy a table from EU to Sydney:
17 | 
18 |  1. Export BigQuery table to a bucket located in the EU
19 |  2. Copy/sync the exported table from the EU bucket to another bucket located in Sydney
20 |  3. Load into BigQuery from Sydney bucket
21 | 
22 | You can roll your own solution for this (e.g. bash + gcloud), or there's currently 2 patterns available, which do it 
23 | for you:
24 | 
25 |  1. Use Cloud Composer (Airflow)
26 |  2. Use Cloud Dataflow (Beam)
27 |  
28 | This is the repo for Cloud Dataflow option.
29 | 
30 | ## How does it work?
31 | You basically configure some YAML with name(s) of the table(s) that you copy between regions. Then invoke Cloud Build
32 | to build, deploy and run it. The application will handle everything for you. See `config.yaml` for details on how to
33 | configure the application. It will create the necessary GCS buckets for you, and also in the correct locations. It can
34 | also create the BigQuery target dataset if you can't be bothered manually creating it beforehand.
35 | 
36 | You can specify for the job to copy a list of individual tables from one region to another, or copy an entire dataset 
37 | from one region to another. Dataset copying can be performed in one large Dataflow job, or split into multiple.
38 | 
39 | ## How do I run it?
40 |  1. Make sure all the relevant APIs are enabled on your GCP project. These include Cloud Dataflow,
41 | BigQuery, GCS, and Cloud Build.
42 |  2. Elevate permissions on the Cloud Build service account that was created for you by Google. It will look something
43 | like `<your_project_number>@cloudbuild.gserviceaccount.com`. You can give it only the required permissions for each
44 | service, or simply give it the `Project Editor` role if you're comfortable with that.
45 |  3. Clone the GitHub repo and make the necessary changes to `config.yaml`
46 |  4. Finally, `gcloud builds submit --config=cloudbuild.yaml <path_to_repo>` 
47 | 
48 | ## Any known limitations?
49 |  Complex schemas are not supported e.g. nested records etc. If you have a complex schema, then create an empty table
50 |  in the target dataset with the schema and set the flag `detectSchema` to `false` in the YAML config for the
51 |  appropriate copy, and the application will skip trying to detect the schema.
52 |  
53 | ## Can I contact you if I need some help?
54 | Sure. Email me at `polleyg@gmail.com`
55 | 
56 | 


--------------------------------------------------------------------------------
/build.gradle:
--------------------------------------------------------------------------------
 1 | apply plugin: "application"
 2 | apply plugin: "java"
 3 | apply plugin: "idea"
 4 | 
 5 | ext {
 6 |     version="1.0"
 7 |     beamVersion="2.8.0"
 8 |     jacksonDataformatVersion="2.9.7"
 9 |     jacksonDatabindVersion="2.9.7"
10 |     apacheCommsLang3Version="3.8.1"
11 |     bigqueryVersion="1.52.0"
12 |     gcsVersion="1.52.0"
13 | }
14 | 
15 | task wrapper(type: Wrapper) {
16 |     gradleVersion = "4.10"
17 | }
18 | 
19 | repositories {
20 |     mavenCentral()
21 | }
22 | 
23 | dependencies {
24 |     compile "org.slf4j:slf4j-log4j12:1.7.12"
25 |     compile "org.apache.beam:beam-sdks-java-core:{$beamVersion}"
26 |     compile "org.apache.beam:beam-runners-google-cloud-dataflow-java:${beamVersion}"
27 |     compile "org.apache.beam:beam-runners-direct-java:${beamVersion}"
28 |     compile "org.apache.beam:beam-sdks-java-io-google-cloud-platform:${beamVersion}"
29 |     compile "org.apache.beam:beam-sdks-java-extensions-google-cloud-platform-core:${beamVersion}"
30 |     compile "com.fasterxml.jackson.dataformat:jackson-dataformat-yaml:${jacksonDataformatVersion}"
31 |     compile "com.fasterxml.jackson.core:jackson-databind:${jacksonDatabindVersion}"
32 |     compile "org.apache.commons:commons-lang3:${apacheCommsLang3Version}"
33 |     compile "com.google.cloud:google-cloud-bigquery:${bigqueryVersion}"
34 |     compile "com.google.cloud:google-cloud-storage:${gcsVersion}"
35 | }
36 | 
37 | mainClassName="org.polleyg.BQTableCopyPipeline"
38 | 


--------------------------------------------------------------------------------
/cloudbuild.yaml:
--------------------------------------------------------------------------------
1 | steps:
2 | - name: gcr.io/cloud-builders/git
3 |   args: ['clone', 'https://github.com/polleyg/gcp-dataflow-copy-bigquery.git']
4 | 
5 | - name: gcr.io/cloud-builders/gradle
6 |   args: ['build', 'run']


--------------------------------------------------------------------------------
/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/polleyg/gcp-dataflow-copy-bigquery/f51d19a891e5620090683a7e7a228426a7f0fe69/gradle/wrapper/gradle-wrapper.jar


--------------------------------------------------------------------------------
/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | distributionBase=GRADLE_USER_HOME
2 | distributionPath=wrapper/dists
3 | distributionUrl=https\://services.gradle.org/distributions/gradle-4.10-bin.zip
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 | 


--------------------------------------------------------------------------------
/gradlew:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env sh
  2 | 
  3 | ##############################################################################
  4 | ##
  5 | ##  Gradle start up script for UN*X
  6 | ##
  7 | ##############################################################################
  8 | 
  9 | # Attempt to set APP_HOME
 10 | # Resolve links: $0 may be a link
 11 | PRG="$0"
 12 | # Need this for relative symlinks.
 13 | while [ -h "$PRG" ] ; do
 14 |     ls=`ls -ld "$PRG"`
 15 |     link=`expr "$ls" : '.*-> \(.*\)$'`
 16 |     if expr "$link" : '/.*' > /dev/null; then
 17 |         PRG="$link"
 18 |     else
 19 |         PRG=`dirname "$PRG"`"/$link"
 20 |     fi
 21 | done
 22 | SAVED="`pwd`"
 23 | cd "`dirname \"$PRG\"`/" >/dev/null
 24 | APP_HOME="`pwd -P`"
 25 | cd "$SAVED" >/dev/null
 26 | 
 27 | APP_NAME="Gradle"
 28 | APP_BASE_NAME=`basename "$0"`
 29 | 
 30 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
 31 | DEFAULT_JVM_OPTS=""
 32 | 
 33 | # Use the maximum available, or set MAX_FD != -1 to use that value.
 34 | MAX_FD="maximum"
 35 | 
 36 | warn () {
 37 |     echo "$*"
 38 | }
 39 | 
 40 | die () {
 41 |     echo
 42 |     echo "$*"
 43 |     echo
 44 |     exit 1
 45 | }
 46 | 
 47 | # OS specific support (must be 'true' or 'false').
 48 | cygwin=false
 49 | msys=false
 50 | darwin=false
 51 | nonstop=false
 52 | case "`uname`" in
 53 |   CYGWIN* )
 54 |     cygwin=true
 55 |     ;;
 56 |   Darwin* )
 57 |     darwin=true
 58 |     ;;
 59 |   MINGW* )
 60 |     msys=true
 61 |     ;;
 62 |   NONSTOP* )
 63 |     nonstop=true
 64 |     ;;
 65 | esac
 66 | 
 67 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
 68 | 
 69 | # Determine the Java command to use to start the JVM.
 70 | if [ -n "$JAVA_HOME" ] ; then
 71 |     if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
 72 |         # IBM's JDK on AIX uses strange locations for the executables
 73 |         JAVACMD="$JAVA_HOME/jre/sh/java"
 74 |     else
 75 |         JAVACMD="$JAVA_HOME/bin/java"
 76 |     fi
 77 |     if [ ! -x "$JAVACMD" ] ; then
 78 |         die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
 79 | 
 80 | Please set the JAVA_HOME variable in your environment to match the
 81 | location of your Java installation."
 82 |     fi
 83 | else
 84 |     JAVACMD="java"
 85 |     which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
 86 | 
 87 | Please set the JAVA_HOME variable in your environment to match the
 88 | location of your Java installation."
 89 | fi
 90 | 
 91 | # Increase the maximum file descriptors if we can.
 92 | if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
 93 |     MAX_FD_LIMIT=`ulimit -H -n`
 94 |     if [ $? -eq 0 ] ; then
 95 |         if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
 96 |             MAX_FD="$MAX_FD_LIMIT"
 97 |         fi
 98 |         ulimit -n $MAX_FD
 99 |         if [ $? -ne 0 ] ; then
100 |             warn "Could not set maximum file descriptor limit: $MAX_FD"
101 |         fi
102 |     else
103 |         warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
104 |     fi
105 | fi
106 | 
107 | # For Darwin, add options to specify how the application appears in the dock
108 | if $darwin; then
109 |     GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
110 | fi
111 | 
112 | # For Cygwin, switch paths to Windows format before running java
113 | if $cygwin ; then
114 |     APP_HOME=`cygpath --path --mixed "$APP_HOME"`
115 |     CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
116 |     JAVACMD=`cygpath --unix "$JAVACMD"`
117 | 
118 |     # We build the pattern for arguments to be converted via cygpath
119 |     ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
120 |     SEP=""
121 |     for dir in $ROOTDIRSRAW ; do
122 |         ROOTDIRS="$ROOTDIRS$SEP$dir"
123 |         SEP="|"
124 |     done
125 |     OURCYGPATTERN="(^($ROOTDIRS))"
126 |     # Add a user-defined pattern to the cygpath arguments
127 |     if [ "$GRADLE_CYGPATTERN" != "" ] ; then
128 |         OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
129 |     fi
130 |     # Now convert the arguments - kludge to limit ourselves to /bin/sh
131 |     i=0
132 |     for arg in "$@" ; do
133 |         CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
134 |         CHECK2=`echo "$arg"|egrep -c "^-"`                                 ### Determine if an option
135 | 
136 |         if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then                    ### Added a condition
137 |             eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
138 |         else
139 |             eval `echo args$i`="\"$arg\""
140 |         fi
141 |         i=$((i+1))
142 |     done
143 |     case $i in
144 |         (0) set -- ;;
145 |         (1) set -- "$args0" ;;
146 |         (2) set -- "$args0" "$args1" ;;
147 |         (3) set -- "$args0" "$args1" "$args2" ;;
148 |         (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
149 |         (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
150 |         (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
151 |         (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
152 |         (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
153 |         (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
154 |     esac
155 | fi
156 | 
157 | # Escape application args
158 | save () {
159 |     for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
160 |     echo " "
161 | }
162 | APP_ARGS=$(save "$@")
163 | 
164 | # Collect all arguments for the java command, following the shell quoting and substitution rules
165 | eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
166 | 
167 | # by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong
168 | if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then
169 |   cd "$(dirname "$0")"
170 | fi
171 | 
172 | exec "$JAVACMD" "$@"
173 | 


--------------------------------------------------------------------------------
/gradlew.bat:
--------------------------------------------------------------------------------
 1 | @if "%DEBUG%" == "" @echo off
 2 | @rem ##########################################################################
 3 | @rem
 4 | @rem  Gradle startup script for Windows
 5 | @rem
 6 | @rem ##########################################################################
 7 | 
 8 | @rem Set local scope for the variables with windows NT shell
 9 | if "%OS%"=="Windows_NT" setlocal
10 | 
11 | set DIRNAME=%~dp0
12 | if "%DIRNAME%" == "" set DIRNAME=.
13 | set APP_BASE_NAME=%~n0
14 | set APP_HOME=%DIRNAME%
15 | 
16 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
17 | set DEFAULT_JVM_OPTS=
18 | 
19 | @rem Find java.exe
20 | if defined JAVA_HOME goto findJavaFromJavaHome
21 | 
22 | set JAVA_EXE=java.exe
23 | %JAVA_EXE% -version >NUL 2>&1
24 | if "%ERRORLEVEL%" == "0" goto init
25 | 
26 | echo.
27 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
28 | echo.
29 | echo Please set the JAVA_HOME variable in your environment to match the
30 | echo location of your Java installation.
31 | 
32 | goto fail
33 | 
34 | :findJavaFromJavaHome
35 | set JAVA_HOME=%JAVA_HOME:"=%
36 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe
37 | 
38 | if exist "%JAVA_EXE%" goto init
39 | 
40 | echo.
41 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
42 | echo.
43 | echo Please set the JAVA_HOME variable in your environment to match the
44 | echo location of your Java installation.
45 | 
46 | goto fail
47 | 
48 | :init
49 | @rem Get command-line arguments, handling Windows variants
50 | 
51 | if not "%OS%" == "Windows_NT" goto win9xME_args
52 | 
53 | :win9xME_args
54 | @rem Slurp the command line arguments.
55 | set CMD_LINE_ARGS=
56 | set _SKIP=2
57 | 
58 | :win9xME_args_slurp
59 | if "x%~1" == "x" goto execute
60 | 
61 | set CMD_LINE_ARGS=%*
62 | 
63 | :execute
64 | @rem Setup the command line
65 | 
66 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
67 | 
68 | @rem Execute Gradle
69 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
70 | 
71 | :end
72 | @rem End local scope for the variables with windows NT shell
73 | if "%ERRORLEVEL%"=="0" goto mainEnd
74 | 
75 | :fail
76 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
77 | rem the _cmd.exe /c_ return code!
78 | if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
79 | exit /b 1
80 | 
81 | :mainEnd
82 | if "%OS%"=="Windows_NT" endlocal
83 | 
84 | :omega
85 | 


--------------------------------------------------------------------------------
/src/main/java/org/polleyg/BQTableCopyPipeline.java:
--------------------------------------------------------------------------------
  1 | package org.polleyg;
  2 | 
  3 | import com.fasterxml.jackson.annotation.JsonProperty;
  4 | import com.fasterxml.jackson.core.type.TypeReference;
  5 | import com.fasterxml.jackson.databind.ObjectMapper;
  6 | import com.fasterxml.jackson.dataformat.yaml.YAMLFactory;
  7 | import com.google.api.services.bigquery.model.TableRow;
  8 | import com.google.api.services.bigquery.model.TableSchema;
  9 | import com.google.cloud.bigquery.BigQueryException;
 10 | import com.google.cloud.bigquery.TableId;
 11 | import com.google.cloud.storage.StorageException;
 12 | import org.apache.beam.runners.dataflow.options.DataflowPipelineOptions;
 13 | import org.apache.beam.sdk.Pipeline;
 14 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
 15 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition;
 16 | import org.apache.beam.sdk.options.PipelineOptionsFactory;
 17 | import org.apache.beam.sdk.options.ValueProvider.StaticValueProvider;
 18 | import org.apache.beam.sdk.values.PCollection;
 19 | import org.apache.http.HttpStatus;
 20 | import org.slf4j.Logger;
 21 | import org.slf4j.LoggerFactory;
 22 | 
 23 | import java.io.File;
 24 | import java.util.*;
 25 | 
 26 | import static com.google.common.base.Preconditions.checkNotNull;
 27 | import static java.lang.String.format;
 28 | import static java.lang.System.currentTimeMillis;
 29 | import static org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED;
 30 | import static org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition.CREATE_NEVER;
 31 | 
 32 | /**
 33 |  * This application is designed to be used when you need to copy/transfer a BigQuery table(s) between location/region
 34 |  * e.g. copying a table from the US to the EU. If you are just copying a table(s) between the same location/region, then
 35 |  * you don't need this. Instead just use the `gcloud` CLI tool, WebUI, or the BigQuery API. Refer to the README.md for
 36 |  * details/instructions on how to use this application.
 37 |  *
 38 |  * @author Graham Polley, Matthew Grey
 39 |  */
 40 | public class BQTableCopyPipeline {
 41 |     private static final Logger LOG = LoggerFactory.getLogger(BQTableCopyPipeline.class);
 42 |     private static final String DEFAULT_NUM_WORKERS = "1";
 43 |     private static final String DEFAULT_MAX_WORKERS = "3";
 44 |     private static final String DEFAULT_TYPE_WORKERS = "n1-standard-1";
 45 |     private static final String DEFAULT_TARGET_LOCATION = null;
 46 |     private static final String DEFAULT_ZONE = "australia-southeast1-a";
 47 |     private static final String DEFAULT_WRITE_DISPOSITION = "truncate";
 48 |     private static final String DEFAULT_DETECT_SCHEMA = "true";
 49 | 
 50 |     /**
 51 |      * @param args
 52 |      * @throws Exception
 53 |      */
 54 |     public static void main(String[] args) throws Exception {
 55 |         new BQTableCopyPipeline().copy(args);
 56 |     }
 57 | 
 58 |     /**
 59 |      * Kicks off the copying process by reading the YAML config and creating the initial DataflowPipelineOptions
 60 |      * that will be used by all subsequent Dataflow pipelines. Each pipeline will share the same project and runner,
 61 |      * but each pipeline can be configured differently, depending on user requirements.
 62 |      *
 63 |      * @param args
 64 |      * @throws Exception
 65 |      */
 66 |     private void copy(final String[] args) throws Exception {
 67 |         ObjectMapper mapper = new ObjectMapper(new YAMLFactory());
 68 |         Config config = mapper.readValue(
 69 |                 new File(getClass().getClassLoader().getResource("config.yaml").getFile()),
 70 |                 new TypeReference<Config>() {
 71 |                 });
 72 |         PipelineOptionsFactory.register(DataflowPipelineOptions.class);
 73 |         DataflowPipelineOptions options = PipelineOptionsFactory
 74 |                 .fromArgs(args)
 75 |                 .as(DataflowPipelineOptions.class);
 76 |         if (config.copies == null || config.copies.size() == 0) {
 77 |             throw new IllegalStateException("No table or datasets were defined for copying in the config file");
 78 |         }
 79 |         options.setProject(config.project);
 80 |         options.setRunner(GCPHelpers.getRunnerClass(config.runner));
 81 | 
 82 |         LOG.info("BigQuery table copy: {}", config);
 83 | 
 84 |         List<Map<String, String>> copyTables = new ArrayList<>();
 85 |         for (Map<String, String> copy : config.copies) {
 86 |             if (GCPHelpers.isDatasetTableSpec(copy.get("source"))) {
 87 |                 List<TableId> tableIds = GCPHelpers.getTableIds(copy.get("source"));
 88 |                 tableIds.forEach(id -> copyTables.add(createTableCopyParams(GCPHelpers.getTableIdAsString(id), copy)));
 89 |             } else {
 90 |                 copyTables.add(copy);
 91 |             }
 92 |             copyTables.forEach(tableCopyParams -> setupAndRunPipeline(options, Arrays.asList(tableCopyParams), config));
 93 |         }
 94 |     }
 95 | 
 96 |     /**
 97 |      * Bootstraps the Dataflow pipeline with the provided configuration. For example, sets the number of Dataflow
 98 |      * workers, zone, machine type etc. This can be configured per pipeline that does the copy. Finally, it makes
 99 |      * a call to run the actual Dataflow pipeline.
100 |      *
101 |      * @param options         the options used for creating the actual Dataflow pipeline
102 |      * @param tableCopyParams a list of table copy Maps that encapsulates the copy configuration, which is defined in the YAML config
103 |      * @param config          the YAML config, used for globally-set copy params
104 |      */
105 |     private void setupAndRunPipeline(final DataflowPipelineOptions options,
106 |                                      final List<Map<String, String>> tableCopyParams,
107 |                                      final Config config) {
108 | 
109 |         Map<String, String> pipeParams = getFullTableCopyParams(tableCopyParams.get(0), config); //use first copy command as base params for pipeline
110 | 
111 |         String exportBucket = format("%s_df_bqcopy_export_%s", options.getProject(), pipeParams.get("sourceLocation")).toLowerCase();
112 |         String importBucket = format("%s_df_bqcopy_import_%s", options.getProject(), pipeParams.get("targetLocation")).toLowerCase();
113 | 
114 |         handleBucketCreation(exportBucket, pipeParams.get("sourceLocation"));
115 |         handleBucketCreation(importBucket, pipeParams.get("targetLocation"));
116 | 
117 |         Pipeline pipeline = setupPipeline(options, pipeParams, exportBucket);
118 | 
119 |         tableCopyParams.forEach(tableCopy -> {
120 | 
121 |             tableCopy = getFullTableCopyParams(tableCopy, config);
122 | 
123 |             handleTargetDatasetCreation(tableCopy.get("target"), tableCopy.get("targetDatasetLocation"));
124 | 
125 |             TableSchema schema = null; //no schema is permitted
126 |             if (Boolean.valueOf(tableCopy.get("detectSchema"))) {
127 |                 schema = GCPHelpers.getTableSchema(tableCopy.get("source"));
128 |             }
129 |             WriteDisposition writeDisposition = GCPHelpers.getWriteDisposition(tableCopy.get("writeDisposition"));
130 |             addCopyToPipeline(pipeline, tableCopy.get("source"), tableCopy.get("target"), importBucket, schema, writeDisposition);
131 |         });
132 |         pipeline.run();
133 |     }
134 | 
135 |     /**
136 |      * Creates a Dataflow pipeline based on the configuration parameters
137 |      *
138 |      * @param options        the options used for creating the actual Dataflow pipeline
139 |      * @param pipelineParams a Map that encapsulates the copy configuration, which is defined in the YAML config
140 |      * @param exportBucket   the GCS bucket name that is to be used in the exporting process
141 |      * @return the created Dataflow Pipeline
142 |      */
143 |     private Pipeline setupPipeline(final DataflowPipelineOptions options,
144 |                                    final Map<String, String> pipelineParams,
145 |                                    final String exportBucket) {
146 | 
147 |         LOG.info("Running a copy for '{}'", pipelineParams);
148 |         int numWorkers = Integer.valueOf(pipelineParams.get("numWorkers"));
149 |         int maxNumWorkers = Integer.valueOf(pipelineParams.get("maxNumWorkers"));
150 |         String zone = pipelineParams.get("zone");
151 |         String worker = pipelineParams.get("workerMachineType");
152 | 
153 |         options.setNumWorkers(numWorkers);
154 |         options.setMaxNumWorkers(maxNumWorkers);
155 |         options.setZone(zone);
156 |         options.setWorkerMachineType(worker);
157 |         options.setTempLocation(format("gs://%s/tmp", exportBucket));
158 |         options.setStagingLocation(format("gs://%s/jars", exportBucket));
159 |         options.setJobName(format("bq-copy-%s-to-%s-%d", pipelineParams.get("sourceLocation"), pipelineParams.get("targetLocation"), currentTimeMillis()));
160 | 
161 |         LOG.info("Running Dataflow pipeline with options '{}'", options);
162 | 
163 |         return Pipeline.create(options);
164 |     }
165 | 
166 |     /**
167 |      * Adds a table-copy job to a Dataflow pipeline
168 |      *
169 |      * @param pipeline         the schema of the BigQuery table to use and can be null
170 |      * @param sourceTable      the source BigQuery to copy from
171 |      * @param targetTable      the target BigQuery to copy to
172 |      * @param importBucket     the GCS bucket name that is to be used in the importing process
173 |      * @param schema           the schema of the table to be copied (null acceptable)
174 |      * @param writeDisposition the write disposition if the table already exists
175 |      */
176 |     private void addCopyToPipeline(final Pipeline pipeline,
177 |                                    final String sourceTable,
178 |                                    final String targetTable,
179 |                                    final String importBucket,
180 |                                    final TableSchema schema,
181 |                                    final WriteDisposition writeDisposition) {
182 | 
183 |         checkNotNull(sourceTable, "Source table cannot be null");
184 |         checkNotNull(targetTable, "Target table cannot be null");
185 | 
186 |         PCollection<TableRow> rows = pipeline.apply(format("Read: %s", sourceTable), BigQueryIO.readTableRows().from(sourceTable));
187 |         if (schema != null) {
188 |             rows.apply(format("Write: %s", targetTable), BigQueryIO.writeTableRows()
189 |                     .to(targetTable)
190 |                     .withCreateDisposition(CREATE_IF_NEEDED)
191 |                     .withWriteDisposition(writeDisposition)
192 |                     .withSchema(schema)
193 |                     .withCustomGcsTempLocation(StaticValueProvider.of((format("gs://%s", importBucket)))));
194 |         } else {
195 |             rows.apply(format("Write: %s", targetTable), BigQueryIO.writeTableRows()
196 |                     .to(targetTable)
197 |                     .withCreateDisposition(CREATE_NEVER)
198 |                     .withWriteDisposition(writeDisposition)
199 |                     .withCustomGcsTempLocation(StaticValueProvider.of((format("gs://%s", importBucket)))));
200 |         }
201 |     }
202 | 
203 |     /**
204 |      * Wraps the creation of the GCS bucket in a try/catch block. If the bucket already exists it will swallow up the
205 |      * exception because that's ok. Otherwise, it will rethrow it.
206 |      *
207 |      * @param name     the name of the GCS bucket
208 |      * @param location the location of the GCS bucket e.g. "US"
209 |      */
210 |     private void handleBucketCreation(final String name,
211 |                                       final String location) {
212 |         try {
213 |             GCPHelpers.createGCSBucket(name, location);
214 |         } catch (StorageException e) {
215 |             if (e.getCode() != HttpStatus.SC_CONFLICT) { // 409 == bucket already exists. That's ok.
216 |                 throw new IllegalStateException(e);
217 |             }
218 |         }
219 |     }
220 | 
221 |     /**
222 |      * Determines the location of the target dataset. If it has not configured in the YAML config, then it is assumed
223 |      * the target dataset exists. If it doesn't, then it will bail out and throw an exception. If it has been
224 |      * configured in the YAML config, then it will use this as the target dataset location. handleTargetDatasetCreation
225 |      * performs an existance check for the dataset in the target location.
226 |      *
227 |      * @param targetTable           the full table spec of the target table in format [PROJECT]:[DATASET].[TABLE]
228 |      * @param targetDatasetLocation the target dataset location and can be null
229 |      * @return the location of the dataset
230 |      */
231 |     private String getTargetDatasetLocation(final String targetTable,
232 |                                             final String targetDatasetLocation) {
233 |         String location;
234 |         if (targetDatasetLocation == null) {
235 |             //target dataset/table should already exist in this case
236 |             try {
237 |                 location = GCPHelpers.getDatasetLocation(targetTable);
238 |             } catch (RuntimeException e) {
239 |                 throw new IllegalStateException("'targetDatasetLocation' wasn't specified in config, but it looks" +
240 |                         " like the target dataset doesn't exist.");
241 |             }
242 |         } else {
243 |             //otherwise, return target dataset location
244 |             location = targetDatasetLocation;
245 |         }
246 |         assert location != null;
247 |         return location;
248 |     }
249 | 
250 |     /**
251 |      * Handles the creation of the target dataset in a given location. If the target dataset location is specified
252 |      * in the YAML configuration, then it will attempt to verify that the dataset does not exist. If it does not
253 |      * exist, it will attempt to create the dataset in BigQuery using that region. If the dataset already exists and
254 |      * the target dataset location is specified in the YAML, then it will bail out and throw an exception. If the
255 |      * target dataset location is not set in the YAML, it will verify that the dataset exists within the project.
256 |      * If it does not exist, it will bail out and throw an exception
257 |      *
258 |      * @param targetTable           the full table spec of the target table in format [PROJECT]:[DATASET].[TABLE]
259 |      * @param targetDatasetLocation the target dataset location and can be null
260 |      */
261 |     private void handleTargetDatasetCreation(final String targetTable,
262 |                                              final String targetDatasetLocation) {
263 |         if (targetDatasetLocation == null) {
264 |             //target dataset/table should already exist in this case
265 |             try {
266 |                 GCPHelpers.getDatasetLocation(targetTable);
267 |             } catch (RuntimeException e) {
268 |                 throw new IllegalStateException("'targetDatasetLocation' wasn't specified in config, but it looks" +
269 |                         " like the target dataset doesn't exist.");
270 |             }
271 |         } else {
272 |             //otherwise, return target dataset location
273 |             try {
274 |                 GCPHelpers.createBQDataset(targetTable, targetDatasetLocation);
275 |             } catch (BigQueryException e) {
276 |                 if (e.getCode() == HttpStatus.SC_CONFLICT) { // 409 == dataset already exists
277 |                     throw new IllegalStateException(
278 |                             format("'targetDatasetLocation' specified in config, but the dataset '%s' already exists",
279 |                                     targetTable));
280 |                 } else {
281 |                     throw new IllegalStateException(e);
282 |                 }
283 |             }
284 |         }
285 |     }
286 | 
287 |     /**
288 |      * Creates a map of params for a single table copy pipeline, data is combined from dataset copy config & a table that was found inside the dataset
289 |      *
290 |      * @param tableId           the full table spec of the target table in format [PROJECT]:[DATASET].[TABLE]
291 |      * @param datasetCopyParams a configuration map
292 |      * @return a map of copy parameters for one table copy
293 |      */
294 |     private Map<String, String> createTableCopyParams(String tableId, Map<String, String> datasetCopyParams) {
295 |         Map<String, String> params = new HashMap<>(datasetCopyParams);
296 |         params.put("source", tableId);
297 |         String tableName = tableId.split("\\.")[1];
298 |         params.put("target", format("%s.%s", datasetCopyParams.get("target"), tableName));
299 |         return params;
300 |     }
301 | 
302 |     /**
303 |      * Combines values in the config with values specified in a map, favouring the map values over the config values where they exist
304 |      *
305 |      * @param copyParams a map of copy parameters that is to override values in the config
306 |      * @param config     the config specified in the YAML file
307 |      * @return the combined map of parameters
308 |      */
309 |     private Map<String, String> getFullTableCopyParams(Map<String, String> copyParams, Config config) {
310 |         copyParams.put("detectSchema", copyParams.getOrDefault("detectSchema", config.detectSchema));
311 |         copyParams.put("numWorkers", copyParams.getOrDefault("numWorkers", config.numWorkers));
312 |         copyParams.put("maxNumWorkers", copyParams.getOrDefault("maxNumWorkers", config.maxNumWorkers));
313 |         copyParams.put("zone", copyParams.getOrDefault("zone", config.zone));
314 |         copyParams.put("workerMachineType", copyParams.getOrDefault("workerMachineType", config.workerMachineType));
315 |         copyParams.put("writeDisposition", copyParams.getOrDefault("writeDisposition", config.writeDisposition));
316 |         copyParams.put("targetDatasetLocation", copyParams.getOrDefault("targetDatasetLocation", config.targetDatasetLocation));
317 |         copyParams.put("targetLocation", getTargetDatasetLocation(copyParams.get("target"), copyParams.get("targetDatasetLocation")));
318 |         copyParams.put("sourceLocation", GCPHelpers.getDatasetLocation(copyParams.get("source")));
319 |         return copyParams;
320 |     }
321 | 
322 |     /**
323 |      * POJO for YAML config
324 |      */
325 |     private static class Config {
326 |         @JsonProperty
327 |         public List<Map<String, String>> copies;
328 |         @JsonProperty
329 |         public String project;
330 |         @JsonProperty
331 |         public String runner;
332 | 
333 |         @JsonProperty
334 |         public String workerMachineType = DEFAULT_TYPE_WORKERS;
335 |         @JsonProperty
336 |         public String numWorkers = DEFAULT_NUM_WORKERS;
337 |         @JsonProperty
338 |         public String maxNumWorkers = DEFAULT_MAX_WORKERS;
339 |         @JsonProperty
340 |         public String targetDatasetLocation = DEFAULT_TARGET_LOCATION;
341 |         @JsonProperty
342 |         public String zone = DEFAULT_ZONE;
343 |         @JsonProperty
344 |         public String writeDisposition = DEFAULT_WRITE_DISPOSITION;
345 |         @JsonProperty
346 |         public String detectSchema = DEFAULT_DETECT_SCHEMA;
347 | 
348 |         @Override
349 |         public String toString() {
350 |             return "Config{" +
351 |                     "copies=" + copies +
352 |                     ", project='" + project + '\'' +
353 |                     ", runner='" + runner + '\'' +
354 |                     ", workerMachineType='" + workerMachineType + '\'' +
355 |                     ", numWorkers='" + numWorkers + '\'' +
356 |                     ", maxNumWorkers='" + maxNumWorkers + '\'' +
357 |                     ", targetDatasetLocation='" + targetDatasetLocation + '\'' +
358 |                     ", zone='" + zone + '\'' +
359 |                     ", writeDisposition='" + writeDisposition + '\'' +
360 |                     ", detectSchema='" + detectSchema + '\'' +
361 |                     '}';
362 |         }
363 |     }
364 | }
365 | 


--------------------------------------------------------------------------------
/src/main/java/org/polleyg/GCPHelpers.java:
--------------------------------------------------------------------------------
  1 | package org.polleyg;
  2 | 
  3 | import com.google.api.services.bigquery.model.TableFieldSchema;
  4 | import com.google.api.services.bigquery.model.TableReference;
  5 | import com.google.api.services.bigquery.model.TableSchema;
  6 | import com.google.cloud.bigquery.*;
  7 | import com.google.cloud.storage.Storage;
  8 | import com.google.cloud.storage.StorageClass;
  9 | import com.google.cloud.storage.StorageException;
 10 | import com.google.cloud.storage.StorageOptions;
 11 | import com.google.common.collect.ImmutableMap;
 12 | import org.apache.beam.runners.dataflow.DataflowRunner;
 13 | import org.apache.beam.runners.direct.DirectRunner;
 14 | import org.apache.beam.sdk.PipelineRunner;
 15 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers;
 16 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition;
 17 | import org.slf4j.Logger;
 18 | import org.slf4j.LoggerFactory;
 19 | 
 20 | import java.util.ArrayList;
 21 | import java.util.List;
 22 | import java.util.Map;
 23 | import java.util.stream.Collectors;
 24 | import java.util.stream.StreamSupport;
 25 | 
 26 | import static com.google.cloud.storage.BucketInfo.newBuilder;
 27 | import static java.lang.String.format;
 28 | 
 29 | /**
 30 |  * Some package-private helper/convenience methods for talking with GCP services. Nothing to see here,
 31 |  * move along please.
 32 |  *
 33 |  * @author Graham Polley
 34 |  */
 35 | class GCPHelpers {
 36 |     private static final Logger LOG = LoggerFactory.getLogger(GCPHelpers.class);
 37 |     private static final BigQuery BIGQUERY = BigQueryOptions.getDefaultInstance().getService();
 38 |     private static final Storage STORAGE = StorageOptions.getDefaultInstance().getService();
 39 |     private static final Map<String, StorageClass> BQ_LOCATION_TO_GCS_STORAGE_CLASS = ImmutableMap.of(
 40 |             "us", StorageClass.MULTI_REGIONAL,
 41 |             "eu", StorageClass.MULTI_REGIONAL,
 42 |             "asia-northeast1", StorageClass.REGIONAL,
 43 |             "australia-southeast1", StorageClass.REGIONAL,
 44 |             "europe-west2", StorageClass.REGIONAL);
 45 | 
 46 |     /**
 47 |      * Retrieves the table ids of all tables within a given dataset
 48 |      *
 49 |      * @param datasetSpec the dataset spec in the format [PROJECT]:[DATASET]
 50 |      * @return a list of table ids for the given dataset
 51 |      */
 52 |     static List<TableId> getTableIds(final String datasetSpec) {
 53 |         TableReference tableReference = BigQueryHelpers.parseTableSpec(datasetSpec);
 54 |         if (tableReference.getDatasetId() == null) {
 55 |             throw new IllegalStateException(String.format("No dataset could be found for %s", datasetSpec));
 56 |         }
 57 |         LOG.debug("Discovering tables in the dataset: {}", datasetSpec);
 58 |         DatasetId datasetId = DatasetId.of(tableReference.getProjectId(), tableReference.getDatasetId());
 59 |         return StreamSupport
 60 |                 .stream(BIGQUERY.listTables(datasetId).iterateAll().spliterator(), false)
 61 |                 .map(table -> table.getTableId())
 62 |                 .collect(Collectors.toList());
 63 | 
 64 |     }
 65 | 
 66 |     /**
 67 |      * Returns the TableId as Strings in the format [PROJECT]:[DATASET].[TABLE]
 68 |      *
 69 |      * @param tableId
 70 |      * @return
 71 |      */
 72 |     static String getTableIdAsString(final TableId tableId) {
 73 |         return format("%s:%s.%s", tableId.getProject(), tableId.getDataset(), tableId.getTable());
 74 |     }
 75 | 
 76 |     /**
 77 |      * Works out the table schema of the provided BigQuery table spec. Note, it currently does not support
 78 |      * complex schemas with nested structures.
 79 |      *
 80 |      * @param tableSpec the full table spec in the format [PROJECT].[DATASET].[TABLE]
 81 |      * @return the TableSchema.
 82 |      */
 83 |     static TableSchema getTableSchema(final String tableSpec) {
 84 |         LOG.debug("Fetching schema for '{}'", tableSpec);
 85 |         TableReference ref = BigQueryHelpers.parseTableSpec(tableSpec);
 86 |         TableId tableId = TableId.of(ref.getProjectId(), ref.getDatasetId(), ref.getTableId());
 87 |         List<TableFieldSchema> fields = new ArrayList<>();
 88 |         Schema realSchema = BIGQUERY.getTable(tableId).getDefinition().getSchema();
 89 |         realSchema.getFields().forEach(f -> fields.add(
 90 |                 new TableFieldSchema().setName(f.getName()).setType(f.getType().name()))
 91 |         );
 92 |         return new TableSchema().setFields(fields);
 93 |     }
 94 | 
 95 |     /**
 96 |      * Creates a GCS bucket
 97 |      *
 98 |      * @param bucketName the name of the bucket to create
 99 |      * @param location   the location where the bucket should be created
100 |      * @throws StorageException
101 |      */
102 |     static void createGCSBucket(final String bucketName,
103 |                                 final String location) throws StorageException {
104 |         LOG.debug("Requested to create bucket '{}' in location '{}'..", bucketName, location);
105 |         StorageClass storageClass = BQ_LOCATION_TO_GCS_STORAGE_CLASS.get(location);
106 |         STORAGE.create(newBuilder(bucketName)
107 |                 .setStorageClass(storageClass)
108 |                 .setLocation(location)
109 |                 .build());
110 |         LOG.info("Successfully created bucket '{}' with storage class '{}' and in location '{}'",
111 |                 bucketName, storageClass, location);
112 |     }
113 | 
114 |     /**
115 |      * Creates a BigQuery dataset
116 |      *
117 |      * @param tableSpec the full table spec in the format [PROJECT].[DATASET].[TABLE]
118 |      * @param location  the location where the dataset should be created
119 |      * @throws BigQueryException
120 |      */
121 |     static void createBQDataset(final String tableSpec,
122 |                                 final String location) throws BigQueryException {
123 |         TableReference ref = BigQueryHelpers.parseTableSpec(tableSpec);
124 |         LOG.debug("Requested to create dataset '{}' in location '{}'..", ref.getDatasetId(), location);
125 |         DatasetInfo datasetInfo = DatasetInfo.newBuilder(ref.getProjectId(), ref.getDatasetId())
126 |                 .setLocation(location)
127 |                 .build();
128 |         BIGQUERY.create(datasetInfo);
129 |         LOG.info("Successfully created dataset '{}' in location '{}'", ref.getDatasetId(), location);
130 |     }
131 | 
132 |     /**
133 |      * Gets the location of a BigQuery dataset
134 |      *
135 |      * @param tableSpec the full table spec in the format [PROJECT].[DATASET].[TABLE]
136 |      * @return the location of the dataset
137 |      */
138 |     static String getDatasetLocation(final String tableSpec) {
139 |         LOG.debug("Fetching BigQuery dataset location for '{}'", tableSpec);
140 |         TableReference tableReference = BigQueryHelpers.parseTableSpec(tableSpec);
141 |         DatasetId datasetId = DatasetId.of(tableReference.getProjectId(), tableReference.getDatasetId());
142 |         return BIGQUERY.getDataset(datasetId).getLocation().toLowerCase();
143 |     }
144 | 
145 |     /**
146 |      * Takes a String and returns the corresponding Dataflow runner class object. Expects one of either 'dataflow' or
147 |      * 'local'.
148 |      *
149 |      * @param clazz the type of Dataflow runner
150 |      * @return the class object for the Dataflow runner.
151 |      */
152 |     static Class<? extends PipelineRunner<?>> getRunnerClass(final String clazz) {
153 |         Class<? extends PipelineRunner<?>> result;
154 |         switch (clazz) {
155 |             case "dataflow": {
156 |                 result = DataflowRunner.class;
157 |                 break;
158 |             }
159 |             case "local": {
160 |                 result = DirectRunner.class;
161 |                 break;
162 |             }
163 |             default:
164 |                 throw new IllegalArgumentException(format("I don't know this runner: '%s'." +
165 |                         " Use one of 'Dataflow' or 'Local'", clazz));
166 |         }
167 |         return result;
168 |     }
169 | 
170 |     /**
171 |      * Takes a String and return the corresponding Write Disposition. Expects one of either 'truncate' or 'append'.
172 |      *
173 |      * @param writeDisposition
174 |      * @return the Write Disposition for the pipeline
175 |      */
176 |     static WriteDisposition getWriteDisposition(final String writeDisposition) {
177 |         WriteDisposition result;
178 |         switch (writeDisposition) {
179 |             case "truncate": {
180 |                 result = WriteDisposition.WRITE_TRUNCATE;
181 |                 break;
182 |             }
183 |             case "append": {
184 |                 result = WriteDisposition.WRITE_APPEND;
185 |                 break;
186 |             }
187 |             default:
188 |                 throw new IllegalArgumentException(format("I don't know this write disposition: '%s'." +
189 |                         " Use one of 'truncate' or 'append'", writeDisposition));
190 |         }
191 |         return result;
192 |     }
193 | 
194 |     /**
195 |      * Determines if the table spec ( [PROJECT]:[DATASET].[TABLE] ) is that of an entire dataset or a single table
196 |      *
197 |      * @param spec the table spec to test
198 |      * @return boolean true if spec is for a dataset
199 |      */
200 |     static Boolean isDatasetTableSpec(String spec) {
201 |         return BigQueryHelpers.parseTableSpec(spec).getTableId() == null;
202 |     }
203 | }
204 | 


--------------------------------------------------------------------------------
/src/main/resources/config.yaml:
--------------------------------------------------------------------------------
 1 | # [required] The GCP project id (not the number). You can find this in the GCP console.
 2 | project: <YOUR_PROJECT_ID>
 3 | 
 4 | # [required] The type of runner. One of:
 5 | # - dataflow (runs on GCP)
 6 | # - local (runs on local machine)
 7 | runner: dataflow
 8 | 
 9 | # Copy parameters. These can be overwritten on a per-copy basis below. Options:
10 | #
11 | # [optional] workerMachineType: The type of the workers. Default is n1-standard-1.
12 | # [optional] numWorkers: The initial number of workers in the Dataflow cluster. Default is 1.
13 | # [optional] maxNumWorkers: The max number of workers in the Dataflow cluster. Default is 3.
14 | # [optional] targetDatasetLocation: If the target dataset in BigQuery does not exist, use this to specify the location.
15 | # [optional] zone: The zone where Dataflow cluster will spin up. Default is australia-southeast1-a.
16 | # [optional] writeDisposition: Either truncate or append. Default is truncate.
17 | # [optional] detectSchema: Either true or false. Use this as a workaround for complex schemas. Default is true.
18 | # [optional] composite: Either true or false. If true, Dataflow runs all copies as 1 job/pipeline. If false, each copy
19 | #                       is run as as an independent Dataflow job/pipeline. Default is true.
20 | # [required] source: The source in format [PROJECT]:[DATASET] for dataset or [PROJECT]:[DATASET].[TABLE] for table
21 | # [required] target: The target in format [PROJECT]:[DATASET] for dataset or [PROJECT]:[DATASET].[TABLE] for table
22 | copies:
23 | # Dataset copy
24 | - source: bigquery-public-data:world_bank_wdi
25 |   target: <YOUR_PROJECT_ID>:world_bank_wdi
26 |   numWorkers: 2
27 |   maxNumWorkers: 2
28 | 
29 | # Table copy to EU
30 | - source: bigquery-public-data:world_bank_wdi.country_series_definitions
31 |   target: <YOUR_PROJECT_ID>:world_bank_wdi_US.country_series_definitions
32 |   maxNumWorkers: 1
33 |   workerMachineType: n1-standard-2
34 |   writeDisposition: append
35 |   targetDatasetLocation: US
36 | 
37 |   # Table copy to US
38 | - source: bigquery-public-data:world_bank_wdi.country_series_definitions
39 |   target: <YOUR_PROJECT_ID>:world_bank_wdi_EU.country_series_definitions
40 |   maxNumWorkers: 1
41 |   workerMachineType: n1-standard-2
42 |   writeDisposition: append
43 |   targetDatasetLocation: EU
44 | 


--------------------------------------------------------------------------------
/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | log4j.rootLogger=INFO, A1
2 | log4j.appender.A1=org.apache.log4j.ConsoleAppender
3 | log4j.appender.A1.layout=org.apache.log4j.PatternLayout
4 | log4j.appender.A1.layout.ConversionPattern=%d{HH:mm:ss,SSS} %-4r [%t] %-5p %c %x- %m%n
5 | log4j.logger.org.polleyg=INFO


--------------------------------------------------------------------------------