├── .gitignore ├── README.md ├── build.gradle.kts ├── gradle.properties ├── gradle └── wrapper │ ├── gradle-wrapper.jar │ └── gradle-wrapper.properties ├── gradlew ├── gradlew.bat ├── settings.gradle.kts └── src ├── main ├── kotlin │ ├── BigQueryUtils.kt │ ├── CSVConverters.kt │ ├── CoroutinesUtils.kt │ ├── GCPUtils.kt │ ├── Main.kt │ ├── ProtoBufConverters.kt │ ├── ProtoWriter.kt │ ├── StatisticCollector.kt │ └── StorageUtils.kt └── resources │ └── logback.xml └── test └── kotlin ├── DataType └── DataType.kt ├── MainCreateTable.kt ├── TestCoroutinesUtils.kt ├── TestInsertRow.kt ├── bigquery └── TestInsertProtobuf.kt └── csv └── CSVFileGenerator.kt /.gitignore: -------------------------------------------------------------------------------- 1 | .gradle 2 | build/ 3 | !gradle/wrapper/gradle-wrapper.jar 4 | !**/src/main/**/build/ 5 | !**/src/test/**/build/ 6 | 7 | ### IntelliJ IDEA ### 8 | .idea/modules.xml 9 | .idea/jarRepositories.xml 10 | .idea/compiler.xml 11 | .idea/libraries/ 12 | *.iws 13 | *.iml 14 | *.ipr 15 | out/ 16 | !**/src/main/**/out/ 17 | !**/src/test/**/out/ 18 | 19 | ### Eclipse ### 20 | .apt_generated 21 | .classpath 22 | .factorypath 23 | .project 24 | .settings 25 | .springBeans 26 | .sts4-cache 27 | bin/ 28 | !**/src/main/**/bin/ 29 | !**/src/test/**/bin/ 30 | 31 | ### NetBeans ### 32 | /nbproject/private/ 33 | /nbbuild/ 34 | /dist/ 35 | /nbdist/ 36 | /.nb-gradle/ 37 | 38 | ### VS Code ### 39 | .vscode/ 40 | 41 | ### Mac OS ### 42 | .DS_Store -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # data ingest 2 | This project is the support of the talk about ingesting data with Kotlin and coroutines. 3 | 4 | 5 | ### Scripts 6 | 7 | #### application default login 8 | ```bash 9 | gcloud auth login --update-adc 10 | ``` 11 | 12 | ### upload file using gsutil 13 | ```bash 14 | gsutil cp generated_200iMB.csv gs://d2v-ingest-demo 15 | ``` 16 | 17 | ### download file using gsutil 18 | ```bash 19 | time gsutil cp gs://d2v-ingest-demo/generated_200MiB.csv ./temp/generated_200MiB.csv 20 | ``` 21 | 22 | -------------------------------------------------------------------------------- /build.gradle.kts: -------------------------------------------------------------------------------- 1 | plugins { 2 | kotlin("jvm") version "1.9.21" 3 | id("com.google.cloud.tools.jib") version "3.3.1" 4 | } 5 | 6 | group = "org.example" 7 | version = "1.0-SNAPSHOT" 8 | 9 | val googleApiVersion: String by project 10 | val commonsCsvVersion: String by project 11 | val kotlinxSerializationVersion: String by project 12 | val kotlinxCoroutinesVersion: String by project 13 | val logbackVersion: String by project 14 | val kotlinxDateTimeVersion: String by project 15 | 16 | repositories { 17 | mavenCentral() 18 | } 19 | 20 | dependencies { 21 | implementation(platform("com.google.cloud:libraries-bom:26.11.0")) 22 | api("org.apache.commons:commons-csv:$commonsCsvVersion") 23 | api("org.jetbrains.kotlinx:kotlinx-coroutines-core:$kotlinxCoroutinesVersion") 24 | api("org.jetbrains.kotlinx:kotlinx-serialization-json:$kotlinxSerializationVersion") 25 | api("com.google.cloud:google-cloud-storage:") 26 | api( "ch.qos.logback:logback-core:$logbackVersion") 27 | api( "ch.qos.logback:logback-classic:$logbackVersion") 28 | api( "com.google.cloud:google-cloud-logging-logback:0.131.0-alpha") 29 | api("org.jetbrains.kotlinx:kotlinx-datetime:$kotlinxDateTimeVersion") 30 | api("com.google.cloud:google-cloud-bigquery:") 31 | 32 | testImplementation("org.jetbrains.kotlin:kotlin-test") 33 | } 34 | 35 | tasks.test { 36 | useJUnitPlatform() 37 | } 38 | kotlin { 39 | jvmToolchain(17) 40 | } 41 | 42 | jib { 43 | to { 44 | image="europe-west3-docker.pkg.dev/data-ingest-421014/d2v-docker/data-ingest:$version" 45 | } 46 | } -------------------------------------------------------------------------------- /gradle.properties: -------------------------------------------------------------------------------- 1 | kotlin.code.style=official 2 | 3 | commonsCsvVersion=1.10.0 4 | googleApiVersion=26.11.0 5 | kamlVersion=0.53.0 6 | kotlinxDateTimeVersion=0.4.0 7 | kotlinxCoroutinesVersion=1.8.0 8 | kotlinxSerializationVersion=1.5.0 9 | ktorVersion=2.3.5 10 | logbackVersion=1.4.12 11 | -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gzoritchak/gcp-data-ingest/1b2e8a522374fb16ab312f39f5c58cda0a842f94/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | #Mon Mar 25 20:17:26 CET 2024 2 | distributionBase=GRADLE_USER_HOME 3 | distributionPath=wrapper/dists 4 | distributionUrl=https\://services.gradle.org/distributions/gradle-8.5-bin.zip 5 | zipStoreBase=GRADLE_USER_HOME 6 | zipStorePath=wrapper/dists 7 | -------------------------------------------------------------------------------- /gradlew: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # 4 | # Copyright © 2015-2021 the original authors. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # https://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | ############################################################################## 20 | # 21 | # Gradle start up script for POSIX generated by Gradle. 22 | # 23 | # Important for running: 24 | # 25 | # (1) You need a POSIX-compliant shell to run this script. If your /bin/sh is 26 | # noncompliant, but you have some other compliant shell such as ksh or 27 | # bash, then to run this script, type that shell name before the whole 28 | # command line, like: 29 | # 30 | # ksh Gradle 31 | # 32 | # Busybox and similar reduced shells will NOT work, because this script 33 | # requires all of these POSIX shell features: 34 | # * functions; 35 | # * expansions «$var», «${var}», «${var:-default}», «${var+SET}», 36 | # «${var#prefix}», «${var%suffix}», and «$( cmd )»; 37 | # * compound commands having a testable exit status, especially «case»; 38 | # * various built-in commands including «command», «set», and «ulimit». 39 | # 40 | # Important for patching: 41 | # 42 | # (2) This script targets any POSIX shell, so it avoids extensions provided 43 | # by Bash, Ksh, etc; in particular arrays are avoided. 44 | # 45 | # The "traditional" practice of packing multiple parameters into a 46 | # space-separated string is a well documented source of bugs and security 47 | # problems, so this is (mostly) avoided, by progressively accumulating 48 | # options in "$@", and eventually passing that to Java. 49 | # 50 | # Where the inherited environment variables (DEFAULT_JVM_OPTS, JAVA_OPTS, 51 | # and GRADLE_OPTS) rely on word-splitting, this is performed explicitly; 52 | # see the in-line comments for details. 53 | # 54 | # There are tweaks for specific operating systems such as AIX, CygWin, 55 | # Darwin, MinGW, and NonStop. 56 | # 57 | # (3) This script is generated from the Groovy template 58 | # https://github.com/gradle/gradle/blob/master/subprojects/plugins/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt 59 | # within the Gradle project. 60 | # 61 | # You can find Gradle at https://github.com/gradle/gradle/. 62 | # 63 | ############################################################################## 64 | 65 | # Attempt to set APP_HOME 66 | 67 | # Resolve links: $0 may be a link 68 | app_path=$0 69 | 70 | # Need this for daisy-chained symlinks. 71 | while 72 | APP_HOME=${app_path%"${app_path##*/}"} # leaves a trailing /; empty if no leading path 73 | [ -h "$app_path" ] 74 | do 75 | ls=$( ls -ld "$app_path" ) 76 | link=${ls#*' -> '} 77 | case $link in #( 78 | /*) app_path=$link ;; #( 79 | *) app_path=$APP_HOME$link ;; 80 | esac 81 | done 82 | 83 | APP_HOME=$( cd "${APP_HOME:-./}" && pwd -P ) || exit 84 | 85 | APP_NAME="Gradle" 86 | APP_BASE_NAME=${0##*/} 87 | 88 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 89 | DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"' 90 | 91 | # Use the maximum available, or set MAX_FD != -1 to use that value. 92 | MAX_FD=maximum 93 | 94 | warn () { 95 | echo "$*" 96 | } >&2 97 | 98 | die () { 99 | echo 100 | echo "$*" 101 | echo 102 | exit 1 103 | } >&2 104 | 105 | # OS specific support (must be 'true' or 'false'). 106 | cygwin=false 107 | msys=false 108 | darwin=false 109 | nonstop=false 110 | case "$( uname )" in #( 111 | CYGWIN* ) cygwin=true ;; #( 112 | Darwin* ) darwin=true ;; #( 113 | MSYS* | MINGW* ) msys=true ;; #( 114 | NONSTOP* ) nonstop=true ;; 115 | esac 116 | 117 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar 118 | 119 | 120 | # Determine the Java command to use to start the JVM. 121 | if [ -n "$JAVA_HOME" ] ; then 122 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then 123 | # IBM's JDK on AIX uses strange locations for the executables 124 | JAVACMD=$JAVA_HOME/jre/sh/java 125 | else 126 | JAVACMD=$JAVA_HOME/bin/java 127 | fi 128 | if [ ! -x "$JAVACMD" ] ; then 129 | die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME 130 | 131 | Please set the JAVA_HOME variable in your environment to match the 132 | location of your Java installation." 133 | fi 134 | else 135 | JAVACMD=java 136 | which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 137 | 138 | Please set the JAVA_HOME variable in your environment to match the 139 | location of your Java installation." 140 | fi 141 | 142 | # Increase the maximum file descriptors if we can. 143 | if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then 144 | case $MAX_FD in #( 145 | max*) 146 | MAX_FD=$( ulimit -H -n ) || 147 | warn "Could not query maximum file descriptor limit" 148 | esac 149 | case $MAX_FD in #( 150 | '' | soft) :;; #( 151 | *) 152 | ulimit -n "$MAX_FD" || 153 | warn "Could not set maximum file descriptor limit to $MAX_FD" 154 | esac 155 | fi 156 | 157 | # Collect all arguments for the java command, stacking in reverse order: 158 | # * args from the command line 159 | # * the main class name 160 | # * -classpath 161 | # * -D...appname settings 162 | # * --module-path (only if needed) 163 | # * DEFAULT_JVM_OPTS, JAVA_OPTS, and GRADLE_OPTS environment variables. 164 | 165 | # For Cygwin or MSYS, switch paths to Windows format before running java 166 | if "$cygwin" || "$msys" ; then 167 | APP_HOME=$( cygpath --path --mixed "$APP_HOME" ) 168 | CLASSPATH=$( cygpath --path --mixed "$CLASSPATH" ) 169 | 170 | JAVACMD=$( cygpath --unix "$JAVACMD" ) 171 | 172 | # Now convert the arguments - kludge to limit ourselves to /bin/sh 173 | for arg do 174 | if 175 | case $arg in #( 176 | -*) false ;; # don't mess with options #( 177 | /?*) t=${arg#/} t=/${t%%/*} # looks like a POSIX filepath 178 | [ -e "$t" ] ;; #( 179 | *) false ;; 180 | esac 181 | then 182 | arg=$( cygpath --path --ignore --mixed "$arg" ) 183 | fi 184 | # Roll the args list around exactly as many times as the number of 185 | # args, so each arg winds up back in the position where it started, but 186 | # possibly modified. 187 | # 188 | # NB: a `for` loop captures its iteration list before it begins, so 189 | # changing the positional parameters here affects neither the number of 190 | # iterations, nor the values presented in `arg`. 191 | shift # remove old arg 192 | set -- "$@" "$arg" # push replacement arg 193 | done 194 | fi 195 | 196 | # Collect all arguments for the java command; 197 | # * $DEFAULT_JVM_OPTS, $JAVA_OPTS, and $GRADLE_OPTS can contain fragments of 198 | # shell script including quotes and variable substitutions, so put them in 199 | # double quotes to make sure that they get re-expanded; and 200 | # * put everything else in single quotes, so that it's not re-expanded. 201 | 202 | set -- \ 203 | "-Dorg.gradle.appname=$APP_BASE_NAME" \ 204 | -classpath "$CLASSPATH" \ 205 | org.gradle.wrapper.GradleWrapperMain \ 206 | "$@" 207 | 208 | # Use "xargs" to parse quoted args. 209 | # 210 | # With -n1 it outputs one arg per line, with the quotes and backslashes removed. 211 | # 212 | # In Bash we could simply go: 213 | # 214 | # readarray ARGS < <( xargs -n1 <<<"$var" ) && 215 | # set -- "${ARGS[@]}" "$@" 216 | # 217 | # but POSIX shell has neither arrays nor command substitution, so instead we 218 | # post-process each arg (as a line of input to sed) to backslash-escape any 219 | # character that might be a shell metacharacter, then use eval to reverse 220 | # that process (while maintaining the separation between arguments), and wrap 221 | # the whole thing up as a single "set" statement. 222 | # 223 | # This will of course break if any of these variables contains a newline or 224 | # an unmatched quote. 225 | # 226 | 227 | eval "set -- $( 228 | printf '%s\n' "$DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS" | 229 | xargs -n1 | 230 | sed ' s~[^-[:alnum:]+,./:=@_]~\\&~g; ' | 231 | tr '\n' ' ' 232 | )" '"$@"' 233 | 234 | exec "$JAVACMD" "$@" 235 | -------------------------------------------------------------------------------- /gradlew.bat: -------------------------------------------------------------------------------- 1 | @rem 2 | @rem Copyright 2015 the original author or authors. 3 | @rem 4 | @rem Licensed under the Apache License, Version 2.0 (the "License"); 5 | @rem you may not use this file except in compliance with the License. 6 | @rem You may obtain a copy of the License at 7 | @rem 8 | @rem https://www.apache.org/licenses/LICENSE-2.0 9 | @rem 10 | @rem Unless required by applicable law or agreed to in writing, software 11 | @rem distributed under the License is distributed on an "AS IS" BASIS, 12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | @rem See the License for the specific language governing permissions and 14 | @rem limitations under the License. 15 | @rem 16 | 17 | @if "%DEBUG%" == "" @echo off 18 | @rem ########################################################################## 19 | @rem 20 | @rem Gradle startup script for Windows 21 | @rem 22 | @rem ########################################################################## 23 | 24 | @rem Set local scope for the variables with windows NT shell 25 | if "%OS%"=="Windows_NT" setlocal 26 | 27 | set DIRNAME=%~dp0 28 | if "%DIRNAME%" == "" set DIRNAME=. 29 | set APP_BASE_NAME=%~n0 30 | set APP_HOME=%DIRNAME% 31 | 32 | @rem Resolve any "." and ".." in APP_HOME to make it shorter. 33 | for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi 34 | 35 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 36 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m" 37 | 38 | @rem Find java.exe 39 | if defined JAVA_HOME goto findJavaFromJavaHome 40 | 41 | set JAVA_EXE=java.exe 42 | %JAVA_EXE% -version >NUL 2>&1 43 | if "%ERRORLEVEL%" == "0" goto execute 44 | 45 | echo. 46 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 47 | echo. 48 | echo Please set the JAVA_HOME variable in your environment to match the 49 | echo location of your Java installation. 50 | 51 | goto fail 52 | 53 | :findJavaFromJavaHome 54 | set JAVA_HOME=%JAVA_HOME:"=% 55 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe 56 | 57 | if exist "%JAVA_EXE%" goto execute 58 | 59 | echo. 60 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 61 | echo. 62 | echo Please set the JAVA_HOME variable in your environment to match the 63 | echo location of your Java installation. 64 | 65 | goto fail 66 | 67 | :execute 68 | @rem Setup the command line 69 | 70 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar 71 | 72 | 73 | @rem Execute Gradle 74 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %* 75 | 76 | :end 77 | @rem End local scope for the variables with windows NT shell 78 | if "%ERRORLEVEL%"=="0" goto mainEnd 79 | 80 | :fail 81 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of 82 | rem the _cmd.exe /c_ return code! 83 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 84 | exit /b 1 85 | 86 | :mainEnd 87 | if "%OS%"=="Windows_NT" endlocal 88 | 89 | :omega 90 | -------------------------------------------------------------------------------- /settings.gradle.kts: -------------------------------------------------------------------------------- 1 | plugins { 2 | id("org.gradle.toolchains.foojay-resolver-convention") version "0.5.0" 3 | } 4 | rootProject.name = "data-ingest" 5 | 6 | -------------------------------------------------------------------------------- /src/main/kotlin/BigQueryUtils.kt: -------------------------------------------------------------------------------- 1 | package org.example 2 | 3 | import com.google.cloud.bigquery.BigQuery 4 | import com.google.cloud.bigquery.BigQueryOptions 5 | import com.google.cloud.bigquery.DatasetInfo 6 | import com.google.cloud.bigquery.storage.v1.BigQueryWriteClient 7 | import com.google.cloud.bigquery.storage.v1.BigQueryWriteSettings 8 | 9 | 10 | /** 11 | * Get a reference on the BigQuery API for the given project. 12 | * Uses Google default credentials. 13 | */ 14 | fun bigQuery(projectId: String): BigQuery = 15 | BigQueryOptions.newBuilder() 16 | .setCredentials(defaultCredentials) 17 | .setProjectId(projectId) 18 | .build() 19 | .service 20 | 21 | 22 | /** 23 | * Build the client to use the BigQuery Storage API, using default Credentials 24 | */ 25 | fun BigQueryWriteClient(): BigQueryWriteClient = BigQueryWriteClient.create( 26 | BigQueryWriteSettings.newBuilder() 27 | .setCredentialsProvider(com.google.api.gax.core.FixedCredentialsProvider.create(defaultCredentials)) 28 | .build() 29 | ) 30 | 31 | 32 | /** 33 | * Creates a dataset if it doesn't exist. 34 | * Use 'EU' as default location. 35 | */ 36 | fun BigQuery.createDatasetIfNeeded(datasetId: String, location: String = "EU") { 37 | val ds = getDataset(datasetId) 38 | if (ds == null) { 39 | val datasetInfo = DatasetInfo 40 | .newBuilder(datasetId) 41 | .setLocation(location) 42 | .build() 43 | create(datasetInfo, BigQuery.DatasetOption.fields()) 44 | } 45 | } 46 | 47 | -------------------------------------------------------------------------------- /src/main/kotlin/CSVConverters.kt: -------------------------------------------------------------------------------- 1 | package org.example 2 | 3 | import kotlinx.datetime.Instant 4 | import kotlinx.datetime.LocalDate 5 | import kotlinx.datetime.LocalDateTime 6 | import org.threeten.bp.ZoneOffset 7 | import org.threeten.bp.format.DateTimeFormatter 8 | import org.threeten.bp.format.DateTimeFormatterBuilder 9 | import org.threeten.bp.format.TextStyle 10 | import org.threeten.bp.temporal.ChronoField 11 | import org.threeten.bp.temporal.TemporalAccessor 12 | import java.math.BigDecimal 13 | import java.math.RoundingMode 14 | 15 | 16 | /** 17 | * Converts a string to a BigDecimal, with a scale of 9 (biggest scale accepted by BigQuery) 18 | */ 19 | fun String.toBigDecimal(): BigDecimal = BigDecimal(this) 20 | .setScale(9, RoundingMode.HALF_EVEN) 21 | 22 | /** 23 | * Converts a CSV string to a Boolean value, accepting true, false, 1, 0 24 | * 25 | * @return the corresponding Boolean value of the CSV string 26 | * @throws IllegalArgumentException if the CSV string cannot be parsed to Boolean 27 | */ 28 | fun String.csvToBoolean(): Boolean = when (this.lowercase()) { 29 | "0", "false" -> false 30 | "1", "true" -> true 31 | else -> error("$this can't be parsed to Boolean") 32 | } 33 | 34 | /** 35 | * Converts this string representing a timestamp to an Instant 36 | */ 37 | fun String.timestampToInstant(): Instant = timestampAccessor().toInstant() 38 | 39 | /** 40 | * 41 | */ 42 | fun String.timestampAccessor(): TemporalAccessor = timestampFormatter.parse(this) 43 | 44 | /** 45 | * @return The `Instant` equivalent of the `TemporalAccessor` instance. 46 | */ 47 | fun TemporalAccessor.toInstant(): Instant = 48 | Instant.fromEpochMilliseconds( 49 | getLong(ChronoField.INSTANT_SECONDS) * 1_000 + getLong(ChronoField.MILLI_OF_SECOND) 50 | ) 51 | 52 | 53 | 54 | /** 55 | * Convert a String to a LocalDateTime. 56 | */ 57 | fun String.toLocalDate(): LocalDate = localDateFormatter.parse(this).toLocalDate() 58 | 59 | /** 60 | * Convert a String to a LocalDateTime. 61 | */ 62 | fun String.toLocalDateTime(): LocalDateTime = timestampFormatter.parse(this).toLocalDateTime() 63 | 64 | /** 65 | * @return The `Instant` equivalent of the `TemporalAccessor` instance. Only YEAR, MONTH and DAY are taken. 66 | */ 67 | fun TemporalAccessor.toLocalDate(): LocalDate = 68 | LocalDate( 69 | get(ChronoField.YEAR), 70 | get(ChronoField.MONTH_OF_YEAR), 71 | get(ChronoField.DAY_OF_MONTH), 72 | ) 73 | 74 | /** 75 | * @return The `Instant` equivalent of the `TemporalAccessor` instance. 76 | */ 77 | fun TemporalAccessor.toLocalDateTime(): LocalDateTime = 78 | LocalDateTime( 79 | get(ChronoField.YEAR), 80 | get(ChronoField.MONTH_OF_YEAR), 81 | get(ChronoField.DAY_OF_MONTH), 82 | get(ChronoField.HOUR_OF_DAY), 83 | get(ChronoField.MINUTE_OF_HOUR), 84 | get(ChronoField.SECOND_OF_MINUTE), 85 | // get(ChronoField.NANO_OF_SECOND), 86 | // todo decimals of seconds removed to stick with legacy dataflow implementation, should we put it back? 87 | ) 88 | 89 | /** 90 | * A permissive date formatter used for parsing and formatting date and time strings in a specific format. 91 | * The format follows the pattern "yyyy-MM-dd['T'][ ][[HH:mm:ss.SSSSSS]]", 92 | * where each component is optional. 93 | */ 94 | val localDateFormatter = 95 | DateTimeFormatterBuilder() 96 | .parseLenient() 97 | .append(DateTimeFormatter.ofPattern("yyyy[/][-]MM[/][-]dd")) 98 | .optionalStart() 99 | .appendLiteral('T') 100 | .optionalEnd() 101 | .optionalStart() 102 | .appendLiteral(' ') 103 | .optionalEnd() 104 | .optionalStart() 105 | .appendValue(ChronoField.HOUR_OF_DAY, 2) 106 | .appendLiteral(':') 107 | .appendValue(ChronoField.MINUTE_OF_HOUR, 2) 108 | .optionalStart() 109 | .appendLiteral(':') 110 | .appendValue(ChronoField.SECOND_OF_MINUTE, 2) 111 | .optionalEnd() 112 | .optionalStart() 113 | .appendValue(ChronoField.MILLI_OF_SECOND, 3) 114 | .optionalEnd() 115 | .optionalStart() 116 | .appendFraction(ChronoField.MICRO_OF_SECOND, 3, 6, true) 117 | .optionalEnd() 118 | .optionalStart() 119 | .appendFraction(ChronoField.NANO_OF_SECOND, 6, 9, true) 120 | .optionalEnd() 121 | .optionalEnd() 122 | .toFormatter() 123 | 124 | /** 125 | * DateTimeFormatter for formatting and parsing LocalDateTime objects. 126 | * 127 | * The formatter is configured to handle the following patterns: 128 | * - "yyyy-MM-dd" 129 | * - "yyyy/MM/dd" 130 | * - "yyyy-MM-dd'T'HH:mm" 131 | * - "yyyy/MM/dd HH:mm" 132 | * - "yyyy-MM-dd'T'HH:mm:ss" 133 | * - "yyyy/MM/dd HH:mm:ss" 134 | * - "yyyy-MM-dd'T'HH:mm:ss.SSS" 135 | * - "yyyy/MM/dd HH:mm:ss.SSS" 136 | * - "yyyy-MM-dd'T'HH:mm:ss.SSSSSS" 137 | * - "yyyy/MM/dd HH:mm:ss.SSSSSS" 138 | * - "yyyy-MM-dd'T'HH:mm:ss.nnnnnnnnn" 139 | * - "yyyy/MM/dd HH:mm:ss.nnnnnnnnn" 140 | * 141 | * Example usage: 142 | * ``` 143 | * val formattedDateTime = localDateTimeFormatter.format(LocalDateTime.now()) 144 | * val parsedDateTime = localDateTimeFormatter.parse("2022-06-28T14:30:00.123456789") 145 | * ``` 146 | */ 147 | val localDateTimeFormatter: DateTimeFormatter = 148 | DateTimeFormatterBuilder() 149 | .parseLenient() 150 | .append(DateTimeFormatter.ofPattern("yyyy[/][-]MM[/][-]dd")) 151 | .optionalStart() 152 | .appendLiteral('T') 153 | .optionalEnd() 154 | .optionalStart() 155 | .appendLiteral(' ') 156 | .optionalEnd() 157 | .appendValue(ChronoField.HOUR_OF_DAY, 2) 158 | .appendLiteral(':') 159 | .appendValue(ChronoField.MINUTE_OF_HOUR, 2) 160 | .optionalStart() 161 | .appendLiteral(':') 162 | .appendValue(ChronoField.SECOND_OF_MINUTE, 2) 163 | .optionalEnd() 164 | .optionalStart() 165 | .appendValue(ChronoField.MILLI_OF_SECOND, 3) 166 | .optionalEnd() 167 | .optionalStart() 168 | .appendFraction(ChronoField.MICRO_OF_SECOND, 3, 6, true) 169 | .optionalEnd() 170 | .optionalStart() 171 | .appendFraction(ChronoField.NANO_OF_SECOND, 6, 9, true) 172 | .optionalEnd() 173 | .toFormatter() 174 | 175 | /** 176 | * Formatter to parse and format timestamps in different formats. 177 | * The formatter is built using DateTimeFormatterBuilder and supports the following patterns: 178 | */ 179 | internal val timestampFormatter: DateTimeFormatter = 180 | DateTimeFormatterBuilder() 181 | .parseLenient() 182 | .append(DateTimeFormatter.ofPattern("yyyy[/][-]MM[/][-]dd")) 183 | .optionalStart() 184 | .appendLiteral('T') 185 | .optionalEnd() 186 | .optionalStart() 187 | .appendLiteral(' ') 188 | .optionalEnd() 189 | .appendValue(ChronoField.HOUR_OF_DAY, 2) 190 | .appendLiteral(':') 191 | .appendValue(ChronoField.MINUTE_OF_HOUR, 2) 192 | .optionalStart() 193 | .appendLiteral(':') 194 | .appendValue(ChronoField.SECOND_OF_MINUTE, 2) 195 | .optionalEnd() 196 | .optionalStart() 197 | .appendValue(ChronoField.MILLI_OF_SECOND, 3) 198 | .optionalEnd() 199 | .optionalStart() 200 | .appendFraction(ChronoField.MICRO_OF_SECOND, 3, 6, true) 201 | .optionalEnd() 202 | .optionalStart() 203 | .appendFraction(ChronoField.NANO_OF_SECOND, 6, 9, true) 204 | .optionalEnd() 205 | .optionalStart() 206 | .appendLiteral(' ') 207 | .optionalEnd() 208 | .optionalStart() 209 | .appendOffset("+HH:MM", "+00:00") 210 | .optionalEnd() 211 | .optionalStart() 212 | .appendZoneText(TextStyle.SHORT) 213 | .optionalEnd() 214 | .optionalStart() 215 | .appendLiteral('Z') 216 | .optionalEnd() 217 | .toFormatter() 218 | .withZone(ZoneOffset.UTC) 219 | -------------------------------------------------------------------------------- /src/main/kotlin/CoroutinesUtils.kt: -------------------------------------------------------------------------------- 1 | package org.example 2 | 3 | import com.google.api.core.ApiFuture 4 | import com.google.api.core.ApiFutureCallback 5 | import com.google.api.core.ApiFutures 6 | import kotlinx.coroutines.* 7 | import kotlinx.coroutines.channels.Channel 8 | import kotlinx.coroutines.flow.flow 9 | import java.util.ArrayList 10 | import kotlin.time.Duration 11 | import kotlin.time.Duration.Companion.nanoseconds 12 | import kotlinx.coroutines.flow.Flow 13 | import kotlinx.coroutines.flow.buffer 14 | import kotlinx.coroutines.flow.channelFlow 15 | import kotlinx.coroutines.sync.Mutex 16 | import kotlinx.coroutines.sync.withLock 17 | import kotlin.coroutines.resume 18 | import kotlin.coroutines.resumeWithException 19 | import kotlin.time.TimeSource 20 | 21 | /** 22 | * Collect all elements of the flow and send them to a Channel for the parallel 23 | * execution of the given block. 24 | * 25 | * The size of the Channel is given by the parallelism parameter. 26 | */ 27 | suspend fun Flow.parallelProcessing( 28 | parallelism: Int = 20, 29 | processElement: suspend (T) -> Unit 30 | ) { 31 | // Max RAM overhead = memory_size_of(T) × parallelism × 2 32 | withContext(Dispatchers.Default) { 33 | val inputChannel = Channel(parallelism) 34 | launch { 35 | collect { 36 | inputChannel.send(it) 37 | } 38 | inputChannel.close() 39 | } 40 | for (i in 0... 50 | * Naturally, the last element of the flow can have a size smaller than the 51 | * requested chunk size. 52 | * @param size the size of the chunks 53 | */ 54 | fun Flow.chunked(size: Int): Flow> = flow { 55 | val elements = ArrayList(size) 56 | collect { 57 | elements.add(it) 58 | if (elements.size == size) { 59 | emit(elements.toList()) 60 | elements.clear() 61 | } 62 | } 63 | if (elements.isNotEmpty()) 64 | emit(elements) 65 | } 66 | 67 | /** 68 | * Adapter to convert an ApiFuture API in a suspendable function 69 | */ 70 | suspend fun ApiFuture.await(dispatcher: CoroutineDispatcher = Dispatchers.Default ): T = 71 | suspendCancellableCoroutine { cancellableContinuation -> 72 | val callback = object : ApiFutureCallback { 73 | override fun onFailure(t: Throwable) = cancellableContinuation.resumeWithException(t) 74 | override fun onSuccess(result: T) = cancellableContinuation.resume(result) 75 | } 76 | ApiFutures.addCallback(this, callback, dispatcher.asExecutor()) 77 | } 78 | -------------------------------------------------------------------------------- /src/main/kotlin/GCPUtils.kt: -------------------------------------------------------------------------------- 1 | package org.example 2 | 3 | 4 | import com.google.auth.oauth2.GoogleCredentials 5 | import java.util.logging.Level 6 | 7 | typealias UtilLogger = java.util.logging.Logger 8 | 9 | /** 10 | * The Google credentials using application default. It works the same way 11 | * on the developer machine and on the cloud. 12 | */ 13 | val defaultCredentials: GoogleCredentials by lazy { GoogleCredentials.getApplicationDefault() } 14 | 15 | 16 | /** 17 | * Google libraries are a little chatty. Let's keep only warnings and error messages 18 | */ 19 | fun attenuateGoogleLogs() { 20 | UtilLogger.getLogger("com.google").level = Level.WARNING 21 | } 22 | -------------------------------------------------------------------------------- /src/main/kotlin/Main.kt: -------------------------------------------------------------------------------- 1 | package org.example 2 | 3 | import com.google.cloud.bigquery.TableId 4 | import com.google.cloud.bigquery.storage.v1.BigDecimalByteStringEncoder 5 | import com.google.cloud.bigquery.storage.v1.ProtoRows 6 | import com.google.protobuf.ByteString 7 | import com.google.protobuf.DynamicMessage 8 | import kotlinx.coroutines.* 9 | import kotlinx.coroutines.flow.asFlow 10 | import kotlinx.coroutines.flow.count 11 | import kotlinx.datetime.Clock 12 | import kotlinx.datetime.LocalDate 13 | import org.apache.commons.csv.CSVFormat 14 | import org.apache.commons.csv.CSVRecord 15 | import org.slf4j.LoggerFactory 16 | import java.math.BigDecimal 17 | 18 | private val logger = LoggerFactory.getLogger("Loader")!! 19 | 20 | const val bucketName = "d2v-ingest-demo" 21 | const val fileName = "generated_200MiB.csv" 22 | 23 | const val projectId = "you-project-id" 24 | const val datasetName = "ingest_demo" 25 | const val tableName = "insert_test" 26 | 27 | val tableId = TableId.of(projectId, datasetName, tableName)!! 28 | 29 | fun main() = runBlocking { 30 | attenuateGoogleLogs() 31 | val reader = Storage(projectId) 32 | .getBlob(bucketName, fileName) 33 | .also { logger.info("${it.size / 1024 / 1024} MiB") } 34 | .bufferedReader() 35 | 36 | val stats = StatisticCollector(logger) 37 | val writer = tableId.protoWriter() 38 | val now = Clock.System.now().toProtoField() 39 | 40 | csvFormat 41 | .parse(reader) 42 | .asFlow() 43 | .chunked(2000) 44 | .parallelProcessing { 45 | stats.addIngestedRows(it.size) 46 | val beforeProcess = System.nanoTime() 47 | val rowsBuilder = ProtoRows.newBuilder() 48 | for (record in it) { 49 | rowsBuilder.addSerializedRows(record.toProtoMessage(writer, fileName, now)) 50 | } 51 | val rows = rowsBuilder.build() 52 | val afterProcess = System.nanoTime() 53 | writer.appendRows(rows) 54 | val afterRequest = System.nanoTime() 55 | stats.addProcessDuration(afterProcess-beforeProcess) 56 | stats.addRequestDuration(afterRequest - afterProcess) 57 | } 58 | stats.logStats() 59 | writer.close() 60 | return@runBlocking 61 | } 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | val csvFormat: CSVFormat = CSVFormat.Builder.create(CSVFormat.DEFAULT) 74 | .setIgnoreSurroundingSpaces(true) 75 | .setDelimiter(';') 76 | .build() 77 | 78 | 79 | /** 80 | * Converts the CSVRecord to a ProtoBuf message 81 | */ 82 | fun CSVRecord.toProtoMessage( 83 | writer: ProtoWriter, 84 | fileName: String, 85 | now: Long, 86 | ): ByteString = writer.insertTestRow( 87 | get(0), 88 | get(1).toLong(), 89 | get(2).toLocalDate(), 90 | get(3).toBigDecimal(), 91 | fileName, 92 | now) 93 | 94 | /** 95 | * Builds the protobuf message for row insertion 96 | */ 97 | fun ProtoWriter.insertTestRow ( 98 | a: String, b: Long, c: LocalDate, d: BigDecimal, fileName: String, now: Long): ByteString = 99 | DynamicMessage.newBuilder(descriptor) 100 | .apply { 101 | setField(descriptor.fields[0], a) 102 | setField(descriptor.fields[1], b) 103 | setField(descriptor.fields[2], c.toProtoField()) 104 | setField(descriptor.fields[3], BigDecimalByteStringEncoder.encodeToNumericByteString(d)) 105 | setField(descriptor.fields[4], fileName) 106 | setField(descriptor.fields[5], now) 107 | 108 | }.build().toByteString() 109 | -------------------------------------------------------------------------------- /src/main/kotlin/ProtoBufConverters.kt: -------------------------------------------------------------------------------- 1 | package org.example 2 | 3 | import com.google.cloud.bigquery.storage.v1.BigDecimalByteStringEncoder 4 | import com.google.cloud.bigquery.storage.v1.CivilTimeEncoder 5 | import com.google.protobuf.ByteString 6 | import kotlinx.datetime.Instant 7 | import kotlinx.datetime.LocalDate 8 | import kotlinx.datetime.LocalDateTime 9 | import kotlinx.datetime.LocalTime 10 | import java.math.BigDecimal 11 | 12 | typealias TTLocalDateTime = org.threeten.bp.LocalDateTime 13 | typealias TTLocalTime = org.threeten.bp.LocalTime 14 | 15 | /** 16 | * Converts a string representation of a BigDecimal to a ByteString representation of a numeric value. 17 | */ 18 | fun String.bigDecimalToNumericByteString(): ByteString = 19 | BigDecimalByteStringEncoder.encodeToNumericByteString(BigDecimal(this)) 20 | 21 | /** 22 | * Converts a string representation of a BigDecimal to a ByteString representation of a big numeric value. 23 | */ 24 | fun String.bigDecimalToBigNumericByteString(): ByteString = 25 | BigDecimalByteStringEncoder.encodeToBigNumericByteString(BigDecimal(this)) 26 | 27 | /** 28 | * Converts the Instant object to a protocol buffer field value represented as a Long (millis). 29 | */ 30 | fun Instant.toProtoField(): Long = toEpochMilliseconds() * 1_000 31 | 32 | /** 33 | * The local date is converted into an Int (epochDays) 34 | */ 35 | fun LocalDate.toProtoField(): Int = toEpochDays() 36 | 37 | /** 38 | * Converts a LocalTime object to a protocol buffer field value. 39 | */ 40 | fun LocalTime.toProtoField()= CivilTimeEncoder.encodePacked64TimeMicros(this.toTTLocalTime()) 41 | 42 | /** 43 | * Converts a LocalDateTime object to a protocol buffer field value. 44 | */ 45 | fun LocalDateTime.toProtoField() = CivilTimeEncoder.encodePacked64DatetimeMicros(this.toTTLocalDateTime()) 46 | 47 | private fun LocalDateTime.toTTLocalDateTime(): TTLocalDateTime = TTLocalDateTime.of(year, monthNumber, dayOfMonth, hour, minute, second, nanosecond) 48 | private fun LocalTime.toTTLocalTime(): TTLocalTime = TTLocalTime.of(hour, minute, second, nanosecond) 49 | -------------------------------------------------------------------------------- /src/main/kotlin/ProtoWriter.kt: -------------------------------------------------------------------------------- 1 | package org.example 2 | 3 | import com.google.cloud.bigquery.TableId 4 | import com.google.cloud.bigquery.storage.v1.* 5 | import com.google.protobuf.Descriptors 6 | import java.io.Closeable 7 | import java.util.concurrent.TimeUnit 8 | 9 | /** 10 | * Util extension function to simplify the creation of a ProtoWriter on 11 | * the table. 12 | */ 13 | fun TableId.protoWriter() = ProtoWriter(this) 14 | 15 | /** 16 | * This class in only responsible for sending protobuf message through 17 | * Writer Storage API using _default stream. 18 | */ 19 | class ProtoWriter( 20 | tableId: TableId, 21 | ): Closeable { 22 | 23 | val descriptor: Descriptors.Descriptor 24 | 25 | private val streamWriter: StreamWriter 26 | private val client: BigQueryWriteClient = BigQueryWriteClient() 27 | 28 | init { 29 | val parentTable: TableName = TableName.of( 30 | tableId.project, 31 | tableId.dataset, 32 | tableId.table 33 | ) 34 | 35 | val newWriteStream = WriteStream.newBuilder() 36 | .setType(WriteStream.Type.COMMITTED) 37 | .build() 38 | 39 | val createWriteStreamRequest = CreateWriteStreamRequest.newBuilder() 40 | .setParent(parentTable.toString()) 41 | .setWriteStream(newWriteStream) 42 | .build() 43 | 44 | val writeStream = client.createWriteStream(createWriteStreamRequest) 45 | 46 | descriptor = BQTableSchemaToProtoDescriptor.convertBQTableSchemaToProtoDescriptor(writeStream.tableSchema) 47 | streamWriter = StreamWriter.newBuilder( 48 | "projects/${tableId.project}/datasets/${tableId.dataset}/tables/${tableId.table}/streams/_default", 49 | client 50 | ) 51 | .setWriterSchema( 52 | ProtoSchemaConverter.convert( 53 | BQTableSchemaToProtoDescriptor.convertBQTableSchemaToProtoDescriptor( 54 | writeStream.tableSchema 55 | ) 56 | ) 57 | ) 58 | .build() 59 | } 60 | 61 | /** 62 | * Performs the suspend call on bigquery write api 63 | */ 64 | suspend fun appendRows(rows: ProtoRows): AppendRowsResponse = 65 | streamWriter.append(rows).await() 66 | 67 | override fun close() { 68 | streamWriter.close() 69 | client.shutdown() 70 | client.close() 71 | client.awaitTermination(5, TimeUnit.SECONDS) 72 | } 73 | 74 | } 75 | 76 | -------------------------------------------------------------------------------- /src/main/kotlin/StatisticCollector.kt: -------------------------------------------------------------------------------- 1 | package org.example 2 | 3 | import org.slf4j.Logger 4 | import java.util.concurrent.atomic.AtomicLong 5 | import kotlin.time.Duration.Companion.nanoseconds 6 | 7 | class StatisticCollector(private val logger: Logger) { 8 | private val start = System.nanoTime() 9 | private val processDuration = AtomicLong() 10 | private val requestDuration = AtomicLong() 11 | private val ingestedRows = AtomicLong() 12 | 13 | fun addIngestedRows(size: Int) { 14 | ingestedRows.addAndGet(size.toLong()) 15 | } 16 | 17 | fun logStats() { 18 | val total = System.nanoTime() - start 19 | logger.info("Ingested ${ingestedRows.get()} rows in ${total.nanoseconds}") 20 | logger.info("Process duration: ${processDuration.get().nanoseconds} parallelization ${(processDuration.get().toFloat() / total)}") 21 | logger.info("Request duration: ${requestDuration.get().nanoseconds} parallelization ${(requestDuration.get().toFloat() / total)}") 22 | 23 | } 24 | 25 | fun addProcessDuration(duration: Long) { 26 | processDuration.addAndGet(duration) 27 | 28 | } 29 | 30 | fun addRequestDuration(duration: Long) { 31 | requestDuration.addAndGet(duration) 32 | } 33 | 34 | } -------------------------------------------------------------------------------- /src/main/kotlin/StorageUtils.kt: -------------------------------------------------------------------------------- 1 | package org.example 2 | 3 | import com.google.cloud.storage.Blob 4 | import com.google.cloud.storage.BlobId 5 | import com.google.cloud.storage.Storage 6 | import com.google.cloud.storage.StorageOptions 7 | import java.io.BufferedReader 8 | import java.io.InputStreamReader 9 | import java.nio.channels.Channels 10 | import java.nio.channels.ReadableByteChannel 11 | import java.util.zip.ZipInputStream 12 | 13 | /** 14 | * Creates a new storage instance with the specified project ID. 15 | */ 16 | fun Storage(projetId: String): Storage = StorageOptions.newBuilder() 17 | .setCredentials(defaultCredentials) 18 | .setProjectId(projetId) 19 | .build() 20 | .service 21 | 22 | /** 23 | * @return a reference to the blob 24 | */ 25 | fun Storage.getBlob(bucket: String, name: String): Blob = get(BlobId.of(bucket, name)) ?: error( 26 | "No blob for $bucket $name" + 27 | if (name.startsWith("/")) "\n The name shouldn't start with a '/'. Try removing it." else "" 28 | ) 29 | 30 | /** 31 | * Returns a buffered reader from a blob. 32 | * If the name ends with .zip, unzip the content. 33 | */ 34 | fun Blob.bufferedReader() = 35 | if (name.endsWith(".zip")) //use Content-type? 36 | reader().zippedChannelToReader() 37 | else 38 | BufferedReader(Channels.newReader(reader(), "UTF-8")) 39 | 40 | /** 41 | * Converts a zipped channel to a Reader 42 | */ 43 | fun ReadableByteChannel.zippedChannelToReader(): BufferedReader { 44 | val zipInputStream = ZipInputStream(Channels.newInputStream(this)) 45 | .also { it.nextEntry } 46 | return BufferedReader(InputStreamReader(zipInputStream)) 47 | } 48 | -------------------------------------------------------------------------------- /src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 8 | true 9 | 10 | 11 | OFF 12 | 13 | 14 | 15 | 16 | %-5level %d{HH:mm:ss.SSS} [%thread] %msg%n 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /src/test/kotlin/DataType/DataType.kt: -------------------------------------------------------------------------------- 1 | package DataType 2 | 3 | enum class DataType { 4 | STRING, INTEGER, DATE, NUMERIC; 5 | } 6 | -------------------------------------------------------------------------------- /src/test/kotlin/MainCreateTable.kt: -------------------------------------------------------------------------------- 1 | import com.google.cloud.bigquery.* 2 | import org.example.datasetName 3 | import org.example.projectId 4 | import org.example.bigQuery 5 | import org.example.tableName 6 | 7 | 8 | fun main() { 9 | bigQuery(projectId) 10 | .createTable(datasetName, tableName) { 11 | colums { 12 | col("A", LegacySQLTypeName.STRING) 13 | col("B", LegacySQLTypeName.INTEGER) 14 | col("C", LegacySQLTypeName.DATE) 15 | col("D", LegacySQLTypeName.NUMERIC) 16 | col("_filename", LegacySQLTypeName.STRING) 17 | col("_inserted_at", LegacySQLTypeName.TIMESTAMP) 18 | } 19 | } 20 | 21 | } 22 | 23 | fun BigQuery.createTable(dataset: String, tableName:String, build: TableBuilder.() -> Unit) { 24 | val builder = TableBuilder().apply(build) 25 | val tableDefinition = StandardTableDefinition 26 | .newBuilder() 27 | .setSchema( 28 | Schema.of( 29 | builder.columns.map { 30 | Field.of(it.first, it.second) 31 | }) 32 | ).build() 33 | 34 | create( 35 | TableInfo.of( 36 | TableId.of(dataset, tableName), 37 | tableDefinition 38 | ) 39 | ) 40 | 41 | println("Table created $dataset:$tableName") 42 | } 43 | 44 | 45 | /** 46 | * Small DSL for table creation 47 | */ 48 | class TableBuilder { 49 | val columns = mutableListOf>() 50 | class ColBuilder(val tableBuilder: TableBuilder) { 51 | fun col(name: String, type: LegacySQLTypeName) { 52 | tableBuilder.columns.add(name to type) 53 | } 54 | } 55 | val colBuilder = ColBuilder(this) 56 | fun colums(init: ColBuilder.() -> Unit) { 57 | colBuilder.init() 58 | } 59 | } -------------------------------------------------------------------------------- /src/test/kotlin/TestCoroutinesUtils.kt: -------------------------------------------------------------------------------- 1 | import kotlinx.coroutines.delay 2 | import kotlinx.coroutines.flow.asFlow 3 | import kotlinx.coroutines.flow.toList 4 | import kotlinx.coroutines.runBlocking 5 | import org.example.chunked 6 | import org.example.parallelProcessing 7 | import org.junit.jupiter.api.Assertions.assertEquals 8 | import org.slf4j.LoggerFactory 9 | import java.util.concurrent.atomic.AtomicInteger 10 | import kotlin.test.Test 11 | import kotlin.test.fail 12 | 13 | 14 | private val testLogger = LoggerFactory.getLogger("TestCoroutinesUtils") 15 | 16 | class TestCoroutinesUtils { 17 | 18 | @Test 19 | fun `chunked without remaining items`() { 20 | runBlocking { 21 | val chunkedFlow = listOf(1, 2, 3, 4) 22 | .asFlow() 23 | .chunked(2) 24 | 25 | assertEquals( 26 | listOf(listOf(1, 2), listOf(3, 4)), 27 | chunkedFlow.toList() 28 | ) 29 | } 30 | } 31 | 32 | @Test 33 | fun `chunked with remaining items`() { 34 | runBlocking { 35 | val chunkedFlow = listOf(1, 2, 3, 4, 5) 36 | .asFlow() 37 | .chunked(2) 38 | 39 | assertEquals( 40 | listOf(listOf(1, 2), listOf(3, 4), listOf(5)), 41 | chunkedFlow.toList() 42 | ) 43 | } 44 | } 45 | 46 | 47 | @Test 48 | fun `parallel processing`() { 49 | runBlocking { 50 | val sum = AtomicInteger() 51 | (1..1000) 52 | .asFlow() 53 | .parallelProcessing { 54 | testLogger.info("Processing $it") 55 | sum.addAndGet(it) 56 | delay(10) 57 | } 58 | assertEquals(500500, sum.get()) 59 | } 60 | } 61 | 62 | @Test 63 | fun `parallel processing with error`() { 64 | val ret: Result = runCatching { 65 | runBlocking { 66 | (1..101) 67 | .asFlow() 68 | .parallelProcessing { 69 | if (it == 100) error("Error 100") 70 | } 71 | } 72 | } 73 | if (ret.isFailure) { 74 | assertEquals("Error 100", ret.exceptionOrNull()?.message) 75 | } else { 76 | fail("Should have failed") 77 | } 78 | } 79 | } -------------------------------------------------------------------------------- /src/test/kotlin/TestInsertRow.kt: -------------------------------------------------------------------------------- 1 | import com.google.cloud.bigquery.storage.v1.ProtoRows 2 | import kotlinx.coroutines.runBlocking 3 | import kotlinx.datetime.Clock 4 | import kotlinx.datetime.LocalDate 5 | import org.example.* 6 | import kotlin.test.Test 7 | 8 | 9 | class TestInsertRow { 10 | 11 | @Test 12 | fun testInsertRow() { 13 | val writer = tableId.protoWriter() 14 | val row = writer.insertTestRow( 15 | "test", 16 | 1234L, 17 | LocalDate.parse("2024-04-30"), 18 | "12345.6789".toBigDecimal(), 19 | "no_file.txt", 20 | Clock.System.now().toProtoField() 21 | ) 22 | val rows = ProtoRows.newBuilder() 23 | rows.addSerializedRows(row) 24 | runBlocking { 25 | writer.appendRows(rows.build()) 26 | } 27 | writer.close() 28 | } 29 | } -------------------------------------------------------------------------------- /src/test/kotlin/bigquery/TestInsertProtobuf.kt: -------------------------------------------------------------------------------- 1 | package bigquery 2 | 3 | import com.google.cloud.bigquery.* 4 | import com.google.cloud.bigquery.storage.v1.ProtoRows 5 | import com.google.protobuf.DynamicMessage 6 | import kotlinx.coroutines.runBlocking 7 | import kotlinx.datetime.Instant 8 | import kotlinx.datetime.TimeZone 9 | import kotlinx.datetime.toLocalDateTime 10 | import org.example.* 11 | import org.junit.jupiter.api.Test 12 | 13 | 14 | class TestInsertProtobuf { 15 | 16 | @Test 17 | fun testInsertProtobuf() { 18 | attenuateGoogleLogs() 19 | val bigQuery = bigQuery(projectId) 20 | bigQuery.createDatasetIfNeeded(datasetName) 21 | 22 | val schema: Schema = Schema.of( 23 | listOf( 24 | Field.of("Boolean", LegacySQLTypeName.valueOf("Boolean")), 25 | Field.of("Date", LegacySQLTypeName.valueOf("Date")), 26 | Field.of("Datetime", LegacySQLTypeName.valueOf("Datetime")), 27 | Field.of("Int64", LegacySQLTypeName.valueOf("Int64")), 28 | Field.of("Numeric", LegacySQLTypeName.valueOf("Numeric")), 29 | Field.of("BigNumeric", LegacySQLTypeName.valueOf("BigNumeric")), 30 | Field.of("Float64", LegacySQLTypeName.valueOf("Float64")), 31 | Field.of("String", LegacySQLTypeName.valueOf("String")), 32 | Field.of("Time", LegacySQLTypeName.valueOf("Time")), 33 | Field.of("Timestamp", LegacySQLTypeName.valueOf("Timestamp")), 34 | )) 35 | 36 | val builder = StandardTableDefinition 37 | .newBuilder() 38 | .setSchema(schema) 39 | 40 | val tableInfo: TableInfo = TableInfo.of( 41 | TableId.of(datasetName, "insert_protobuf"), 42 | builder.build() 43 | ) 44 | try { 45 | bigQuery.create(tableInfo) 46 | } catch (e: Exception) { 47 | println(e.message) 48 | } 49 | 50 | val instant = Instant.parse("2024-02-28T23:12:59.123Z") 51 | val localDateTime = instant.toLocalDateTime(TimeZone.UTC) 52 | 53 | val tableId = TableId.of(projectId, datasetName, "insert_protobuf") 54 | ProtoWriter(tableId).use { protoWriter -> 55 | val message: DynamicMessage.Builder = DynamicMessage.newBuilder(protoWriter.descriptor) 56 | message.setField(protoWriter.descriptor.fields[0], true) 57 | message.setField(protoWriter.descriptor.fields[1], localDateTime.date.toProtoField()) 58 | message.setField(protoWriter.descriptor.fields[2], localDateTime.toProtoField()) 59 | message.setField(protoWriter.descriptor.fields[3], Long.MAX_VALUE) 60 | message.setField(protoWriter.descriptor.fields[4], "12345678901234567890.123456789".bigDecimalToNumericByteString()) 61 | message.setField(protoWriter.descriptor.fields[5], "123456789012345678901234567890.012345678".bigDecimalToBigNumericByteString()) 62 | message.setField(protoWriter.descriptor.fields[6], 1234567890.0123456) 63 | message.setField(protoWriter.descriptor.fields[7], "une chaîne de caractères avec un \n retour à la ligne") 64 | message.setField(protoWriter.descriptor.fields[8], localDateTime.time.toProtoField()) 65 | message.setField(protoWriter.descriptor.fields[9], instant.toProtoField()) 66 | val protoRows = ProtoRows.newBuilder().addSerializedRows(message.build().toByteString()) 67 | runBlocking { 68 | try { 69 | val response = protoWriter.appendRows(protoRows.build()) 70 | response 71 | } catch (e: Exception) { 72 | e.printStackTrace() 73 | } 74 | } 75 | } 76 | } 77 | } 78 | 79 | -------------------------------------------------------------------------------- /src/test/kotlin/csv/CSVFileGenerator.kt: -------------------------------------------------------------------------------- 1 | package csv 2 | 3 | import DataType.DataType 4 | import DataType.DataType.STRING 5 | import java.io.File 6 | import java.io.OutputStreamWriter 7 | import java.io.Writer 8 | import kotlin.random.Random 9 | 10 | /** 11 | * Generate a CSV file for testing. 12 | */ 13 | fun main() { 14 | generate(1000) 15 | } 16 | 17 | 18 | val defaultSchema = Schema( 19 | listOf( 20 | Field("A", STRING), 21 | Field("B", DataType.INTEGER), 22 | Field("C", DataType.DATE), 23 | Field("D", DataType.NUMERIC) 24 | ) 25 | ) 26 | 27 | class Schema(val fields: List) { 28 | 29 | } 30 | 31 | class Field( 32 | val name: String, 33 | val type: Any) { 34 | 35 | } 36 | 37 | fun generate(sizeInMiB: Int, schema: Schema = defaultSchema) { 38 | // val schemaJson = Json.encodeToString(schema) 39 | // File("generate.json").writeText(schemaJson) 40 | // 41 | 42 | fun addLine() = schema.randomLine() 43 | 44 | val output = File("generated_${sizeInMiB}MB.csv") 45 | val writer = CountWriter(output.writer()) 46 | writer.use { 47 | while (writer.size < sizeInMiB * 1024 * 1024) { 48 | it.appendLine(addLine()) 49 | } 50 | } 51 | 52 | } 53 | 54 | class CountWriter(private val delegate: OutputStreamWriter): Writer() { 55 | 56 | var size = 0 57 | 58 | override fun write(cbuf: CharArray, off: Int, len: Int) { 59 | delegate.write(cbuf, off, len) 60 | size += len 61 | } 62 | 63 | override fun close() = delegate.close() 64 | override fun flush() = delegate.flush() 65 | 66 | } 67 | 68 | 69 | 70 | fun Schema.randomLine() = 71 | fields.joinToString(";") 72 | { it.randomContent() } 73 | 74 | val random = Random(42) 75 | 76 | fun Field.randomContent() = when (this.type) { 77 | 78 | STRING -> "abcdefghijklmnopqrstuvxyz" 79 | 80 | // """"abc 81 | // |defghijklmnopqrstuvwxyz 82 | // """".trimMargin() 83 | 84 | DataType.INTEGER -> random.nextInt().toString() 85 | DataType.NUMERIC -> random.nextDouble().toString() 86 | DataType.DATE -> "2023-01-12" 87 | 88 | else -> error("No generation for $type") 89 | } --------------------------------------------------------------------------------