├── .github ├── FUNDING.yml └── workflows │ ├── test.yml │ └── release.yml ├── settings.gradle ├── gradle ├── wrapper │ ├── gradle-wrapper.jar │ └── gradle-wrapper.properties └── dependency-locks │ └── embulkPluginRuntime.lockfile ├── run_s3_local.sh ├── example ├── prepare_s3_bucket.sh ├── data.tsv ├── config.yml ├── with_logicaltypes.yml └── with_catalog.yml ├── .gitignore ├── .scalafmt.conf ├── src ├── main │ └── scala │ │ └── org │ │ └── embulk │ │ └── output │ │ └── s3_parquet │ │ ├── ContextClassLoaderSwapper.scala │ │ ├── aws │ │ ├── AwsClientConfiguration.scala │ │ ├── HttpProxy.scala │ │ ├── AwsEndpointConfiguration.scala │ │ ├── Aws.scala │ │ ├── AwsS3Configuration.scala │ │ └── AwsCredentials.scala │ │ ├── implicits.scala │ │ ├── S3ParquetPageOutput.scala │ │ ├── parquet │ │ ├── DateLogicalType.scala │ │ ├── JsonLogicalType.scala │ │ ├── DefaultColumnType.scala │ │ ├── LogicalTypeProxy.scala │ │ ├── TimestampLogicalType.scala │ │ ├── DecimalLogicalType.scala │ │ ├── TimeLogicalType.scala │ │ ├── IntLogicalType.scala │ │ ├── ParquetFileWriteSupport.scala │ │ └── ParquetColumnType.scala │ │ ├── catalog │ │ ├── GlueDataType.scala │ │ └── CatalogRegistrator.scala │ │ ├── PluginTask.scala │ │ └── S3ParquetOutputPlugin.scala └── test │ └── scala │ └── org │ └── embulk │ └── output │ └── s3_parquet │ ├── parquet │ ├── ParquetColumnTypeTestHelper.scala │ ├── MockParquetRecordConsumer.scala │ ├── TestJsonLogicalType.scala │ ├── TestDefaultColumnType.scala │ ├── TestDateLogicalType.scala │ ├── TestTimestampLogicalType.scala │ ├── TestDecimalLogicalType.scala │ ├── TestTimeLogicalType.scala │ └── TestIntLogicalType.scala │ ├── TestS3ParquetOutputPluginConfigException.scala │ ├── TestS3ParquetOutputPlugin.scala │ └── EmbulkPluginTestHelper.scala ├── LICENSE.txt ├── gradlew.bat ├── CHANGELOG.md ├── gradlew └── README.md /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: civitaspo 2 | -------------------------------------------------------------------------------- /settings.gradle: -------------------------------------------------------------------------------- 1 | rootProject.name = 'embulk-output-s3_parquet' 2 | -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/civitaspo/embulk-output-s3_parquet/HEAD/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /run_s3_local.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | docker run -it -d --rm \ 4 | -p 4566:4566 \ 5 | -e SERVICES=s3 \ 6 | localstack/localstack 7 | 8 | -------------------------------------------------------------------------------- /example/prepare_s3_bucket.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | aws s3 mb s3://example \ 4 | --endpoint-url http://localhost:4566 \ 5 | --region us-east-1 6 | 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | /pkg/ 3 | /tmp/ 4 | *.gemspec 5 | .gradle/ 6 | /classpath/ 7 | build/ 8 | .idea 9 | /.settings/ 10 | /.metadata/ 11 | .classpath 12 | .project 13 | -------------------------------------------------------------------------------- /.scalafmt.conf: -------------------------------------------------------------------------------- 1 | # https://scalameta.org/scalafmt/#Configuration 2 | 3 | version = "2.4.2" 4 | newlines.alwaysBeforeElseAfterCurlyIf = true 5 | assumeStandardLibraryStripMargin = true 6 | -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | distributionBase=GRADLE_USER_HOME 2 | distributionPath=wrapper/dists 3 | distributionUrl=https\://services.gradle.org/distributions/gradle-6.3-bin.zip 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | -------------------------------------------------------------------------------- /example/data.tsv: -------------------------------------------------------------------------------- 1 | 0 c20ef94602 c212c89f91 2017-10-24 03:54:35 +0900 {"a":0,"b":"99"} 2 | 1 330a9fc33a e25b33b616 2017-10-22 19:53:31 +0900 {"a":1,"b":"a9"} 3 | 2 707b3b7588 90823c6a1f 2017-10-23 23:42:43 +0900 {"a":2,"b":"96"} 4 | 3 8d8288e66f 2017-10-22 06:12:13 +0900 {"a":3,"b":"86"} 5 | 4 c54d8b6481 e56a40571c 2017-10-23 04:59:16 +0900 {"a":4,"b":"d2"} 6 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test CI 2 | 3 | on: 4 | - push 5 | 6 | jobs: 7 | test: 8 | 9 | runs-on: ubuntu-latest 10 | services: 11 | localstack: 12 | image: localstack/localstack 13 | ports: 14 | - 4566:4566 15 | env: 16 | SERVICES: s3 17 | 18 | steps: 19 | - uses: actions/checkout@v1 20 | - name: Set up JDK 1.8 21 | uses: actions/setup-java@v1 22 | with: 23 | java-version: 1.8 24 | - name: scalafmt 25 | run: ./gradlew spotlessCheck 26 | - name: scalatest 27 | run: ./gradlew scalatest 28 | 29 | -------------------------------------------------------------------------------- /src/main/scala/org/embulk/output/s3_parquet/ContextClassLoaderSwapper.scala: -------------------------------------------------------------------------------- 1 | package org.embulk.output.s3_parquet 2 | 3 | // WARNING: This object should be used for limited purposes only. 4 | object ContextClassLoaderSwapper { 5 | 6 | def using[A](klass: Class[_])(f: => A): A = { 7 | val currentTread = Thread.currentThread() 8 | val original = currentTread.getContextClassLoader 9 | val target = klass.getClassLoader 10 | currentTread.setContextClassLoader(target) 11 | try f 12 | finally currentTread.setContextClassLoader(original) 13 | } 14 | 15 | def usingPluginClass[A](f: => A): A = { 16 | using(classOf[S3ParquetOutputPlugin])(f) 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /example/config.yml: -------------------------------------------------------------------------------- 1 | 2 | in: 3 | type: file 4 | path_prefix: ./example/data.tsv 5 | parser: 6 | type: csv 7 | delimiter: "\t" 8 | skip_header_lines: 0 9 | null_string: "" 10 | columns: 11 | - { name: id, type: long } 12 | - { name: description, type: string } 13 | - { name: name, type: string } 14 | - { name: t, type: timestamp, format: "%Y-%m-%d %H:%M:%S %z"} 15 | - { name: payload, type: json} 16 | stop_on_invalid_record: true 17 | 18 | out: 19 | type: s3_parquet 20 | bucket: example 21 | region: us-east-1 22 | endpoint: http://127.0.0.1:4566 23 | path_prefix: path/to/my-obj. 24 | file_ext: snappy.parquet 25 | compression_codec: snappy 26 | default_timezone: Asia/Tokyo 27 | canned_acl: bucket-owner-full-control 28 | -------------------------------------------------------------------------------- /src/test/scala/org/embulk/output/s3_parquet/parquet/ParquetColumnTypeTestHelper.scala: -------------------------------------------------------------------------------- 1 | package org.embulk.output.s3_parquet.parquet 2 | 3 | import org.embulk.spi.Column 4 | import org.embulk.spi.`type`.Types 5 | 6 | trait ParquetColumnTypeTestHelper { 7 | 8 | val SAMPLE_BOOLEAN_COLUMN: Column = new Column(0, "a", Types.BOOLEAN) 9 | val SAMPLE_LONG_COLUMN: Column = new Column(0, "a", Types.LONG) 10 | val SAMPLE_DOUBLE_COLUMN: Column = new Column(0, "a", Types.DOUBLE) 11 | val SAMPLE_STRING_COLUMN: Column = new Column(0, "a", Types.STRING) 12 | val SAMPLE_TIMESTAMP_COLUMN: Column = new Column(0, "a", Types.TIMESTAMP) 13 | val SAMPLE_JSON_COLUMN: Column = new Column(0, "a", Types.JSON) 14 | 15 | def newMockRecordConsumer(): MockParquetRecordConsumer = 16 | MockParquetRecordConsumer() 17 | } 18 | -------------------------------------------------------------------------------- /example/with_logicaltypes.yml: -------------------------------------------------------------------------------- 1 | 2 | in: 3 | type: file 4 | path_prefix: ./example/data.tsv 5 | parser: 6 | type: csv 7 | delimiter: "\t" 8 | skip_header_lines: 0 9 | null_string: "" 10 | columns: 11 | - { name: id, type: long } 12 | - { name: description, type: string } 13 | - { name: name, type: string } 14 | - { name: t, type: timestamp, format: "%Y-%m-%d %H:%M:%S %z"} 15 | - { name: payload, type: json} 16 | stop_on_invalid_record: true 17 | 18 | out: 19 | type: s3_parquet 20 | bucket: example 21 | region: us-east-1 22 | endpoint: http://127.0.0.1:4566 23 | path_prefix: path/to/my-obj-2. 24 | file_ext: snappy.parquet 25 | compression_codec: snappy 26 | default_timezone: Asia/Tokyo 27 | canned_acl: bucket-owner-full-control 28 | column_options: 29 | id: 30 | logical_type: "uint64" 31 | type_options: 32 | timestamp: 33 | logical_type: "timestamp-millis" 34 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release CI 2 | 3 | on: 4 | push: 5 | tags: 6 | - '*' 7 | 8 | jobs: 9 | release: 10 | 11 | runs-on: ubuntu-latest 12 | services: 13 | localstack: 14 | image: localstack/localstack 15 | ports: 16 | - 4566:4566 17 | env: 18 | SERVICES: s3 19 | 20 | steps: 21 | - uses: actions/checkout@v1 22 | - name: Set up JDK 1.8 23 | uses: actions/setup-java@v1 24 | with: 25 | java-version: 1.8 26 | - name: scalafmt 27 | run: ./gradlew spotlessCheck 28 | - name: scalatest 29 | run: ./gradlew scalatest 30 | - name: Release the new gem 31 | run: | 32 | mkdir -p $HOME/.gem 33 | touch $HOME/.gem/credentials 34 | chmod 0600 $HOME/.gem/credentials 35 | printf -- "---\n:rubygems_api_key: ${RUBYGEMS_API_KEY}\n" > $HOME/.gem/credentials 36 | ./gradlew gemPush 37 | env: 38 | RUBYGEMS_API_KEY: ${{secrets.RUBYGEMS_API_KEY}} 39 | -------------------------------------------------------------------------------- /example/with_catalog.yml: -------------------------------------------------------------------------------- 1 | 2 | in: 3 | type: file 4 | path_prefix: ./example/data.tsv 5 | parser: 6 | type: csv 7 | delimiter: "\t" 8 | skip_header_lines: 0 9 | null_string: "" 10 | columns: 11 | - { name: id, type: long } 12 | - { name: description, type: string } 13 | - { name: name, type: string } 14 | - { name: t, type: timestamp, format: "%Y-%m-%d %H:%M:%S %z"} 15 | - { name: payload, type: json} 16 | stop_on_invalid_record: true 17 | 18 | out: 19 | type: s3_parquet 20 | bucket: example 21 | region: us-east-1 22 | endpoint: http://127.0.0.1:4566 23 | path_prefix: path/to/my-obj-2. 24 | file_ext: snappy.parquet 25 | compression_codec: snappy 26 | default_timezone: Asia/Tokyo 27 | canned_acl: bucket-owner-full-control 28 | column_options: 29 | id: 30 | logical_type: "int64" 31 | payload: 32 | logical_type: "json" 33 | type_options: 34 | timestamp: 35 | logical_type: "timestamp-millis" 36 | catalog: 37 | database: example_db 38 | table: example_tbl 39 | -------------------------------------------------------------------------------- /src/main/scala/org/embulk/output/s3_parquet/aws/AwsClientConfiguration.scala: -------------------------------------------------------------------------------- 1 | package org.embulk.output.s3_parquet.aws 2 | 3 | import java.util.Optional 4 | 5 | import com.amazonaws.ClientConfiguration 6 | import com.amazonaws.client.builder.AwsClientBuilder 7 | import org.embulk.config.{Config, ConfigDefault} 8 | import org.embulk.output.s3_parquet.aws.AwsClientConfiguration.Task 9 | 10 | object AwsClientConfiguration { 11 | 12 | trait Task { 13 | 14 | @Config("http_proxy") 15 | @ConfigDefault("null") 16 | def getHttpProxy: Optional[HttpProxy.Task] 17 | 18 | } 19 | 20 | def apply(task: Task): AwsClientConfiguration = { 21 | new AwsClientConfiguration(task) 22 | } 23 | } 24 | 25 | class AwsClientConfiguration(task: Task) { 26 | 27 | def configureAwsClientBuilder[S <: AwsClientBuilder[S, T], T]( 28 | builder: AwsClientBuilder[S, T] 29 | ): Unit = { 30 | task.getHttpProxy.ifPresent { v => 31 | val cc = new ClientConfiguration 32 | HttpProxy(v).configureClientConfiguration(cc) 33 | builder.setClientConfiguration(cc) 34 | } 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2019 Takahiro Nakayama 2 | 3 | MIT License 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 20 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /src/main/scala/org/embulk/output/s3_parquet/aws/HttpProxy.scala: -------------------------------------------------------------------------------- 1 | package org.embulk.output.s3_parquet.aws 2 | 3 | import java.util.Optional 4 | 5 | import com.amazonaws.{ClientConfiguration, Protocol} 6 | import org.embulk.config.{Config, ConfigDefault, ConfigException} 7 | import org.embulk.output.s3_parquet.aws.HttpProxy.Task 8 | 9 | object HttpProxy { 10 | 11 | trait Task { 12 | 13 | @Config("host") 14 | @ConfigDefault("null") 15 | def getHost: Optional[String] 16 | 17 | @Config("port") 18 | @ConfigDefault("null") 19 | def getPort: Optional[Int] 20 | 21 | @Config("protocol") 22 | @ConfigDefault("\"https\"") 23 | def getProtocol: String 24 | 25 | @Config("user") 26 | @ConfigDefault("null") 27 | def getUser: Optional[String] 28 | 29 | @Config("password") 30 | @ConfigDefault("null") 31 | def getPassword: Optional[String] 32 | 33 | } 34 | 35 | def apply(task: Task): HttpProxy = { 36 | new HttpProxy(task) 37 | } 38 | 39 | } 40 | 41 | class HttpProxy(task: Task) { 42 | 43 | def configureClientConfiguration(cc: ClientConfiguration): Unit = { 44 | task.getHost.ifPresent(v => cc.setProxyHost(v)) 45 | task.getPort.ifPresent(v => cc.setProxyPort(v)) 46 | 47 | Protocol.values.find(p => p.name().equals(task.getProtocol)) match { 48 | case Some(v) => 49 | cc.setProtocol(v) 50 | case None => 51 | throw new ConfigException( 52 | s"'${task.getProtocol}' is unsupported: `protocol` must be one of [${Protocol.values 53 | .map(v => s"'$v'") 54 | .mkString(", ")}]." 55 | ) 56 | } 57 | 58 | task.getUser.ifPresent(v => cc.setProxyUsername(v)) 59 | task.getPassword.ifPresent(v => cc.setProxyPassword(v)) 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /src/main/scala/org/embulk/output/s3_parquet/aws/AwsEndpointConfiguration.scala: -------------------------------------------------------------------------------- 1 | package org.embulk.output.s3_parquet.aws 2 | 3 | import java.util.Optional 4 | 5 | import com.amazonaws.client.builder.AwsClientBuilder 6 | import com.amazonaws.client.builder.AwsClientBuilder.EndpointConfiguration 7 | import com.amazonaws.regions.{DefaultAwsRegionProviderChain, Regions} 8 | import org.embulk.config.{Config, ConfigDefault} 9 | import org.embulk.output.s3_parquet.aws.AwsEndpointConfiguration.Task 10 | 11 | import scala.util.Try 12 | 13 | object AwsEndpointConfiguration { 14 | 15 | trait Task { 16 | 17 | @Config("endpoint") 18 | @ConfigDefault("null") 19 | def getEndpoint: Optional[String] 20 | 21 | @Config("region") 22 | @ConfigDefault("null") 23 | def getRegion: Optional[String] 24 | 25 | } 26 | 27 | def apply(task: Task): AwsEndpointConfiguration = { 28 | new AwsEndpointConfiguration(task) 29 | } 30 | } 31 | 32 | class AwsEndpointConfiguration(task: Task) { 33 | 34 | def configureAwsClientBuilder[S <: AwsClientBuilder[S, T], T]( 35 | builder: AwsClientBuilder[S, T] 36 | ): Unit = { 37 | if (task.getRegion.isPresent && task.getEndpoint.isPresent) { 38 | val ec = 39 | new EndpointConfiguration(task.getEndpoint.get, task.getRegion.get) 40 | builder.setEndpointConfiguration(ec) 41 | } 42 | else if (task.getRegion.isPresent && !task.getEndpoint.isPresent) { 43 | builder.setRegion(task.getRegion.get) 44 | } 45 | else if (!task.getRegion.isPresent && task.getEndpoint.isPresent) { 46 | val r: String = Try(new DefaultAwsRegionProviderChain().getRegion) 47 | .getOrElse(Regions.DEFAULT_REGION.getName) 48 | val e: String = task.getEndpoint.get 49 | val ec = new EndpointConfiguration(e, r) 50 | builder.setEndpointConfiguration(ec) 51 | } 52 | } 53 | 54 | } 55 | -------------------------------------------------------------------------------- /src/main/scala/org/embulk/output/s3_parquet/aws/Aws.scala: -------------------------------------------------------------------------------- 1 | package org.embulk.output.s3_parquet.aws 2 | 3 | import com.amazonaws.client.builder.AwsClientBuilder 4 | import com.amazonaws.services.glue.{AWSGlue, AWSGlueClientBuilder} 5 | import com.amazonaws.services.s3.{AmazonS3, AmazonS3ClientBuilder} 6 | import com.amazonaws.services.s3.transfer.{ 7 | TransferManager, 8 | TransferManagerBuilder 9 | } 10 | 11 | object Aws { 12 | 13 | trait Task 14 | extends AwsCredentials.Task 15 | with AwsEndpointConfiguration.Task 16 | with AwsClientConfiguration.Task 17 | with AwsS3Configuration.Task 18 | 19 | def apply(task: Task): Aws = { 20 | new Aws(task) 21 | } 22 | 23 | } 24 | 25 | class Aws(task: Aws.Task) { 26 | 27 | def withS3[A](f: AmazonS3 => A): A = { 28 | val builder: AmazonS3ClientBuilder = AmazonS3ClientBuilder.standard() 29 | AwsS3Configuration(task).configureAmazonS3ClientBuilder(builder) 30 | val svc = createService(builder) 31 | try f(svc) 32 | finally svc.shutdown() 33 | } 34 | 35 | def withTransferManager[A](f: TransferManager => A): A = { 36 | withS3 { s3 => 37 | val svc = TransferManagerBuilder.standard().withS3Client(s3).build() 38 | try f(svc) 39 | finally svc.shutdownNow(false) 40 | } 41 | } 42 | 43 | def withGlue[A](f: AWSGlue => A): A = { 44 | val builder: AWSGlueClientBuilder = AWSGlueClientBuilder.standard() 45 | val svc = createService(builder) 46 | try f(svc) 47 | finally svc.shutdown() 48 | } 49 | 50 | def createService[S <: AwsClientBuilder[S, T], T]( 51 | builder: AwsClientBuilder[S, T] 52 | ): T = { 53 | AwsEndpointConfiguration(task).configureAwsClientBuilder(builder) 54 | AwsClientConfiguration(task).configureAwsClientBuilder(builder) 55 | builder.setCredentials(AwsCredentials(task).createAwsCredentialsProvider) 56 | 57 | builder.build() 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/main/scala/org/embulk/output/s3_parquet/implicits.scala: -------------------------------------------------------------------------------- 1 | package org.embulk.output.s3_parquet 2 | 3 | import java.util.{Optional, Iterator => JIterator, List => JList, Map => JMap} 4 | 5 | import com.google.common.base.{Optional => GoogleOptional} 6 | 7 | import scala.jdk.CollectionConverters._ 8 | import scala.language.implicitConversions 9 | 10 | case object implicits { 11 | implicit def JList2Seq[A](a: JList[A]): Seq[A] = a.asScala.toSeq 12 | implicit def Seq2JList[A](a: Seq[A]): JList[A] = a.asJava 13 | implicit def JIte2Ite[A](a: JIterator[A]): Iterator[A] = a.asScala 14 | implicit def Ite2JIte[A](a: Iterator[A]): JIterator[A] = a.asJava 15 | 16 | implicit def OptionalJList2OptionSeq[A]( 17 | a: Optional[JList[A]] 18 | ): Option[Seq[A]] = a.map(JList2Seq(_)) 19 | 20 | implicit def OptionSeq2OptionalJList[A]( 21 | a: Option[Seq[A]] 22 | ): Optional[JList[A]] = a.map(Seq2JList) 23 | implicit def JMap2Map[K, V](a: JMap[K, V]): Map[K, V] = a.asScala.toMap 24 | implicit def Map2JMap[K, V](a: Map[K, V]): JMap[K, V] = a.asJava 25 | 26 | implicit def OptionalJMap2OptionMap[K, V]( 27 | a: Optional[JMap[K, V]] 28 | ): Option[Map[K, V]] = a.map(JMap2Map(_)) 29 | 30 | implicit def OptionMap2Optional2JMap[K, V]( 31 | a: Option[Map[K, V]] 32 | ): Optional[JMap[K, V]] = a.map(Map2JMap) 33 | 34 | implicit def Optional2Option[A](a: Optional[A]): Option[A] = 35 | if (a.isPresent) Some(a.get()) else None 36 | 37 | implicit def Option2Optional[A](a: Option[A]): Optional[A] = a match { 38 | case Some(v) => Optional.of(v) 39 | case None => Optional.empty() 40 | } 41 | 42 | implicit def GoogleOptional2Option[A](a: GoogleOptional[A]): Option[A] = 43 | Option(a.orNull()) 44 | 45 | implicit def Option2GoogleOptional[A](a: Option[A]): GoogleOptional[A] = 46 | a match { 47 | case Some(v) => GoogleOptional.of(v) 48 | case None => GoogleOptional.absent() 49 | } 50 | 51 | } 52 | -------------------------------------------------------------------------------- /src/test/scala/org/embulk/output/s3_parquet/TestS3ParquetOutputPluginConfigException.scala: -------------------------------------------------------------------------------- 1 | package org.embulk.output.s3_parquet 2 | 3 | import org.embulk.config.ConfigException 4 | import org.embulk.spi.Schema 5 | import org.embulk.spi.`type`.Types 6 | 7 | class TestS3ParquetOutputPluginConfigException extends EmbulkPluginTestHelper { 8 | 9 | test( 10 | "Throw ConfigException when un-convertible types are defined in type_options" 11 | ) { 12 | val schema = Schema.builder().add("c0", Types.STRING).build() 13 | val data: Seq[Seq[String]] = Seq( 14 | Seq("a") 15 | ) 16 | val cfg = newDefaultConfig.merge( 17 | loadConfigSourceFromYamlString(""" 18 | |type_options: 19 | | string: 20 | | logical_type: "timestamp-millis" 21 | |""".stripMargin) 22 | ) 23 | val caught = intercept[ConfigException](runOutput(cfg, schema, data)) 24 | assert(caught.isInstanceOf[ConfigException]) 25 | assert(caught.getMessage.startsWith("Unsupported column type: ")) 26 | } 27 | 28 | test( 29 | "Throw ConfigException when un-convertible types are defined in column_options" 30 | ) { 31 | val schema = Schema.builder().add("c0", Types.STRING).build() 32 | val data: Seq[Seq[String]] = Seq( 33 | Seq("a") 34 | ) 35 | val cfg = newDefaultConfig.merge( 36 | loadConfigSourceFromYamlString(""" 37 | |column_options: 38 | | c0: 39 | | logical_type: "timestamp-millis" 40 | |""".stripMargin) 41 | ) 42 | val caught = intercept[ConfigException](runOutput(cfg, schema, data)) 43 | assert(caught.isInstanceOf[ConfigException]) 44 | assert(caught.getMessage.startsWith("Unsupported column type: ")) 45 | 46 | } 47 | 48 | } 49 | -------------------------------------------------------------------------------- /src/main/scala/org/embulk/output/s3_parquet/S3ParquetPageOutput.scala: -------------------------------------------------------------------------------- 1 | package org.embulk.output.s3_parquet 2 | 3 | import java.io.File 4 | import java.nio.file.{Files, Paths} 5 | 6 | import com.amazonaws.services.s3.transfer.{TransferManager, Upload} 7 | import com.amazonaws.services.s3.transfer.model.UploadResult 8 | import org.apache.parquet.hadoop.ParquetWriter 9 | import org.embulk.config.TaskReport 10 | import org.embulk.output.s3_parquet.aws.Aws 11 | import org.embulk.spi.{Exec, Page, PageReader, TransactionalPageOutput} 12 | 13 | case class S3ParquetPageOutput( 14 | outputLocalFile: String, 15 | reader: PageReader, 16 | writer: ParquetWriter[PageReader], 17 | aws: Aws, 18 | destBucket: String, 19 | destKey: String 20 | ) extends TransactionalPageOutput { 21 | 22 | private var isClosed: Boolean = false 23 | 24 | override def add(page: Page): Unit = { 25 | reader.setPage(page) 26 | while (reader.nextRecord()) { 27 | ContextClassLoaderSwapper.usingPluginClass { 28 | writer.write(reader) 29 | } 30 | } 31 | } 32 | 33 | override def finish(): Unit = {} 34 | 35 | override def close(): Unit = { 36 | synchronized { 37 | if (!isClosed) { 38 | ContextClassLoaderSwapper.usingPluginClass { 39 | writer.close() 40 | } 41 | isClosed = true 42 | } 43 | } 44 | } 45 | 46 | override def abort(): Unit = { 47 | close() 48 | cleanup() 49 | } 50 | 51 | override def commit(): TaskReport = { 52 | close() 53 | val result: UploadResult = ContextClassLoaderSwapper.usingPluginClass { 54 | aws.withTransferManager { xfer: TransferManager => 55 | val upload: Upload = 56 | xfer.upload(destBucket, destKey, new File(outputLocalFile)) 57 | upload.waitForUploadResult() 58 | } 59 | } 60 | cleanup() 61 | Exec 62 | .newTaskReport() 63 | .set("bucket", result.getBucketName) 64 | .set("key", result.getKey) 65 | .set("etag", result.getETag) 66 | .set("version_id", result.getVersionId) 67 | } 68 | 69 | private def cleanup(): Unit = { 70 | Files.delete(Paths.get(outputLocalFile)) 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/main/scala/org/embulk/output/s3_parquet/aws/AwsS3Configuration.scala: -------------------------------------------------------------------------------- 1 | package org.embulk.output.s3_parquet.aws 2 | 3 | import java.util.Optional 4 | 5 | import com.amazonaws.services.s3.AmazonS3ClientBuilder 6 | import org.embulk.config.{Config, ConfigDefault} 7 | import org.embulk.output.s3_parquet.aws.AwsS3Configuration.Task 8 | 9 | /* 10 | * These are advanced settings, so write no documentation. 11 | */ 12 | object AwsS3Configuration { 13 | 14 | trait Task { 15 | 16 | @Config("accelerate_mode_enabled") 17 | @ConfigDefault("null") 18 | def getAccelerateModeEnabled: Optional[Boolean] 19 | 20 | @Config("chunked_encoding_disabled") 21 | @ConfigDefault("null") 22 | def getChunkedEncodingDisabled: Optional[Boolean] 23 | 24 | @Config("dualstack_enabled") 25 | @ConfigDefault("null") 26 | def getDualstackEnabled: Optional[Boolean] 27 | 28 | @Config("force_global_bucket_access_enabled") 29 | @ConfigDefault("null") 30 | def getForceGlobalBucketAccessEnabled: Optional[Boolean] 31 | 32 | @Config("path_style_access_enabled") 33 | @ConfigDefault("null") 34 | def getPathStyleAccessEnabled: Optional[Boolean] 35 | 36 | @Config("payload_signing_enabled") 37 | @ConfigDefault("null") 38 | def getPayloadSigningEnabled: Optional[Boolean] 39 | 40 | } 41 | 42 | def apply(task: Task): AwsS3Configuration = { 43 | new AwsS3Configuration(task) 44 | } 45 | } 46 | 47 | class AwsS3Configuration(task: Task) { 48 | 49 | def configureAmazonS3ClientBuilder(builder: AmazonS3ClientBuilder): Unit = { 50 | task.getAccelerateModeEnabled.ifPresent(v => 51 | builder.setAccelerateModeEnabled(v) 52 | ) 53 | task.getChunkedEncodingDisabled.ifPresent(v => 54 | builder.setChunkedEncodingDisabled(v) 55 | ) 56 | task.getDualstackEnabled.ifPresent(v => builder.setDualstackEnabled(v)) 57 | task.getForceGlobalBucketAccessEnabled.ifPresent(v => 58 | builder.setForceGlobalBucketAccessEnabled(v) 59 | ) 60 | task.getPathStyleAccessEnabled.ifPresent(v => 61 | builder.setPathStyleAccessEnabled(v) 62 | ) 63 | task.getPayloadSigningEnabled.ifPresent(v => 64 | builder.setPayloadSigningEnabled(v) 65 | ) 66 | } 67 | 68 | } 69 | -------------------------------------------------------------------------------- /src/test/scala/org/embulk/output/s3_parquet/parquet/MockParquetRecordConsumer.scala: -------------------------------------------------------------------------------- 1 | package org.embulk.output.s3_parquet.parquet 2 | 3 | import org.apache.parquet.io.api.{Binary, RecordConsumer} 4 | 5 | case class MockParquetRecordConsumer() extends RecordConsumer { 6 | case class Data private (messages: Seq[Message] = Seq()) { 7 | def toData: Seq[Seq[Any]] = messages.map(_.toData) 8 | } 9 | case class Message private (fields: Seq[Field] = Seq()) { 10 | def toData: Seq[Any] = { 11 | val maxIndex: Int = fields.maxBy(_.index).index 12 | val raw: Map[Int, Any] = fields.map(f => f.index -> f.value).toMap 13 | 0.to(maxIndex).map(idx => raw.get(idx).orNull) 14 | } 15 | } 16 | case class Field private (index: Int = 0, value: Any = null) 17 | 18 | private var _data: Data = Data() 19 | private var _message: Message = Message() 20 | private var _field: Field = Field() 21 | 22 | override def startMessage(): Unit = _message = Message() 23 | override def endMessage(): Unit = 24 | _data = _data.copy(messages = _data.messages :+ _message) 25 | override def startField(field: String, index: Int): Unit = 26 | _field = Field(index = index) 27 | override def endField(field: String, index: Int): Unit = 28 | _message = _message.copy(fields = _message.fields :+ _field) 29 | override def startGroup(): Unit = throw new UnsupportedOperationException 30 | override def endGroup(): Unit = throw new UnsupportedOperationException 31 | override def addInteger(value: Int): Unit = 32 | _field = _field.copy(value = value) 33 | override def addLong(value: Long): Unit = _field = _field.copy(value = value) 34 | override def addBoolean(value: Boolean): Unit = 35 | _field = _field.copy(value = value) 36 | override def addBinary(value: Binary): Unit = 37 | _field = _field.copy(value = value) 38 | override def addFloat(value: Float): Unit = 39 | _field = _field.copy(value = value) 40 | override def addDouble(value: Double): Unit = 41 | _field = _field.copy(value = value) 42 | 43 | def writingMessage(f: => Unit): Unit = { 44 | startMessage() 45 | f 46 | endMessage() 47 | } 48 | def writingField(field: String, index: Int)(f: => Unit): Unit = { 49 | startField(field, index) 50 | f 51 | endField(field, index) 52 | } 53 | def writingSampleField(f: => Unit): Unit = { 54 | writingMessage { 55 | writingField("a", 0)(f) 56 | } 57 | } 58 | def data: Seq[Seq[Any]] = _data.toData 59 | } 60 | -------------------------------------------------------------------------------- /src/main/scala/org/embulk/output/s3_parquet/parquet/DateLogicalType.scala: -------------------------------------------------------------------------------- 1 | package org.embulk.output.s3_parquet.parquet 2 | 3 | import java.time.{Duration, Instant} 4 | 5 | import org.apache.parquet.io.api.RecordConsumer 6 | import org.apache.parquet.schema.{LogicalTypeAnnotation, PrimitiveType, Types} 7 | import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName 8 | import org.embulk.config.ConfigException 9 | import org.embulk.output.s3_parquet.catalog.GlueDataType 10 | import org.embulk.spi.`type`.{ 11 | BooleanType, 12 | DoubleType, 13 | JsonType, 14 | LongType, 15 | StringType, 16 | TimestampType 17 | } 18 | import org.embulk.spi.time.{Timestamp, TimestampFormatter} 19 | import org.embulk.spi.Column 20 | import org.msgpack.value.Value 21 | 22 | object DateLogicalType extends ParquetColumnType { 23 | override def primitiveType(column: Column): PrimitiveType = { 24 | column.getType match { 25 | case _: LongType | _: TimestampType => 26 | Types 27 | .optional(PrimitiveTypeName.INT32) 28 | .as(LogicalTypeAnnotation.dateType()) 29 | .named(column.getName) 30 | case _: BooleanType | _: DoubleType | _: StringType | _: JsonType | _ => 31 | throw new ConfigException(s"Unsupported column type: ${column.getName}") 32 | } 33 | } 34 | 35 | override def glueDataType(column: Column): GlueDataType = 36 | column.getType match { 37 | case _: LongType | _: TimestampType => GlueDataType.DATE 38 | case _: BooleanType | _: DoubleType | _: StringType | _: JsonType | _ => 39 | throw new ConfigException(s"Unsupported column type: ${column.getName}") 40 | } 41 | 42 | override def consumeBoolean(consumer: RecordConsumer, v: Boolean): Unit = 43 | throw newUnsupportedMethodException("consumeBoolean") 44 | 45 | override def consumeString(consumer: RecordConsumer, v: String): Unit = 46 | throw newUnsupportedMethodException("consumeString") 47 | 48 | override def consumeLong(consumer: RecordConsumer, v: Long): Unit = 49 | consumeLongAsInteger(consumer, v) 50 | 51 | override def consumeDouble(consumer: RecordConsumer, v: Double): Unit = 52 | throw newUnsupportedMethodException("consumeDouble") 53 | 54 | override def consumeTimestamp( 55 | consumer: RecordConsumer, 56 | v: Timestamp, 57 | formatter: TimestampFormatter 58 | ): Unit = 59 | consumeLongAsInteger( 60 | consumer, 61 | Duration.between(Instant.EPOCH, v.getInstant).toDays 62 | ) 63 | 64 | override def consumeJson(consumer: RecordConsumer, v: Value): Unit = 65 | throw newUnsupportedMethodException("consumeJson") 66 | } 67 | -------------------------------------------------------------------------------- /src/main/scala/org/embulk/output/s3_parquet/parquet/JsonLogicalType.scala: -------------------------------------------------------------------------------- 1 | package org.embulk.output.s3_parquet.parquet 2 | import org.apache.parquet.io.api.{Binary, RecordConsumer} 3 | import org.apache.parquet.schema.{LogicalTypeAnnotation, PrimitiveType, Types} 4 | import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName 5 | import org.embulk.config.ConfigException 6 | import org.embulk.output.s3_parquet.catalog.GlueDataType 7 | import org.embulk.spi.Column 8 | import org.embulk.spi.`type`.{ 9 | BooleanType, 10 | DoubleType, 11 | JsonType, 12 | LongType, 13 | StringType, 14 | TimestampType 15 | } 16 | import org.embulk.spi.time.{Timestamp, TimestampFormatter} 17 | import org.msgpack.value.{Value, ValueFactory} 18 | import org.slf4j.{Logger, LoggerFactory} 19 | 20 | object JsonLogicalType extends ParquetColumnType { 21 | private val logger: Logger = LoggerFactory.getLogger(JsonLogicalType.getClass) 22 | override def primitiveType(column: Column): PrimitiveType = 23 | column.getType match { 24 | case _: BooleanType | _: LongType | _: DoubleType | _: StringType | 25 | _: JsonType => 26 | Types 27 | .optional(PrimitiveTypeName.BINARY) 28 | .as(LogicalTypeAnnotation.jsonType()) 29 | .named(column.getName) 30 | case _: TimestampType | _ => 31 | throw new ConfigException(s"Unsupported column type: ${column.getName}") 32 | } 33 | 34 | override def glueDataType(column: Column): GlueDataType = 35 | column.getType match { 36 | case _: BooleanType | _: LongType | _: DoubleType | _: StringType | 37 | _: JsonType => 38 | warningWhenConvertingJsonToGlueType(GlueDataType.STRING) 39 | GlueDataType.STRING 40 | case _: TimestampType | _ => 41 | throw new ConfigException(s"Unsupported column type: ${column.getName}") 42 | } 43 | 44 | override def consumeBoolean(consumer: RecordConsumer, v: Boolean): Unit = 45 | consumeJson(consumer, ValueFactory.newBoolean(v)) 46 | 47 | override def consumeString(consumer: RecordConsumer, v: String): Unit = 48 | consumeJson(consumer, ValueFactory.newString(v)) 49 | 50 | override def consumeLong(consumer: RecordConsumer, v: Long): Unit = 51 | consumeJson(consumer, ValueFactory.newInteger(v)) 52 | 53 | override def consumeDouble(consumer: RecordConsumer, v: Double): Unit = 54 | consumeJson(consumer, ValueFactory.newFloat(v)) 55 | 56 | override def consumeTimestamp( 57 | consumer: RecordConsumer, 58 | v: Timestamp, 59 | formatter: TimestampFormatter 60 | ): Unit = throw newUnsupportedMethodException("consumeTimestamp") 61 | 62 | override def consumeJson(consumer: RecordConsumer, v: Value): Unit = 63 | consumer.addBinary(Binary.fromString(v.toJson)) 64 | 65 | private def warningWhenConvertingJsonToGlueType( 66 | glueType: GlueDataType 67 | ): Unit = { 68 | logger.warn( 69 | s"json is converted" + 70 | s" to Glue ${glueType.name} but this is not represented correctly, because Glue" + 71 | s" does not support json type. Please use `catalog.column_options` to define the type." 72 | ) 73 | } 74 | 75 | } 76 | -------------------------------------------------------------------------------- /src/main/scala/org/embulk/output/s3_parquet/parquet/DefaultColumnType.scala: -------------------------------------------------------------------------------- 1 | package org.embulk.output.s3_parquet.parquet 2 | 3 | import org.apache.parquet.io.api.{Binary, RecordConsumer} 4 | import org.apache.parquet.schema.{LogicalTypeAnnotation, PrimitiveType, Types} 5 | import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName 6 | import org.embulk.config.ConfigException 7 | import org.embulk.output.s3_parquet.catalog.GlueDataType 8 | import org.embulk.spi.time.{Timestamp, TimestampFormatter} 9 | import org.embulk.spi.Column 10 | import org.embulk.spi.`type`.{ 11 | BooleanType, 12 | DoubleType, 13 | JsonType, 14 | LongType, 15 | StringType, 16 | TimestampType 17 | } 18 | import org.msgpack.value.Value 19 | 20 | object DefaultColumnType extends ParquetColumnType { 21 | override def primitiveType(column: Column): PrimitiveType = 22 | column.getType match { 23 | case _: BooleanType => 24 | Types.optional(PrimitiveTypeName.BOOLEAN).named(column.getName) 25 | case _: LongType => 26 | Types.optional(PrimitiveTypeName.INT64).named(column.getName) 27 | case _: DoubleType => 28 | Types.optional(PrimitiveTypeName.DOUBLE).named(column.getName) 29 | case _: StringType => 30 | Types 31 | .optional(PrimitiveTypeName.BINARY) 32 | .as(LogicalTypeAnnotation.stringType()) 33 | .named(column.getName) 34 | case _: TimestampType => 35 | Types 36 | .optional(PrimitiveTypeName.BINARY) 37 | .as(LogicalTypeAnnotation.stringType()) 38 | .named(column.getName) 39 | case _: JsonType => 40 | Types 41 | .optional(PrimitiveTypeName.BINARY) 42 | .as(LogicalTypeAnnotation.stringType()) 43 | .named(column.getName) 44 | case _ => 45 | throw new ConfigException(s"Unsupported column type: ${column.getName}") 46 | } 47 | 48 | override def glueDataType(column: Column): GlueDataType = 49 | column.getType match { 50 | case _: BooleanType => 51 | GlueDataType.BOOLEAN 52 | case _: LongType => 53 | GlueDataType.BIGINT 54 | case _: DoubleType => 55 | GlueDataType.DOUBLE 56 | case _: StringType | _: TimestampType | _: JsonType => 57 | GlueDataType.STRING 58 | case _ => 59 | throw new ConfigException(s"Unsupported column type: ${column.getName}") 60 | } 61 | 62 | override def consumeBoolean(consumer: RecordConsumer, v: Boolean): Unit = 63 | consumer.addBoolean(v) 64 | override def consumeString(consumer: RecordConsumer, v: String): Unit = 65 | consumer.addBinary(Binary.fromString(v)) 66 | override def consumeLong(consumer: RecordConsumer, v: Long): Unit = 67 | consumer.addLong(v) 68 | override def consumeDouble(consumer: RecordConsumer, v: Double): Unit = 69 | consumer.addDouble(v) 70 | override def consumeTimestamp( 71 | consumer: RecordConsumer, 72 | v: Timestamp, 73 | formatter: TimestampFormatter 74 | ): Unit = consumer.addBinary(Binary.fromString(formatter.format(v))) 75 | override def consumeJson(consumer: RecordConsumer, v: Value): Unit = 76 | consumer.addBinary(Binary.fromString(v.toJson)) 77 | } 78 | -------------------------------------------------------------------------------- /gradlew.bat: -------------------------------------------------------------------------------- 1 | @rem 2 | @rem Copyright 2015 the original author or authors. 3 | @rem 4 | @rem Licensed under the Apache License, Version 2.0 (the "License"); 5 | @rem you may not use this file except in compliance with the License. 6 | @rem You may obtain a copy of the License at 7 | @rem 8 | @rem https://www.apache.org/licenses/LICENSE-2.0 9 | @rem 10 | @rem Unless required by applicable law or agreed to in writing, software 11 | @rem distributed under the License is distributed on an "AS IS" BASIS, 12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | @rem See the License for the specific language governing permissions and 14 | @rem limitations under the License. 15 | @rem 16 | 17 | @if "%DEBUG%" == "" @echo off 18 | @rem ########################################################################## 19 | @rem 20 | @rem Gradle startup script for Windows 21 | @rem 22 | @rem ########################################################################## 23 | 24 | @rem Set local scope for the variables with windows NT shell 25 | if "%OS%"=="Windows_NT" setlocal 26 | 27 | set DIRNAME=%~dp0 28 | if "%DIRNAME%" == "" set DIRNAME=. 29 | set APP_BASE_NAME=%~n0 30 | set APP_HOME=%DIRNAME% 31 | 32 | @rem Resolve any "." and ".." in APP_HOME to make it shorter. 33 | for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi 34 | 35 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 36 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m" 37 | 38 | @rem Find java.exe 39 | if defined JAVA_HOME goto findJavaFromJavaHome 40 | 41 | set JAVA_EXE=java.exe 42 | %JAVA_EXE% -version >NUL 2>&1 43 | if "%ERRORLEVEL%" == "0" goto init 44 | 45 | echo. 46 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 47 | echo. 48 | echo Please set the JAVA_HOME variable in your environment to match the 49 | echo location of your Java installation. 50 | 51 | goto fail 52 | 53 | :findJavaFromJavaHome 54 | set JAVA_HOME=%JAVA_HOME:"=% 55 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe 56 | 57 | if exist "%JAVA_EXE%" goto init 58 | 59 | echo. 60 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 61 | echo. 62 | echo Please set the JAVA_HOME variable in your environment to match the 63 | echo location of your Java installation. 64 | 65 | goto fail 66 | 67 | :init 68 | @rem Get command-line arguments, handling Windows variants 69 | 70 | if not "%OS%" == "Windows_NT" goto win9xME_args 71 | 72 | :win9xME_args 73 | @rem Slurp the command line arguments. 74 | set CMD_LINE_ARGS= 75 | set _SKIP=2 76 | 77 | :win9xME_args_slurp 78 | if "x%~1" == "x" goto execute 79 | 80 | set CMD_LINE_ARGS=%* 81 | 82 | :execute 83 | @rem Setup the command line 84 | 85 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar 86 | 87 | @rem Execute Gradle 88 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS% 89 | 90 | :end 91 | @rem End local scope for the variables with windows NT shell 92 | if "%ERRORLEVEL%"=="0" goto mainEnd 93 | 94 | :fail 95 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of 96 | rem the _cmd.exe /c_ return code! 97 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 98 | exit /b 1 99 | 100 | :mainEnd 101 | if "%OS%"=="Windows_NT" endlocal 102 | 103 | :omega 104 | -------------------------------------------------------------------------------- /src/main/scala/org/embulk/output/s3_parquet/parquet/LogicalTypeProxy.scala: -------------------------------------------------------------------------------- 1 | package org.embulk.output.s3_parquet.parquet 2 | 3 | import java.time.ZoneId 4 | import java.util.Locale 5 | 6 | import org.apache.parquet.io.api.RecordConsumer 7 | import org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit 8 | import org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.MILLIS 9 | import org.apache.parquet.schema.PrimitiveType 10 | import org.embulk.config.ConfigException 11 | import org.embulk.output.s3_parquet.catalog.GlueDataType 12 | import org.embulk.spi.Column 13 | import org.embulk.spi.time.{Timestamp, TimestampFormatter} 14 | import org.msgpack.value.Value 15 | 16 | object LogicalTypeProxy { 17 | private val DEFAULT_SCALE: Int = 0 18 | private val DEFAULT_BID_WIDTH: Int = 64 19 | private val DEFAULT_IS_SIGNED: Boolean = true 20 | private val DEFAULT_IS_ADJUSTED_TO_UTC: Boolean = true 21 | private val DEFAULT_TIME_UNIT: TimeUnit = MILLIS 22 | private val DEFAULT_TIME_ZONE: ZoneId = ZoneId.of("UTC") 23 | } 24 | 25 | case class LogicalTypeProxy( 26 | name: String, 27 | scale: Option[Int] = None, 28 | precision: Option[Int] = None, 29 | bitWidth: Option[Int] = None, 30 | isSigned: Option[Boolean] = None, 31 | isAdjustedToUtc: Option[Boolean] = None, 32 | timeUnit: Option[TimeUnit] = None, 33 | timeZone: Option[ZoneId] = None 34 | ) extends ParquetColumnType { 35 | private def getScale: Int = scale.getOrElse(LogicalTypeProxy.DEFAULT_SCALE) 36 | private def getPrecision: Int = precision.getOrElse { 37 | throw new ConfigException("\"precision\" must be set.") 38 | } 39 | private def getBidWith: Int = 40 | bitWidth.getOrElse(LogicalTypeProxy.DEFAULT_BID_WIDTH) 41 | private def getIsSigned: Boolean = 42 | isSigned.getOrElse(LogicalTypeProxy.DEFAULT_IS_SIGNED) 43 | private def getIsAdjustedToUtc: Boolean = 44 | isAdjustedToUtc.getOrElse(LogicalTypeProxy.DEFAULT_IS_ADJUSTED_TO_UTC) 45 | private def getTimeUnit: TimeUnit = 46 | timeUnit.getOrElse(LogicalTypeProxy.DEFAULT_TIME_UNIT) 47 | private def getTimeZone: ZoneId = 48 | timeZone.getOrElse(LogicalTypeProxy.DEFAULT_TIME_ZONE) 49 | 50 | lazy val logicalType: ParquetColumnType = { 51 | name.toUpperCase(Locale.ENGLISH) match { 52 | case "INT" => IntLogicalType(getBidWith, getIsSigned) 53 | case "TIMESTAMP" => 54 | TimestampLogicalType(getIsAdjustedToUtc, getTimeUnit, getTimeZone) 55 | case "TIME" => 56 | TimeLogicalType(getIsAdjustedToUtc, getTimeUnit, getTimeZone) 57 | case "DECIMAL" => DecimalLogicalType(getScale, getPrecision) 58 | case "DATE" => DateLogicalType 59 | case "JSON" => JsonLogicalType 60 | case _ => 61 | throw new ConfigException(s"Unsupported logical_type.name: $name.") 62 | } 63 | } 64 | 65 | override def primitiveType(column: Column): PrimitiveType = 66 | logicalType.primitiveType(column) 67 | override def glueDataType(column: Column): GlueDataType = 68 | logicalType.glueDataType(column) 69 | override def consumeBoolean(consumer: RecordConsumer, v: Boolean): Unit = 70 | logicalType.consumeBoolean(consumer, v) 71 | override def consumeString(consumer: RecordConsumer, v: String): Unit = 72 | logicalType.consumeString(consumer, v) 73 | override def consumeLong(consumer: RecordConsumer, v: Long): Unit = 74 | logicalType.consumeLong(consumer, v) 75 | override def consumeDouble(consumer: RecordConsumer, v: Double): Unit = 76 | logicalType.consumeDouble(consumer, v) 77 | override def consumeTimestamp( 78 | consumer: RecordConsumer, 79 | v: Timestamp, 80 | formatter: TimestampFormatter 81 | ): Unit = logicalType.consumeTimestamp(consumer, v, formatter) 82 | override def consumeJson(consumer: RecordConsumer, v: Value): Unit = 83 | logicalType.consumeJson(consumer, v) 84 | } 85 | -------------------------------------------------------------------------------- /gradle/dependency-locks/embulkPluginRuntime.lockfile: -------------------------------------------------------------------------------- 1 | # This is a Gradle generated file for dependency locking. 2 | # Manual edits can break the build and are not advised. 3 | # This file is expected to be part of source control. 4 | asm:asm:3.1 5 | ch.qos.reload4j:reload4j:1.2.19 6 | com.amazonaws:aws-java-sdk-core:1.11.769 7 | com.amazonaws:aws-java-sdk-glue:1.11.769 8 | com.amazonaws:aws-java-sdk-kms:1.11.769 9 | com.amazonaws:aws-java-sdk-s3:1.11.769 10 | com.amazonaws:aws-java-sdk-sts:1.11.769 11 | com.amazonaws:jmespath-java:1.11.769 12 | com.fasterxml.jackson.dataformat:jackson-dataformat-cbor:2.6.7 13 | com.fasterxml.woodstox:woodstox-core:5.3.0 14 | com.github.spotbugs:spotbugs-annotations:3.1.9 15 | com.github.stephenc.jcip:jcip-annotations:1.0-1 16 | com.google.code.findbugs:jsr305:3.0.2 17 | com.google.code.gson:gson:2.2.4 18 | com.google.protobuf:protobuf-java:2.5.0 19 | com.jamesmurty.utils:java-xmlbuilder:0.4 20 | com.jcraft:jsch:0.1.55 21 | com.nimbusds:nimbus-jose-jwt:7.9 22 | com.sun.jersey:jersey-core:1.9 23 | com.sun.jersey:jersey-json:1.9 24 | com.sun.jersey:jersey-server:1.9 25 | com.sun.xml.bind:jaxb-impl:2.2.3-1 26 | com.thoughtworks.paranamer:paranamer:2.3 27 | commons-beanutils:commons-beanutils:1.9.4 28 | commons-cli:commons-cli:1.2 29 | commons-codec:commons-codec:1.11 30 | commons-collections:commons-collections:3.2.2 31 | commons-configuration:commons-configuration:1.6 32 | commons-digester:commons-digester:1.8 33 | commons-io:commons-io:2.5 34 | commons-lang:commons-lang:2.6 35 | commons-logging:commons-logging:1.2 36 | commons-net:commons-net:3.1 37 | commons-pool:commons-pool:1.6 38 | io.netty:netty:3.10.6.Final 39 | javax.activation:activation:1.1 40 | javax.annotation:javax.annotation-api:1.3.2 41 | javax.servlet.jsp:jsp-api:2.1 42 | javax.servlet:servlet-api:2.5 43 | javax.xml.bind:jaxb-api:2.2.2 44 | javax.xml.stream:stax-api:1.0-2 45 | jline:jline:0.9.94 46 | log4j:log4j:1.2.17 47 | net.java.dev.jets3t:jets3t:0.9.0 48 | net.minidev:accessors-smart:1.2 49 | net.minidev:json-smart:2.3 50 | org.apache.avro:avro:1.7.7 51 | org.apache.commons:commons-compress:1.21 52 | org.apache.commons:commons-math3:3.1.1 53 | org.apache.curator:curator-client:2.13.0 54 | org.apache.curator:curator-framework:2.13.0 55 | org.apache.curator:curator-recipes:2.13.0 56 | org.apache.directory.api:api-asn1-api:1.0.0-M20 57 | org.apache.directory.api:api-util:1.0.0-M20 58 | org.apache.directory.server:apacheds-i18n:2.0.0-M15 59 | org.apache.directory.server:apacheds-kerberos-codec:2.0.0-M15 60 | org.apache.hadoop:hadoop-annotations:2.10.2 61 | org.apache.hadoop:hadoop-auth:2.10.2 62 | org.apache.hadoop:hadoop-common:2.10.2 63 | org.apache.htrace:htrace-core4:4.1.0-incubating 64 | org.apache.httpcomponents:httpclient:4.5.13 65 | org.apache.httpcomponents:httpcore:4.4.13 66 | org.apache.parquet:parquet-column:1.11.0 67 | org.apache.parquet:parquet-common:1.11.0 68 | org.apache.parquet:parquet-encoding:1.11.0 69 | org.apache.parquet:parquet-format-structures:1.11.0 70 | org.apache.parquet:parquet-format:2.7.0 71 | org.apache.parquet:parquet-hadoop:1.11.0 72 | org.apache.parquet:parquet-jackson:1.11.0 73 | org.apache.yetus:audience-annotations:0.11.0 74 | org.apache.zookeeper:zookeeper:3.4.14 75 | org.codehaus.jackson:jackson-core-asl:1.9.13 76 | org.codehaus.jackson:jackson-jaxrs:1.8.3 77 | org.codehaus.jackson:jackson-mapper-asl:1.9.13 78 | org.codehaus.jackson:jackson-xc:1.8.3 79 | org.codehaus.jettison:jettison:1.1 80 | org.codehaus.woodstox:stax2-api:4.2.1 81 | org.mortbay.jetty:jetty-sslengine:6.1.26 82 | org.mortbay.jetty:jetty-util:6.1.26 83 | org.mortbay.jetty:jetty:6.1.26 84 | org.mortbay.jetty:servlet-api:2.5-20081211 85 | org.ow2.asm:asm:5.0.4 86 | org.scala-lang:scala-library:2.13.1 87 | org.slf4j:slf4j-log4j12:1.7.25 88 | org.slf4j:slf4j-reload4j:1.7.36 89 | org.xerial.snappy:snappy-java:1.1.7.3 90 | software.amazon.ion:ion-java:1.0.2 91 | xmlenc:xmlenc:0.52 92 | -------------------------------------------------------------------------------- /src/main/scala/org/embulk/output/s3_parquet/parquet/TimestampLogicalType.scala: -------------------------------------------------------------------------------- 1 | package org.embulk.output.s3_parquet.parquet 2 | 3 | import java.time.ZoneId 4 | 5 | import org.apache.parquet.io.api.RecordConsumer 6 | import org.apache.parquet.schema.{LogicalTypeAnnotation, PrimitiveType, Types} 7 | import org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit 8 | import org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.{ 9 | MICROS, 10 | MILLIS, 11 | NANOS 12 | } 13 | import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName 14 | import org.embulk.config.ConfigException 15 | import org.embulk.output.s3_parquet.catalog.GlueDataType 16 | import org.embulk.spi.`type`.{ 17 | BooleanType, 18 | DoubleType, 19 | JsonType, 20 | LongType, 21 | StringType, 22 | TimestampType 23 | } 24 | import org.embulk.spi.time.{Timestamp, TimestampFormatter} 25 | import org.embulk.spi.Column 26 | import org.msgpack.value.Value 27 | import org.slf4j.{Logger, LoggerFactory} 28 | 29 | case class TimestampLogicalType( 30 | isAdjustedToUtc: Boolean, 31 | timeUnit: TimeUnit, 32 | timeZone: ZoneId 33 | ) extends ParquetColumnType { 34 | private val logger: Logger = 35 | LoggerFactory.getLogger(classOf[TimestampLogicalType]) 36 | 37 | override def primitiveType(column: Column): PrimitiveType = 38 | column.getType match { 39 | case _: LongType | _: TimestampType => 40 | Types 41 | .optional(PrimitiveTypeName.INT64) 42 | .as(LogicalTypeAnnotation.timestampType(isAdjustedToUtc, timeUnit)) 43 | .named(column.getName) 44 | case _: BooleanType | _: DoubleType | _: StringType | _: JsonType | _ => 45 | throw new ConfigException(s"Unsupported column type: ${column.getName}") 46 | } 47 | 48 | override def glueDataType(column: Column): GlueDataType = 49 | column.getType match { 50 | case _: LongType | _: TimestampType => 51 | timeUnit match { 52 | case MILLIS => GlueDataType.TIMESTAMP 53 | case MICROS | NANOS => 54 | warningWhenConvertingTimestampToGlueType(GlueDataType.BIGINT) 55 | GlueDataType.BIGINT 56 | } 57 | case _: BooleanType | _: DoubleType | _: StringType | _: JsonType | _ => 58 | throw new ConfigException(s"Unsupported column type: ${column.getName}") 59 | } 60 | 61 | override def consumeBoolean(consumer: RecordConsumer, v: Boolean): Unit = 62 | throw newUnsupportedMethodException("consumeBoolean") 63 | override def consumeString(consumer: RecordConsumer, v: String): Unit = 64 | throw newUnsupportedMethodException("consumeString") 65 | 66 | override def consumeLong(consumer: RecordConsumer, v: Long): Unit = 67 | consumer.addLong(v) 68 | 69 | override def consumeDouble(consumer: RecordConsumer, v: Double): Unit = 70 | throw newUnsupportedMethodException("consumeDouble") 71 | 72 | override def consumeTimestamp( 73 | consumer: RecordConsumer, 74 | v: Timestamp, 75 | formatter: TimestampFormatter 76 | ): Unit = timeUnit match { 77 | case MILLIS => consumer.addLong(v.toEpochMilli) 78 | case MICROS => 79 | consumer.addLong(v.getEpochSecond * 1_000_000L + (v.getNano / 1_000L)) 80 | case NANOS => 81 | consumer.addLong(v.getEpochSecond * 1_000_000_000L + v.getNano) 82 | } 83 | 84 | override def consumeJson(consumer: RecordConsumer, v: Value): Unit = 85 | throw newUnsupportedMethodException("consumeJson") 86 | 87 | private def warningWhenConvertingTimestampToGlueType( 88 | glueType: GlueDataType 89 | ): Unit = 90 | logger.warn( 91 | s"timestamp(isAdjustedToUtc = $isAdjustedToUtc, timeUnit = $timeUnit) is converted" + 92 | s" to Glue ${glueType.name} but this is not represented correctly, because Glue" + 93 | s" does not support time type. Please use `catalog.column_options` to define the type." 94 | ) 95 | } 96 | -------------------------------------------------------------------------------- /src/main/scala/org/embulk/output/s3_parquet/catalog/GlueDataType.scala: -------------------------------------------------------------------------------- 1 | package org.embulk.output.s3_parquet.catalog 2 | 3 | // https://docs.aws.amazon.com/athena/latest/ug/data-types.html 4 | 5 | sealed abstract class GlueDataType(val name: String) 6 | object GlueDataType { 7 | sealed abstract class AbstractIntGlueDataType(name: String, val bitWidth: Int) 8 | extends GlueDataType(name) 9 | 10 | // BOOLEAN – Values are true and false. 11 | case object BOOLEAN extends GlueDataType("boolean") 12 | // TINYINT – A 8-bit signed INTEGER in two’s complement format, with a minimum value of -27 and a maximum value of 27-1. 13 | case object TINYINT extends AbstractIntGlueDataType("tinyint", bitWidth = 8) 14 | // SMALLINT – A 16-bit signed INTEGER in two’s complement format, with a minimum value of -215 and a maximum value of 215-1. 15 | case object SMALLINT 16 | extends AbstractIntGlueDataType("smallint", bitWidth = 16) 17 | // INT and INTEGER – Athena combines two different implementations of the integer data type, as follows: 18 | // * INT – In Data Definition Language (DDL) queries, Athena uses the INT data type. 19 | // * INTEGER – In DML queries, Athena uses the INTEGER data type. INTEGER is represented as a 32-bit signed value in two's complement format, with a minimum value of -231 and a maximum value of 231-1. 20 | case object INT extends AbstractIntGlueDataType("int", bitWidth = 32) 21 | // BIGINT – A 64-bit signed INTEGER in two’s complement format, with a minimum value of -263 and a maximum value of 263-1. 22 | case object BIGINT extends AbstractIntGlueDataType("bigint", bitWidth = 64) 23 | // DOUBLE – A 64-bit double-precision floating point number. 24 | case object DOUBLE extends GlueDataType("double") 25 | // FLOAT – A 32-bit single-precision floating point number. Equivalent to the REAL in Presto. 26 | case object FLOAT extends GlueDataType("float") 27 | // DECIMAL(precision, scale) – precision is the total number of digits. scale (optional) is the number of digits in fractional part with a default of 0. For example, use these type definitions: DECIMAL(11,5), DECIMAL(15). 28 | case class DECIMAL(precision: Int, scale: Int) 29 | extends GlueDataType(s"decimal($precision,$scale)") 30 | // STRING – A string literal enclosed in single or double quotes. For more information, see STRING Hive Data Type. 31 | case object STRING extends GlueDataType("string") 32 | // CHAR – Fixed length character data, with a specified length between 1 and 255, such as char(10). For more information, see CHAR Hive Data Type. 33 | case class CHAR(length: Int) extends GlueDataType(s"char($length)") 34 | // VARCHAR – Variable length character data, with a specified length between 1 and 65535, such as varchar(10). For more information, see VARCHAR Hive Data Type. 35 | case class VARCHAR(length: Int) extends GlueDataType(s"varchar($length)") 36 | // BINARY – Used for data in Parquet. 37 | case object BINARY extends GlueDataType("binary") 38 | // DATE – A date in UNIX format, such as YYYY-MM-DD. 39 | case object DATE extends GlueDataType("date") 40 | // TIMESTAMP – Date and time instant in the UNiX format, such as yyyy-mm-dd hh:mm:ss[.f...]. For example, TIMESTAMP '2008-09-15 03:04:05.324'. This format uses the session time zone. 41 | case object TIMESTAMP extends GlueDataType("timestamp") 42 | // ARRAY 43 | case class ARRAY(dataType: GlueDataType) 44 | extends GlueDataType(s"array<${dataType.name}>") 45 | // MAP 46 | case class MAP(keyDataType: GlueDataType, valueDataType: GlueDataType) 47 | extends GlueDataType(s"map<${keyDataType.name},${valueDataType.name}>") 48 | // STRUCT 49 | case class STRUCT(struct: Map[String, GlueDataType]) 50 | extends GlueDataType({ 51 | val columns = struct 52 | .map { 53 | case (columnName, glueType) => s"$columnName : ${glueType.name}" 54 | } 55 | s"struct<${columns.mkString(",")}>" 56 | }) 57 | } 58 | -------------------------------------------------------------------------------- /src/main/scala/org/embulk/output/s3_parquet/PluginTask.scala: -------------------------------------------------------------------------------- 1 | package org.embulk.output.s3_parquet 2 | 3 | import java.util.{Locale, MissingFormatArgumentException, Optional} 4 | 5 | import com.amazonaws.services.s3.model.CannedAccessControlList 6 | import org.apache.parquet.hadoop.metadata.CompressionCodecName 7 | import org.embulk.config.{ 8 | Config, 9 | ConfigDefault, 10 | ConfigException, 11 | ConfigSource, 12 | Task, 13 | TaskSource 14 | } 15 | import org.embulk.output.s3_parquet.aws.Aws 16 | import org.embulk.output.s3_parquet.catalog.CatalogRegistrator 17 | import org.embulk.output.s3_parquet.parquet.ParquetFileWriteSupport 18 | 19 | trait PluginTask extends Task with ParquetFileWriteSupport.Task with Aws.Task { 20 | 21 | @Config("bucket") 22 | def getBucket: String 23 | 24 | @Config("path_prefix") 25 | @ConfigDefault("\"\"") 26 | def getPathPrefix: String 27 | 28 | @Config("sequence_format") 29 | @ConfigDefault("\"%03d.%02d.\"") 30 | def getSequenceFormat: String 31 | 32 | @Config("file_ext") 33 | @ConfigDefault("\"parquet\"") 34 | def getFileExt: String 35 | 36 | @Config("compression_codec") 37 | @ConfigDefault("\"uncompressed\"") 38 | def getCompressionCodecString: String 39 | 40 | def getCompressionCodec: CompressionCodecName 41 | def setCompressionCodec(v: CompressionCodecName): Unit 42 | 43 | @Config("canned_acl") 44 | @ConfigDefault("\"private\"") 45 | def getCannedAclString: String 46 | 47 | def getCannedAcl: CannedAccessControlList 48 | def setCannedAcl(v: CannedAccessControlList): Unit 49 | 50 | @Config("block_size") 51 | @ConfigDefault("null") 52 | def getBlockSize: Optional[Int] 53 | 54 | @Config("page_size") 55 | @ConfigDefault("null") 56 | def getPageSize: Optional[Int] 57 | 58 | @Config("max_padding_size") 59 | @ConfigDefault("null") 60 | def getMaxPaddingSize: Optional[Int] 61 | 62 | @Config("enable_dictionary_encoding") 63 | @ConfigDefault("null") 64 | def getEnableDictionaryEncoding: Optional[Boolean] 65 | 66 | @Config("buffer_dir") 67 | @ConfigDefault("null") 68 | def getBufferDir: Optional[String] 69 | 70 | @Config("catalog") 71 | @ConfigDefault("null") 72 | def getCatalog: Optional[CatalogRegistrator.Task] 73 | } 74 | 75 | object PluginTask { 76 | 77 | def loadConfig(config: ConfigSource): PluginTask = { 78 | val task = config.loadConfig(classOf[PluginTask]) 79 | // sequence_format 80 | try task.getSequenceFormat.format(0, 0) 81 | catch { 82 | case e: MissingFormatArgumentException => 83 | throw new ConfigException( 84 | s"Invalid sequence_format: ${task.getSequenceFormat}", 85 | e 86 | ) 87 | } 88 | 89 | // compression_codec 90 | CompressionCodecName 91 | .values() 92 | .find( 93 | _.name() 94 | .toLowerCase(Locale.ENGLISH) 95 | .equals(task.getCompressionCodecString) 96 | ) match { 97 | case Some(v) => task.setCompressionCodec(v) 98 | case None => 99 | val unsupported: String = task.getCompressionCodecString 100 | val supported: String = CompressionCodecName 101 | .values() 102 | .map(v => s"'${v.name().toLowerCase}'") 103 | .mkString(", ") 104 | throw new ConfigException( 105 | s"'$unsupported' is unsupported: `compression_codec` must be one of [$supported]." 106 | ) 107 | } 108 | 109 | // canned_acl 110 | CannedAccessControlList 111 | .values() 112 | .find(_.toString.equals(task.getCannedAclString)) match { 113 | case Some(v) => task.setCannedAcl(v) 114 | case None => 115 | val unsupported: String = task.getCannedAclString 116 | val supported: String = CannedAccessControlList 117 | .values() 118 | .map(v => s"'${v.toString}'") 119 | .mkString(", ") 120 | throw new ConfigException( 121 | s"'$unsupported' is unsupported: `canned_acl` must be one of [$supported]." 122 | ) 123 | } 124 | 125 | ParquetFileWriteSupport.configure(task) 126 | task 127 | } 128 | 129 | def loadTask(taskSource: TaskSource): PluginTask = 130 | taskSource.loadTask(classOf[PluginTask]) 131 | 132 | } 133 | -------------------------------------------------------------------------------- /src/main/scala/org/embulk/output/s3_parquet/parquet/DecimalLogicalType.scala: -------------------------------------------------------------------------------- 1 | package org.embulk.output.s3_parquet.parquet 2 | 3 | import java.math.{MathContext, RoundingMode => JRoundingMode} 4 | 5 | import org.apache.parquet.io.api.{Binary, RecordConsumer} 6 | import org.apache.parquet.schema.{LogicalTypeAnnotation, PrimitiveType, Types} 7 | import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName 8 | import org.embulk.config.ConfigException 9 | import org.embulk.output.s3_parquet.catalog.GlueDataType 10 | import org.embulk.spi.{Column, DataException} 11 | import org.embulk.spi.`type`.{ 12 | BooleanType, 13 | DoubleType, 14 | JsonType, 15 | LongType, 16 | StringType, 17 | TimestampType 18 | } 19 | import org.embulk.spi.time.{Timestamp, TimestampFormatter} 20 | import org.msgpack.value.Value 21 | 22 | import scala.math.BigDecimal.RoundingMode 23 | 24 | case class DecimalLogicalType(scale: Int, precision: Int) 25 | extends ParquetColumnType { 26 | // ref. https://github.com/apache/parquet-format/blob/apache-parquet-format-2.8.0/LogicalTypes.md#decimal 27 | require(scale >= 0, "Scale must be zero or a positive integer.") 28 | require( 29 | scale < precision, 30 | "Scale must be a positive integer less than the precision." 31 | ) 32 | require( 33 | precision > 0, 34 | "Precision is required and must be a non-zero positive integer." 35 | ) 36 | 37 | override def primitiveType(column: Column): PrimitiveType = 38 | column.getType match { 39 | case _: LongType if 1 <= precision && precision <= 9 => 40 | Types 41 | .optional(PrimitiveTypeName.INT32) 42 | .as(LogicalTypeAnnotation.decimalType(scale, precision)) 43 | .named(column.getName) 44 | case _: LongType if 10 <= precision && precision <= 18 => 45 | Types 46 | .optional(PrimitiveTypeName.INT64) 47 | .as(LogicalTypeAnnotation.decimalType(scale, precision)) 48 | .named(column.getName) 49 | case _: StringType | _: DoubleType => 50 | Types 51 | .optional(PrimitiveTypeName.BINARY) 52 | .as(LogicalTypeAnnotation.decimalType(scale, precision)) 53 | .named(column.getName) 54 | case _: BooleanType | _: TimestampType | _: JsonType | _ => 55 | throw new ConfigException( 56 | s"Unsupported column type: ${column.getName} (scale: $scale, precision: $precision)" 57 | ) 58 | } 59 | 60 | override def glueDataType(column: Column): GlueDataType = 61 | column.getType match { 62 | case _: StringType | _: LongType | _: DoubleType => 63 | GlueDataType.DECIMAL(scale = scale, precision = precision) 64 | case _: BooleanType | _: TimestampType | _: JsonType | _ => 65 | throw new ConfigException( 66 | s"Unsupported column type: ${column.getName} (scale: $scale, precision: $precision)" 67 | ) 68 | } 69 | 70 | override def consumeBoolean(consumer: RecordConsumer, v: Boolean): Unit = 71 | throw newUnsupportedMethodException("consumeBoolean") 72 | override def consumeString(consumer: RecordConsumer, v: String): Unit = 73 | try consumeBigDecimal(consumer, BigDecimal.exact(v)) 74 | catch { 75 | case ex: NumberFormatException => 76 | throw new DataException(s"Failed to cast String: $v to BigDecimal.", ex) 77 | } 78 | override def consumeLong(consumer: RecordConsumer, v: Long): Unit = 79 | if (1 <= precision && precision <= 9) consumeLongAsInteger(consumer, v) 80 | else if (10 <= precision && precision <= 18) consumer.addLong(v) 81 | else 82 | throw new ConfigException( 83 | s"precision must be 1 <= precision <= 18 when consuming long values but precision is $precision." 84 | ) 85 | override def consumeDouble(consumer: RecordConsumer, v: Double): Unit = 86 | consumeBigDecimal(consumer, BigDecimal.exact(v)) 87 | override def consumeTimestamp( 88 | consumer: RecordConsumer, 89 | v: Timestamp, 90 | formatter: TimestampFormatter 91 | ): Unit = throw newUnsupportedMethodException("consumeTimestamp") 92 | override def consumeJson(consumer: RecordConsumer, v: Value): Unit = 93 | throw newUnsupportedMethodException("consumeJson") 94 | 95 | private def consumeBigDecimal(consumer: RecordConsumer, v: BigDecimal): Unit = 96 | // TODO: Make RoundingMode configurable? 97 | consumer.addBinary( 98 | Binary.fromString( 99 | v.setScale(scale, RoundingMode.HALF_UP) 100 | .round(new MathContext(precision, JRoundingMode.HALF_UP)) 101 | .toString() 102 | ) 103 | ) 104 | } 105 | -------------------------------------------------------------------------------- /src/main/scala/org/embulk/output/s3_parquet/parquet/TimeLogicalType.scala: -------------------------------------------------------------------------------- 1 | package org.embulk.output.s3_parquet.parquet 2 | 3 | import java.time.{OffsetTime, ZoneId} 4 | import java.time.temporal.ChronoField.{MICRO_OF_DAY, MILLI_OF_DAY, NANO_OF_DAY} 5 | 6 | import org.apache.parquet.io.api.RecordConsumer 7 | import org.apache.parquet.schema.{LogicalTypeAnnotation, PrimitiveType, Types} 8 | import org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit 9 | import org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.{ 10 | MICROS, 11 | MILLIS, 12 | NANOS 13 | } 14 | import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName 15 | import org.embulk.config.ConfigException 16 | import org.embulk.output.s3_parquet.catalog.GlueDataType 17 | import org.embulk.spi.Column 18 | import org.embulk.spi.`type`.{ 19 | BooleanType, 20 | DoubleType, 21 | JsonType, 22 | LongType, 23 | StringType, 24 | TimestampType 25 | } 26 | import org.embulk.spi.time.{Timestamp, TimestampFormatter} 27 | import org.msgpack.value.Value 28 | import org.slf4j.{Logger, LoggerFactory} 29 | 30 | case class TimeLogicalType( 31 | isAdjustedToUtc: Boolean, 32 | timeUnit: TimeUnit, 33 | timeZone: ZoneId 34 | ) extends ParquetColumnType { 35 | private val logger: Logger = LoggerFactory.getLogger(classOf[TimeLogicalType]) 36 | private val UTC: ZoneId = ZoneId.of("UTC") 37 | 38 | override def primitiveType(column: Column): PrimitiveType = 39 | column.getType match { 40 | case _: LongType | _: TimestampType => 41 | Types 42 | .optional(timeUnit match { 43 | case MILLIS => PrimitiveTypeName.INT32 44 | case MICROS | NANOS => PrimitiveTypeName.INT64 45 | }) 46 | .as(LogicalTypeAnnotation.timeType(isAdjustedToUtc, timeUnit)) 47 | .named(column.getName) 48 | case _: BooleanType | _: DoubleType | _: StringType | _: JsonType | _ => 49 | throw new ConfigException(s"Unsupported column type: ${column.getName}") 50 | } 51 | 52 | override def glueDataType(column: Column): GlueDataType = 53 | column.getType match { 54 | case _: LongType | _: TimestampType => 55 | timeUnit match { 56 | case MILLIS => 57 | warningWhenConvertingTimeToGlueType(GlueDataType.INT) 58 | GlueDataType.INT 59 | case MICROS | NANOS => 60 | warningWhenConvertingTimeToGlueType(GlueDataType.BIGINT) 61 | GlueDataType.BIGINT 62 | } 63 | case _: BooleanType | _: DoubleType | _: StringType | _: JsonType | _ => 64 | throw new ConfigException(s"Unsupported column type: ${column.getName}") 65 | } 66 | 67 | override def consumeBoolean(consumer: RecordConsumer, v: Boolean): Unit = 68 | throw newUnsupportedMethodException("consumeBoolean") 69 | 70 | override def consumeString(consumer: RecordConsumer, v: String): Unit = 71 | throw newUnsupportedMethodException("consumeString") 72 | 73 | override def consumeLong(consumer: RecordConsumer, v: Long): Unit = 74 | timeUnit match { 75 | case MILLIS => consumeLongAsInteger(consumer, v) 76 | case MICROS | NANOS => consumer.addLong(v) 77 | } 78 | 79 | override def consumeDouble(consumer: RecordConsumer, v: Double): Unit = 80 | throw newUnsupportedMethodException("consumeDouble") 81 | 82 | override def consumeTimestamp( 83 | consumer: RecordConsumer, 84 | v: Timestamp, 85 | formatter: TimestampFormatter 86 | ): Unit = { 87 | // * `TIME` with precision `MILLIS` is used for millisecond precision. 88 | // It must annotate an `int32` that stores the number of milliseconds after midnight. 89 | // * `TIME` with precision `MICROS` is used for microsecond precision. 90 | // It must annotate an `int64` that stores the number of microseconds after midnight. 91 | // * `TIME` with precision `NANOS` is used for nanosecond precision. 92 | // It must annotate an `int64` that stores the number of nanoseconds after midnight. 93 | // 94 | // ref. https://github.com/apache/parquet-format/blob/apache-parquet-format-2.7.0/LogicalTypes.md#time 95 | val zoneId = if (isAdjustedToUtc) UTC else timeZone 96 | val offsetTime: OffsetTime = OffsetTime.ofInstant(v.getInstant, zoneId) 97 | timeUnit match { 98 | case MILLIS => 99 | consumeLongAsInteger(consumer, offsetTime.get(MILLI_OF_DAY)) 100 | case MICROS => 101 | consumer.addLong(offsetTime.getLong(MICRO_OF_DAY)) 102 | case NANOS => 103 | consumer.addLong(offsetTime.getLong(NANO_OF_DAY)) 104 | } 105 | } 106 | 107 | override def consumeJson(consumer: RecordConsumer, v: Value): Unit = 108 | throw newUnsupportedMethodException("consumeJson") 109 | 110 | private def warningWhenConvertingTimeToGlueType( 111 | glueType: GlueDataType 112 | ): Unit = 113 | logger.warn( 114 | s"time(isAdjustedToUtc = $isAdjustedToUtc, timeUnit = $timeUnit) is converted to Glue" + 115 | s" ${glueType.name} but this is not represented correctly, because Glue does not" + 116 | s" support time type. Please use `catalog.column_options` to define the type." 117 | ) 118 | } 119 | -------------------------------------------------------------------------------- /src/main/scala/org/embulk/output/s3_parquet/S3ParquetOutputPlugin.scala: -------------------------------------------------------------------------------- 1 | package org.embulk.output.s3_parquet 2 | 3 | import java.nio.file.{Files, Paths} 4 | import java.util.{List => JList} 5 | 6 | import org.apache.parquet.column.ParquetProperties 7 | import org.apache.parquet.hadoop.ParquetWriter 8 | import org.embulk.config.{ConfigDiff, ConfigSource, TaskReport, TaskSource} 9 | import org.embulk.output.s3_parquet.aws.Aws 10 | import org.embulk.output.s3_parquet.catalog.CatalogRegistrator 11 | import org.embulk.output.s3_parquet.parquet.ParquetFileWriteSupport 12 | import org.embulk.spi.{ 13 | Exec, 14 | OutputPlugin, 15 | PageReader, 16 | Schema, 17 | TransactionalPageOutput 18 | } 19 | import org.slf4j.{Logger, LoggerFactory} 20 | 21 | class S3ParquetOutputPlugin extends OutputPlugin { 22 | 23 | import implicits._ 24 | 25 | val logger: Logger = LoggerFactory.getLogger(classOf[S3ParquetOutputPlugin]) 26 | 27 | override def transaction( 28 | config: ConfigSource, 29 | schema: Schema, 30 | taskCount: Int, 31 | control: OutputPlugin.Control 32 | ): ConfigDiff = { 33 | val task: PluginTask = PluginTask.loadConfig(config) 34 | val support: ParquetFileWriteSupport = ParquetFileWriteSupport(task, schema) 35 | support.showOutputSchema(logger) 36 | control.run(task.dump) 37 | 38 | task.getCatalog.ifPresent { catalog => 39 | val location = 40 | s"s3://${task.getBucket}/${task.getPathPrefix.replaceFirst("(.*/)[^/]+$", "$1")}" 41 | val cr = CatalogRegistrator.fromTask( 42 | task = catalog, 43 | aws = Aws(task), 44 | schema = schema, 45 | location = location, 46 | compressionCodec = task.getCompressionCodec, 47 | defaultGlueTypes = 48 | support.parquetSchema.transform((k, v) => v.glueDataType(k)) 49 | ) 50 | ContextClassLoaderSwapper.usingPluginClass { 51 | cr.run() 52 | } 53 | } 54 | 55 | Exec.newConfigDiff 56 | } 57 | 58 | override def resume( 59 | taskSource: TaskSource, 60 | schema: Schema, 61 | taskCount: Int, 62 | control: OutputPlugin.Control 63 | ): ConfigDiff = { 64 | throw new UnsupportedOperationException( 65 | "s3_parquet output plugin does not support resuming" 66 | ) 67 | } 68 | 69 | override def cleanup( 70 | taskSource: TaskSource, 71 | schema: Schema, 72 | taskCount: Int, 73 | successTaskReports: JList[TaskReport] 74 | ): Unit = { 75 | successTaskReports.foreach { tr => 76 | logger.info( 77 | s"Created: s3://${tr.get(classOf[String], "bucket")}/${tr.get(classOf[String], "key")}, " 78 | + s"version_id: ${tr.get(classOf[String], "version_id", null)}, " 79 | + s"etag: ${tr.get(classOf[String], "etag", null)}" 80 | ) 81 | } 82 | } 83 | 84 | override def open( 85 | taskSource: TaskSource, 86 | schema: Schema, 87 | taskIndex: Int 88 | ): TransactionalPageOutput = { 89 | val task = PluginTask.loadTask(taskSource) 90 | val bufferDir: String = task.getBufferDir.getOrElse( 91 | Files.createTempDirectory("embulk-output-s3_parquet-").toString 92 | ) 93 | val bufferFile: String = Paths 94 | .get(bufferDir, s"embulk-output-s3_parquet-task-$taskIndex-0.parquet") 95 | .toString 96 | val destS3bucket: String = task.getBucket 97 | val destS3Key: String = 98 | s"${task.getPathPrefix}${task.getSequenceFormat.format(taskIndex, 0)}${task.getFileExt}" 99 | 100 | val pageReader: PageReader = new PageReader(schema) 101 | val aws: Aws = Aws(task) 102 | val parquetWriter: ParquetWriter[PageReader] = 103 | ContextClassLoaderSwapper.usingPluginClass { 104 | ParquetFileWriteSupport(task, schema) 105 | .newWriterBuilder(bufferFile) 106 | .withCompressionCodec(task.getCompressionCodec) 107 | .withDictionaryEncoding( 108 | task.getEnableDictionaryEncoding.orElse( 109 | ParquetProperties.DEFAULT_IS_DICTIONARY_ENABLED 110 | ) 111 | ) 112 | .withDictionaryPageSize( 113 | task.getPageSize.orElse( 114 | ParquetProperties.DEFAULT_DICTIONARY_PAGE_SIZE 115 | ) 116 | ) 117 | .withMaxPaddingSize( 118 | task.getMaxPaddingSize.orElse( 119 | ParquetWriter.MAX_PADDING_SIZE_DEFAULT 120 | ) 121 | ) 122 | .withPageSize( 123 | task.getPageSize.orElse(ParquetProperties.DEFAULT_PAGE_SIZE) 124 | ) 125 | .withRowGroupSize( 126 | task.getBlockSize.orElse(ParquetWriter.DEFAULT_BLOCK_SIZE) 127 | ) 128 | .withValidation(ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED) 129 | .withWriteMode( 130 | org.apache.parquet.hadoop.ParquetFileWriter.Mode.CREATE 131 | ) 132 | .withWriterVersion(ParquetProperties.DEFAULT_WRITER_VERSION) 133 | .build() 134 | } 135 | 136 | logger.info( 137 | s"Local Buffer File: $bufferFile, Destination: s3://$destS3bucket/$destS3Key" 138 | ) 139 | 140 | S3ParquetPageOutput( 141 | bufferFile, 142 | pageReader, 143 | parquetWriter, 144 | aws, 145 | destS3bucket, 146 | destS3Key 147 | ) 148 | } 149 | 150 | } 151 | -------------------------------------------------------------------------------- /src/test/scala/org/embulk/output/s3_parquet/parquet/TestJsonLogicalType.scala: -------------------------------------------------------------------------------- 1 | package org.embulk.output.s3_parquet.parquet 2 | 3 | import org.apache.parquet.io.api.Binary 4 | import org.apache.parquet.schema.LogicalTypeAnnotation 5 | import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName 6 | import org.embulk.config.ConfigException 7 | import org.embulk.output.s3_parquet.catalog.GlueDataType 8 | import org.embulk.spi.`type`.{ 9 | BooleanType, 10 | DoubleType, 11 | JsonType, 12 | LongType, 13 | StringType 14 | } 15 | import org.embulk.spi.json.JsonParser 16 | import org.embulk.spi.time.TimestampFormatter 17 | import org.msgpack.value.ValueFactory 18 | import org.scalatest.diagrams.Diagrams 19 | import org.scalatest.funsuite.AnyFunSuite 20 | import org.scalatest.prop.TableDrivenPropertyChecks 21 | 22 | import scala.util.chaining._ 23 | 24 | class TestJsonLogicalType 25 | extends AnyFunSuite 26 | with ParquetColumnTypeTestHelper 27 | with TableDrivenPropertyChecks 28 | with Diagrams { 29 | 30 | private val conditions = Table( 31 | "column", 32 | Seq( 33 | SAMPLE_BOOLEAN_COLUMN, 34 | SAMPLE_LONG_COLUMN, 35 | SAMPLE_DOUBLE_COLUMN, 36 | SAMPLE_STRING_COLUMN, 37 | SAMPLE_TIMESTAMP_COLUMN, 38 | SAMPLE_JSON_COLUMN 39 | ): _* 40 | ) 41 | 42 | test( 43 | "#primitiveType(column) returns PrimitiveTypeName.{BOOLEAN,INT64,DOUBLE,BINARY} with LogicalType" 44 | ) { 45 | forAll(conditions) { column => 46 | // format: off 47 | column.getType match { 48 | case _: BooleanType | _: LongType | _: DoubleType | _: StringType | 49 | _: JsonType => 50 | assert(PrimitiveTypeName.BINARY == JsonLogicalType.primitiveType(column).getPrimitiveTypeName) 51 | assert(LogicalTypeAnnotation.jsonType() == JsonLogicalType.primitiveType(column).getLogicalTypeAnnotation) 52 | case _ => 53 | assert(intercept[ConfigException](JsonLogicalType.primitiveType(column)).getMessage.startsWith("Unsupported column type: ")) 54 | } 55 | // format: on 56 | } 57 | } 58 | 59 | test("#glueDataType(column) returns GlueDataType") { 60 | forAll(conditions) { column => 61 | // format: off 62 | column.getType match { 63 | case _: BooleanType | _: LongType | _: DoubleType | _: StringType | 64 | _: JsonType => 65 | assert(GlueDataType.STRING == JsonLogicalType.glueDataType(column)) 66 | case _ => 67 | assert(intercept[ConfigException](JsonLogicalType.glueDataType(column)).getMessage.startsWith("Unsupported column type: ")) 68 | } 69 | // format: on 70 | } 71 | } 72 | 73 | test("#consumeBoolean") { 74 | newMockRecordConsumer().tap { consumer => 75 | consumer.writingSampleField { 76 | JsonLogicalType.consumeBoolean(consumer, true) 77 | } 78 | // format: off 79 | assert(consumer.data.head.head.isInstanceOf[Binary]) 80 | assert(consumer.data.head.head == Binary.fromString(ValueFactory.newBoolean(true).toJson)) 81 | // format: on 82 | } 83 | } 84 | 85 | test("#consumeString") { 86 | newMockRecordConsumer().tap { consumer => 87 | consumer.writingSampleField { 88 | JsonLogicalType.consumeString(consumer, "string") 89 | } 90 | // format: off 91 | assert(consumer.data.head.head.isInstanceOf[Binary]) 92 | assert(consumer.data.head.head == Binary.fromString(ValueFactory.newString("string").toJson)) 93 | // format: on 94 | } 95 | } 96 | 97 | test("#consumeLong") { 98 | newMockRecordConsumer().tap { consumer => 99 | consumer.writingSampleField { 100 | JsonLogicalType.consumeLong(consumer, Long.MaxValue) 101 | } 102 | // format: off 103 | assert(consumer.data.head.head.isInstanceOf[Binary]) 104 | assert(consumer.data.head.head == Binary.fromString(ValueFactory.newInteger(Long.MaxValue).toJson)) 105 | // format: on 106 | } 107 | } 108 | 109 | test("#consumeDouble") { 110 | newMockRecordConsumer().tap { consumer => 111 | consumer.writingSampleField { 112 | JsonLogicalType.consumeDouble(consumer, Double.MaxValue) 113 | } 114 | // format: off 115 | assert(consumer.data.head.head.isInstanceOf[Binary]) 116 | assert(consumer.data.head.head == Binary.fromString(ValueFactory.newFloat(Double.MaxValue).toJson)) 117 | // format: on 118 | } 119 | } 120 | 121 | test("#consumeTimestamp") { 122 | val formatter = TimestampFormatter 123 | .of("%Y-%m-%d %H:%M:%S.%6N %z", "UTC") 124 | newMockRecordConsumer().tap { consumer => 125 | consumer.writingSampleField { 126 | // format: off 127 | assert(intercept[ConfigException](JsonLogicalType.consumeTimestamp(consumer, null, null)).getMessage.endsWith("is unsupported.")) 128 | // format: on 129 | } 130 | } 131 | } 132 | 133 | test("#consumeJson") { 134 | newMockRecordConsumer().tap { consumer => 135 | consumer.writingSampleField { 136 | JsonLogicalType.consumeJson( 137 | consumer, 138 | new JsonParser().parse("""{"a":1,"b":"c","d":5.5,"e":true}""") 139 | ) 140 | } 141 | // format: off 142 | assert(consumer.data.head.head.isInstanceOf[Binary]) 143 | assert(consumer.data.head.head == Binary.fromString("""{"a":1,"b":"c","d":5.5,"e":true}""")) 144 | // format: on 145 | } 146 | } 147 | 148 | } 149 | -------------------------------------------------------------------------------- /src/test/scala/org/embulk/output/s3_parquet/parquet/TestDefaultColumnType.scala: -------------------------------------------------------------------------------- 1 | package org.embulk.output.s3_parquet.parquet 2 | 3 | import org.apache.parquet.io.api.Binary 4 | import org.apache.parquet.schema.LogicalTypeAnnotation 5 | import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName 6 | import org.embulk.output.s3_parquet.catalog.GlueDataType 7 | import org.embulk.spi.`type`.{ 8 | BooleanType, 9 | DoubleType, 10 | JsonType, 11 | LongType, 12 | StringType, 13 | TimestampType 14 | } 15 | import org.embulk.spi.json.JsonParser 16 | import org.embulk.spi.time.{Timestamp, TimestampFormatter} 17 | import org.scalatest.diagrams.Diagrams 18 | import org.scalatest.funsuite.AnyFunSuite 19 | import org.scalatest.prop.TableDrivenPropertyChecks 20 | 21 | import scala.util.chaining._ 22 | 23 | class TestDefaultColumnType 24 | extends AnyFunSuite 25 | with ParquetColumnTypeTestHelper 26 | with TableDrivenPropertyChecks 27 | with Diagrams { 28 | 29 | private val conditions = Table( 30 | "column", 31 | Seq( 32 | SAMPLE_BOOLEAN_COLUMN, 33 | SAMPLE_LONG_COLUMN, 34 | SAMPLE_DOUBLE_COLUMN, 35 | SAMPLE_STRING_COLUMN, 36 | SAMPLE_TIMESTAMP_COLUMN, 37 | SAMPLE_JSON_COLUMN 38 | ): _* 39 | ) 40 | 41 | test( 42 | "#primitiveType(column) returns PrimitiveTypeName.{BOOLEAN,INT64,DOUBLE,BINARY}" 43 | ) { 44 | forAll(conditions) { column => 45 | // format: off 46 | column.getType match { 47 | case _: BooleanType => 48 | assert(PrimitiveTypeName.BOOLEAN == DefaultColumnType.primitiveType(column).getPrimitiveTypeName) 49 | assert(null == DefaultColumnType.primitiveType(column).getLogicalTypeAnnotation) 50 | case _: LongType => 51 | assert(PrimitiveTypeName.INT64 == DefaultColumnType.primitiveType(column).getPrimitiveTypeName) 52 | assert(null == DefaultColumnType.primitiveType(column).getLogicalTypeAnnotation) 53 | case _: DoubleType => 54 | assert(PrimitiveTypeName.DOUBLE == DefaultColumnType.primitiveType(column).getPrimitiveTypeName) 55 | assert(null == DefaultColumnType.primitiveType(column).getLogicalTypeAnnotation) 56 | case _: StringType | _: TimestampType | _: JsonType => 57 | assert(PrimitiveTypeName.BINARY == DefaultColumnType.primitiveType(column).getPrimitiveTypeName) 58 | assert(LogicalTypeAnnotation.stringType() == DefaultColumnType.primitiveType(column).getLogicalTypeAnnotation) 59 | case _ => 60 | fail() 61 | } 62 | // format: on 63 | } 64 | } 65 | 66 | test("#glueDataType(column) returns GlueDataType") { 67 | forAll(conditions) { column => 68 | // format: off 69 | column.getType match { 70 | case _: BooleanType => 71 | assert(GlueDataType.BOOLEAN == DefaultColumnType.glueDataType(column)) 72 | case _: LongType => 73 | assert(GlueDataType.BIGINT == DefaultColumnType.glueDataType(column)) 74 | case _: DoubleType => 75 | assert(GlueDataType.DOUBLE == DefaultColumnType.glueDataType(column)) 76 | case _: StringType | _: TimestampType | _: JsonType => 77 | assert(GlueDataType.STRING == DefaultColumnType.glueDataType(column)) 78 | case _ => 79 | fail() 80 | } 81 | // format: on 82 | } 83 | } 84 | 85 | test("#consumeBoolean") { 86 | newMockRecordConsumer().tap { consumer => 87 | consumer.writingSampleField { 88 | DefaultColumnType.consumeBoolean(consumer, true) 89 | } 90 | assert(consumer.data.head.head.isInstanceOf[Boolean]) 91 | assert(consumer.data.head.head == true) 92 | } 93 | } 94 | 95 | test("#consumeString") { 96 | newMockRecordConsumer().tap { consumer => 97 | consumer.writingSampleField { 98 | DefaultColumnType.consumeString(consumer, "string") 99 | } 100 | assert(consumer.data.head.head.isInstanceOf[Binary]) 101 | assert(consumer.data.head.head == Binary.fromString("string")) 102 | } 103 | } 104 | 105 | test("#consumeLong") { 106 | newMockRecordConsumer().tap { consumer => 107 | consumer.writingSampleField { 108 | DefaultColumnType.consumeLong(consumer, Long.MaxValue) 109 | } 110 | assert(consumer.data.head.head.isInstanceOf[Long]) 111 | assert(consumer.data.head.head == Long.MaxValue) 112 | } 113 | } 114 | 115 | test("#consumeDouble") { 116 | newMockRecordConsumer().tap { consumer => 117 | consumer.writingSampleField { 118 | DefaultColumnType.consumeDouble(consumer, Double.MaxValue) 119 | } 120 | assert(consumer.data.head.head.isInstanceOf[Double]) 121 | assert(consumer.data.head.head == Double.MaxValue) 122 | } 123 | } 124 | 125 | test("#consumeTimestamp") { 126 | val formatter = TimestampFormatter 127 | .of("%Y-%m-%d %H:%M:%S.%6N %z", "UTC") 128 | newMockRecordConsumer().tap { consumer => 129 | consumer.writingSampleField { 130 | DefaultColumnType.consumeTimestamp( 131 | consumer, 132 | Timestamp.ofEpochMilli(Int.MaxValue), 133 | formatter 134 | ) 135 | } 136 | // format: off 137 | assert(consumer.data.head.head.isInstanceOf[Binary]) 138 | assert(consumer.data.head.head == Binary.fromString("1970-01-25 20:31:23.647000 +0000")) 139 | // format: on 140 | } 141 | } 142 | 143 | test("#consumeJson") { 144 | newMockRecordConsumer().tap { consumer => 145 | consumer.writingSampleField { 146 | DefaultColumnType.consumeJson( 147 | consumer, 148 | new JsonParser().parse("""{"a":1,"b":"c","d":5.5,"e":true}""") 149 | ) 150 | } 151 | // format: off 152 | assert(consumer.data.head.head.isInstanceOf[Binary]) 153 | assert(consumer.data.head.head == Binary.fromString("""{"a":1,"b":"c","d":5.5,"e":true}""")) 154 | // format: on 155 | } 156 | } 157 | } 158 | -------------------------------------------------------------------------------- /src/main/scala/org/embulk/output/s3_parquet/aws/AwsCredentials.scala: -------------------------------------------------------------------------------- 1 | package org.embulk.output.s3_parquet.aws 2 | 3 | import java.util.Optional 4 | 5 | import com.amazonaws.auth.{ 6 | AnonymousAWSCredentials, 7 | AWSCredentialsProvider, 8 | AWSStaticCredentialsProvider, 9 | BasicAWSCredentials, 10 | BasicSessionCredentials, 11 | DefaultAWSCredentialsProviderChain, 12 | EC2ContainerCredentialsProviderWrapper, 13 | EnvironmentVariableCredentialsProvider, 14 | STSAssumeRoleSessionCredentialsProvider, 15 | SystemPropertiesCredentialsProvider, 16 | WebIdentityTokenCredentialsProvider 17 | } 18 | import com.amazonaws.auth.profile.{ 19 | ProfileCredentialsProvider, 20 | ProfilesConfigFile 21 | } 22 | import org.embulk.config.{Config, ConfigDefault, ConfigException} 23 | import org.embulk.output.s3_parquet.aws.AwsCredentials.Task 24 | import org.embulk.spi.unit.LocalFile 25 | 26 | object AwsCredentials { 27 | 28 | trait Task { 29 | 30 | @Config("auth_method") 31 | @ConfigDefault("\"default\"") 32 | def getAuthMethod: String 33 | 34 | @Config("access_key_id") 35 | @ConfigDefault("null") 36 | def getAccessKeyId: Optional[String] 37 | 38 | @Config("secret_access_key") 39 | @ConfigDefault("null") 40 | def getSecretAccessKey: Optional[String] 41 | 42 | @Config("session_token") 43 | @ConfigDefault("null") 44 | def getSessionToken: Optional[String] 45 | 46 | @Config("profile_file") 47 | @ConfigDefault("null") 48 | def getProfileFile: Optional[LocalFile] 49 | 50 | @Config("profile_name") 51 | @ConfigDefault("\"default\"") 52 | def getProfileName: String 53 | 54 | @Config("role_arn") 55 | @ConfigDefault("null") 56 | def getRoleArn: Optional[String] 57 | 58 | @Config("role_session_name") 59 | @ConfigDefault("null") 60 | def getRoleSessionName: Optional[String] 61 | 62 | @Config("role_external_id") 63 | @ConfigDefault("null") 64 | def getRoleExternalId: Optional[String] 65 | 66 | @Config("role_session_duration_seconds") 67 | @ConfigDefault("null") 68 | def getRoleSessionDurationSeconds: Optional[Int] 69 | 70 | @Config("scope_down_policy") 71 | @ConfigDefault("null") 72 | def getScopeDownPolicy: Optional[String] 73 | 74 | @Config("web_identity_token_file") 75 | @ConfigDefault("null") 76 | def getWebIdentityTokenFile: Optional[String] 77 | } 78 | 79 | def apply(task: Task): AwsCredentials = { 80 | new AwsCredentials(task) 81 | } 82 | } 83 | 84 | class AwsCredentials(task: Task) { 85 | 86 | def createAwsCredentialsProvider: AWSCredentialsProvider = { 87 | task.getAuthMethod match { 88 | case "basic" => 89 | new AWSStaticCredentialsProvider( 90 | new BasicAWSCredentials( 91 | getRequiredOption(task.getAccessKeyId, "access_key_id"), 92 | getRequiredOption(task.getSecretAccessKey, "secret_access_key") 93 | ) 94 | ) 95 | 96 | case "env" => 97 | new EnvironmentVariableCredentialsProvider 98 | 99 | case "instance" => 100 | // NOTE: combination of InstanceProfileCredentialsProvider and ContainerCredentialsProvider 101 | new EC2ContainerCredentialsProviderWrapper 102 | 103 | case "profile" => 104 | if (task.getProfileFile.isPresent) { 105 | val pf: ProfilesConfigFile = new ProfilesConfigFile( 106 | task.getProfileFile.get().getFile 107 | ) 108 | new ProfileCredentialsProvider(pf, task.getProfileName) 109 | } 110 | else new ProfileCredentialsProvider(task.getProfileName) 111 | 112 | case "properties" => 113 | new SystemPropertiesCredentialsProvider 114 | 115 | case "anonymous" => 116 | new AWSStaticCredentialsProvider(new AnonymousAWSCredentials) 117 | 118 | case "session" => 119 | new AWSStaticCredentialsProvider( 120 | new BasicSessionCredentials( 121 | getRequiredOption(task.getAccessKeyId, "access_key_id"), 122 | getRequiredOption(task.getSecretAccessKey, "secret_access_key"), 123 | getRequiredOption(task.getSessionToken, "session_token") 124 | ) 125 | ) 126 | 127 | case "assume_role" => 128 | // NOTE: Are http_proxy, endpoint, region required when assuming role? 129 | val builder = new STSAssumeRoleSessionCredentialsProvider.Builder( 130 | getRequiredOption(task.getRoleArn, "role_arn"), 131 | getRequiredOption(task.getRoleSessionName, "role_session_name") 132 | ) 133 | task.getRoleExternalId.ifPresent(v => builder.withExternalId(v)) 134 | task.getRoleSessionDurationSeconds.ifPresent(v => 135 | builder.withRoleSessionDurationSeconds(v) 136 | ) 137 | task.getScopeDownPolicy.ifPresent(v => builder.withScopeDownPolicy(v)) 138 | 139 | builder.build() 140 | 141 | case "web_identity_token" => 142 | WebIdentityTokenCredentialsProvider 143 | .builder() 144 | .roleArn(getRequiredOption(task.getRoleArn, "role_arn")) 145 | .roleSessionName( 146 | getRequiredOption(task.getRoleSessionName, "role_session_name") 147 | ) 148 | .webIdentityTokenFile( 149 | getRequiredOption( 150 | task.getWebIdentityTokenFile, 151 | "web_identity_token_file" 152 | ) 153 | ) 154 | .build() 155 | 156 | case "default" => 157 | new DefaultAWSCredentialsProviderChain 158 | 159 | case am => 160 | throw new ConfigException( 161 | s"'$am' is unsupported: `auth_method` must be one of ['basic', 'env', 'instance', 'profile', 'properties', 'anonymous', 'session', 'assume_role', 'default']." 162 | ) 163 | } 164 | } 165 | 166 | private def getRequiredOption[A](o: Optional[A], name: String): A = { 167 | o.orElseThrow(() => 168 | new ConfigException( 169 | s"`$name` must be set when `auth_method` is ${task.getAuthMethod}." 170 | ) 171 | ) 172 | } 173 | 174 | } 175 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | 0.5.3 (2024-06-28) 2 | ================== 3 | 4 | * [Enhancement] [#55](https://github.com/civitaspo/embulk-output-s3_parquet/pull/55) Replace parquet-tools with parquet-avro 5 | * [Enhancement] [#57](https://github.com/civitaspo/embulk-output-s3_parquet/pull/57) Upgrade hadoop-common library to resolve CVE-2021-37404 6 | 7 | 8 | 0.5.2 (2020-10-12) 9 | ================== 10 | 11 | * [Fix] [#51](https://github.com/civitaspo/embulk-output-s3_parquet/pull/51) Use PluginClassLoader when oparating catalog. 12 | 13 | 0.5.1 (2020-06-24) 14 | ================== 15 | 16 | * [Fix] [#47](https://github.com/civitaspo/embulk-output-s3_parquet/pull/47) Use lower case without any space for Glue data type. 17 | 18 | 0.5.0 (2020-05-25) 19 | ================== 20 | 21 | * [New Feature] [#45](https://github.com/civitaspo/embulk-output-s3_parquet/pull/45) Introduce the new usage of **column_options.logical_type**, **type_options.logical_type** to configure more detailed logical types. 22 | * [Deprecated] [#45](https://github.com/civitaspo/embulk-output-s3_parquet/pull/45) The old usage of **column_options.logical_type**, **type_options.logical_type** is deprecated. Use **column_options.converted_type**, **type_options.converted_type** instead. 23 | * [New Feature] [#45](https://github.com/civitaspo/embulk-output-s3_parquet/pull/45) Support casting boolean, double, string, timestamp, json to the int logical type. 24 | * [New Feature] [#45](https://github.com/civitaspo/embulk-output-s3_parquet/pull/45) Support casting long to the timestamp logical type. 25 | * [New Feature] [#45](https://github.com/civitaspo/embulk-output-s3_parquet/pull/45) Support the decimal logical type. (close [#44](https://github.com/civitaspo/embulk-output-s3_parquet/issues/44)) 26 | * [New Feature] [#45](https://github.com/civitaspo/embulk-output-s3_parquet/pull/45) Support the time logical type. 27 | * [New Feature] [#45](https://github.com/civitaspo/embulk-output-s3_parquet/pull/45) Support the date logical type. 28 | * [New Feature] [#45](https://github.com/civitaspo/embulk-output-s3_parquet/pull/45) Support is_adjusted_to_utc = false for the timestamp logical type. 29 | * [Fix] [#45](https://github.com/civitaspo/embulk-output-s3_parquet/pull/45) Fix the issue 'Logical type int{8,16,32} don't work' (close [#43](https://github.com/civitaspo/embulk-output-s3_parquet/issues/43)) 30 | * [Enhancement] Add lots of tests. 31 | 32 | 0.4.2 (2020-04-30) 33 | ================== 34 | 35 | * [Enhancement] [#40](https://github.com/civitaspo/embulk-output-s3_parquet/pull/40) Check combinations with embulk-type and logical-type strictly. 36 | 37 | 0.4.1 (2020-04-30) 38 | ================== 39 | 40 | * [Enhancement] [#37](https://github.com/civitaspo/embulk-output-s3_parquet/pull/37) Rewrite the integration tests to make writing and reading tests easier & Use Diagrams for all test cases. 41 | * [Enhancement] [#38](https://github.com/civitaspo/embulk-output-s3_parquet/pull/38) Make all column types enable to use LogicalTypeHandler. 42 | * [Enhancement] [#38](https://github.com/civitaspo/embulk-output-s3_parquet/pull/38) Make parquet schema testable. 43 | * [New Feature] [#38](https://github.com/civitaspo/embulk-output-s3_parquet/pull/38) Support timestamp-nanos. 44 | 45 | 0.4.0 (2020-04-28) 46 | ================== 47 | 48 | * [Enhancement] [#35](https://github.com/civitaspo/embulk-output-s3_parquet/pull/35) Fix deprecation warnings. 49 | 50 | 51 | 0.3.0 (2020-04-26) 52 | ================== 53 | 54 | * [Enhancement] [#27](https://github.com/civitaspo/embulk-output-s3_parquet/pull/27) Github Actions releases automatically when a new release tag pushed instead of releasing from local. 55 | * [HotFix] [#29](https://github.com/civitaspo/embulk-output-s3_parquet/pull/29) Do not skip the CI when a tag is pushed. 56 | * [Enhancement] [#28](https://github.com/civitaspo/embulk-output-s3_parquet/pull/28) Apply the "org.embulk.embulk-plugins" Gradle plugin. 57 | 58 | 0.2.0 (2020-03-10) 59 | ================== 60 | 61 | * [Enhancement] [#23](https://github.com/civitaspo/embulk-output-s3_parquet/pull/23) Limit the usage of swapping ContextClassLoader 62 | * [BugFix] [#24](https://github.com/civitaspo/embulk-output-s3_parquet/pull/24) Use basic credentials correctly 63 | * [Enhancement] [#20](https://github.com/civitaspo/embulk-output-s3_parquet/pull/20) Update gradle 4.1 -> 6.1 64 | * [Enhancement] [#20](https://github.com/civitaspo/embulk-output-s3_parquet/pull/20) Update parquet-{column,common,encoding,hadoop,jackson,tools} 1.10.1 -> 1.11.0 with the latest parquet-format 2.4.0 -> 2.7.0 65 | * [parquet-format CHANGELOG](https://github.com/apache/parquet-format/blob/master/CHANGES.md) 66 | * [parquet-mr CHANGELOG](https://github.com/apache/parquet-mr/blob/apache-parquet-1.11.0/CHANGES.md#version-1110) 67 | * [Enhancement] [#20](https://github.com/civitaspo/embulk-output-s3_parquet/pull/20) Update aws-java-sdk 1.11.676 -> 1.11.739 68 | * [Enhancement] [#20](https://github.com/civitaspo/embulk-output-s3_parquet/pull/20) Update embulk 0.9.20 -> 0.9.23 with embulk-deps-{config,buffer} 69 | * [Enhancement] [#19](https://github.com/civitaspo/embulk-output-s3_parquet/pull/19) Use scalafmt instead of the Intellij formatter. 70 | * [Enhancement] [#19](https://github.com/civitaspo/embulk-output-s3_parquet/pull/19) Use scalafmt in CI. 71 | * [Enhancement] [#19](https://github.com/civitaspo/embulk-output-s3_parquet/pull/19) Enable to run examples locally with some prepared scripts. 72 | 73 | 0.1.0 (2019-11-17) 74 | ================== 75 | 76 | * [New Feature] Support Logical Types older representations(OriginalTypes) #12 77 | * [Enhancement] Add Github Actions CI settings #13 78 | * [Enhancement] Support LogicalTypes for Glue Data Catalog #14 79 | * [Enhancement] Update dependencies #15 80 | * [New Feature] Support `auth_method: web_identity_token` #15 81 | 82 | 0.0.3 (2019-07-17) 83 | ================== 84 | 85 | * [New Feature] Add `catalog` option to register a new table that has data created by `s3_parquet` plugin. 86 | * [Enhancement] Update dependencies. 87 | 88 | 0.0.2 (2019-01-21) 89 | ================== 90 | 91 | * [Fix] Close local buffer files before uploading even if lots of pages exist. 92 | 93 | 0.0.1 (2019-01-18) 94 | ================== 95 | 96 | * First Release 97 | -------------------------------------------------------------------------------- /src/main/scala/org/embulk/output/s3_parquet/catalog/CatalogRegistrator.scala: -------------------------------------------------------------------------------- 1 | package org.embulk.output.s3_parquet.catalog 2 | 3 | import java.util.{Optional, Map => JMap} 4 | 5 | import com.amazonaws.services.glue.model.{ 6 | Column, 7 | CreateTableRequest, 8 | DeleteTableRequest, 9 | GetTableRequest, 10 | SerDeInfo, 11 | StorageDescriptor, 12 | TableInput 13 | } 14 | import org.apache.parquet.hadoop.metadata.CompressionCodecName 15 | import org.embulk.config.{Config, ConfigDefault, ConfigException} 16 | import org.embulk.output.s3_parquet.aws.Aws 17 | import org.embulk.output.s3_parquet.implicits 18 | import org.embulk.spi.{Schema, Column => EmbulkColumn} 19 | import org.slf4j.{Logger, LoggerFactory} 20 | 21 | import scala.util.Try 22 | 23 | object CatalogRegistrator { 24 | 25 | trait Task extends org.embulk.config.Task { 26 | @Config("catalog_id") 27 | @ConfigDefault("null") 28 | def getCatalogId: Optional[String] 29 | 30 | @Config("database") 31 | def getDatabase: String 32 | 33 | @Config("table") 34 | def getTable: String 35 | 36 | @Config("column_options") 37 | @ConfigDefault("{}") 38 | def getColumnOptions: JMap[String, ColumnOption] 39 | 40 | @Config("operation_if_exists") 41 | @ConfigDefault("\"delete\"") 42 | def getOperationIfExists: String 43 | } 44 | 45 | trait ColumnOption { 46 | @Config("type") 47 | def getType: String 48 | } 49 | 50 | import implicits._ 51 | 52 | def fromTask( 53 | task: CatalogRegistrator.Task, 54 | aws: Aws, 55 | schema: Schema, 56 | location: String, 57 | compressionCodec: CompressionCodecName, 58 | defaultGlueTypes: Map[EmbulkColumn, GlueDataType] = Map.empty 59 | ): CatalogRegistrator = 60 | CatalogRegistrator( 61 | aws = aws, 62 | catalogId = task.getCatalogId, 63 | database = task.getDatabase, 64 | table = task.getTable, 65 | operationIfExists = task.getOperationIfExists, 66 | location = location, 67 | compressionCodec = compressionCodec, 68 | schema = schema, 69 | columnOptions = task.getColumnOptions, 70 | defaultGlueTypes = defaultGlueTypes 71 | ) 72 | } 73 | 74 | case class CatalogRegistrator( 75 | aws: Aws, 76 | catalogId: Option[String] = None, 77 | database: String, 78 | table: String, 79 | operationIfExists: String, 80 | location: String, 81 | compressionCodec: CompressionCodecName, 82 | schema: Schema, 83 | columnOptions: Map[String, CatalogRegistrator.ColumnOption], 84 | defaultGlueTypes: Map[EmbulkColumn, GlueDataType] = Map.empty 85 | ) { 86 | 87 | import implicits._ 88 | 89 | private val logger: Logger = 90 | LoggerFactory.getLogger(classOf[CatalogRegistrator]) 91 | 92 | def run(): Unit = { 93 | if (doesTableExists()) { 94 | operationIfExists match { 95 | case "skip" => 96 | logger.info( 97 | s"Skip to register the table: ${database}.${table}" 98 | ) 99 | return 100 | 101 | case "delete" => 102 | logger.info(s"Delete the table: ${database}.${table}") 103 | deleteTable() 104 | 105 | case unknown => 106 | throw new ConfigException(s"Unsupported operation: $unknown") 107 | } 108 | } 109 | registerNewParquetTable() 110 | showNewTableInfo() 111 | } 112 | 113 | def showNewTableInfo(): Unit = { 114 | val req = new GetTableRequest() 115 | catalogId.foreach(req.setCatalogId) 116 | req.setDatabaseName(database) 117 | req.setName(table) 118 | 119 | val t = aws.withGlue(_.getTable(req)).getTable 120 | logger.info(s"Created a table: ${t.toString}") 121 | } 122 | 123 | def doesTableExists(): Boolean = { 124 | val req = new GetTableRequest() 125 | catalogId.foreach(req.setCatalogId) 126 | req.setDatabaseName(database) 127 | req.setName(table) 128 | 129 | Try(aws.withGlue(_.getTable(req))).isSuccess 130 | } 131 | 132 | def deleteTable(): Unit = { 133 | val req = new DeleteTableRequest() 134 | catalogId.foreach(req.setCatalogId) 135 | req.setDatabaseName(database) 136 | req.setName(table) 137 | aws.withGlue(_.deleteTable(req)) 138 | } 139 | 140 | def registerNewParquetTable(): Unit = { 141 | logger.info(s"Create a new table: ${database}.${table}") 142 | val req = new CreateTableRequest() 143 | catalogId.foreach(req.setCatalogId) 144 | req.setDatabaseName(database) 145 | req.setTableInput( 146 | new TableInput() 147 | .withName(table) 148 | .withDescription("Created by embulk-output-s3_parquet") 149 | .withTableType("EXTERNAL_TABLE") 150 | .withParameters( 151 | Map( 152 | "EXTERNAL" -> "TRUE", 153 | "classification" -> "parquet", 154 | "parquet.compression" -> compressionCodec.name() 155 | ) 156 | ) 157 | .withStorageDescriptor( 158 | new StorageDescriptor() 159 | .withColumns(getGlueSchema: _*) 160 | .withLocation(location) 161 | .withCompressed(isCompressed) 162 | .withInputFormat( 163 | "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat" 164 | ) 165 | .withOutputFormat( 166 | "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat" 167 | ) 168 | .withSerdeInfo( 169 | new SerDeInfo() 170 | .withSerializationLibrary( 171 | "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe" 172 | ) 173 | .withParameters(Map("serialization.format" -> "1")) 174 | ) 175 | ) 176 | ) 177 | aws.withGlue(_.createTable(req)) 178 | } 179 | 180 | private def getGlueSchema: Seq[Column] = { 181 | schema.getColumns.map { c: EmbulkColumn => 182 | new Column() 183 | .withName(c.getName) 184 | .withType( 185 | columnOptions 186 | .get(c.getName) 187 | .map(_.getType) 188 | .getOrElse(defaultGlueTypes(c).name) 189 | ) 190 | } 191 | } 192 | 193 | private def isCompressed: Boolean = { 194 | !compressionCodec.equals(CompressionCodecName.UNCOMPRESSED) 195 | } 196 | 197 | } 198 | -------------------------------------------------------------------------------- /src/test/scala/org/embulk/output/s3_parquet/parquet/TestDateLogicalType.scala: -------------------------------------------------------------------------------- 1 | package org.embulk.output.s3_parquet.parquet 2 | 3 | import org.apache.parquet.schema.LogicalTypeAnnotation 4 | import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName 5 | import org.embulk.config.ConfigException 6 | import org.embulk.output.s3_parquet.catalog.GlueDataType 7 | import org.embulk.spi.DataException 8 | import org.embulk.spi.time.Timestamp 9 | import org.scalatest.diagrams.Diagrams 10 | import org.scalatest.funsuite.AnyFunSuite 11 | import org.scalatest.prop.TableDrivenPropertyChecks 12 | 13 | import scala.util.chaining._ 14 | 15 | class TestDateLogicalType 16 | extends AnyFunSuite 17 | with ParquetColumnTypeTestHelper 18 | with TableDrivenPropertyChecks 19 | with Diagrams { 20 | 21 | private val conditions = Table( 22 | "column", 23 | Seq( 24 | SAMPLE_BOOLEAN_COLUMN, 25 | SAMPLE_LONG_COLUMN, 26 | SAMPLE_DOUBLE_COLUMN, 27 | SAMPLE_STRING_COLUMN, 28 | SAMPLE_TIMESTAMP_COLUMN, 29 | SAMPLE_JSON_COLUMN 30 | ): _* 31 | ) 32 | 33 | private val unsupportedEmbulkColumns = Seq( 34 | SAMPLE_BOOLEAN_COLUMN, 35 | SAMPLE_DOUBLE_COLUMN, 36 | SAMPLE_STRING_COLUMN, 37 | SAMPLE_JSON_COLUMN 38 | ) 39 | 40 | test( 41 | "#primitiveType(column) returns PrimitiveTypeName.INT32 with LogicalType" 42 | ) { 43 | forAll(conditions) { column => 44 | whenever(!unsupportedEmbulkColumns.contains(column)) { 45 | // format: off 46 | assert(PrimitiveTypeName.INT32 == DateLogicalType.primitiveType(column).getPrimitiveTypeName) 47 | assert(LogicalTypeAnnotation.dateType() == DateLogicalType.primitiveType(column).getLogicalTypeAnnotation) 48 | // format: on 49 | } 50 | } 51 | } 52 | 53 | test( 54 | s"#primitiveType(column) cannot return any PrimitiveType when embulk column type is one of (${unsupportedEmbulkColumns 55 | .map(_.getType.getName) 56 | .mkString(",")})" 57 | ) { 58 | forAll(conditions) { column => 59 | whenever(unsupportedEmbulkColumns.contains(column)) { 60 | // format: off 61 | assert(intercept[ConfigException](DateLogicalType.primitiveType(column)).getMessage.startsWith("Unsupported column type: ")) 62 | // format: on 63 | } 64 | } 65 | } 66 | 67 | test("#glueDataType(column) returns GlueDataType") { 68 | forAll(conditions) { column => 69 | whenever(!unsupportedEmbulkColumns.contains(column)) { 70 | assert(GlueDataType.DATE == DateLogicalType.glueDataType(column)) 71 | } 72 | } 73 | } 74 | 75 | test( 76 | s"#glueDataType(column) cannot return any GlueDataType when embulk column type is one of (${unsupportedEmbulkColumns 77 | .map(_.getType.getName) 78 | .mkString(",")})" 79 | ) { 80 | forAll(conditions) { column => 81 | whenever(unsupportedEmbulkColumns.contains(column)) { 82 | // format: off 83 | assert(intercept[ConfigException](DateLogicalType.glueDataType(column)).getMessage.startsWith("Unsupported column type: ")) 84 | // format: on 85 | } 86 | } 87 | } 88 | 89 | test("#consumeBoolean") { 90 | newMockRecordConsumer().tap { consumer => 91 | consumer.writingSampleField { 92 | // format: off 93 | assert(intercept[ConfigException](DateLogicalType.consumeBoolean(consumer, true)).getMessage.endsWith("is unsupported.")) 94 | // format: on 95 | } 96 | } 97 | } 98 | 99 | test("#consumeString") { 100 | newMockRecordConsumer().tap { consumer => 101 | consumer.writingSampleField { 102 | // format: off 103 | assert(intercept[ConfigException](DateLogicalType.consumeString(consumer, "")).getMessage.endsWith("is unsupported.")) 104 | // format: on 105 | } 106 | } 107 | } 108 | 109 | test("#consumeLong") { 110 | newMockRecordConsumer().tap { consumer => 111 | consumer.writingSampleField { 112 | DateLogicalType.consumeLong(consumer, 1L) 113 | } 114 | assert(consumer.data.head.head.isInstanceOf[Int]) 115 | assert(consumer.data.head.head == 1) 116 | } 117 | newMockRecordConsumer().tap { consumer => 118 | consumer.writingSampleField { 119 | // format: off 120 | assert(intercept[DataException](DateLogicalType.consumeLong(consumer, Long.MaxValue)).getMessage.startsWith("Failed to cast Long: ")) 121 | // format: on 122 | } 123 | } 124 | } 125 | 126 | test("#consumeDouble") { 127 | newMockRecordConsumer().tap { consumer => 128 | consumer.writingSampleField { 129 | // format: off 130 | assert(intercept[ConfigException](DateLogicalType.consumeDouble(consumer, 0.0d)).getMessage.endsWith("is unsupported.")) 131 | // format: on 132 | } 133 | 134 | } 135 | } 136 | 137 | test("#consumeTimestamp") { 138 | newMockRecordConsumer().tap { consumer => 139 | consumer.writingSampleField { 140 | DateLogicalType.consumeTimestamp( 141 | consumer, 142 | Timestamp.ofEpochSecond(24 * 60 * 60), // 1day 143 | null 144 | ) 145 | } 146 | assert(consumer.data.head.head.isInstanceOf[Int]) 147 | assert(consumer.data.head.head == 1) 148 | } 149 | newMockRecordConsumer().tap { consumer => 150 | consumer.writingSampleField { 151 | // NOTE: See. java.time.Instant#MAX_SECOND 152 | val instantMaxEpochSeconds = 31556889864403199L 153 | // format: off 154 | assert(intercept[DataException](DateLogicalType.consumeTimestamp(consumer, Timestamp.ofEpochSecond(instantMaxEpochSeconds), null)).getMessage.startsWith("Failed to cast Long: ")) 155 | // format: on 156 | } 157 | } 158 | newMockRecordConsumer().tap { consumer => 159 | consumer.writingSampleField { 160 | // NOTE: See. java.time.Instant#MIN_SECOND 161 | val instantMinEpochSeconds = -31557014167219200L 162 | // format: off 163 | assert(intercept[DataException](DateLogicalType.consumeTimestamp(consumer, Timestamp.ofEpochSecond(instantMinEpochSeconds), null)).getMessage.startsWith("Failed to cast Long: ")) 164 | // format: on 165 | } 166 | } 167 | } 168 | 169 | test("#consumeJson") { 170 | newMockRecordConsumer().tap { consumer => 171 | consumer.writingSampleField { 172 | // format: off 173 | assert(intercept[ConfigException](DateLogicalType.consumeJson(consumer, null)).getMessage.endsWith("is unsupported.")) 174 | // format: on 175 | } 176 | } 177 | } 178 | } 179 | -------------------------------------------------------------------------------- /gradlew: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | 3 | # 4 | # Copyright 2015 the original author or authors. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # https://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | ############################################################################## 20 | ## 21 | ## Gradle start up script for UN*X 22 | ## 23 | ############################################################################## 24 | 25 | # Attempt to set APP_HOME 26 | # Resolve links: $0 may be a link 27 | PRG="$0" 28 | # Need this for relative symlinks. 29 | while [ -h "$PRG" ] ; do 30 | ls=`ls -ld "$PRG"` 31 | link=`expr "$ls" : '.*-> \(.*\)$'` 32 | if expr "$link" : '/.*' > /dev/null; then 33 | PRG="$link" 34 | else 35 | PRG=`dirname "$PRG"`"/$link" 36 | fi 37 | done 38 | SAVED="`pwd`" 39 | cd "`dirname \"$PRG\"`/" >/dev/null 40 | APP_HOME="`pwd -P`" 41 | cd "$SAVED" >/dev/null 42 | 43 | APP_NAME="Gradle" 44 | APP_BASE_NAME=`basename "$0"` 45 | 46 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 47 | DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"' 48 | 49 | # Use the maximum available, or set MAX_FD != -1 to use that value. 50 | MAX_FD="maximum" 51 | 52 | warn () { 53 | echo "$*" 54 | } 55 | 56 | die () { 57 | echo 58 | echo "$*" 59 | echo 60 | exit 1 61 | } 62 | 63 | # OS specific support (must be 'true' or 'false'). 64 | cygwin=false 65 | msys=false 66 | darwin=false 67 | nonstop=false 68 | case "`uname`" in 69 | CYGWIN* ) 70 | cygwin=true 71 | ;; 72 | Darwin* ) 73 | darwin=true 74 | ;; 75 | MINGW* ) 76 | msys=true 77 | ;; 78 | NONSTOP* ) 79 | nonstop=true 80 | ;; 81 | esac 82 | 83 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar 84 | 85 | # Determine the Java command to use to start the JVM. 86 | if [ -n "$JAVA_HOME" ] ; then 87 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then 88 | # IBM's JDK on AIX uses strange locations for the executables 89 | JAVACMD="$JAVA_HOME/jre/sh/java" 90 | else 91 | JAVACMD="$JAVA_HOME/bin/java" 92 | fi 93 | if [ ! -x "$JAVACMD" ] ; then 94 | die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME 95 | 96 | Please set the JAVA_HOME variable in your environment to match the 97 | location of your Java installation." 98 | fi 99 | else 100 | JAVACMD="java" 101 | which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 102 | 103 | Please set the JAVA_HOME variable in your environment to match the 104 | location of your Java installation." 105 | fi 106 | 107 | # Increase the maximum file descriptors if we can. 108 | if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then 109 | MAX_FD_LIMIT=`ulimit -H -n` 110 | if [ $? -eq 0 ] ; then 111 | if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then 112 | MAX_FD="$MAX_FD_LIMIT" 113 | fi 114 | ulimit -n $MAX_FD 115 | if [ $? -ne 0 ] ; then 116 | warn "Could not set maximum file descriptor limit: $MAX_FD" 117 | fi 118 | else 119 | warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT" 120 | fi 121 | fi 122 | 123 | # For Darwin, add options to specify how the application appears in the dock 124 | if $darwin; then 125 | GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\"" 126 | fi 127 | 128 | # For Cygwin or MSYS, switch paths to Windows format before running java 129 | if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then 130 | APP_HOME=`cygpath --path --mixed "$APP_HOME"` 131 | CLASSPATH=`cygpath --path --mixed "$CLASSPATH"` 132 | JAVACMD=`cygpath --unix "$JAVACMD"` 133 | 134 | # We build the pattern for arguments to be converted via cygpath 135 | ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null` 136 | SEP="" 137 | for dir in $ROOTDIRSRAW ; do 138 | ROOTDIRS="$ROOTDIRS$SEP$dir" 139 | SEP="|" 140 | done 141 | OURCYGPATTERN="(^($ROOTDIRS))" 142 | # Add a user-defined pattern to the cygpath arguments 143 | if [ "$GRADLE_CYGPATTERN" != "" ] ; then 144 | OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)" 145 | fi 146 | # Now convert the arguments - kludge to limit ourselves to /bin/sh 147 | i=0 148 | for arg in "$@" ; do 149 | CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -` 150 | CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option 151 | 152 | if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition 153 | eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"` 154 | else 155 | eval `echo args$i`="\"$arg\"" 156 | fi 157 | i=`expr $i + 1` 158 | done 159 | case $i in 160 | 0) set -- ;; 161 | 1) set -- "$args0" ;; 162 | 2) set -- "$args0" "$args1" ;; 163 | 3) set -- "$args0" "$args1" "$args2" ;; 164 | 4) set -- "$args0" "$args1" "$args2" "$args3" ;; 165 | 5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;; 166 | 6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;; 167 | 7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;; 168 | 8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;; 169 | 9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;; 170 | esac 171 | fi 172 | 173 | # Escape application args 174 | save () { 175 | for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done 176 | echo " " 177 | } 178 | APP_ARGS=`save "$@"` 179 | 180 | # Collect all arguments for the java command, following the shell quoting and substitution rules 181 | eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS" 182 | 183 | exec "$JAVACMD" "$@" 184 | -------------------------------------------------------------------------------- /src/test/scala/org/embulk/output/s3_parquet/parquet/TestTimestampLogicalType.scala: -------------------------------------------------------------------------------- 1 | package org.embulk.output.s3_parquet.parquet 2 | 3 | import java.time.ZoneId 4 | 5 | import org.apache.parquet.io.api.RecordConsumer 6 | import org.apache.parquet.schema.LogicalTypeAnnotation 7 | import org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.{ 8 | MICROS, 9 | MILLIS, 10 | NANOS 11 | } 12 | import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName 13 | import org.embulk.config.ConfigException 14 | import org.embulk.output.s3_parquet.catalog.GlueDataType 15 | import org.embulk.spi.time.Timestamp 16 | import org.scalatest.diagrams.Diagrams 17 | import org.scalatest.funsuite.AnyFunSuite 18 | import org.scalatest.prop.TableDrivenPropertyChecks 19 | 20 | import scala.util.chaining._ 21 | 22 | class TestTimestampLogicalType 23 | extends AnyFunSuite 24 | with ParquetColumnTypeTestHelper 25 | with TableDrivenPropertyChecks 26 | with Diagrams { 27 | 28 | private val conditions = Table( 29 | ("isAdjustedToUtc", "timeUnit", "timeZone", "column"), { 30 | for { 31 | isAdjustedToUtc <- Seq(true, false) 32 | timeUnit <- Seq(MILLIS, MICROS, NANOS) 33 | timeZone <- Seq(ZoneId.of("UTC"), ZoneId.of("Asia/Tokyo")) 34 | column <- Seq( 35 | SAMPLE_BOOLEAN_COLUMN, 36 | SAMPLE_LONG_COLUMN, 37 | SAMPLE_DOUBLE_COLUMN, 38 | SAMPLE_STRING_COLUMN, 39 | SAMPLE_TIMESTAMP_COLUMN, 40 | SAMPLE_JSON_COLUMN 41 | ) 42 | } yield (isAdjustedToUtc, timeUnit, timeZone, column) 43 | }: _* 44 | ) 45 | 46 | private val unsupportedEmbulkColumns = Seq( 47 | SAMPLE_BOOLEAN_COLUMN, 48 | SAMPLE_DOUBLE_COLUMN, 49 | SAMPLE_STRING_COLUMN, 50 | SAMPLE_JSON_COLUMN 51 | ) 52 | 53 | test( 54 | "#primitiveType(column) returns PrimitiveTypeName.{INT32,INT64} with LogicalType" 55 | ) { 56 | forAll(conditions) { (isAdjustedToUtc, timeUnit, timeZone, column) => 57 | whenever(unsupportedEmbulkColumns.contains(column)) { 58 | // format: off 59 | assert(intercept[ConfigException](TimestampLogicalType(isAdjustedToUtc = isAdjustedToUtc, timeUnit = timeUnit, timeZone = timeZone).primitiveType(column)).getMessage.startsWith("Unsupported column type: ")) 60 | // format: on 61 | } 62 | 63 | whenever(!unsupportedEmbulkColumns.contains(column)) { 64 | // format: off 65 | assert(PrimitiveTypeName.INT64 == TimestampLogicalType(isAdjustedToUtc = isAdjustedToUtc, timeUnit = timeUnit, timeZone = timeZone).primitiveType(column).getPrimitiveTypeName) 66 | assert(LogicalTypeAnnotation.timeType(isAdjustedToUtc, timeUnit) == TimestampLogicalType(isAdjustedToUtc = isAdjustedToUtc, timeUnit = timeUnit, timeZone = timeZone).primitiveType(column).getLogicalTypeAnnotation) 67 | // format: on 68 | } 69 | } 70 | } 71 | 72 | test("#glueDataType(column) returns GlueDataType") { 73 | forAll(conditions) { (isAdjustedToUtc, timeUnit, timeZone, column) => 74 | whenever(unsupportedEmbulkColumns.contains(column)) { 75 | // format: off 76 | assert(intercept[ConfigException](TimestampLogicalType(isAdjustedToUtc = isAdjustedToUtc, timeUnit = timeUnit, timeZone = timeZone).glueDataType(column)).getMessage.startsWith("Unsupported column type: ")) 77 | // format: on 78 | } 79 | whenever(!unsupportedEmbulkColumns.contains(column)) { 80 | val expectedGlueDataType = 81 | if (timeUnit === MILLIS) GlueDataType.TIMESTAMP 82 | else GlueDataType.BIGINT 83 | // format: off 84 | assert(expectedGlueDataType == TimestampLogicalType(isAdjustedToUtc = isAdjustedToUtc, timeUnit = timeUnit, timeZone = timeZone).glueDataType(column)) 85 | // format: on 86 | } 87 | } 88 | } 89 | 90 | test("#consumeLong") { 91 | forAll(conditions) { (isAdjustedToUtc, timeUnit, timeZone, _) => 92 | newMockRecordConsumer().tap { consumer => 93 | consumer.writingSampleField { 94 | TimestampLogicalType( 95 | isAdjustedToUtc = isAdjustedToUtc, 96 | timeUnit = timeUnit, 97 | timeZone = timeZone 98 | ).consumeLong(consumer, 5) 99 | } 100 | assert(consumer.data.head.head.isInstanceOf[Long]) 101 | assert(consumer.data.head.head == 5L) 102 | } 103 | newMockRecordConsumer().tap { consumer => 104 | consumer.writingSampleField { 105 | TimestampLogicalType( 106 | isAdjustedToUtc = isAdjustedToUtc, 107 | timeUnit = timeUnit, 108 | timeZone = timeZone 109 | ).consumeLong(consumer, Long.MaxValue) 110 | } 111 | assert(consumer.data.head.head.isInstanceOf[Long]) 112 | assert(consumer.data.head.head == Long.MaxValue) 113 | } 114 | } 115 | } 116 | 117 | test("#consumeTimestamp") { 118 | forAll(conditions) { (isAdjustedToUtc, timeUnit, timeZone, _) => 119 | timeUnit match { 120 | case MILLIS => 121 | val v = Timestamp.ofEpochMilli(Int.MaxValue) 122 | newMockRecordConsumer().tap { consumer => 123 | consumer.writingSampleField { 124 | TimestampLogicalType( 125 | isAdjustedToUtc = isAdjustedToUtc, 126 | timeUnit = timeUnit, 127 | timeZone = timeZone 128 | ).consumeTimestamp(consumer, v, null) 129 | } 130 | assert(consumer.data.head.head.isInstanceOf[Long]) 131 | assert(consumer.data.head.head == Int.MaxValue) 132 | } 133 | case MICROS => 134 | val v = Timestamp.ofEpochMilli(Int.MaxValue) 135 | newMockRecordConsumer().tap { consumer => 136 | consumer.writingSampleField { 137 | TimestampLogicalType( 138 | isAdjustedToUtc = isAdjustedToUtc, 139 | timeUnit = timeUnit, 140 | timeZone = timeZone 141 | ).consumeTimestamp(consumer, v, null) 142 | } 143 | assert(consumer.data.head.head.isInstanceOf[Long]) 144 | 145 | assert(consumer.data.head.head == Int.MaxValue * 1_000L) 146 | } 147 | case NANOS => 148 | val v = Timestamp.ofEpochMilli(Int.MaxValue) 149 | newMockRecordConsumer().tap { consumer => 150 | consumer.writingSampleField { 151 | TimestampLogicalType( 152 | isAdjustedToUtc = isAdjustedToUtc, 153 | timeUnit = timeUnit, 154 | timeZone = timeZone 155 | ).consumeTimestamp(consumer, v, null) 156 | } 157 | assert(consumer.data.head.head.isInstanceOf[Long]) 158 | assert(consumer.data.head.head == Int.MaxValue * 1_000_000L) 159 | } 160 | } 161 | 162 | } 163 | } 164 | 165 | test("#consume{Boolean,Double,String,Json} are unsupported.") { 166 | def assertUnsupportedConsume(f: RecordConsumer => Unit) = 167 | newMockRecordConsumer().tap { consumer => 168 | consumer.writingSampleField { 169 | // format: off 170 | assert(intercept[ConfigException](f(consumer)).getMessage.endsWith("is unsupported.")) 171 | // format: on 172 | } 173 | } 174 | 175 | forAll(conditions) { (isAdjustedToUtc, timeUnit, timeZone, _) => 176 | val t = 177 | TimestampLogicalType( 178 | isAdjustedToUtc = isAdjustedToUtc, 179 | timeUnit = timeUnit, 180 | timeZone = timeZone 181 | ) 182 | assertUnsupportedConsume(t.consumeBoolean(_, true)) 183 | assertUnsupportedConsume(t.consumeDouble(_, 0.0d)) 184 | assertUnsupportedConsume(t.consumeString(_, null)) 185 | assertUnsupportedConsume(t.consumeJson(_, null)) 186 | } 187 | } 188 | 189 | } 190 | -------------------------------------------------------------------------------- /src/main/scala/org/embulk/output/s3_parquet/parquet/IntLogicalType.scala: -------------------------------------------------------------------------------- 1 | package org.embulk.output.s3_parquet.parquet 2 | 3 | import org.apache.parquet.io.api.RecordConsumer 4 | import org.apache.parquet.schema.{LogicalTypeAnnotation, PrimitiveType, Types} 5 | import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName 6 | import org.embulk.config.ConfigException 7 | import org.embulk.output.s3_parquet.catalog.GlueDataType 8 | import org.embulk.output.s3_parquet.catalog.GlueDataType.AbstractIntGlueDataType 9 | import org.embulk.spi.{Column, DataException} 10 | import org.embulk.spi.`type`.{ 11 | BooleanType, 12 | DoubleType, 13 | JsonType, 14 | LongType, 15 | StringType, 16 | TimestampType 17 | } 18 | import org.embulk.spi.time.{Timestamp, TimestampFormatter} 19 | import org.msgpack.value.Value 20 | import org.slf4j.{Logger, LoggerFactory} 21 | 22 | import scala.math.BigDecimal.RoundingMode 23 | 24 | case class IntLogicalType(bitWidth: Int, isSigned: Boolean) 25 | extends ParquetColumnType { 26 | require( 27 | Seq(8, 16, 32, 64).contains(bitWidth), 28 | s"bitWidth value must be one of (8, 16, 32, 64)." 29 | ) 30 | 31 | private val logger: Logger = LoggerFactory.getLogger(classOf[IntLogicalType]) 32 | 33 | private val SIGNED_64BIT_INT_MAX_VALUE = BigInt("9223372036854775807") 34 | private val SIGNED_64BIT_INT_MIN_VALUE = BigInt("-9223372036854775808") 35 | private val SIGNED_32BIT_INT_MAX_VALUE = BigInt("2147483647") 36 | private val SIGNED_32BIT_INT_MIN_VALUE = BigInt("-2147483648") 37 | private val SIGNED_16BIT_INT_MAX_VALUE = BigInt("32767") 38 | private val SIGNED_16BIT_INT_MIN_VALUE = BigInt("-32768") 39 | private val SIGNED_8BIT_INT_MAX_VALUE = BigInt("127") 40 | private val SIGNED_8BIT_INT_MIN_VALUE = BigInt("-128") 41 | private val UNSIGNED_64BIT_INT_MAX_VALUE = BigInt("18446744073709551615") 42 | private val UNSIGNED_64BIT_INT_MIN_VALUE = BigInt("0") 43 | private val UNSIGNED_32BIT_INT_MAX_VALUE = BigInt("4294967295") 44 | private val UNSIGNED_32BIT_INT_MIN_VALUE = BigInt("0") 45 | private val UNSIGNED_16BIT_INT_MAX_VALUE = BigInt("65535") 46 | private val UNSIGNED_16BIT_INT_MIN_VALUE = BigInt("0") 47 | private val UNSIGNED_8BIT_INT_MAX_VALUE = BigInt("255") 48 | private val UNSIGNED_8BIT_INT_MIN_VALUE = BigInt("0") 49 | 50 | private def isINT32: Boolean = bitWidth < 64 51 | 52 | override def primitiveType(column: Column): PrimitiveType = 53 | column.getType match { 54 | case _: BooleanType | _: LongType | _: DoubleType | _: StringType => 55 | Types 56 | .optional( 57 | if (isINT32) PrimitiveTypeName.INT32 58 | else PrimitiveTypeName.INT64 59 | ) 60 | .as(LogicalTypeAnnotation.intType(bitWidth, isSigned)) 61 | .named(column.getName) 62 | case _: TimestampType | _: JsonType | _ => 63 | throw new ConfigException(s"Unsupported column type: ${column.getName}") 64 | } 65 | 66 | override def glueDataType(column: Column): GlueDataType = 67 | column.getType match { 68 | case _: BooleanType | _: LongType | _: DoubleType | _: StringType => 69 | (bitWidth, isSigned) match { 70 | case (8, true) => GlueDataType.TINYINT 71 | case (16, true) => GlueDataType.SMALLINT 72 | case (32, true) => GlueDataType.INT 73 | case (64, true) => GlueDataType.BIGINT 74 | case (8, false) => 75 | warningWhenConvertingUnsignedIntegerToGlueType( 76 | GlueDataType.SMALLINT 77 | ) 78 | GlueDataType.SMALLINT 79 | case (16, false) => 80 | warningWhenConvertingUnsignedIntegerToGlueType(GlueDataType.INT) 81 | GlueDataType.INT 82 | case (32, false) => 83 | warningWhenConvertingUnsignedIntegerToGlueType(GlueDataType.BIGINT) 84 | GlueDataType.BIGINT 85 | case (64, false) => 86 | warningWhenConvertingUnsignedIntegerToGlueType(GlueDataType.BIGINT) 87 | GlueDataType.BIGINT 88 | case (_, _) => 89 | throw new ConfigException( 90 | s"Unsupported column type: ${column.getName} (bitWidth: $bitWidth, isSigned: $isSigned)" 91 | ) 92 | } 93 | case _: TimestampType | _: JsonType | _ => 94 | throw new ConfigException(s"Unsupported column type: ${column.getName}") 95 | } 96 | 97 | override def consumeBoolean(consumer: RecordConsumer, v: Boolean): Unit = 98 | if (isINT32) 99 | consumer.addInteger( 100 | if (v) 1 101 | else 0 102 | ) 103 | else 104 | consumer.addLong( 105 | if (v) 1 106 | else 0 107 | ) 108 | 109 | override def consumeString(consumer: RecordConsumer, v: String): Unit = 110 | try consumeBigDecimal(consumer, BigDecimal.exact(v)) 111 | catch { 112 | case ex: NumberFormatException => 113 | throw new DataException(s"Failed to cast String: $v to BigDecimal.", ex) 114 | } 115 | override def consumeLong(consumer: RecordConsumer, v: Long): Unit = 116 | consumeBigInt(consumer, BigInt(v)) 117 | override def consumeDouble(consumer: RecordConsumer, v: Double): Unit = 118 | consumeBigDecimal(consumer, BigDecimal.exact(v)) 119 | override def consumeTimestamp( 120 | consumer: RecordConsumer, 121 | v: Timestamp, 122 | formatter: TimestampFormatter 123 | ): Unit = throw newUnsupportedMethodException("consumeTimestamp") 124 | override def consumeJson(consumer: RecordConsumer, v: Value): Unit = 125 | throw newUnsupportedMethodException("consumeJson") 126 | 127 | private def warningWhenConvertingUnsignedIntegerToGlueType( 128 | glueType: AbstractIntGlueDataType 129 | ): Unit = { 130 | logger.warn { 131 | s"int(bit_width = $bitWidth, is_signed $isSigned) is converted to Glue ${glueType.name}" + 132 | s" but this is not represented correctly, because the Glue ${glueType.name} represents" + 133 | s" a ${glueType.bitWidth}-bit signed integer. Please use `catalog.column_options` to define the type." 134 | } 135 | } 136 | 137 | private def consumeBigDecimal(consumer: RecordConsumer, v: BigDecimal): Unit = 138 | // TODO: Make RoundingMode configurable? 139 | consumeBigInt(consumer, v.setScale(0, RoundingMode.HALF_UP).toBigInt) 140 | 141 | private def consumeBigInt(consumer: RecordConsumer, v: BigInt): Unit = { 142 | def consume(min: BigInt, max: BigInt): Unit = 143 | if (min <= v && v <= max) 144 | if (isINT32) consumer.addInteger(v.toInt) 145 | else consumer.addLong(v.toLong) 146 | else 147 | throw new DataException( 148 | s"The value is out of the range: that is '$min <= value <= $max'" + 149 | s" in the case of int(bit_width = $bitWidth, is_signed $isSigned)" + 150 | s", but the value is $v." 151 | ) 152 | (bitWidth, isSigned) match { 153 | case (8, true) => 154 | consume(SIGNED_8BIT_INT_MIN_VALUE, SIGNED_8BIT_INT_MAX_VALUE) 155 | case (16, true) => 156 | consume(SIGNED_16BIT_INT_MIN_VALUE, SIGNED_16BIT_INT_MAX_VALUE) 157 | case (32, true) => 158 | consume(SIGNED_32BIT_INT_MIN_VALUE, SIGNED_32BIT_INT_MAX_VALUE) 159 | case (64, true) => 160 | consume(SIGNED_64BIT_INT_MIN_VALUE, SIGNED_64BIT_INT_MAX_VALUE) 161 | case (8, false) => 162 | consume(UNSIGNED_8BIT_INT_MIN_VALUE, UNSIGNED_8BIT_INT_MAX_VALUE) 163 | case (16, false) => 164 | consume(UNSIGNED_16BIT_INT_MIN_VALUE, UNSIGNED_16BIT_INT_MAX_VALUE) 165 | case (32, false) => 166 | consume(UNSIGNED_32BIT_INT_MIN_VALUE, UNSIGNED_32BIT_INT_MAX_VALUE) 167 | case (64, false) => 168 | consume(UNSIGNED_64BIT_INT_MIN_VALUE, UNSIGNED_64BIT_INT_MAX_VALUE) 169 | case _ => 170 | throw new ConfigException( 171 | s"int(bit_width = $bitWidth, is_signed $isSigned) is unsupported." 172 | ) 173 | } 174 | } 175 | } 176 | -------------------------------------------------------------------------------- /src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriteSupport.scala: -------------------------------------------------------------------------------- 1 | package org.embulk.output.s3_parquet.parquet 2 | 3 | import java.lang.{StringBuilder => JStringBuilder} 4 | import java.util.{Map => JMap} 5 | 6 | import org.apache.hadoop.conf.Configuration 7 | import org.apache.hadoop.fs.Path 8 | import org.apache.parquet.hadoop.api.WriteSupport 9 | import org.apache.parquet.hadoop.api.WriteSupport.WriteContext 10 | import org.apache.parquet.hadoop.ParquetWriter 11 | import org.apache.parquet.io.api.RecordConsumer 12 | import org.apache.parquet.schema.MessageType 13 | import org.embulk.config.{ 14 | Config, 15 | ConfigDefault, 16 | ConfigException, 17 | ConfigSource, 18 | Task => EmbulkTask 19 | } 20 | import org.embulk.output.s3_parquet.implicits 21 | import org.embulk.output.s3_parquet.parquet.ParquetFileWriteSupport.WriterBuilder 22 | import org.embulk.spi.{Column, ColumnVisitor, PageReader, Schema} 23 | import org.embulk.spi.`type`.{TimestampType, Type, Types} 24 | import org.embulk.spi.time.TimestampFormatter 25 | import org.embulk.spi.util.Timestamps 26 | import org.slf4j.Logger 27 | 28 | object ParquetFileWriteSupport { 29 | 30 | import implicits._ 31 | 32 | trait Task extends TimestampFormatter.Task with EmbulkTask { 33 | @Config("column_options") 34 | @ConfigDefault("{}") 35 | def getRawColumnOptions: JMap[String, ConfigSource] 36 | 37 | def getColumnOptions: JMap[String, ParquetColumnType.Task] 38 | def setColumnOptions( 39 | columnOptions: JMap[String, ParquetColumnType.Task] 40 | ): Unit 41 | 42 | @Config("type_options") 43 | @ConfigDefault("{}") 44 | def getRawTypeOptions: JMap[String, ConfigSource] 45 | 46 | def getTypeOptions: JMap[String, ParquetColumnType.Task] 47 | def setTypeOptions(typeOptions: JMap[String, ParquetColumnType.Task]): Unit 48 | } 49 | 50 | case class WriterBuilder(path: Path, writeSupport: ParquetFileWriteSupport) 51 | extends ParquetWriter.Builder[PageReader, WriterBuilder](path) { 52 | override def self(): WriterBuilder = this 53 | override def getWriteSupport( 54 | conf: Configuration 55 | ): WriteSupport[PageReader] = writeSupport 56 | } 57 | 58 | def configure(task: Task): Unit = { 59 | task.setColumnOptions(task.getRawColumnOptions.map { 60 | case (columnName, config) => 61 | columnName -> ParquetColumnType.loadConfig(config) 62 | }) 63 | task.setTypeOptions(task.getRawTypeOptions.map { 64 | case (columnType, config) => 65 | columnType -> ParquetColumnType.loadConfig(config) 66 | }) 67 | } 68 | 69 | private def validateTask(task: Task, schema: Schema): Unit = { 70 | if (task.getColumnOptions == null || task.getTypeOptions == null) 71 | assert(false) 72 | 73 | task.getTypeOptions.keys.foreach( 74 | embulkType 75 | ) // throw ConfigException if unknown type name is found. 76 | 77 | task.getColumnOptions.foreach { 78 | case (c: String, t: ParquetColumnType.Task) => 79 | val column: Column = schema.lookupColumn(c) // throw ConfigException if columnName does not exist. 80 | 81 | if (t.getFormat.isDefined || t.getTimeZoneId.isDefined) { 82 | if (!column.getType.isInstanceOf[TimestampType]) { 83 | // NOTE: Warning is better instead of throwing. 84 | throw new ConfigException( 85 | s"The type of column{name:${column.getName},type:${column.getType.getName}} is not 'timestamp'," + 86 | " but timestamp options (\"format\" or \"timezone\") are set." 87 | ) 88 | } 89 | } 90 | } 91 | } 92 | 93 | private def embulkType(typeName: String): Type = { 94 | Seq( 95 | Types.BOOLEAN, 96 | Types.STRING, 97 | Types.LONG, 98 | Types.DOUBLE, 99 | Types.TIMESTAMP, 100 | Types.JSON 101 | ).foreach { embulkType => 102 | if (embulkType.getName.equals(typeName)) return embulkType 103 | } 104 | throw new ConfigException(s"Unknown embulk type: $typeName.") 105 | } 106 | 107 | def apply(task: Task, schema: Schema): ParquetFileWriteSupport = { 108 | validateTask(task, schema) 109 | 110 | val parquetSchema: Map[Column, ParquetColumnType] = schema.getColumns.map { 111 | c: Column => 112 | c -> task.getColumnOptions.toMap 113 | .get(c.getName) 114 | .orElse(task.getTypeOptions.toMap.get(c.getType.getName)) 115 | .flatMap(ParquetColumnType.fromTask) 116 | .getOrElse(DefaultColumnType) 117 | }.toMap 118 | val timestampFormatters: Seq[TimestampFormatter] = Timestamps 119 | .newTimestampColumnFormatters(task, schema, task.getColumnOptions) 120 | new ParquetFileWriteSupport(schema, parquetSchema, timestampFormatters) 121 | } 122 | } 123 | 124 | case class ParquetFileWriteSupport private ( 125 | schema: Schema, 126 | parquetSchema: Map[Column, ParquetColumnType], 127 | timestampFormatters: Seq[TimestampFormatter] 128 | ) extends WriteSupport[PageReader] { 129 | 130 | import implicits._ 131 | 132 | private val messageType: MessageType = 133 | new MessageType("embulk", schema.getColumns.map { c => 134 | parquetSchema(c).primitiveType(c) 135 | }) 136 | 137 | private var current: RecordConsumer = _ 138 | 139 | def showOutputSchema(logger: Logger): Unit = { 140 | val sb = new JStringBuilder() 141 | sb.append("=== Output Parquet Schema ===\n") 142 | messageType.writeToStringBuilder(sb, null) // NOTE: indent is not used. 143 | sb.append("=============================\n") 144 | sb.toString.split("\n").foreach(logger.info) 145 | } 146 | 147 | override def init(configuration: Configuration): WriteContext = { 148 | val metadata: Map[String, String] = Map.empty // NOTE: When is this used? 149 | new WriteContext(messageType, metadata) 150 | } 151 | 152 | override def prepareForWrite(recordConsumer: RecordConsumer): Unit = 153 | current = recordConsumer 154 | 155 | override def write(record: PageReader): Unit = { 156 | writingRecord { 157 | schema.visitColumns(new ColumnVisitor { 158 | override def booleanColumn(column: Column): Unit = nullOr(column) { 159 | parquetSchema(column) 160 | .consumeBoolean(current, record.getBoolean(column)) 161 | } 162 | override def longColumn(column: Column): Unit = nullOr(column) { 163 | parquetSchema(column).consumeLong(current, record.getLong(column)) 164 | } 165 | override def doubleColumn(column: Column): Unit = nullOr(column) { 166 | parquetSchema(column).consumeDouble(current, record.getDouble(column)) 167 | } 168 | override def stringColumn(column: Column): Unit = nullOr(column) { 169 | parquetSchema(column).consumeString(current, record.getString(column)) 170 | } 171 | override def timestampColumn(column: Column): Unit = nullOr(column) { 172 | parquetSchema(column).consumeTimestamp( 173 | current, 174 | record.getTimestamp(column), 175 | timestampFormatters(column.getIndex) 176 | ) 177 | } 178 | override def jsonColumn(column: Column): Unit = nullOr(column) { 179 | parquetSchema(column).consumeJson(current, record.getJson(column)) 180 | } 181 | private def nullOr(column: Column)(f: => Unit): Unit = 182 | if (!record.isNull(column)) writingColumn(column)(f) 183 | }) 184 | } 185 | } 186 | 187 | private def writingRecord(f: => Unit): Unit = { 188 | current.startMessage() 189 | f 190 | current.endMessage() 191 | } 192 | 193 | private def writingColumn(column: Column)(f: => Unit): Unit = { 194 | current.startField(column.getName, column.getIndex) 195 | f 196 | current.endField(column.getName, column.getIndex) 197 | } 198 | 199 | def newWriterBuilder(pathString: String): WriterBuilder = 200 | WriterBuilder(new Path(pathString), this) 201 | } 202 | -------------------------------------------------------------------------------- /src/test/scala/org/embulk/output/s3_parquet/parquet/TestDecimalLogicalType.scala: -------------------------------------------------------------------------------- 1 | package org.embulk.output.s3_parquet.parquet 2 | 3 | import org.apache.parquet.io.api.{Binary, RecordConsumer} 4 | import org.apache.parquet.schema.LogicalTypeAnnotation 5 | import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName 6 | import org.embulk.config.ConfigException 7 | import org.embulk.output.s3_parquet.catalog.GlueDataType 8 | import org.embulk.spi.`type`.{DoubleType, LongType, StringType} 9 | import org.embulk.spi.DataException 10 | import org.scalatest.diagrams.Diagrams 11 | import org.scalatest.funsuite.AnyFunSuite 12 | import org.scalatest.prop.TableDrivenPropertyChecks 13 | 14 | import scala.util.chaining._ 15 | 16 | class TestDecimalLogicalType 17 | extends AnyFunSuite 18 | with ParquetColumnTypeTestHelper 19 | with TableDrivenPropertyChecks 20 | with Diagrams { 21 | 22 | private val conditions = Table( 23 | ("precision", "scale", "column"), { 24 | for { 25 | precision <- Seq(1, 9, 10, 18, 19) 26 | scale <- Seq(0, 1, 20) 27 | column <- Seq( 28 | SAMPLE_BOOLEAN_COLUMN, 29 | SAMPLE_LONG_COLUMN, 30 | SAMPLE_DOUBLE_COLUMN, 31 | SAMPLE_STRING_COLUMN, 32 | SAMPLE_TIMESTAMP_COLUMN, 33 | SAMPLE_JSON_COLUMN 34 | ) 35 | } yield (precision, scale, column) 36 | }: _* 37 | ) 38 | 39 | private val unsupportedEmbulkColumns = Seq( 40 | SAMPLE_BOOLEAN_COLUMN, 41 | SAMPLE_TIMESTAMP_COLUMN, 42 | SAMPLE_JSON_COLUMN 43 | ) 44 | 45 | def isValidScaleAndPrecision(scale: Int, precision: Int): Boolean = 46 | scale >= 0 && scale < precision && precision > 0 47 | 48 | test("throws IllegalArgumentException") { 49 | // format: off 50 | assert(intercept[IllegalArgumentException](DecimalLogicalType(-1, 5)).getMessage.startsWith("requirement failed: Scale must be zero or a positive integer.")) 51 | assert(intercept[IllegalArgumentException](DecimalLogicalType(10, 5)).getMessage.startsWith("requirement failed: Scale must be a positive integer less than the precision.")) 52 | // format: on 53 | } 54 | 55 | test( 56 | "#primitiveType(column) returns PrimitiveTypeName.{INT32, INT64, BINARY} with LogicalType" 57 | ) { 58 | forAll(conditions) { (precision, scale, column) => 59 | whenever(isValidScaleAndPrecision(scale, precision)) { 60 | // format: off 61 | column.getType match { 62 | case _: LongType if 1 <= precision && precision <= 9 => 63 | assert(PrimitiveTypeName.INT32 == DecimalLogicalType(scale, precision).primitiveType(column).getPrimitiveTypeName) 64 | assert(LogicalTypeAnnotation.decimalType(scale, precision) == DecimalLogicalType(scale, precision).primitiveType(column).getLogicalTypeAnnotation) 65 | case _: LongType if 10 <= precision && precision <= 18 => 66 | assert(PrimitiveTypeName.INT64 == DecimalLogicalType(scale, precision).primitiveType(column).getPrimitiveTypeName) 67 | assert(LogicalTypeAnnotation.decimalType(scale, precision) == DecimalLogicalType(scale, precision).primitiveType(column).getLogicalTypeAnnotation) 68 | case _: StringType | _: DoubleType => 69 | assert(PrimitiveTypeName.BINARY == DecimalLogicalType(scale, precision).primitiveType(column).getPrimitiveTypeName) 70 | assert(LogicalTypeAnnotation.decimalType(scale, precision) == DecimalLogicalType(scale, precision).primitiveType(column).getLogicalTypeAnnotation) 71 | case _ => 72 | assert(intercept[ConfigException](DecimalLogicalType(scale, precision).primitiveType(column)).getMessage.startsWith("Unsupported column type: ")) 73 | } 74 | // format: on 75 | } 76 | } 77 | } 78 | 79 | test("#glueDataType(column) returns GlueDataType") { 80 | forAll(conditions) { (precision, scale, column) => 81 | whenever(isValidScaleAndPrecision(scale, precision)) { 82 | // format: off 83 | column.getType match { 84 | case _: LongType | _: StringType | _: DoubleType => 85 | assert(GlueDataType.DECIMAL(precision, scale) == DecimalLogicalType(scale, precision).glueDataType(column)) 86 | case _ => 87 | assert(intercept[ConfigException](DecimalLogicalType(scale, precision).glueDataType(column)).getMessage.startsWith("Unsupported column type: ")) 88 | } 89 | // format: on 90 | } 91 | } 92 | } 93 | 94 | test("#consumeString") { 95 | forAll(conditions) { (precision, scale, _) => 96 | whenever(isValidScaleAndPrecision(scale, precision)) { 97 | newMockRecordConsumer().tap { consumer => 98 | consumer.writingSampleField { 99 | // format: off 100 | assert(intercept[DataException](DecimalLogicalType(scale, precision).consumeString(consumer, "string")).getMessage.startsWith("Failed to cast String: ")) 101 | // format: on 102 | } 103 | } 104 | newMockRecordConsumer().tap { consumer => 105 | consumer.writingSampleField { 106 | DecimalLogicalType(scale, precision).consumeString(consumer, "5.5") 107 | } 108 | assert(consumer.data.head.head.isInstanceOf[Binary]) 109 | if (scale == 0) 110 | assert(consumer.data.head.head == Binary.fromString("6")) 111 | else assert(consumer.data.head.head == Binary.fromString("5.5")) 112 | } 113 | } 114 | } 115 | } 116 | 117 | test("#consumeLong") { 118 | forAll(conditions) { (precision, scale, _) => 119 | whenever(isValidScaleAndPrecision(scale, precision) && precision <= 18) { 120 | newMockRecordConsumer().tap { consumer => 121 | consumer.writingSampleField { 122 | DecimalLogicalType(scale, precision) 123 | .consumeLong(consumer, 1L) 124 | } 125 | if (1 <= precision && precision <= 9) { 126 | assert(consumer.data.head.head.isInstanceOf[Int]) 127 | assert(consumer.data.head.head == 1) 128 | } 129 | else { 130 | assert(consumer.data.head.head.isInstanceOf[Long]) 131 | assert(consumer.data.head.head == 1) 132 | } 133 | } 134 | } 135 | whenever(isValidScaleAndPrecision(scale, precision) && precision > 18) { 136 | newMockRecordConsumer().tap { consumer => 137 | consumer.writingSampleField { 138 | // format: off 139 | assert(intercept[ConfigException](DecimalLogicalType(scale, precision).consumeLong(consumer, 1L)).getMessage.startsWith("precision must be 1 <= precision <= 18 when consuming long values but precision is ")) 140 | // format: on 141 | } 142 | } 143 | } 144 | } 145 | } 146 | 147 | test("#consumeDouble") { 148 | forAll(conditions) { (precision, scale, _) => 149 | whenever(isValidScaleAndPrecision(scale, precision)) { 150 | newMockRecordConsumer().tap { consumer => 151 | consumer.writingSampleField { 152 | DecimalLogicalType(scale, precision) 153 | .consumeDouble(consumer, 1.1d) 154 | } 155 | assert(consumer.data.head.head.isInstanceOf[Binary]) 156 | if (scale == 0) 157 | assert(consumer.data.head.head == Binary.fromString("1")) 158 | else assert(consumer.data.head.head == Binary.fromString("1.1")) 159 | } 160 | } 161 | } 162 | } 163 | 164 | test("#consume{Boolean,Timestamp,Json} are unsupported.") { 165 | def assertUnsupportedConsume(f: RecordConsumer => Unit) = 166 | newMockRecordConsumer().tap { consumer => 167 | consumer.writingSampleField { 168 | // format: off 169 | assert(intercept[ConfigException](f(consumer)).getMessage.endsWith("is unsupported.")) 170 | // format: on 171 | } 172 | } 173 | assertUnsupportedConsume(DecimalLogicalType(5, 10).consumeBoolean(_, true)) 174 | assertUnsupportedConsume( 175 | DecimalLogicalType(5, 10).consumeTimestamp(_, null, null) 176 | ) 177 | assertUnsupportedConsume(DecimalLogicalType(5, 10).consumeJson(_, null)) 178 | } 179 | } 180 | -------------------------------------------------------------------------------- /src/test/scala/org/embulk/output/s3_parquet/TestS3ParquetOutputPlugin.scala: -------------------------------------------------------------------------------- 1 | package org.embulk.output.s3_parquet 2 | 3 | import org.apache.parquet.schema.LogicalTypeAnnotation 4 | import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName 5 | import org.embulk.spi.Schema 6 | import org.embulk.spi.`type`.Types 7 | import org.embulk.spi.time.{Timestamp, TimestampFormatter, TimestampParser} 8 | import org.msgpack.value.Value 9 | 10 | import scala.util.chaining._ 11 | 12 | class TestS3ParquetOutputPlugin extends EmbulkPluginTestHelper { 13 | 14 | test("minimal default case") { 15 | val schema: Schema = Schema 16 | .builder() 17 | .add("c0", Types.BOOLEAN) 18 | .add("c1", Types.LONG) 19 | .add("c2", Types.DOUBLE) 20 | .add("c3", Types.STRING) 21 | .add("c4", Types.TIMESTAMP) 22 | .add("c5", Types.JSON) 23 | .build() 24 | // scalafmt: { maxColumn = 200 } 25 | val parser = TimestampParser.of("%Y-%m-%d %H:%M:%S.%N %z", "UTC") 26 | val data: Seq[Seq[Any]] = Seq( 27 | Seq(true, 0L, 0.0d, "c212c89f91", parser.parse("2017-10-22 19:53:31.000000 +0900"), json("""{"a":0,"b":"00"}""")), 28 | Seq(false, 1L, -0.5d, "aaaaa", parser.parse("2017-10-22 19:53:31.000000 +0900"), json("""{"a":1,"b":"11"}""")), 29 | Seq(false, 2L, 1.5d, "90823c6a1f", parser.parse("2017-10-23 23:42:43.000000 +0900"), json("""{"a":2,"b":"22"}""")), 30 | Seq(true, 3L, 0.44d, "", parser.parse("2017-10-22 06:12:13.000000 +0900"), json("""{"a":3,"b":"33","c":3.3}""")), 31 | Seq(false, 9999L, 10000.33333d, "e56a40571c", parser.parse("2017-10-23 04:59:16.000000 +0900"), json("""{"a":4,"b":"44","c":4.4,"d":true}""")) 32 | ) 33 | // scalafmt: { maxColumn = 80 } 34 | 35 | val result: Seq[Seq[AnyRef]] = 36 | runOutput( 37 | newDefaultConfig, 38 | schema, 39 | data, 40 | messageTypeTest = { messageType => 41 | // format: off 42 | assert(PrimitiveTypeName.BOOLEAN == messageType.getColumns.get(0).getPrimitiveType.getPrimitiveTypeName) 43 | assert(PrimitiveTypeName.INT64 == messageType.getColumns.get(1).getPrimitiveType.getPrimitiveTypeName) 44 | assert(PrimitiveTypeName.DOUBLE == messageType.getColumns.get(2).getPrimitiveType.getPrimitiveTypeName) 45 | assert(PrimitiveTypeName.BINARY == messageType.getColumns.get(3).getPrimitiveType.getPrimitiveTypeName) 46 | assert(PrimitiveTypeName.BINARY == messageType.getColumns.get(4).getPrimitiveType.getPrimitiveTypeName) 47 | assert(PrimitiveTypeName.BINARY == messageType.getColumns.get(5).getPrimitiveType.getPrimitiveTypeName) 48 | 49 | assert(null == messageType.getColumns.get(0).getPrimitiveType.getLogicalTypeAnnotation) 50 | assert(null == messageType.getColumns.get(1).getPrimitiveType.getLogicalTypeAnnotation) 51 | assert(null == messageType.getColumns.get(2).getPrimitiveType.getLogicalTypeAnnotation) 52 | 53 | assert(LogicalTypeAnnotation.stringType() == messageType.getColumns.get(3).getPrimitiveType.getLogicalTypeAnnotation) 54 | assert(LogicalTypeAnnotation.stringType() == messageType.getColumns.get(4).getPrimitiveType.getLogicalTypeAnnotation) 55 | assert(LogicalTypeAnnotation.stringType() == messageType.getColumns.get(5).getPrimitiveType.getLogicalTypeAnnotation) 56 | // format: on 57 | } 58 | ) 59 | 60 | assert(result.size == 5) 61 | data.indices.foreach { i => 62 | data(i).indices.foreach { j => 63 | data(i)(j) match { 64 | case timestamp: Timestamp => 65 | val formatter = 66 | TimestampFormatter.of("%Y-%m-%d %H:%M:%S.%6N %z", "Asia/Tokyo") 67 | assert( 68 | formatter.format(timestamp) == result(i)(j), 69 | s"A different timestamp value is found (Record Index: $i, Column Index: $j)" 70 | ) 71 | case value: Value => 72 | assert( 73 | value.toJson == result(i)(j), 74 | s"A different json value is found (Record Index: $i, Column Index: $j)" 75 | ) 76 | case _ => 77 | assert( 78 | data(i)(j) == result(i)(j), 79 | s"A different value is found (Record Index: $i, Column Index: $j)" 80 | ) 81 | } 82 | } 83 | } 84 | } 85 | 86 | test("timestamp-millis") { 87 | val schema = Schema.builder().add("c0", Types.TIMESTAMP).build() 88 | val data: Seq[Seq[Timestamp]] = Seq( 89 | Seq(Timestamp.ofEpochMilli(111_111_111L)), 90 | Seq(Timestamp.ofEpochMilli(222_222_222L)), 91 | Seq(Timestamp.ofEpochMilli(333_333_333L)) 92 | ) 93 | val cfg = newDefaultConfig.merge( 94 | loadConfigSourceFromYamlString(""" 95 | |type_options: 96 | | timestamp: 97 | | logical_type: "timestamp-millis" 98 | |""".stripMargin) 99 | ) 100 | 101 | val result: Seq[Seq[AnyRef]] = runOutput( 102 | cfg, 103 | schema, 104 | data, 105 | messageTypeTest = { messageType => 106 | // format: off 107 | assert(PrimitiveTypeName.INT64 == messageType.getColumns.get(0).getPrimitiveType.getPrimitiveTypeName) 108 | assert(LogicalTypeAnnotation.timestampType(true, LogicalTypeAnnotation.TimeUnit.MILLIS) == messageType.getColumns.get(0).getPrimitiveType.getLogicalTypeAnnotation) 109 | // format: on 110 | } 111 | ) 112 | 113 | assert(data.size == result.size) 114 | data.indices.foreach { i => 115 | assert { 116 | data(i).head.toEpochMilli == result(i).head.asInstanceOf[Long] 117 | } 118 | } 119 | } 120 | 121 | test("timestamp-micros") { 122 | val schema = Schema.builder().add("c0", Types.TIMESTAMP).build() 123 | val data: Seq[Seq[Timestamp]] = Seq( 124 | Seq(Timestamp.ofEpochSecond(111_111_111L, 111_111_000L)), 125 | Seq(Timestamp.ofEpochSecond(222_222_222L, 222_222_222L)), 126 | Seq(Timestamp.ofEpochSecond(333_333_333L, 333_000L)) 127 | ) 128 | val cfg = newDefaultConfig.merge( 129 | loadConfigSourceFromYamlString(""" 130 | |type_options: 131 | | timestamp: 132 | | logical_type: "timestamp-micros" 133 | |""".stripMargin) 134 | ) 135 | 136 | val result: Seq[Seq[AnyRef]] = runOutput( 137 | cfg, 138 | schema, 139 | data, 140 | messageTypeTest = { messageType => 141 | // format: off 142 | assert(PrimitiveTypeName.INT64 == messageType.getColumns.get(0).getPrimitiveType.getPrimitiveTypeName) 143 | assert(LogicalTypeAnnotation.timestampType(true, LogicalTypeAnnotation.TimeUnit.MICROS) == messageType.getColumns.get(0).getPrimitiveType.getLogicalTypeAnnotation) 144 | // format: on 145 | } 146 | ) 147 | 148 | assert(data.size == result.size) 149 | data.indices.foreach { i => 150 | // format: off 151 | assert( 152 | data(i).head.pipe(ts => (ts.getEpochSecond * 1_000_000L) + (ts.getNano / 1_000L)) == result(i).head.asInstanceOf[Long] 153 | ) 154 | // format: on 155 | } 156 | } 157 | 158 | test("timestamp-nanos") { 159 | val schema = Schema.builder().add("c0", Types.TIMESTAMP).build() 160 | val data: Seq[Seq[Timestamp]] = Seq( 161 | Seq(Timestamp.ofEpochSecond(111_111_111L, 111_111_000L)), 162 | Seq(Timestamp.ofEpochSecond(222_222_222L, 222_222_222L)), 163 | Seq(Timestamp.ofEpochSecond(333_333_333L, 333_000L)) 164 | ) 165 | val cfg = newDefaultConfig.merge( 166 | loadConfigSourceFromYamlString(""" 167 | |type_options: 168 | | timestamp: 169 | | logical_type: "timestamp-nanos" 170 | |""".stripMargin) 171 | ) 172 | 173 | val result: Seq[Seq[AnyRef]] = runOutput( 174 | cfg, 175 | schema, 176 | data, 177 | messageTypeTest = { messageType => 178 | // format: off 179 | assert(PrimitiveTypeName.INT64 == messageType.getColumns.get(0).getPrimitiveType.getPrimitiveTypeName) 180 | assert(LogicalTypeAnnotation.timestampType(true, LogicalTypeAnnotation.TimeUnit.NANOS) == messageType.getColumns.get(0).getPrimitiveType.getLogicalTypeAnnotation) 181 | // format: on 182 | } 183 | ) 184 | 185 | assert(data.size == result.size) 186 | data.indices.foreach { i => 187 | // format: off 188 | assert(data(i).head.pipe(ts => (ts.getEpochSecond * 1_000_000_000L) + ts.getNano) == result(i).head.asInstanceOf[Long]) 189 | // format: on 190 | } 191 | } 192 | } 193 | -------------------------------------------------------------------------------- /src/test/scala/org/embulk/output/s3_parquet/EmbulkPluginTestHelper.scala: -------------------------------------------------------------------------------- 1 | package org.embulk.output.s3_parquet 2 | 3 | import java.io.File 4 | import java.nio.file.{Files, Path} 5 | import java.util.concurrent.ExecutionException 6 | 7 | import com.amazonaws.auth.{AWSStaticCredentialsProvider, BasicAWSCredentials} 8 | import com.amazonaws.client.builder.AwsClientBuilder.EndpointConfiguration 9 | import com.amazonaws.services.s3.{AmazonS3, AmazonS3ClientBuilder} 10 | import com.amazonaws.services.s3.model.ObjectListing 11 | import com.amazonaws.services.s3.transfer.{ 12 | TransferManager, 13 | TransferManagerBuilder 14 | } 15 | import com.google.inject.{Binder, Guice, Module, Stage} 16 | import org.apache.avro.generic.GenericRecord 17 | import org.apache.hadoop.conf.Configuration 18 | import org.apache.hadoop.fs.{Path => HadoopPath} 19 | import org.apache.parquet.avro.AvroReadSupport 20 | import org.apache.parquet.hadoop.{ParquetFileReader, ParquetReader} 21 | import org.apache.parquet.hadoop.util.HadoopInputFile 22 | import org.apache.parquet.schema.MessageType 23 | import org.embulk.{TestPluginSourceModule, TestUtilityModule} 24 | import org.embulk.config.{ 25 | ConfigLoader, 26 | ConfigSource, 27 | DataSourceImpl, 28 | ModelManager, 29 | TaskSource 30 | } 31 | import org.embulk.exec.{ 32 | ExecModule, 33 | ExtensionServiceLoaderModule, 34 | SystemConfigModule 35 | } 36 | import org.embulk.jruby.JRubyScriptingModule 37 | import org.embulk.plugin.{ 38 | BuiltinPluginSourceModule, 39 | InjectedPluginSource, 40 | PluginClassLoaderModule 41 | } 42 | import org.embulk.spi.{Exec, ExecSession, OutputPlugin, PageTestUtils, Schema} 43 | import org.embulk.spi.json.JsonParser 44 | import org.msgpack.value.Value 45 | import org.scalatest.funsuite.AnyFunSuite 46 | import org.scalatest.BeforeAndAfter 47 | import org.scalatest.diagrams.Diagrams 48 | 49 | import scala.util.Using 50 | 51 | object EmbulkPluginTestHelper { 52 | 53 | case class TestRuntimeModule() extends Module { 54 | 55 | override def configure(binder: Binder): Unit = { 56 | val systemConfig = new DataSourceImpl(null) 57 | new SystemConfigModule(systemConfig).configure(binder) 58 | new ExecModule(systemConfig).configure(binder) 59 | new ExtensionServiceLoaderModule(systemConfig).configure(binder) 60 | new BuiltinPluginSourceModule().configure(binder) 61 | new JRubyScriptingModule(systemConfig).configure(binder) 62 | new PluginClassLoaderModule().configure(binder) 63 | new TestUtilityModule().configure(binder) 64 | new TestPluginSourceModule().configure(binder) 65 | InjectedPluginSource.registerPluginTo( 66 | binder, 67 | classOf[OutputPlugin], 68 | "s3_parquet", 69 | classOf[S3ParquetOutputPlugin] 70 | ) 71 | } 72 | } 73 | 74 | def getExecSession: ExecSession = { 75 | val injector = 76 | Guice.createInjector(Stage.PRODUCTION, TestRuntimeModule()) 77 | val execConfig = new DataSourceImpl( 78 | injector.getInstance(classOf[ModelManager]) 79 | ) 80 | ExecSession.builder(injector).fromExecConfig(execConfig).build() 81 | } 82 | } 83 | 84 | abstract class EmbulkPluginTestHelper 85 | extends AnyFunSuite 86 | with BeforeAndAfter 87 | with Diagrams { 88 | import implicits._ 89 | 90 | private var exec: ExecSession = _ 91 | 92 | val TEST_S3_ENDPOINT: String = "http://localhost:4566" 93 | val TEST_S3_REGION: String = "us-east-1" 94 | val TEST_S3_ACCESS_KEY_ID: String = "test" 95 | val TEST_S3_SECRET_ACCESS_KEY: String = "test" 96 | val TEST_BUCKET_NAME: String = "my-bucket" 97 | val TEST_PATH_PREFIX: String = "path/to/parquet-" 98 | 99 | before { 100 | exec = EmbulkPluginTestHelper.getExecSession 101 | 102 | withLocalStackS3Client(_.createBucket(TEST_BUCKET_NAME)) 103 | } 104 | after { 105 | exec.cleanup() 106 | exec = null 107 | 108 | withLocalStackS3Client { cli => 109 | @scala.annotation.tailrec 110 | def rmRecursive(listing: ObjectListing): Unit = { 111 | listing.getObjectSummaries.foreach(o => 112 | cli.deleteObject(TEST_BUCKET_NAME, o.getKey) 113 | ) 114 | if (listing.isTruncated) 115 | rmRecursive(cli.listNextBatchOfObjects(listing)) 116 | } 117 | rmRecursive(cli.listObjects(TEST_BUCKET_NAME)) 118 | } 119 | withLocalStackS3Client(_.deleteBucket(TEST_BUCKET_NAME)) 120 | } 121 | 122 | def execDoWith[A](f: => A): A = 123 | try Exec.doWith(exec, () => f) 124 | catch { 125 | case ex: ExecutionException => throw ex.getCause 126 | } 127 | 128 | def runOutput( 129 | outConfig: ConfigSource, 130 | schema: Schema, 131 | data: Seq[Seq[Any]], 132 | messageTypeTest: MessageType => Unit = { _ => } 133 | ): Seq[Seq[AnyRef]] = { 134 | execDoWith { 135 | val plugin = 136 | exec.getInjector.getInstance(classOf[S3ParquetOutputPlugin]) 137 | plugin.transaction( 138 | outConfig, 139 | schema, 140 | 1, 141 | (taskSource: TaskSource) => { 142 | Using.resource(plugin.open(taskSource, schema, 0)) { output => 143 | try { 144 | PageTestUtils 145 | .buildPage( 146 | exec.getBufferAllocator, 147 | schema, 148 | data.flatten: _* 149 | ) 150 | .foreach(output.add) 151 | output.commit() 152 | } 153 | catch { 154 | case ex: Throwable => 155 | output.abort() 156 | throw ex 157 | } 158 | } 159 | Seq.empty 160 | } 161 | ) 162 | } 163 | 164 | readS3Parquet(TEST_BUCKET_NAME, TEST_PATH_PREFIX, messageTypeTest) 165 | } 166 | 167 | private def withLocalStackS3Client[A](f: AmazonS3 => A): A = { 168 | val client: AmazonS3 = AmazonS3ClientBuilder.standard 169 | .withEndpointConfiguration( 170 | new EndpointConfiguration(TEST_S3_ENDPOINT, TEST_S3_REGION) 171 | ) 172 | .withCredentials( 173 | new AWSStaticCredentialsProvider( 174 | new BasicAWSCredentials( 175 | TEST_S3_ACCESS_KEY_ID, 176 | TEST_S3_SECRET_ACCESS_KEY 177 | ) 178 | ) 179 | ) 180 | .withPathStyleAccessEnabled(true) 181 | .build() 182 | 183 | try f(client) 184 | finally client.shutdown() 185 | } 186 | 187 | private def readS3Parquet( 188 | bucket: String, 189 | prefix: String, 190 | messageTypeTest: MessageType => Unit = { _ => } 191 | ): Seq[Seq[AnyRef]] = { 192 | val tmpDir: Path = Files.createTempDirectory("embulk-output-parquet") 193 | withLocalStackS3Client { s3 => 194 | val xfer: TransferManager = TransferManagerBuilder 195 | .standard() 196 | .withS3Client(s3) 197 | .build() 198 | try xfer 199 | .downloadDirectory(bucket, prefix, tmpDir.toFile) 200 | .waitForCompletion() 201 | finally xfer.shutdownNow() 202 | } 203 | 204 | def listFiles(file: File): Seq[File] = { 205 | file 206 | .listFiles() 207 | .flatMap(f => 208 | if (f.isFile) Seq(f) 209 | else listFiles(f) 210 | ) 211 | .toSeq 212 | } 213 | 214 | listFiles(tmpDir.toFile) 215 | .map(_.getAbsolutePath) 216 | .foldLeft(Seq[Seq[AnyRef]]()) { 217 | (result: Seq[Seq[AnyRef]], path: String) => 218 | result ++ readParquetFile(path, messageTypeTest) 219 | } 220 | } 221 | 222 | private def readParquetFile( 223 | pathString: String, 224 | messageTypeTest: MessageType => Unit = { _ => } 225 | ): Seq[Seq[AnyRef]] = { 226 | Using.resource( 227 | ParquetFileReader.open( 228 | HadoopInputFile 229 | .fromPath(new HadoopPath(pathString), new Configuration()) 230 | ) 231 | ) { reader => messageTypeTest(reader.getFileMetaData.getSchema) } 232 | 233 | val reader: ParquetReader[GenericRecord] = ParquetReader 234 | .builder( 235 | new AvroReadSupport[GenericRecord](), 236 | new HadoopPath(pathString) 237 | ) 238 | .build() 239 | 240 | Iterator 241 | .continually(reader.read()) 242 | .takeWhile(_ != null) 243 | .map(record => record.getSchema.getFields.map(f => record.get(f.name()))) 244 | .toSeq 245 | } 246 | 247 | def loadConfigSourceFromYamlString(yaml: String): ConfigSource = 248 | new ConfigLoader(exec.getModelManager).fromYamlString(yaml) 249 | 250 | def newDefaultConfig: ConfigSource = 251 | loadConfigSourceFromYamlString( 252 | s""" 253 | |endpoint: $TEST_S3_ENDPOINT 254 | |bucket: $TEST_BUCKET_NAME 255 | |path_prefix: $TEST_PATH_PREFIX 256 | |auth_method: basic 257 | |access_key_id: $TEST_S3_ACCESS_KEY_ID 258 | |secret_access_key: $TEST_S3_SECRET_ACCESS_KEY 259 | |path_style_access_enabled: true 260 | |default_timezone: Asia/Tokyo 261 | |""".stripMargin 262 | ) 263 | 264 | def json(str: String): Value = new JsonParser().parse(str) 265 | } 266 | -------------------------------------------------------------------------------- /src/test/scala/org/embulk/output/s3_parquet/parquet/TestTimeLogicalType.scala: -------------------------------------------------------------------------------- 1 | package org.embulk.output.s3_parquet.parquet 2 | 3 | import java.time.ZoneId 4 | 5 | import org.apache.parquet.io.api.RecordConsumer 6 | import org.apache.parquet.schema.LogicalTypeAnnotation 7 | import org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.{ 8 | MICROS, 9 | MILLIS, 10 | NANOS 11 | } 12 | import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName 13 | import org.embulk.config.ConfigException 14 | import org.embulk.output.s3_parquet.catalog.GlueDataType 15 | import org.embulk.spi.DataException 16 | import org.embulk.spi.time.Timestamp 17 | import org.scalatest.diagrams.Diagrams 18 | import org.scalatest.funsuite.AnyFunSuite 19 | import org.scalatest.prop.TableDrivenPropertyChecks 20 | 21 | import scala.util.chaining._ 22 | 23 | class TestTimeLogicalType 24 | extends AnyFunSuite 25 | with ParquetColumnTypeTestHelper 26 | with TableDrivenPropertyChecks 27 | with Diagrams { 28 | 29 | private val conditions = Table( 30 | ("isAdjustedToUtc", "timeUnit", "timeZone", "column"), { 31 | for { 32 | isAdjustedToUtc <- Seq(true, false) 33 | timeUnit <- Seq(MILLIS, MICROS, NANOS) 34 | timeZone <- Seq(ZoneId.of("UTC"), ZoneId.of("Asia/Tokyo")) 35 | column <- Seq( 36 | SAMPLE_BOOLEAN_COLUMN, 37 | SAMPLE_LONG_COLUMN, 38 | SAMPLE_DOUBLE_COLUMN, 39 | SAMPLE_STRING_COLUMN, 40 | SAMPLE_TIMESTAMP_COLUMN, 41 | SAMPLE_JSON_COLUMN 42 | ) 43 | } yield (isAdjustedToUtc, timeUnit, timeZone, column) 44 | }: _* 45 | ) 46 | 47 | private val unsupportedEmbulkColumns = Seq( 48 | SAMPLE_BOOLEAN_COLUMN, 49 | SAMPLE_DOUBLE_COLUMN, 50 | SAMPLE_STRING_COLUMN, 51 | SAMPLE_JSON_COLUMN 52 | ) 53 | 54 | test( 55 | "#primitiveType(column) returns PrimitiveTypeName.{INT32,INT64} with LogicalType" 56 | ) { 57 | forAll(conditions) { (isAdjustedToUtc, timeUnit, timeZone, column) => 58 | whenever(unsupportedEmbulkColumns.contains(column)) { 59 | // format: off 60 | assert(intercept[ConfigException](TimeLogicalType(isAdjustedToUtc = isAdjustedToUtc, timeUnit = timeUnit, timeZone = timeZone).primitiveType(column)).getMessage.startsWith("Unsupported column type: ")) 61 | // format: on 62 | } 63 | 64 | whenever(!unsupportedEmbulkColumns.contains(column)) { 65 | val expectedPrimitiveTypeName = 66 | if (timeUnit === MILLIS) PrimitiveTypeName.INT32 67 | else PrimitiveTypeName.INT64 68 | // format: off 69 | assert(expectedPrimitiveTypeName == TimeLogicalType(isAdjustedToUtc = isAdjustedToUtc, timeUnit = timeUnit, timeZone = timeZone).primitiveType(column).getPrimitiveTypeName) 70 | assert(LogicalTypeAnnotation.timeType(isAdjustedToUtc, timeUnit) == TimeLogicalType(isAdjustedToUtc = isAdjustedToUtc, timeUnit = timeUnit, timeZone = timeZone).primitiveType(column).getLogicalTypeAnnotation) 71 | // format: on 72 | } 73 | } 74 | } 75 | 76 | test("#glueDataType(column) returns GlueDataType") { 77 | forAll(conditions) { (isAdjustedToUtc, timeUnit, timeZone, column) => 78 | whenever(unsupportedEmbulkColumns.contains(column)) { 79 | // format: off 80 | assert(intercept[ConfigException](TimeLogicalType(isAdjustedToUtc = isAdjustedToUtc, timeUnit = timeUnit, timeZone = timeZone).glueDataType(column)).getMessage.startsWith("Unsupported column type: ")) 81 | // format: on 82 | } 83 | whenever(!unsupportedEmbulkColumns.contains(column)) { 84 | val expectedGlueDataType = 85 | if (timeUnit === MILLIS) GlueDataType.INT 86 | else GlueDataType.BIGINT 87 | // format: off 88 | assert(expectedGlueDataType == TimeLogicalType(isAdjustedToUtc = isAdjustedToUtc, timeUnit = timeUnit, timeZone = timeZone).glueDataType(column)) 89 | // format: on 90 | } 91 | } 92 | } 93 | 94 | test("#consumeLong") { 95 | forAll(conditions) { (isAdjustedToUtc, timeUnit, timeZone, _) => 96 | timeUnit match { 97 | case MILLIS => 98 | newMockRecordConsumer().tap { consumer => 99 | consumer.writingSampleField { 100 | TimeLogicalType( 101 | isAdjustedToUtc = isAdjustedToUtc, 102 | timeUnit = timeUnit, 103 | timeZone = timeZone 104 | ).consumeLong(consumer, 5) 105 | } 106 | assert(consumer.data.head.head.isInstanceOf[Int]) 107 | assert(consumer.data.head.head == 5) 108 | } 109 | newMockRecordConsumer().tap { consumer => 110 | consumer.writingSampleField { 111 | // format: off 112 | assert(intercept[DataException](TimeLogicalType(isAdjustedToUtc = isAdjustedToUtc, timeUnit = timeUnit, timeZone = timeZone).consumeLong(consumer, Long.MaxValue)).getMessage.startsWith("Failed to cast Long: ")) 113 | // format: on 114 | } 115 | } 116 | case MICROS | NANOS => 117 | newMockRecordConsumer().tap { consumer => 118 | consumer.writingSampleField { 119 | TimeLogicalType( 120 | isAdjustedToUtc = isAdjustedToUtc, 121 | timeUnit = timeUnit, 122 | timeZone = timeZone 123 | ).consumeLong(consumer, 5) 124 | } 125 | assert(consumer.data.head.head.isInstanceOf[Long]) 126 | assert(consumer.data.head.head == 5L) 127 | } 128 | newMockRecordConsumer().tap { consumer => 129 | consumer.writingSampleField { 130 | TimeLogicalType( 131 | isAdjustedToUtc = isAdjustedToUtc, 132 | timeUnit = timeUnit, 133 | timeZone = timeZone 134 | ).consumeLong(consumer, Long.MaxValue) 135 | } 136 | assert(consumer.data.head.head.isInstanceOf[Long]) 137 | assert(consumer.data.head.head == Long.MaxValue) 138 | } 139 | } 140 | } 141 | } 142 | 143 | test("#consumeTimestamp") { 144 | forAll(conditions) { (isAdjustedToUtc, timeUnit, timeZone, _) => 145 | timeUnit match { 146 | case MILLIS => 147 | val v = Timestamp.ofEpochMilli(Int.MaxValue) 148 | newMockRecordConsumer().tap { consumer => 149 | consumer.writingSampleField { 150 | TimeLogicalType( 151 | isAdjustedToUtc = isAdjustedToUtc, 152 | timeUnit = timeUnit, 153 | timeZone = timeZone 154 | ).consumeTimestamp(consumer, v, null) 155 | } 156 | assert(consumer.data.head.head.isInstanceOf[Int]) 157 | if (timeZone.getId == "Asia/Tokyo" && !isAdjustedToUtc) 158 | assert(consumer.data.head.head == 19883647) 159 | else // UTC 160 | assert(consumer.data.head.head == 73883647) 161 | } 162 | case MICROS => 163 | val v = Timestamp.ofEpochMilli(Int.MaxValue) 164 | newMockRecordConsumer().tap { consumer => 165 | consumer.writingSampleField { 166 | TimeLogicalType( 167 | isAdjustedToUtc = isAdjustedToUtc, 168 | timeUnit = timeUnit, 169 | timeZone = timeZone 170 | ).consumeTimestamp(consumer, v, null) 171 | } 172 | assert(consumer.data.head.head.isInstanceOf[Long]) 173 | if (timeZone.getId == "Asia/Tokyo" && !isAdjustedToUtc) 174 | assert(consumer.data.head.head == 19883647000L) 175 | else // UTC 176 | assert(consumer.data.head.head == 73883647000L) 177 | } 178 | case NANOS => 179 | val v = Timestamp.ofEpochMilli(Int.MaxValue) 180 | newMockRecordConsumer().tap { consumer => 181 | consumer.writingSampleField { 182 | TimeLogicalType( 183 | isAdjustedToUtc = isAdjustedToUtc, 184 | timeUnit = timeUnit, 185 | timeZone = timeZone 186 | ).consumeTimestamp(consumer, v, null) 187 | } 188 | assert(consumer.data.head.head.isInstanceOf[Long]) 189 | if (timeZone.getId == "Asia/Tokyo" && !isAdjustedToUtc) 190 | assert(consumer.data.head.head == 19883647000000L) 191 | else // UTC 192 | assert(consumer.data.head.head == 73883647000000L) 193 | } 194 | } 195 | 196 | } 197 | } 198 | 199 | test("#consume{Boolean,Double,String,Json} are unsupported.") { 200 | def assertUnsupportedConsume(f: RecordConsumer => Unit) = 201 | newMockRecordConsumer().tap { consumer => 202 | consumer.writingSampleField { 203 | // format: off 204 | assert(intercept[ConfigException](f(consumer)).getMessage.endsWith("is unsupported.")) 205 | // format: on 206 | } 207 | } 208 | 209 | forAll(conditions) { (isAdjustedToUtc, timeUnit, timeZone, _) => 210 | val t = 211 | TimeLogicalType( 212 | isAdjustedToUtc = isAdjustedToUtc, 213 | timeUnit = timeUnit, 214 | timeZone = timeZone 215 | ) 216 | assertUnsupportedConsume(t.consumeBoolean(_, true)) 217 | assertUnsupportedConsume(t.consumeDouble(_, 0.0d)) 218 | assertUnsupportedConsume(t.consumeString(_, null)) 219 | assertUnsupportedConsume(t.consumeJson(_, null)) 220 | } 221 | } 222 | 223 | } 224 | -------------------------------------------------------------------------------- /src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetColumnType.scala: -------------------------------------------------------------------------------- 1 | package org.embulk.output.s3_parquet.parquet 2 | 3 | import java.time.ZoneId 4 | import java.util.{Locale, Optional} 5 | 6 | import org.apache.parquet.format.ConvertedType 7 | import org.apache.parquet.io.api.RecordConsumer 8 | import org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit 9 | import org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.{ 10 | MICROS, 11 | MILLIS, 12 | NANOS 13 | } 14 | import org.apache.parquet.schema.PrimitiveType 15 | import org.embulk.config.{ 16 | Config, 17 | ConfigDefault, 18 | ConfigException, 19 | ConfigSource, 20 | Task => EmbulkTask 21 | } 22 | import org.embulk.output.s3_parquet.catalog.GlueDataType 23 | import org.embulk.output.s3_parquet.implicits 24 | import org.embulk.spi.{Column, DataException, Exec} 25 | import org.embulk.spi.time.{Timestamp, TimestampFormatter} 26 | import org.embulk.spi.time.TimestampFormatter.TimestampColumnOption 27 | import org.msgpack.value.Value 28 | import org.slf4j.{Logger, LoggerFactory} 29 | 30 | import scala.util.{Failure, Success, Try} 31 | import scala.util.chaining._ 32 | 33 | object ParquetColumnType { 34 | 35 | import implicits._ 36 | 37 | private val logger: Logger = 38 | LoggerFactory.getLogger(classOf[ParquetColumnType]) 39 | 40 | trait Task extends EmbulkTask with TimestampColumnOption { 41 | @Config("logical_type") 42 | @ConfigDefault("null") 43 | def getLogicalType: Optional[LogicalTypeOption] 44 | } 45 | 46 | trait LogicalTypeOption extends EmbulkTask { 47 | @Config("name") 48 | def getName: String 49 | 50 | @Config("scale") 51 | @ConfigDefault("null") 52 | def getScale: Optional[Int] 53 | 54 | @Config("precision") 55 | @ConfigDefault("null") 56 | def getPrecision: Optional[Int] 57 | 58 | @Config("bit_width") 59 | @ConfigDefault("null") 60 | def getBitWidth: Optional[Int] 61 | 62 | @Config("is_signed") 63 | @ConfigDefault("null") 64 | def getIsSigned: Optional[Boolean] 65 | 66 | @Config("is_adjusted_to_utc") 67 | @ConfigDefault("null") 68 | def getIsAdjustedToUtc: Optional[Boolean] 69 | 70 | @Config("time_unit") 71 | @ConfigDefault("null") 72 | def getTimeUnit: Optional[TimeUnit] 73 | } 74 | 75 | object LogicalTypeOption { 76 | case class ConfigBuilder private () { 77 | case class Attributes private ( 78 | name: Option[String] = None, 79 | precision: Option[Int] = None, 80 | scale: Option[Int] = None, 81 | bitWidth: Option[Int] = None, 82 | isSigned: Option[Boolean] = None, 83 | isAdjustedToUtc: Option[Boolean] = None, 84 | timeUnit: Option[TimeUnit] = None 85 | ) { 86 | def toOnelineYaml: String = { 87 | val builder = Seq.newBuilder[String] 88 | name.foreach(v => builder.addOne(s"name: ${v}")) 89 | precision.foreach(v => builder.addOne(s"precision: ${v}")) 90 | scale.foreach(v => builder.addOne(s"scale: ${v}")) 91 | bitWidth.foreach(v => builder.addOne(s"bit_width: ${v}")) 92 | isSigned.foreach(v => builder.addOne(s"is_signed: ${v}")) 93 | isAdjustedToUtc.foreach(v => 94 | builder.addOne(s"is_adjusted_to_utc: ${v}") 95 | ) 96 | timeUnit.foreach(tu => builder.addOne(s"time_unit: ${tu.name()}")) 97 | "{" + builder.result().mkString(", ") + "}" 98 | } 99 | 100 | def build(): ConfigSource = { 101 | val c = Exec.newConfigSource() 102 | name.foreach(c.set("name", _)) 103 | precision.foreach(c.set("precision", _)) 104 | scale.foreach(c.set("scale", _)) 105 | bitWidth.foreach(c.set("bit_width", _)) 106 | isSigned.foreach(c.set("is_signed", _)) 107 | isAdjustedToUtc.foreach(c.set("is_adjusted_to_utc", _)) 108 | timeUnit.foreach(tu => c.set("time_unit", tu.name())) 109 | c 110 | } 111 | } 112 | var attrs: Attributes = Attributes() 113 | 114 | def name(name: String): ConfigBuilder = 115 | this.tap(_ => attrs = attrs.copy(name = Option(name))) 116 | def scale(scale: Int): ConfigBuilder = 117 | this.tap(_ => attrs = attrs.copy(scale = Option(scale))) 118 | def precision(precision: Int): ConfigBuilder = 119 | this.tap(_ => attrs = attrs.copy(precision = Option(precision))) 120 | def bitWidth(bitWidth: Int): ConfigBuilder = 121 | this.tap(_ => attrs = attrs.copy(bitWidth = Option(bitWidth))) 122 | def isSigned(isSigned: Boolean): ConfigBuilder = 123 | this.tap(_ => attrs = attrs.copy(isSigned = Option(isSigned))) 124 | def isAdjustedToUtc(isAdjustedToUtc: Boolean): ConfigBuilder = 125 | this.tap(_ => 126 | attrs = attrs.copy(isAdjustedToUtc = Option(isAdjustedToUtc)) 127 | ) 128 | def timeUnit(timeUnit: TimeUnit): ConfigBuilder = 129 | this.tap(_ => attrs = attrs.copy(timeUnit = Option(timeUnit))) 130 | 131 | def toOnelineYaml: String = attrs.toOnelineYaml 132 | 133 | def build(): ConfigSource = attrs.build() 134 | } 135 | 136 | def builder(): ConfigBuilder = ConfigBuilder() 137 | } 138 | 139 | def loadConfig(c: ConfigSource): Task = { 140 | if (c.has("logical_type")) { 141 | Try(c.get(classOf[String], "logical_type")).foreach { v => 142 | logger.warn( 143 | "[DEPRECATED] Now, it is deprecated to use the \"logical_type\" option in this usage." + 144 | " Use \"converted_type\" instead." 145 | ) 146 | logger.warn( 147 | s"[DEPRECATED] Translate {logical_type: $v} => {converted_type: $v}" 148 | ) 149 | c.remove("logical_type") 150 | c.set("converted_type", v) 151 | } 152 | } 153 | if (c.has("converted_type")) { 154 | if (c.has("logical_type")) 155 | throw new ConfigException( 156 | "\"converted_type\" and \"logical_type\" options cannot be used at the same time." 157 | ) 158 | Try(c.get(classOf[String], "converted_type")) match { 159 | case Success(convertedType) => 160 | val logicalTypeConfig: ConfigSource = 161 | translateConvertedType2LogicalType(convertedType) 162 | c.setNested("logical_type", logicalTypeConfig) 163 | case Failure(ex) => 164 | throw new ConfigException( 165 | "The value of \"converted_type\" option must be string.", 166 | ex 167 | ) 168 | } 169 | } 170 | c.loadConfig(classOf[Task]) 171 | } 172 | 173 | private def translateConvertedType2LogicalType( 174 | convertedType: String 175 | ): ConfigSource = { 176 | val builder = LogicalTypeOption.builder() 177 | val normalizedConvertedType: String = normalizeConvertedType(convertedType) 178 | if (normalizedConvertedType == "TIMESTAMP_NANOS") { 179 | builder.name("timestamp").isAdjustedToUtc(true).timeUnit(NANOS) 180 | logger.warn( 181 | s"[DEPRECATED] $convertedType is deprecated because this is not one of" + 182 | s" ConvertedTypes actually. Please use 'logical_type: ${builder.toOnelineYaml}'" 183 | ) 184 | } 185 | else { 186 | 187 | ConvertedType.valueOf(normalizedConvertedType) match { 188 | case ConvertedType.UTF8 => builder.name("string") 189 | case ConvertedType.DATE => builder.name("date") 190 | case ConvertedType.TIME_MILLIS => 191 | builder.name("time").isAdjustedToUtc(true).timeUnit(MILLIS) 192 | case ConvertedType.TIME_MICROS => 193 | builder.name("time").isAdjustedToUtc(true).timeUnit(MICROS) 194 | case ConvertedType.TIMESTAMP_MILLIS => 195 | builder.name("timestamp").isAdjustedToUtc(true).timeUnit(MILLIS) 196 | case ConvertedType.TIMESTAMP_MICROS => 197 | builder.name("timestamp").isAdjustedToUtc(true).timeUnit(MICROS) 198 | case ConvertedType.UINT_8 => 199 | builder.name("int").bitWidth(8).isSigned(false) 200 | case ConvertedType.UINT_16 => 201 | builder.name("int").bitWidth(16).isSigned(false) 202 | case ConvertedType.UINT_32 => 203 | builder.name("int").bitWidth(32).isSigned(false) 204 | case ConvertedType.UINT_64 => 205 | builder.name("int").bitWidth(64).isSigned(false) 206 | case ConvertedType.INT_8 => 207 | builder.name("int").bitWidth(8).isSigned(true) 208 | case ConvertedType.INT_16 => 209 | builder.name("int").bitWidth(16).isSigned(true) 210 | case ConvertedType.INT_32 => 211 | builder.name("int").bitWidth(32).isSigned(true) 212 | case ConvertedType.INT_64 => 213 | builder.name("int").bitWidth(64).isSigned(true) 214 | case ConvertedType.JSON => builder.name("json") 215 | case _ => 216 | // MAP, MAP_KEY_VALUE, LIST, ENUM, DECIMAL, BSON, INTERVAL 217 | throw new ConfigException( 218 | s"converted_type: $convertedType is not supported." 219 | ) 220 | } 221 | } 222 | logger.info( 223 | s"Translate {converted_type: $convertedType} => {logical_type: ${builder.toOnelineYaml}}" 224 | ) 225 | builder.build() 226 | } 227 | 228 | private def normalizeConvertedType(convertedType: String): String = { 229 | convertedType 230 | .toUpperCase(Locale.ENGLISH) 231 | .replaceAll("-", "_") 232 | .replaceAll("INT(\\d)", "INT_$1") 233 | } 234 | 235 | def fromTask(task: Task): Option[LogicalTypeProxy] = { 236 | task.getLogicalType.map { o => 237 | LogicalTypeProxy( 238 | name = o.getName, 239 | scale = o.getScale, 240 | precision = o.getPrecision, 241 | bitWidth = o.getBitWidth, 242 | isSigned = o.getIsSigned, 243 | isAdjustedToUtc = o.getIsAdjustedToUtc, 244 | timeUnit = o.getTimeUnit, 245 | timeZone = task.getTimeZoneId.map(ZoneId.of) 246 | ) 247 | } 248 | } 249 | } 250 | 251 | trait ParquetColumnType { 252 | def primitiveType(column: Column): PrimitiveType 253 | def glueDataType(column: Column): GlueDataType 254 | def consumeBoolean(consumer: RecordConsumer, v: Boolean): Unit 255 | def consumeString(consumer: RecordConsumer, v: String): Unit 256 | def consumeLong(consumer: RecordConsumer, v: Long): Unit 257 | def consumeDouble(consumer: RecordConsumer, v: Double): Unit 258 | def consumeTimestamp( 259 | consumer: RecordConsumer, 260 | v: Timestamp, 261 | formatter: TimestampFormatter 262 | ): Unit 263 | def consumeJson(consumer: RecordConsumer, v: Value): Unit 264 | def newUnsupportedMethodException(methodName: String) = 265 | new ConfigException(s"${getClass.getName}#$methodName is unsupported.") 266 | 267 | protected def consumeLongAsInteger( 268 | consumer: RecordConsumer, 269 | v: Long 270 | ): Unit = { 271 | if (v < Int.MinValue || v > Int.MaxValue) 272 | throw new DataException( 273 | s"Failed to cast Long: $v to Int, " + 274 | s"because $v exceeds ${Int.MaxValue} (Int.MaxValue) or ${Int.MinValue} (Int.MinValue)" 275 | ) 276 | consumer.addInteger(v.toInt) 277 | } 278 | } 279 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # S3 Parquet output plugin for Embulk 2 | 3 | [![Release CI Status Badge](https://github.com/civitaspo/embulk-output-s3_parquet/workflows/Release%20CI/badge.svg)](https://github.com/civitaspo/embulk-output-s3_parquet/actions?query=workflow%3A%22Release+CI%22) [![Test CI Status Badge](https://github.com/civitaspo/embulk-output-s3_parquet/workflows/Test%20CI/badge.svg)](https://github.com/civitaspo/embulk-output-s3_parquet/actions?query=workflow%3A%22Test+CI%22) 4 | 5 | [Embulk](https://github.com/embulk/embulk/) output plugin to dump records as [Apache Parquet](https://parquet.apache.org/) files on S3. 6 | 7 | ## Overview 8 | 9 | * **Plugin type**: output 10 | * **Load all or nothing**: no 11 | * **Resume supported**: no 12 | * **Cleanup supported**: yes 13 | 14 | ## Configuration 15 | 16 | - **bucket**: s3 bucket name (string, required) 17 | - **path_prefix**: prefix of target keys (string, optional) 18 | - **sequence_format**: format of the sequence number of the output files (string, default: `"%03d.%02d."`) 19 | - **sequence_format** formats task index and sequence number in a task. 20 | - **file_ext**: path suffix of the output files (string, default: `"parquet"`) 21 | - **compression_codec**: compression codec for parquet file (`"uncompressed"`,`"snappy"`,`"gzip"`,`"lzo"`,`"brotli"`,`"lz4"` or `"zstd"`, default: `"uncompressed"`) 22 | - **default_timestamp_format**: default timestamp format (string, default: `"%Y-%m-%d %H:%M:%S.%6N %z"`) 23 | - **default_timezone**: default timezone (string, default: `"UTC"`) 24 | - **column_options**: a map whose keys are name of columns, and values are configuration with following parameters (optional) 25 | - **timezone**: timezone if type of this column is timestamp. If not set, **default_timezone** is used. (string, optional) 26 | - **format**: timestamp format if type of this column is timestamp. If not set, **default_timestamp_format**: is used. (string, optional) 27 | - **converted_type**: a Parquet converted type name (`timestamp-millis`, `timestamp-micros`, `timestamp-nanos`, `json`, `int8`, `int16`, `int32`, `int64`, `uint8`, `uint16`, `uint32`, `uint64`) (string, optional) 28 | - **logical_type**: **[DEPRECATED: Use **converted_type** instead]** a Parquet converted type name (`timestamp-millis`, `timestamp-micros`, `timestamp-nanos`, `json`, `int8`, `int16`, `int32`, `int64`, `uint8`, `uint16`, `uint32`, `uint64`) (string, optional) 29 | - **logical_type**: configuration for the detailed logical type. See [Logical Type Specification](https://github.com/apache/parquet-format/blob/apache-parquet-format-2.7.0/LogicalTypes.md) (optional) 30 | - **name**: The name of logical type (`"date"`, `"decimal"`, `"int"`, `"json"`, `"time"`, `"timestamp"`) (string, required) 31 | - **bit_width**: The bit width for `"int"` logical type (Allowed bit width values are `8`, `16`, `32`, `64`). (int, default: `64`) 32 | - **is_signed**: Signed or not for `"int"` logical type (boolean, default: `true`) 33 | - **scale**: The scale for `"decimal"` logical type (int, default: `0`) 34 | - **precision**: The precision for `"decimal"` logical type (int, default: `0`) 35 | - **is_adjusted_to_utc**: (boolean, default: `true`) 36 | - **time_unit**: The precision for `"time"` or `"timestamp"` logical type (Allowed values are `"MILLIS`, `MICROS`, `NANOS`) 37 | - **canned_acl**: grants one of [canned ACLs](https://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#CannedACL) for created objects (string, default: `private`) 38 | - **block_size**: The block size is the size of a row group being buffered in memory. This limits the memory usage when writing. Larger values will improve the I/O when reading but consume more memory when writing. (int, default: `134217728` (128MB)) 39 | - **page_size**: The page size is for compression. When reading, each page can be decompressed independently. A block is composed of pages. The page is the smallest unit that must be read fully to access a single record. If this value is too small, the compression will deteriorate. (int, default: `1048576` (1MB)) 40 | - **max_padding_size**: The max size (bytes) to write as padding and the min size of a row group (int, default: `8388608` (8MB)) 41 | - **enable_dictionary_encoding**: The boolean value is to enable/disable dictionary encoding. (boolean, default: `true`) 42 | - **auth_method**: name of mechanism to authenticate requests (`"basic"`, `"env"`, `"instance"`, `"profile"`, `"properties"`, `"anonymous"`, `"session"`, `"web_identity_token"`, default: `"default"`) 43 | - `"basic"`: uses **access_key_id** and **secret_access_key** to authenticate. 44 | - `"env"`: uses `AWS_ACCESS_KEY_ID` (or `AWS_ACCESS_KEY`) and `AWS_SECRET_KEY` (or `AWS_SECRET_ACCESS_KEY`) environment variables. 45 | - `"instance"`: uses EC2 instance profile or attached ECS task role. 46 | - `"profile"`: uses credentials written in a file. Format of the file is as following, where `[...]` is a name of profile. 47 | ``` 48 | [default] 49 | aws_access_key_id=YOUR_ACCESS_KEY_ID 50 | aws_secret_access_key=YOUR_SECRET_ACCESS_KEY 51 | 52 | [profile2] 53 | ... 54 | ``` 55 | - `"properties"`: uses aws.accessKeyId and aws.secretKey Java system properties. 56 | - `"anonymous"`: uses anonymous access. This auth method can access only public files. 57 | - `"session"`: uses temporary-generated **access_key_id**, **secret_access_key** and **session_token**. 58 | - `"assume_role"`: uses temporary-generated credentials by assuming **role_arn** role. 59 | - `"web_identity_token"`: uses temporary-generated credentials by assuming **role_arn** role with web identity. 60 | - `"default"`: uses AWS SDK's default strategy to look up available credentials from runtime environment. This method behaves like the combination of the following methods. 61 | 1. `"env"` 62 | 1. `"properties"` 63 | 1. `"profile"` 64 | 1. `"instance"` 65 | - **profile_file**: path to a profiles file. this is optionally used when **auth_method** is `"profile"`. (string, default: given by `AWS_CREDENTIAL_PROFILES_FILE` environment variable, or ~/.aws/credentials). 66 | - **profile_name**: name of a profile. this is optionally used when **auth_method** is `"profile"`. (string, default: `"default"`) 67 | - **access_key_id**: aws access key id. this is required when **auth_method** is `"basic"` or `"session"`. (string, optional) 68 | - **secret_access_key**: aws secret access key. this is required when **auth_method** is `"basic"` or `"session"`. (string, optional) 69 | - **session_token**: aws session token. this is required when **auth_method** is `"session"`. (string, optional) 70 | - **role_arn**: arn of the role to assume. this is required for **auth_method** is `"assume_role"` or `"web_identity_token"`. (string, optional) 71 | - **role_session_name**: an identifier for the assumed role session. this is required when **auth_method** is `"assume_role"` or `"web_identity_token"`. (string, optional) 72 | - **role_external_id**: a unique identifier that is used by third parties when assuming roles in their customers' accounts. this is optionally used for **auth_method**: `"assume_role"`. (string, optional) 73 | - **role_session_duration_seconds**: duration, in seconds, of the role session. this is optionally used for **auth_method**: `"assume_role"`. (int, optional) 74 | - **web_identity_token_file**: the absolute path to the web identity token file. this is required when **auth_method** is `"web_identity_token"`. (string, optional) 75 | - **scope_down_policy**: an iam policy in json format. this is optionally used for **auth_method**: `"assume_role"`. (string, optional) 76 | - **catalog**: Register a table if this option is specified (optional) 77 | - **catalog_id**: glue data catalog id if you use a catalog different from account/region default catalog. (string, optional) 78 | - **database**: The name of the database (string, required) 79 | - **table**: The name of the table (string, required) 80 | - **column_options**: a key-value pairs where key is a column name and value is options for the column. (string to options map, default: `{}`) 81 | - **type**: type of column when this plugin creates new tables (e.g. `string`, `bigint`) (string, default: depends on the input embulk column type, or the parquet logical type. See the below table) 82 | 83 | |embulk column type|glue data type| 84 | |:---|:---| 85 | |long|bigint| 86 | |boolean|boolean| 87 | |double|double| 88 | |string|string| 89 | |timestamp|string| 90 | |json|string| 91 | 92 | |parquet converted type|glue data type|note| 93 | |:---|:---|:---| 94 | |timestamp-millis|timestamp|| 95 | |timestamp-micros|long|Glue cannot recognize timestamp-micros.| 96 | |timestamp-nanos|long|Glue cannot recognize timestamp-nanos.| 97 | |int8|tinyint|| 98 | |int16|smallint|| 99 | |int32|int|| 100 | |int64|bigint|| 101 | |uint8|smallint|Glue tinyint is a minimum value of -2^7 and a maximum value of 2^7-1| 102 | |uint16|int|Glue smallint is a minimum value of -2^15 and a maximum value of 2^15-1.| 103 | |uint32|bigint|Glue int is a minimum value of-2^31 and a maximum value of 2^31-1.| 104 | |uint64|ConfigException|Glue bigint supports only a 64-bit signed integer.| 105 | |json|string|| 106 | 107 | - **operation_if_exists**: operation if the table already exist. Available operations are `"delete"` and `"skip"` (string, default: `"delete"`) 108 | - **endpoint**: The AWS Service endpoint (string, optional) 109 | - **region**: The AWS region (string, optional) 110 | - **http_proxy**: Indicate whether using when accessing AWS via http proxy. (optional) 111 | - **host** proxy host (string, required) 112 | - **port** proxy port (int, optional) 113 | - **protocol** proxy protocol (string, default: `"https"`) 114 | - **user** proxy user (string, optional) 115 | - **password** proxy password (string, optional) 116 | - **buffer_dir**: buffer directory for parquet files to be uploaded on S3 (string, default: Create a Temporary Directory) 117 | - **type_options**: a map whose keys are name of embulk type(`boolean`, `long`, `double`, `string`, `timestamp`, `json`), and values are configuration with following parameters (optional) 118 | - **converted_type**: a Parquet converted type name (`timestamp-millis`, `timestamp-micros`, `timestamp-nanos`, `json`, `int8`, `int16`, `int32`, `int64`, `uint8`, `uint16`, `uint32`, `uint64`) (string, optional) 119 | - **logical_type**: **[DEPRECATED: Use **converted_type** instead]** a Parquet converted type name (`timestamp-millis`, `timestamp-micros`, `timestamp-nanos`, `json`, `int8`, `int16`, `int32`, `int64`, `uint8`, `uint16`, `uint32`, `uint64`) (string, optional) 120 | - **logical_type**: configuration for the detailed logical type. See [Logical Type Specification](https://github.com/apache/parquet-format/blob/apache-parquet-format-2.7.0/LogicalTypes.md) (optional) 121 | - **name**: The name of logical type (`"date"`, `"decimal"`, `"int"`, `"json"`, `"time"`, `"timestamp"`) (string, required) 122 | - **bit_width**: The bit width for `"int"` logical type (Allowed bit width values are `8`, `16`, `32`, `64`). (int, default: `64`) 123 | - **is_signed**: Signed or not for `"int"` logical type (boolean, default: `true`) 124 | - **scale**: The scale for `"decimal"` logical type (int, default: `0`) 125 | - **precision**: The precision for `"decimal"` logical type (int, default: `0`) 126 | - **is_adjusted_to_utc**: (boolean, default: `true`) 127 | - **time_unit**: The precision for `"time"` or `"timestamp"` logical type (Allowed values are `"MILLIS`, `MICROS`, `NANOS`) 128 | 129 | 130 | ## Example 131 | 132 | ```yaml 133 | out: 134 | type: s3_parquet 135 | bucket: my-bucket 136 | path_prefix: path/to/my-obj. 137 | file_ext: snappy.parquet 138 | compression_codec: snappy 139 | default_timezone: Asia/Tokyo 140 | canned_acl: bucket-owner-full-control 141 | ``` 142 | 143 | ## Note 144 | 145 | * This plugin implements the Parquet [LogicalTypes](https://github.com/apache/parquet-format/blob/apache-parquet-format-2.8.0/LogicalTypes.md) as much as possible. But it does not implement all of ones. 146 | * Some kind of LogicalTypes are sometimes not supported on your middleware. Be careful to giving logical type name. 147 | 148 | ## Development 149 | 150 | ### Run example: 151 | 152 | ```shell 153 | $ ./run_s3_local.sh 154 | $ ./example/prepare_s3_bucket.sh 155 | $ ./gradlew gem 156 | $ embulk run example/config.yml -Ibuild/gemContents/lib 157 | ``` 158 | 159 | ### Run test: 160 | 161 | ```shell 162 | $ ./run_s3_local.sh 163 | $ ./gradlew scalatest 164 | ``` 165 | 166 | ### Build 167 | 168 | ``` 169 | $ ./gradlew gem --write-locks # -t to watch change of files and rebuild continuously 170 | ``` 171 | 172 | ### Release gem: 173 | Fix [build.gradle](./build.gradle), then 174 | 175 | 176 | ```shell 177 | $ ./gradlew gemPush 178 | ``` 179 | 180 | ## ChangeLog 181 | 182 | [CHANGELOG.md](./CHANGELOG.md) 183 | -------------------------------------------------------------------------------- /src/test/scala/org/embulk/output/s3_parquet/parquet/TestIntLogicalType.scala: -------------------------------------------------------------------------------- 1 | package org.embulk.output.s3_parquet.parquet 2 | 3 | import org.apache.parquet.schema.LogicalTypeAnnotation 4 | import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName 5 | import org.embulk.config.ConfigException 6 | import org.embulk.output.s3_parquet.catalog.GlueDataType 7 | import org.embulk.spi.DataException 8 | import org.scalatest.diagrams.Diagrams 9 | import org.scalatest.funsuite.AnyFunSuite 10 | import org.scalatest.prop.TableDrivenPropertyChecks 11 | 12 | import scala.util.chaining._ 13 | class TestIntLogicalType 14 | extends AnyFunSuite 15 | with ParquetColumnTypeTestHelper 16 | with TableDrivenPropertyChecks 17 | with Diagrams { 18 | 19 | private val conditions = Table( 20 | ("bitWidth", "isSigned", "column"), { 21 | for { 22 | bitWidth <- Seq(8, 16, 32, 64) 23 | isSigned <- Seq(true, false) 24 | column <- Seq( 25 | SAMPLE_BOOLEAN_COLUMN, 26 | SAMPLE_LONG_COLUMN, 27 | SAMPLE_DOUBLE_COLUMN, 28 | SAMPLE_STRING_COLUMN, 29 | SAMPLE_TIMESTAMP_COLUMN, 30 | SAMPLE_JSON_COLUMN 31 | ) 32 | } yield (bitWidth, isSigned, column) 33 | }: _* 34 | ) 35 | 36 | private val unsupportedEmbulkColumns = Seq( 37 | SAMPLE_TIMESTAMP_COLUMN, 38 | SAMPLE_JSON_COLUMN 39 | ) 40 | 41 | private def isINT32(bitWidth: Int): Boolean = bitWidth < 64 42 | 43 | test( 44 | "#primitiveType(column) returns PrimitiveTypeName.INT32 with LogicalType" 45 | ) { 46 | forAll(conditions) { (bitWidth, isSigned, column) => 47 | whenever(isINT32(bitWidth) && !unsupportedEmbulkColumns.contains(column)) { 48 | val logicalType = 49 | IntLogicalType(bitWidth = bitWidth, isSigned = isSigned) 50 | // format: off 51 | assert(PrimitiveTypeName.INT32 == logicalType.primitiveType(column).getPrimitiveTypeName) 52 | assert(LogicalTypeAnnotation.intType(bitWidth, isSigned) == logicalType.primitiveType(column).getLogicalTypeAnnotation) 53 | // format: on 54 | } 55 | } 56 | } 57 | 58 | test( 59 | "#primitiveType(column) returns PrimitiveTypeName.INT64 with LogicalType" 60 | ) { 61 | forAll(conditions) { (bitWidth, isSigned, column) => 62 | whenever(!isINT32(bitWidth) && !unsupportedEmbulkColumns.contains(column)) { 63 | val logicalType = 64 | IntLogicalType(bitWidth = bitWidth, isSigned = isSigned) 65 | // format: off 66 | assert(PrimitiveTypeName.INT64 == logicalType.primitiveType(column).getPrimitiveTypeName) 67 | assert(LogicalTypeAnnotation.intType(bitWidth, isSigned) == logicalType.primitiveType(column).getLogicalTypeAnnotation) 68 | // format: on 69 | } 70 | } 71 | } 72 | 73 | test( 74 | s"#primitiveType(column) cannot return any PrimitiveType when embulk column type is one of (${unsupportedEmbulkColumns 75 | .map(_.getType.getName) 76 | .mkString(",")})" 77 | ) { 78 | forAll(conditions) { (bitWidth, isSigned, column) => 79 | whenever(unsupportedEmbulkColumns.contains(column)) { 80 | // format: off 81 | assert(intercept[ConfigException](IntLogicalType(bitWidth = bitWidth, isSigned = isSigned).primitiveType(column)).getMessage.startsWith("Unsupported column type: ")) 82 | // format: on 83 | } 84 | } 85 | } 86 | 87 | test("#glueDataType(column) returns GlueDataType") { 88 | forAll(conditions) { (bitWidth, isSigned, column) => 89 | whenever(!unsupportedEmbulkColumns.contains(column)) { 90 | def assertGlueDataType(expected: GlueDataType) = { 91 | // format: off 92 | assert(expected == IntLogicalType(bitWidth = bitWidth, isSigned = isSigned).glueDataType(column)) 93 | // format: on 94 | } 95 | if (isSigned) { 96 | bitWidth match { 97 | case 8 => assertGlueDataType(GlueDataType.TINYINT) 98 | case 16 => assertGlueDataType(GlueDataType.SMALLINT) 99 | case 32 => assertGlueDataType(GlueDataType.INT) 100 | case 64 => assertGlueDataType(GlueDataType.BIGINT) 101 | case _ => fail() 102 | } 103 | } 104 | else { 105 | bitWidth match { 106 | case 8 => assertGlueDataType(GlueDataType.SMALLINT) 107 | case 16 => assertGlueDataType(GlueDataType.INT) 108 | case 32 => assertGlueDataType(GlueDataType.BIGINT) 109 | case 64 => assertGlueDataType(GlueDataType.BIGINT) 110 | case _ => fail() 111 | } 112 | } 113 | } 114 | } 115 | } 116 | 117 | test( 118 | s"#glueDataType(column) cannot return any GlueDataType when embulk column type is one of (${unsupportedEmbulkColumns 119 | .map(_.getType.getName) 120 | .mkString(",")})" 121 | ) { 122 | forAll(conditions) { (bitWidth, isSigned, column) => 123 | whenever(unsupportedEmbulkColumns.contains(column)) { 124 | // format: off 125 | assert(intercept[ConfigException](IntLogicalType(bitWidth = bitWidth, isSigned = isSigned).glueDataType(column)).getMessage.startsWith("Unsupported column type: ")) 126 | // format: on 127 | } 128 | } 129 | } 130 | 131 | test("#consumeBoolean (INT32)") { 132 | forAll(conditions) { (bitWidth, isSigned, _) => 133 | whenever(isINT32(bitWidth)) { 134 | newMockRecordConsumer().tap { consumer => 135 | consumer.writingSampleField { 136 | IntLogicalType(bitWidth = bitWidth, isSigned = isSigned) 137 | .consumeBoolean(consumer, true) 138 | } 139 | assert(consumer.data.head.head.isInstanceOf[Int]) 140 | assert(consumer.data.head.head == 1) 141 | } 142 | newMockRecordConsumer().tap { consumer => 143 | consumer.writingSampleField { 144 | IntLogicalType(bitWidth = bitWidth, isSigned = isSigned) 145 | .consumeBoolean(consumer, false) 146 | } 147 | assert(consumer.data.head.head.isInstanceOf[Int]) 148 | assert(consumer.data.head.head == 0) 149 | } 150 | } 151 | } 152 | } 153 | 154 | test("#consumeBoolean (INT64)") { 155 | forAll(conditions) { (bitWidth, isSigned, _) => 156 | whenever(!isINT32(bitWidth)) { 157 | newMockRecordConsumer().tap { consumer => 158 | consumer.writingSampleField { 159 | IntLogicalType(bitWidth = bitWidth, isSigned = isSigned) 160 | .consumeBoolean(consumer, true) 161 | } 162 | assert(consumer.data.head.head.isInstanceOf[Long]) 163 | assert(consumer.data.head.head == 1L) 164 | } 165 | newMockRecordConsumer().tap { consumer => 166 | consumer.writingSampleField { 167 | IntLogicalType(bitWidth = bitWidth, isSigned = isSigned) 168 | .consumeBoolean(consumer, false) 169 | } 170 | assert(consumer.data.head.head.isInstanceOf[Long]) 171 | assert(consumer.data.head.head == 0L) 172 | } 173 | } 174 | } 175 | } 176 | 177 | test("#consumeString (INT32)") { 178 | forAll(conditions) { (bitWidth, isSigned, _) => 179 | whenever(isINT32(bitWidth)) { 180 | newMockRecordConsumer().tap { consumer => 181 | consumer.writingSampleField { 182 | IntLogicalType(bitWidth = bitWidth, isSigned = isSigned) 183 | .consumeString(consumer, "1") 184 | } 185 | assert(consumer.data.head.head.isInstanceOf[Int]) 186 | assert(consumer.data.head.head == 1) 187 | } 188 | newMockRecordConsumer().tap { consumer => 189 | consumer.writingSampleField { 190 | // format: off 191 | assert(intercept[DataException](IntLogicalType(bitWidth = bitWidth, isSigned = isSigned).consumeString(consumer, "string")).getMessage.startsWith("Failed to cast String: ")) 192 | // format: on 193 | } 194 | } 195 | } 196 | } 197 | } 198 | 199 | test("#consumeString (INT64)") { 200 | forAll(conditions) { (bitWidth, isSigned, _) => 201 | whenever(!isINT32(bitWidth)) { 202 | newMockRecordConsumer().tap { consumer => 203 | consumer.writingSampleField { 204 | IntLogicalType(bitWidth = bitWidth, isSigned = isSigned) 205 | .consumeString(consumer, "1") 206 | } 207 | assert(consumer.data.head.head.isInstanceOf[Long]) 208 | assert(consumer.data.head.head == 1L) 209 | } 210 | newMockRecordConsumer().tap { consumer => 211 | consumer.writingSampleField { 212 | // format: off 213 | assert(intercept[DataException](IntLogicalType(bitWidth = bitWidth, isSigned = isSigned).consumeString(consumer, "string")).getMessage.startsWith("Failed to cast String: ")) 214 | // format: on 215 | } 216 | } 217 | } 218 | } 219 | } 220 | 221 | test("#consumeLong (INT32)") { 222 | forAll(conditions) { (bitWidth, isSigned, _) => 223 | whenever(isINT32(bitWidth)) { 224 | newMockRecordConsumer().tap { consumer => 225 | consumer.writingSampleField { 226 | IntLogicalType(bitWidth = bitWidth, isSigned = isSigned) 227 | .consumeLong(consumer, 1L) 228 | } 229 | assert(consumer.data.head.head.isInstanceOf[Int]) 230 | assert(consumer.data.head.head == 1) 231 | } 232 | newMockRecordConsumer().tap { consumer => 233 | consumer.writingSampleField { 234 | // format: off 235 | assert(intercept[DataException](IntLogicalType(bitWidth = bitWidth, isSigned = isSigned).consumeLong(consumer, Long.MaxValue)).getMessage.startsWith("The value is out of the range: that is ")) 236 | // format: on 237 | } 238 | } 239 | } 240 | } 241 | } 242 | 243 | test("#consumeLong (INT64)") { 244 | forAll(conditions) { (bitWidth, isSigned, _) => 245 | whenever(!isINT32(bitWidth)) { 246 | newMockRecordConsumer().tap { consumer => 247 | consumer.writingSampleField { 248 | IntLogicalType(bitWidth = bitWidth, isSigned = isSigned) 249 | .consumeLong(consumer, 1L) 250 | } 251 | assert(consumer.data.head.head.isInstanceOf[Long]) 252 | assert(consumer.data.head.head == 1L) 253 | } 254 | newMockRecordConsumer().tap { consumer => 255 | consumer.writingSampleField { 256 | IntLogicalType(bitWidth = bitWidth, isSigned = isSigned) 257 | .consumeLong(consumer, Long.MaxValue) 258 | } 259 | assert(consumer.data.head.head.isInstanceOf[Long]) 260 | assert(consumer.data.head.head == Long.MaxValue) 261 | } 262 | } 263 | } 264 | } 265 | 266 | test("#consumeDouble (INT32)") { 267 | forAll(conditions) { (bitWidth, isSigned, _) => 268 | whenever(isINT32(bitWidth)) { 269 | newMockRecordConsumer().tap { consumer => 270 | consumer.writingSampleField { 271 | IntLogicalType(bitWidth = bitWidth, isSigned = isSigned) 272 | .consumeDouble(consumer, 1.4d) 273 | } 274 | assert(consumer.data.head.head.isInstanceOf[Int]) 275 | assert(consumer.data.head.head == 1) 276 | } 277 | newMockRecordConsumer().tap { consumer => 278 | consumer.writingSampleField { 279 | IntLogicalType(bitWidth = bitWidth, isSigned = isSigned) 280 | .consumeDouble(consumer, 1.5d) 281 | } 282 | assert(consumer.data.head.head.isInstanceOf[Int]) 283 | assert(consumer.data.head.head == 2) 284 | } 285 | 286 | newMockRecordConsumer().tap { consumer => 287 | consumer.writingSampleField { 288 | // format: off 289 | assert(intercept[DataException](IntLogicalType(bitWidth = bitWidth, isSigned = isSigned).consumeDouble(consumer, Double.MaxValue)).getMessage.startsWith("The value is out of the range: that is ")) 290 | // format: on 291 | } 292 | } 293 | } 294 | } 295 | } 296 | 297 | test("#consumeDouble (INT64)") { 298 | forAll(conditions) { (bitWidth, isSigned, _) => 299 | whenever(!isINT32(bitWidth)) { 300 | newMockRecordConsumer().tap { consumer => 301 | consumer.writingSampleField { 302 | IntLogicalType(bitWidth = bitWidth, isSigned = isSigned) 303 | .consumeDouble(consumer, 1.4d) 304 | } 305 | assert(consumer.data.head.head.isInstanceOf[Long]) 306 | assert(consumer.data.head.head == 1L) 307 | } 308 | newMockRecordConsumer().tap { consumer => 309 | consumer.writingSampleField { 310 | IntLogicalType(bitWidth = bitWidth, isSigned = isSigned) 311 | .consumeDouble(consumer, 1.5d) 312 | } 313 | assert(consumer.data.head.head.isInstanceOf[Long]) 314 | assert(consumer.data.head.head == 2L) 315 | } 316 | newMockRecordConsumer().tap { consumer => 317 | consumer.writingSampleField { 318 | // format: off 319 | assert(intercept[DataException](IntLogicalType(bitWidth = bitWidth, isSigned = isSigned).consumeDouble(consumer, Double.MaxValue)).getMessage.startsWith("The value is out of the range: ")) 320 | // format: on 321 | } 322 | } 323 | } 324 | } 325 | } 326 | 327 | test("#consumeTimestamp is unsupported") { 328 | forAll(conditions) { (bitWidth, isSigned, _) => 329 | newMockRecordConsumer().tap { consumer => 330 | consumer.writingSampleField { 331 | // format: off 332 | assert(intercept[ConfigException](IntLogicalType(bitWidth = bitWidth, isSigned = isSigned).consumeTimestamp(consumer, null, null)).getMessage.endsWith("is unsupported.")) 333 | // format: on 334 | } 335 | } 336 | } 337 | } 338 | test("#consumeJson is unsupported") { 339 | forAll(conditions) { (bitWidth, isSigned, _) => 340 | newMockRecordConsumer().tap { consumer => 341 | consumer.writingSampleField { 342 | // format: off 343 | assert(intercept[ConfigException](IntLogicalType(bitWidth = bitWidth, isSigned = isSigned).consumeJson(consumer, null)).getMessage.endsWith("is unsupported.")) 344 | // format: on 345 | } 346 | } 347 | } 348 | } 349 | } 350 | --------------------------------------------------------------------------------