├── .github
    ├── FUNDING.yml
    └── workflows
    │   ├── test.yml
    │   └── release.yml
├── settings.gradle
├── gradle
    ├── wrapper
    │   ├── gradle-wrapper.jar
    │   └── gradle-wrapper.properties
    └── dependency-locks
    │   └── embulkPluginRuntime.lockfile
├── run_s3_local.sh
├── example
    ├── prepare_s3_bucket.sh
    ├── data.tsv
    ├── config.yml
    ├── with_logicaltypes.yml
    └── with_catalog.yml
├── .gitignore
├── .scalafmt.conf
├── src
    ├── main
    │   └── scala
    │   │   └── org
    │   │       └── embulk
    │   │           └── output
    │   │               └── s3_parquet
    │   │                   ├── ContextClassLoaderSwapper.scala
    │   │                   ├── aws
    │   │                       ├── AwsClientConfiguration.scala
    │   │                       ├── HttpProxy.scala
    │   │                       ├── AwsEndpointConfiguration.scala
    │   │                       ├── Aws.scala
    │   │                       ├── AwsS3Configuration.scala
    │   │                       └── AwsCredentials.scala
    │   │                   ├── implicits.scala
    │   │                   ├── S3ParquetPageOutput.scala
    │   │                   ├── parquet
    │   │                       ├── DateLogicalType.scala
    │   │                       ├── JsonLogicalType.scala
    │   │                       ├── DefaultColumnType.scala
    │   │                       ├── LogicalTypeProxy.scala
    │   │                       ├── TimestampLogicalType.scala
    │   │                       ├── DecimalLogicalType.scala
    │   │                       ├── TimeLogicalType.scala
    │   │                       ├── IntLogicalType.scala
    │   │                       ├── ParquetFileWriteSupport.scala
    │   │                       └── ParquetColumnType.scala
    │   │                   ├── catalog
    │   │                       ├── GlueDataType.scala
    │   │                       └── CatalogRegistrator.scala
    │   │                   ├── PluginTask.scala
    │   │                   └── S3ParquetOutputPlugin.scala
    └── test
    │   └── scala
    │       └── org
    │           └── embulk
    │               └── output
    │                   └── s3_parquet
    │                       ├── parquet
    │                           ├── ParquetColumnTypeTestHelper.scala
    │                           ├── MockParquetRecordConsumer.scala
    │                           ├── TestJsonLogicalType.scala
    │                           ├── TestDefaultColumnType.scala
    │                           ├── TestDateLogicalType.scala
    │                           ├── TestTimestampLogicalType.scala
    │                           ├── TestDecimalLogicalType.scala
    │                           ├── TestTimeLogicalType.scala
    │                           └── TestIntLogicalType.scala
    │                       ├── TestS3ParquetOutputPluginConfigException.scala
    │                       ├── TestS3ParquetOutputPlugin.scala
    │                       └── EmbulkPluginTestHelper.scala
├── LICENSE.txt
├── gradlew.bat
├── CHANGELOG.md
├── gradlew
└── README.md


/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: civitaspo
2 | 


--------------------------------------------------------------------------------
/settings.gradle:
--------------------------------------------------------------------------------
1 | rootProject.name = 'embulk-output-s3_parquet'
2 | 


--------------------------------------------------------------------------------
/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/civitaspo/embulk-output-s3_parquet/HEAD/gradle/wrapper/gradle-wrapper.jar


--------------------------------------------------------------------------------
/run_s3_local.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | docker run -it -d --rm \
4 |     -p 4566:4566 \
5 |     -e SERVICES=s3 \
6 |     localstack/localstack
7 | 
8 | 


--------------------------------------------------------------------------------
/example/prepare_s3_bucket.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | aws s3 mb s3://example \
4 |     --endpoint-url http://localhost:4566 \
5 |     --region us-east-1
6 | 
7 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *~
 2 | /pkg/
 3 | /tmp/
 4 | *.gemspec
 5 | .gradle/
 6 | /classpath/
 7 | build/
 8 | .idea
 9 | /.settings/
10 | /.metadata/
11 | .classpath
12 | .project
13 | 


--------------------------------------------------------------------------------
/.scalafmt.conf:
--------------------------------------------------------------------------------
1 | # https://scalameta.org/scalafmt/#Configuration
2 | 
3 | version = "2.4.2"
4 | newlines.alwaysBeforeElseAfterCurlyIf = true
5 | assumeStandardLibraryStripMargin = true
6 | 


--------------------------------------------------------------------------------
/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | distributionBase=GRADLE_USER_HOME
2 | distributionPath=wrapper/dists
3 | distributionUrl=https\://services.gradle.org/distributions/gradle-6.3-bin.zip
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 | 


--------------------------------------------------------------------------------
/example/data.tsv:
--------------------------------------------------------------------------------
1 | 0	c20ef94602	c212c89f91	2017-10-24 03:54:35 +0900	{"a":0,"b":"99"}
2 | 1	330a9fc33a	e25b33b616	2017-10-22 19:53:31 +0900	{"a":1,"b":"a9"}
3 | 2	707b3b7588	90823c6a1f	2017-10-23 23:42:43 +0900	{"a":2,"b":"96"}
4 | 3	8d8288e66f		2017-10-22 06:12:13 +0900	{"a":3,"b":"86"}
5 | 4	c54d8b6481	e56a40571c	2017-10-23 04:59:16 +0900	{"a":4,"b":"d2"}
6 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: Test CI
 2 | 
 3 | on:
 4 |   - push
 5 | 
 6 | jobs:
 7 |   test:
 8 | 
 9 |     runs-on: ubuntu-latest
10 |     services:
11 |       localstack:
12 |         image: localstack/localstack
13 |         ports:
14 |           - 4566:4566
15 |         env:
16 |           SERVICES: s3
17 | 
18 |     steps:
19 |     - uses: actions/checkout@v1
20 |     - name: Set up JDK 1.8
21 |       uses: actions/setup-java@v1
22 |       with:
23 |         java-version: 1.8
24 |     - name: scalafmt
25 |       run: ./gradlew spotlessCheck
26 |     - name: scalatest
27 |       run: ./gradlew scalatest
28 | 
29 | 


--------------------------------------------------------------------------------
/src/main/scala/org/embulk/output/s3_parquet/ContextClassLoaderSwapper.scala:
--------------------------------------------------------------------------------
 1 | package org.embulk.output.s3_parquet
 2 | 
 3 | // WARNING: This object should be used for limited purposes only.
 4 | object ContextClassLoaderSwapper {
 5 | 
 6 |   def using[A](klass: Class[_])(f: => A): A = {
 7 |     val currentTread = Thread.currentThread()
 8 |     val original = currentTread.getContextClassLoader
 9 |     val target = klass.getClassLoader
10 |     currentTread.setContextClassLoader(target)
11 |     try f
12 |     finally currentTread.setContextClassLoader(original)
13 |   }
14 | 
15 |   def usingPluginClass[A](f: => A): A = {
16 |     using(classOf[S3ParquetOutputPlugin])(f)
17 |   }
18 | }
19 | 


--------------------------------------------------------------------------------
/example/config.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | in:
 3 |   type: file
 4 |   path_prefix: ./example/data.tsv
 5 |   parser:
 6 |     type: csv
 7 |     delimiter: "\t"
 8 |     skip_header_lines: 0
 9 |     null_string: ""
10 |     columns:
11 |       - { name: id, type: long }
12 |       - { name: description, type: string }
13 |       - { name: name, type: string }
14 |       - { name: t, type: timestamp, format: "%Y-%m-%d %H:%M:%S %z"}
15 |       - { name: payload, type: json}
16 |     stop_on_invalid_record: true
17 | 
18 | out:
19 |   type: s3_parquet
20 |   bucket: example
21 |   region: us-east-1
22 |   endpoint: http://127.0.0.1:4566
23 |   path_prefix: path/to/my-obj.
24 |   file_ext: snappy.parquet
25 |   compression_codec: snappy
26 |   default_timezone: Asia/Tokyo
27 |   canned_acl: bucket-owner-full-control
28 | 


--------------------------------------------------------------------------------
/src/test/scala/org/embulk/output/s3_parquet/parquet/ParquetColumnTypeTestHelper.scala:
--------------------------------------------------------------------------------
 1 | package org.embulk.output.s3_parquet.parquet
 2 | 
 3 | import org.embulk.spi.Column
 4 | import org.embulk.spi.`type`.Types
 5 | 
 6 | trait ParquetColumnTypeTestHelper {
 7 | 
 8 |   val SAMPLE_BOOLEAN_COLUMN: Column = new Column(0, "a", Types.BOOLEAN)
 9 |   val SAMPLE_LONG_COLUMN: Column = new Column(0, "a", Types.LONG)
10 |   val SAMPLE_DOUBLE_COLUMN: Column = new Column(0, "a", Types.DOUBLE)
11 |   val SAMPLE_STRING_COLUMN: Column = new Column(0, "a", Types.STRING)
12 |   val SAMPLE_TIMESTAMP_COLUMN: Column = new Column(0, "a", Types.TIMESTAMP)
13 |   val SAMPLE_JSON_COLUMN: Column = new Column(0, "a", Types.JSON)
14 | 
15 |   def newMockRecordConsumer(): MockParquetRecordConsumer =
16 |     MockParquetRecordConsumer()
17 | }
18 | 


--------------------------------------------------------------------------------
/example/with_logicaltypes.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | in:
 3 |   type: file
 4 |   path_prefix: ./example/data.tsv
 5 |   parser:
 6 |     type: csv
 7 |     delimiter: "\t"
 8 |     skip_header_lines: 0
 9 |     null_string: ""
10 |     columns:
11 |       - { name: id, type: long }
12 |       - { name: description, type: string }
13 |       - { name: name, type: string }
14 |       - { name: t, type: timestamp, format: "%Y-%m-%d %H:%M:%S %z"}
15 |       - { name: payload, type: json}
16 |     stop_on_invalid_record: true
17 | 
18 | out:
19 |   type: s3_parquet
20 |   bucket: example
21 |   region: us-east-1
22 |   endpoint: http://127.0.0.1:4566
23 |   path_prefix: path/to/my-obj-2.
24 |   file_ext: snappy.parquet
25 |   compression_codec: snappy
26 |   default_timezone: Asia/Tokyo
27 |   canned_acl: bucket-owner-full-control
28 |   column_options:
29 |     id:
30 |       logical_type: "uint64"
31 |   type_options:
32 |     timestamp:
33 |       logical_type: "timestamp-millis"
34 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: Release CI
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - '*'
 7 | 
 8 | jobs:
 9 |   release:
10 | 
11 |     runs-on: ubuntu-latest
12 |     services:
13 |       localstack:
14 |         image: localstack/localstack
15 |         ports:
16 |           - 4566:4566
17 |         env:
18 |           SERVICES: s3
19 | 
20 |     steps:
21 |     - uses: actions/checkout@v1
22 |     - name: Set up JDK 1.8
23 |       uses: actions/setup-java@v1
24 |       with:
25 |         java-version: 1.8
26 |     - name: scalafmt
27 |       run: ./gradlew spotlessCheck
28 |     - name: scalatest
29 |       run: ./gradlew scalatest
30 |     - name: Release the new gem
31 |       run: |
32 |         mkdir -p $HOME/.gem
33 |         touch $HOME/.gem/credentials
34 |         chmod 0600 $HOME/.gem/credentials
35 |         printf -- "---\n:rubygems_api_key: ${RUBYGEMS_API_KEY}\n" > $HOME/.gem/credentials
36 |         ./gradlew gemPush
37 |       env:
38 |         RUBYGEMS_API_KEY: ${{secrets.RUBYGEMS_API_KEY}}
39 | 


--------------------------------------------------------------------------------
/example/with_catalog.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | in:
 3 |   type: file
 4 |   path_prefix: ./example/data.tsv
 5 |   parser:
 6 |     type: csv
 7 |     delimiter: "\t"
 8 |     skip_header_lines: 0
 9 |     null_string: ""
10 |     columns:
11 |       - { name: id, type: long }
12 |       - { name: description, type: string }
13 |       - { name: name, type: string }
14 |       - { name: t, type: timestamp, format: "%Y-%m-%d %H:%M:%S %z"}
15 |       - { name: payload, type: json}
16 |     stop_on_invalid_record: true
17 | 
18 | out:
19 |   type: s3_parquet
20 |   bucket: example
21 |   region: us-east-1
22 |   endpoint: http://127.0.0.1:4566
23 |   path_prefix: path/to/my-obj-2.
24 |   file_ext: snappy.parquet
25 |   compression_codec: snappy
26 |   default_timezone: Asia/Tokyo
27 |   canned_acl: bucket-owner-full-control
28 |   column_options:
29 |     id:
30 |       logical_type: "int64"
31 |     payload:
32 |       logical_type: "json"
33 |   type_options:
34 |     timestamp:
35 |       logical_type: "timestamp-millis"
36 |   catalog:
37 |     database: example_db
38 |     table: example_tbl
39 | 


--------------------------------------------------------------------------------
/src/main/scala/org/embulk/output/s3_parquet/aws/AwsClientConfiguration.scala:
--------------------------------------------------------------------------------
 1 | package org.embulk.output.s3_parquet.aws
 2 | 
 3 | import java.util.Optional
 4 | 
 5 | import com.amazonaws.ClientConfiguration
 6 | import com.amazonaws.client.builder.AwsClientBuilder
 7 | import org.embulk.config.{Config, ConfigDefault}
 8 | import org.embulk.output.s3_parquet.aws.AwsClientConfiguration.Task
 9 | 
10 | object AwsClientConfiguration {
11 | 
12 |   trait Task {
13 | 
14 |     @Config("http_proxy")
15 |     @ConfigDefault("null")
16 |     def getHttpProxy: Optional[HttpProxy.Task]
17 | 
18 |   }
19 | 
20 |   def apply(task: Task): AwsClientConfiguration = {
21 |     new AwsClientConfiguration(task)
22 |   }
23 | }
24 | 
25 | class AwsClientConfiguration(task: Task) {
26 | 
27 |   def configureAwsClientBuilder[S <: AwsClientBuilder[S, T], T](
28 |       builder: AwsClientBuilder[S, T]
29 |   ): Unit = {
30 |     task.getHttpProxy.ifPresent { v =>
31 |       val cc = new ClientConfiguration
32 |       HttpProxy(v).configureClientConfiguration(cc)
33 |       builder.setClientConfiguration(cc)
34 |     }
35 |   }
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2019 Takahiro Nakayama
 2 | 
 3 | MIT License
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining
 6 | a copy of this software and associated documentation files (the
 7 | "Software"), to deal in the Software without restriction, including
 8 | without limitation the rights to use, copy, modify, merge, publish,
 9 | distribute, sublicense, and/or sell copies of the Software, and to
10 | permit persons to whom the Software is furnished to do so, subject to
11 | the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be
14 | included in all copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/src/main/scala/org/embulk/output/s3_parquet/aws/HttpProxy.scala:
--------------------------------------------------------------------------------
 1 | package org.embulk.output.s3_parquet.aws
 2 | 
 3 | import java.util.Optional
 4 | 
 5 | import com.amazonaws.{ClientConfiguration, Protocol}
 6 | import org.embulk.config.{Config, ConfigDefault, ConfigException}
 7 | import org.embulk.output.s3_parquet.aws.HttpProxy.Task
 8 | 
 9 | object HttpProxy {
10 | 
11 |   trait Task {
12 | 
13 |     @Config("host")
14 |     @ConfigDefault("null")
15 |     def getHost: Optional[String]
16 | 
17 |     @Config("port")
18 |     @ConfigDefault("null")
19 |     def getPort: Optional[Int]
20 | 
21 |     @Config("protocol")
22 |     @ConfigDefault("\"https\"")
23 |     def getProtocol: String
24 | 
25 |     @Config("user")
26 |     @ConfigDefault("null")
27 |     def getUser: Optional[String]
28 | 
29 |     @Config("password")
30 |     @ConfigDefault("null")
31 |     def getPassword: Optional[String]
32 | 
33 |   }
34 | 
35 |   def apply(task: Task): HttpProxy = {
36 |     new HttpProxy(task)
37 |   }
38 | 
39 | }
40 | 
41 | class HttpProxy(task: Task) {
42 | 
43 |   def configureClientConfiguration(cc: ClientConfiguration): Unit = {
44 |     task.getHost.ifPresent(v => cc.setProxyHost(v))
45 |     task.getPort.ifPresent(v => cc.setProxyPort(v))
46 | 
47 |     Protocol.values.find(p => p.name().equals(task.getProtocol)) match {
48 |       case Some(v) =>
49 |         cc.setProtocol(v)
50 |       case None =>
51 |         throw new ConfigException(
52 |           s"'${task.getProtocol}' is unsupported: `protocol` must be one of [${Protocol.values
53 |             .map(v => s"'$v'")
54 |             .mkString(", ")}]."
55 |         )
56 |     }
57 | 
58 |     task.getUser.ifPresent(v => cc.setProxyUsername(v))
59 |     task.getPassword.ifPresent(v => cc.setProxyPassword(v))
60 |   }
61 | }
62 | 


--------------------------------------------------------------------------------
/src/main/scala/org/embulk/output/s3_parquet/aws/AwsEndpointConfiguration.scala:
--------------------------------------------------------------------------------
 1 | package org.embulk.output.s3_parquet.aws
 2 | 
 3 | import java.util.Optional
 4 | 
 5 | import com.amazonaws.client.builder.AwsClientBuilder
 6 | import com.amazonaws.client.builder.AwsClientBuilder.EndpointConfiguration
 7 | import com.amazonaws.regions.{DefaultAwsRegionProviderChain, Regions}
 8 | import org.embulk.config.{Config, ConfigDefault}
 9 | import org.embulk.output.s3_parquet.aws.AwsEndpointConfiguration.Task
10 | 
11 | import scala.util.Try
12 | 
13 | object AwsEndpointConfiguration {
14 | 
15 |   trait Task {
16 | 
17 |     @Config("endpoint")
18 |     @ConfigDefault("null")
19 |     def getEndpoint: Optional[String]
20 | 
21 |     @Config("region")
22 |     @ConfigDefault("null")
23 |     def getRegion: Optional[String]
24 | 
25 |   }
26 | 
27 |   def apply(task: Task): AwsEndpointConfiguration = {
28 |     new AwsEndpointConfiguration(task)
29 |   }
30 | }
31 | 
32 | class AwsEndpointConfiguration(task: Task) {
33 | 
34 |   def configureAwsClientBuilder[S <: AwsClientBuilder[S, T], T](
35 |       builder: AwsClientBuilder[S, T]
36 |   ): Unit = {
37 |     if (task.getRegion.isPresent && task.getEndpoint.isPresent) {
38 |       val ec =
39 |         new EndpointConfiguration(task.getEndpoint.get, task.getRegion.get)
40 |       builder.setEndpointConfiguration(ec)
41 |     }
42 |     else if (task.getRegion.isPresent && !task.getEndpoint.isPresent) {
43 |       builder.setRegion(task.getRegion.get)
44 |     }
45 |     else if (!task.getRegion.isPresent && task.getEndpoint.isPresent) {
46 |       val r: String = Try(new DefaultAwsRegionProviderChain().getRegion)
47 |         .getOrElse(Regions.DEFAULT_REGION.getName)
48 |       val e: String = task.getEndpoint.get
49 |       val ec = new EndpointConfiguration(e, r)
50 |       builder.setEndpointConfiguration(ec)
51 |     }
52 |   }
53 | 
54 | }
55 | 


--------------------------------------------------------------------------------
/src/main/scala/org/embulk/output/s3_parquet/aws/Aws.scala:
--------------------------------------------------------------------------------
 1 | package org.embulk.output.s3_parquet.aws
 2 | 
 3 | import com.amazonaws.client.builder.AwsClientBuilder
 4 | import com.amazonaws.services.glue.{AWSGlue, AWSGlueClientBuilder}
 5 | import com.amazonaws.services.s3.{AmazonS3, AmazonS3ClientBuilder}
 6 | import com.amazonaws.services.s3.transfer.{
 7 |   TransferManager,
 8 |   TransferManagerBuilder
 9 | }
10 | 
11 | object Aws {
12 | 
13 |   trait Task
14 |       extends AwsCredentials.Task
15 |       with AwsEndpointConfiguration.Task
16 |       with AwsClientConfiguration.Task
17 |       with AwsS3Configuration.Task
18 | 
19 |   def apply(task: Task): Aws = {
20 |     new Aws(task)
21 |   }
22 | 
23 | }
24 | 
25 | class Aws(task: Aws.Task) {
26 | 
27 |   def withS3[A](f: AmazonS3 => A): A = {
28 |     val builder: AmazonS3ClientBuilder = AmazonS3ClientBuilder.standard()
29 |     AwsS3Configuration(task).configureAmazonS3ClientBuilder(builder)
30 |     val svc = createService(builder)
31 |     try f(svc)
32 |     finally svc.shutdown()
33 |   }
34 | 
35 |   def withTransferManager[A](f: TransferManager => A): A = {
36 |     withS3 { s3 =>
37 |       val svc = TransferManagerBuilder.standard().withS3Client(s3).build()
38 |       try f(svc)
39 |       finally svc.shutdownNow(false)
40 |     }
41 |   }
42 | 
43 |   def withGlue[A](f: AWSGlue => A): A = {
44 |     val builder: AWSGlueClientBuilder = AWSGlueClientBuilder.standard()
45 |     val svc = createService(builder)
46 |     try f(svc)
47 |     finally svc.shutdown()
48 |   }
49 | 
50 |   def createService[S <: AwsClientBuilder[S, T], T](
51 |       builder: AwsClientBuilder[S, T]
52 |   ): T = {
53 |     AwsEndpointConfiguration(task).configureAwsClientBuilder(builder)
54 |     AwsClientConfiguration(task).configureAwsClientBuilder(builder)
55 |     builder.setCredentials(AwsCredentials(task).createAwsCredentialsProvider)
56 | 
57 |     builder.build()
58 |   }
59 | }
60 | 


--------------------------------------------------------------------------------
/src/main/scala/org/embulk/output/s3_parquet/implicits.scala:
--------------------------------------------------------------------------------
 1 | package org.embulk.output.s3_parquet
 2 | 
 3 | import java.util.{Optional, Iterator => JIterator, List => JList, Map => JMap}
 4 | 
 5 | import com.google.common.base.{Optional => GoogleOptional}
 6 | 
 7 | import scala.jdk.CollectionConverters._
 8 | import scala.language.implicitConversions
 9 | 
10 | case object implicits {
11 |   implicit def JList2Seq[A](a: JList[A]): Seq[A] = a.asScala.toSeq
12 |   implicit def Seq2JList[A](a: Seq[A]): JList[A] = a.asJava
13 |   implicit def JIte2Ite[A](a: JIterator[A]): Iterator[A] = a.asScala
14 |   implicit def Ite2JIte[A](a: Iterator[A]): JIterator[A] = a.asJava
15 | 
16 |   implicit def OptionalJList2OptionSeq[A](
17 |       a: Optional[JList[A]]
18 |   ): Option[Seq[A]] = a.map(JList2Seq(_))
19 | 
20 |   implicit def OptionSeq2OptionalJList[A](
21 |       a: Option[Seq[A]]
22 |   ): Optional[JList[A]] = a.map(Seq2JList)
23 |   implicit def JMap2Map[K, V](a: JMap[K, V]): Map[K, V] = a.asScala.toMap
24 |   implicit def Map2JMap[K, V](a: Map[K, V]): JMap[K, V] = a.asJava
25 | 
26 |   implicit def OptionalJMap2OptionMap[K, V](
27 |       a: Optional[JMap[K, V]]
28 |   ): Option[Map[K, V]] = a.map(JMap2Map(_))
29 | 
30 |   implicit def OptionMap2Optional2JMap[K, V](
31 |       a: Option[Map[K, V]]
32 |   ): Optional[JMap[K, V]] = a.map(Map2JMap)
33 | 
34 |   implicit def Optional2Option[A](a: Optional[A]): Option[A] =
35 |     if (a.isPresent) Some(a.get()) else None
36 | 
37 |   implicit def Option2Optional[A](a: Option[A]): Optional[A] = a match {
38 |     case Some(v) => Optional.of(v)
39 |     case None    => Optional.empty()
40 |   }
41 | 
42 |   implicit def GoogleOptional2Option[A](a: GoogleOptional[A]): Option[A] =
43 |     Option(a.orNull())
44 | 
45 |   implicit def Option2GoogleOptional[A](a: Option[A]): GoogleOptional[A] =
46 |     a match {
47 |       case Some(v) => GoogleOptional.of(v)
48 |       case None    => GoogleOptional.absent()
49 |     }
50 | 
51 | }
52 | 


--------------------------------------------------------------------------------
/src/test/scala/org/embulk/output/s3_parquet/TestS3ParquetOutputPluginConfigException.scala:
--------------------------------------------------------------------------------
 1 | package org.embulk.output.s3_parquet
 2 | 
 3 | import org.embulk.config.ConfigException
 4 | import org.embulk.spi.Schema
 5 | import org.embulk.spi.`type`.Types
 6 | 
 7 | class TestS3ParquetOutputPluginConfigException extends EmbulkPluginTestHelper {
 8 | 
 9 |   test(
10 |     "Throw ConfigException when un-convertible types are defined in type_options"
11 |   ) {
12 |     val schema = Schema.builder().add("c0", Types.STRING).build()
13 |     val data: Seq[Seq[String]] = Seq(
14 |       Seq("a")
15 |     )
16 |     val cfg = newDefaultConfig.merge(
17 |       loadConfigSourceFromYamlString("""
18 |                                        |type_options:
19 |                                        |  string:
20 |                                        |    logical_type: "timestamp-millis"
21 |                                        |""".stripMargin)
22 |     )
23 |     val caught = intercept[ConfigException](runOutput(cfg, schema, data))
24 |     assert(caught.isInstanceOf[ConfigException])
25 |     assert(caught.getMessage.startsWith("Unsupported column type: "))
26 |   }
27 | 
28 |   test(
29 |     "Throw ConfigException when un-convertible types are defined in column_options"
30 |   ) {
31 |     val schema = Schema.builder().add("c0", Types.STRING).build()
32 |     val data: Seq[Seq[String]] = Seq(
33 |       Seq("a")
34 |     )
35 |     val cfg = newDefaultConfig.merge(
36 |       loadConfigSourceFromYamlString("""
37 |                                        |column_options:
38 |                                        |  c0:
39 |                                        |    logical_type: "timestamp-millis"
40 |                                        |""".stripMargin)
41 |     )
42 |     val caught = intercept[ConfigException](runOutput(cfg, schema, data))
43 |     assert(caught.isInstanceOf[ConfigException])
44 |     assert(caught.getMessage.startsWith("Unsupported column type: "))
45 | 
46 |   }
47 | 
48 | }
49 | 


--------------------------------------------------------------------------------
/src/main/scala/org/embulk/output/s3_parquet/S3ParquetPageOutput.scala:
--------------------------------------------------------------------------------
 1 | package org.embulk.output.s3_parquet
 2 | 
 3 | import java.io.File
 4 | import java.nio.file.{Files, Paths}
 5 | 
 6 | import com.amazonaws.services.s3.transfer.{TransferManager, Upload}
 7 | import com.amazonaws.services.s3.transfer.model.UploadResult
 8 | import org.apache.parquet.hadoop.ParquetWriter
 9 | import org.embulk.config.TaskReport
10 | import org.embulk.output.s3_parquet.aws.Aws
11 | import org.embulk.spi.{Exec, Page, PageReader, TransactionalPageOutput}
12 | 
13 | case class S3ParquetPageOutput(
14 |     outputLocalFile: String,
15 |     reader: PageReader,
16 |     writer: ParquetWriter[PageReader],
17 |     aws: Aws,
18 |     destBucket: String,
19 |     destKey: String
20 | ) extends TransactionalPageOutput {
21 | 
22 |   private var isClosed: Boolean = false
23 | 
24 |   override def add(page: Page): Unit = {
25 |     reader.setPage(page)
26 |     while (reader.nextRecord()) {
27 |       ContextClassLoaderSwapper.usingPluginClass {
28 |         writer.write(reader)
29 |       }
30 |     }
31 |   }
32 | 
33 |   override def finish(): Unit = {}
34 | 
35 |   override def close(): Unit = {
36 |     synchronized {
37 |       if (!isClosed) {
38 |         ContextClassLoaderSwapper.usingPluginClass {
39 |           writer.close()
40 |         }
41 |         isClosed = true
42 |       }
43 |     }
44 |   }
45 | 
46 |   override def abort(): Unit = {
47 |     close()
48 |     cleanup()
49 |   }
50 | 
51 |   override def commit(): TaskReport = {
52 |     close()
53 |     val result: UploadResult = ContextClassLoaderSwapper.usingPluginClass {
54 |       aws.withTransferManager { xfer: TransferManager =>
55 |         val upload: Upload =
56 |           xfer.upload(destBucket, destKey, new File(outputLocalFile))
57 |         upload.waitForUploadResult()
58 |       }
59 |     }
60 |     cleanup()
61 |     Exec
62 |       .newTaskReport()
63 |       .set("bucket", result.getBucketName)
64 |       .set("key", result.getKey)
65 |       .set("etag", result.getETag)
66 |       .set("version_id", result.getVersionId)
67 |   }
68 | 
69 |   private def cleanup(): Unit = {
70 |     Files.delete(Paths.get(outputLocalFile))
71 |   }
72 | }
73 | 


--------------------------------------------------------------------------------
/src/main/scala/org/embulk/output/s3_parquet/aws/AwsS3Configuration.scala:
--------------------------------------------------------------------------------
 1 | package org.embulk.output.s3_parquet.aws
 2 | 
 3 | import java.util.Optional
 4 | 
 5 | import com.amazonaws.services.s3.AmazonS3ClientBuilder
 6 | import org.embulk.config.{Config, ConfigDefault}
 7 | import org.embulk.output.s3_parquet.aws.AwsS3Configuration.Task
 8 | 
 9 | /*
10 |  * These are advanced settings, so write no documentation.
11 |  */
12 | object AwsS3Configuration {
13 | 
14 |   trait Task {
15 | 
16 |     @Config("accelerate_mode_enabled")
17 |     @ConfigDefault("null")
18 |     def getAccelerateModeEnabled: Optional[Boolean]
19 | 
20 |     @Config("chunked_encoding_disabled")
21 |     @ConfigDefault("null")
22 |     def getChunkedEncodingDisabled: Optional[Boolean]
23 | 
24 |     @Config("dualstack_enabled")
25 |     @ConfigDefault("null")
26 |     def getDualstackEnabled: Optional[Boolean]
27 | 
28 |     @Config("force_global_bucket_access_enabled")
29 |     @ConfigDefault("null")
30 |     def getForceGlobalBucketAccessEnabled: Optional[Boolean]
31 | 
32 |     @Config("path_style_access_enabled")
33 |     @ConfigDefault("null")
34 |     def getPathStyleAccessEnabled: Optional[Boolean]
35 | 
36 |     @Config("payload_signing_enabled")
37 |     @ConfigDefault("null")
38 |     def getPayloadSigningEnabled: Optional[Boolean]
39 | 
40 |   }
41 | 
42 |   def apply(task: Task): AwsS3Configuration = {
43 |     new AwsS3Configuration(task)
44 |   }
45 | }
46 | 
47 | class AwsS3Configuration(task: Task) {
48 | 
49 |   def configureAmazonS3ClientBuilder(builder: AmazonS3ClientBuilder): Unit = {
50 |     task.getAccelerateModeEnabled.ifPresent(v =>
51 |       builder.setAccelerateModeEnabled(v)
52 |     )
53 |     task.getChunkedEncodingDisabled.ifPresent(v =>
54 |       builder.setChunkedEncodingDisabled(v)
55 |     )
56 |     task.getDualstackEnabled.ifPresent(v => builder.setDualstackEnabled(v))
57 |     task.getForceGlobalBucketAccessEnabled.ifPresent(v =>
58 |       builder.setForceGlobalBucketAccessEnabled(v)
59 |     )
60 |     task.getPathStyleAccessEnabled.ifPresent(v =>
61 |       builder.setPathStyleAccessEnabled(v)
62 |     )
63 |     task.getPayloadSigningEnabled.ifPresent(v =>
64 |       builder.setPayloadSigningEnabled(v)
65 |     )
66 |   }
67 | 
68 | }
69 | 


--------------------------------------------------------------------------------
/src/test/scala/org/embulk/output/s3_parquet/parquet/MockParquetRecordConsumer.scala:
--------------------------------------------------------------------------------
 1 | package org.embulk.output.s3_parquet.parquet
 2 | 
 3 | import org.apache.parquet.io.api.{Binary, RecordConsumer}
 4 | 
 5 | case class MockParquetRecordConsumer() extends RecordConsumer {
 6 |   case class Data private (messages: Seq[Message] = Seq()) {
 7 |     def toData: Seq[Seq[Any]] = messages.map(_.toData)
 8 |   }
 9 |   case class Message private (fields: Seq[Field] = Seq()) {
10 |     def toData: Seq[Any] = {
11 |       val maxIndex: Int = fields.maxBy(_.index).index
12 |       val raw: Map[Int, Any] = fields.map(f => f.index -> f.value).toMap
13 |       0.to(maxIndex).map(idx => raw.get(idx).orNull)
14 |     }
15 |   }
16 |   case class Field private (index: Int = 0, value: Any = null)
17 | 
18 |   private var _data: Data = Data()
19 |   private var _message: Message = Message()
20 |   private var _field: Field = Field()
21 | 
22 |   override def startMessage(): Unit = _message = Message()
23 |   override def endMessage(): Unit =
24 |     _data = _data.copy(messages = _data.messages :+ _message)
25 |   override def startField(field: String, index: Int): Unit =
26 |     _field = Field(index = index)
27 |   override def endField(field: String, index: Int): Unit =
28 |     _message = _message.copy(fields = _message.fields :+ _field)
29 |   override def startGroup(): Unit = throw new UnsupportedOperationException
30 |   override def endGroup(): Unit = throw new UnsupportedOperationException
31 |   override def addInteger(value: Int): Unit =
32 |     _field = _field.copy(value = value)
33 |   override def addLong(value: Long): Unit = _field = _field.copy(value = value)
34 |   override def addBoolean(value: Boolean): Unit =
35 |     _field = _field.copy(value = value)
36 |   override def addBinary(value: Binary): Unit =
37 |     _field = _field.copy(value = value)
38 |   override def addFloat(value: Float): Unit =
39 |     _field = _field.copy(value = value)
40 |   override def addDouble(value: Double): Unit =
41 |     _field = _field.copy(value = value)
42 | 
43 |   def writingMessage(f: => Unit): Unit = {
44 |     startMessage()
45 |     f
46 |     endMessage()
47 |   }
48 |   def writingField(field: String, index: Int)(f: => Unit): Unit = {
49 |     startField(field, index)
50 |     f
51 |     endField(field, index)
52 |   }
53 |   def writingSampleField(f: => Unit): Unit = {
54 |     writingMessage {
55 |       writingField("a", 0)(f)
56 |     }
57 |   }
58 |   def data: Seq[Seq[Any]] = _data.toData
59 | }
60 | 


--------------------------------------------------------------------------------
/src/main/scala/org/embulk/output/s3_parquet/parquet/DateLogicalType.scala:
--------------------------------------------------------------------------------
 1 | package org.embulk.output.s3_parquet.parquet
 2 | 
 3 | import java.time.{Duration, Instant}
 4 | 
 5 | import org.apache.parquet.io.api.RecordConsumer
 6 | import org.apache.parquet.schema.{LogicalTypeAnnotation, PrimitiveType, Types}
 7 | import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName
 8 | import org.embulk.config.ConfigException
 9 | import org.embulk.output.s3_parquet.catalog.GlueDataType
10 | import org.embulk.spi.`type`.{
11 |   BooleanType,
12 |   DoubleType,
13 |   JsonType,
14 |   LongType,
15 |   StringType,
16 |   TimestampType
17 | }
18 | import org.embulk.spi.time.{Timestamp, TimestampFormatter}
19 | import org.embulk.spi.Column
20 | import org.msgpack.value.Value
21 | 
22 | object DateLogicalType extends ParquetColumnType {
23 |   override def primitiveType(column: Column): PrimitiveType = {
24 |     column.getType match {
25 |       case _: LongType | _: TimestampType =>
26 |         Types
27 |           .optional(PrimitiveTypeName.INT32)
28 |           .as(LogicalTypeAnnotation.dateType())
29 |           .named(column.getName)
30 |       case _: BooleanType | _: DoubleType | _: StringType | _: JsonType | _ =>
31 |         throw new ConfigException(s"Unsupported column type: ${column.getName}")
32 |     }
33 |   }
34 | 
35 |   override def glueDataType(column: Column): GlueDataType =
36 |     column.getType match {
37 |       case _: LongType | _: TimestampType => GlueDataType.DATE
38 |       case _: BooleanType | _: DoubleType | _: StringType | _: JsonType | _ =>
39 |         throw new ConfigException(s"Unsupported column type: ${column.getName}")
40 |     }
41 | 
42 |   override def consumeBoolean(consumer: RecordConsumer, v: Boolean): Unit =
43 |     throw newUnsupportedMethodException("consumeBoolean")
44 | 
45 |   override def consumeString(consumer: RecordConsumer, v: String): Unit =
46 |     throw newUnsupportedMethodException("consumeString")
47 | 
48 |   override def consumeLong(consumer: RecordConsumer, v: Long): Unit =
49 |     consumeLongAsInteger(consumer, v)
50 | 
51 |   override def consumeDouble(consumer: RecordConsumer, v: Double): Unit =
52 |     throw newUnsupportedMethodException("consumeDouble")
53 | 
54 |   override def consumeTimestamp(
55 |       consumer: RecordConsumer,
56 |       v: Timestamp,
57 |       formatter: TimestampFormatter
58 |   ): Unit =
59 |     consumeLongAsInteger(
60 |       consumer,
61 |       Duration.between(Instant.EPOCH, v.getInstant).toDays
62 |     )
63 | 
64 |   override def consumeJson(consumer: RecordConsumer, v: Value): Unit =
65 |     throw newUnsupportedMethodException("consumeJson")
66 | }
67 | 


--------------------------------------------------------------------------------
/src/main/scala/org/embulk/output/s3_parquet/parquet/JsonLogicalType.scala:
--------------------------------------------------------------------------------
 1 | package org.embulk.output.s3_parquet.parquet
 2 | import org.apache.parquet.io.api.{Binary, RecordConsumer}
 3 | import org.apache.parquet.schema.{LogicalTypeAnnotation, PrimitiveType, Types}
 4 | import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName
 5 | import org.embulk.config.ConfigException
 6 | import org.embulk.output.s3_parquet.catalog.GlueDataType
 7 | import org.embulk.spi.Column
 8 | import org.embulk.spi.`type`.{
 9 |   BooleanType,
10 |   DoubleType,
11 |   JsonType,
12 |   LongType,
13 |   StringType,
14 |   TimestampType
15 | }
16 | import org.embulk.spi.time.{Timestamp, TimestampFormatter}
17 | import org.msgpack.value.{Value, ValueFactory}
18 | import org.slf4j.{Logger, LoggerFactory}
19 | 
20 | object JsonLogicalType extends ParquetColumnType {
21 |   private val logger: Logger = LoggerFactory.getLogger(JsonLogicalType.getClass)
22 |   override def primitiveType(column: Column): PrimitiveType =
23 |     column.getType match {
24 |       case _: BooleanType | _: LongType | _: DoubleType | _: StringType |
25 |           _: JsonType =>
26 |         Types
27 |           .optional(PrimitiveTypeName.BINARY)
28 |           .as(LogicalTypeAnnotation.jsonType())
29 |           .named(column.getName)
30 |       case _: TimestampType | _ =>
31 |         throw new ConfigException(s"Unsupported column type: ${column.getName}")
32 |     }
33 | 
34 |   override def glueDataType(column: Column): GlueDataType =
35 |     column.getType match {
36 |       case _: BooleanType | _: LongType | _: DoubleType | _: StringType |
37 |           _: JsonType =>
38 |         warningWhenConvertingJsonToGlueType(GlueDataType.STRING)
39 |         GlueDataType.STRING
40 |       case _: TimestampType | _ =>
41 |         throw new ConfigException(s"Unsupported column type: ${column.getName}")
42 |     }
43 | 
44 |   override def consumeBoolean(consumer: RecordConsumer, v: Boolean): Unit =
45 |     consumeJson(consumer, ValueFactory.newBoolean(v))
46 | 
47 |   override def consumeString(consumer: RecordConsumer, v: String): Unit =
48 |     consumeJson(consumer, ValueFactory.newString(v))
49 | 
50 |   override def consumeLong(consumer: RecordConsumer, v: Long): Unit =
51 |     consumeJson(consumer, ValueFactory.newInteger(v))
52 | 
53 |   override def consumeDouble(consumer: RecordConsumer, v: Double): Unit =
54 |     consumeJson(consumer, ValueFactory.newFloat(v))
55 | 
56 |   override def consumeTimestamp(
57 |       consumer: RecordConsumer,
58 |       v: Timestamp,
59 |       formatter: TimestampFormatter
60 |   ): Unit = throw newUnsupportedMethodException("consumeTimestamp")
61 | 
62 |   override def consumeJson(consumer: RecordConsumer, v: Value): Unit =
63 |     consumer.addBinary(Binary.fromString(v.toJson))
64 | 
65 |   private def warningWhenConvertingJsonToGlueType(
66 |       glueType: GlueDataType
67 |   ): Unit = {
68 |     logger.warn(
69 |       s"json is converted" +
70 |         s" to Glue ${glueType.name} but this is not represented correctly, because Glue" +
71 |         s" does not support json type. Please use `catalog.column_options` to define the type."
72 |     )
73 |   }
74 | 
75 | }
76 | 


--------------------------------------------------------------------------------
/src/main/scala/org/embulk/output/s3_parquet/parquet/DefaultColumnType.scala:
--------------------------------------------------------------------------------
 1 | package org.embulk.output.s3_parquet.parquet
 2 | 
 3 | import org.apache.parquet.io.api.{Binary, RecordConsumer}
 4 | import org.apache.parquet.schema.{LogicalTypeAnnotation, PrimitiveType, Types}
 5 | import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName
 6 | import org.embulk.config.ConfigException
 7 | import org.embulk.output.s3_parquet.catalog.GlueDataType
 8 | import org.embulk.spi.time.{Timestamp, TimestampFormatter}
 9 | import org.embulk.spi.Column
10 | import org.embulk.spi.`type`.{
11 |   BooleanType,
12 |   DoubleType,
13 |   JsonType,
14 |   LongType,
15 |   StringType,
16 |   TimestampType
17 | }
18 | import org.msgpack.value.Value
19 | 
20 | object DefaultColumnType extends ParquetColumnType {
21 |   override def primitiveType(column: Column): PrimitiveType =
22 |     column.getType match {
23 |       case _: BooleanType =>
24 |         Types.optional(PrimitiveTypeName.BOOLEAN).named(column.getName)
25 |       case _: LongType =>
26 |         Types.optional(PrimitiveTypeName.INT64).named(column.getName)
27 |       case _: DoubleType =>
28 |         Types.optional(PrimitiveTypeName.DOUBLE).named(column.getName)
29 |       case _: StringType =>
30 |         Types
31 |           .optional(PrimitiveTypeName.BINARY)
32 |           .as(LogicalTypeAnnotation.stringType())
33 |           .named(column.getName)
34 |       case _: TimestampType =>
35 |         Types
36 |           .optional(PrimitiveTypeName.BINARY)
37 |           .as(LogicalTypeAnnotation.stringType())
38 |           .named(column.getName)
39 |       case _: JsonType =>
40 |         Types
41 |           .optional(PrimitiveTypeName.BINARY)
42 |           .as(LogicalTypeAnnotation.stringType())
43 |           .named(column.getName)
44 |       case _ =>
45 |         throw new ConfigException(s"Unsupported column type: ${column.getName}")
46 |     }
47 | 
48 |   override def glueDataType(column: Column): GlueDataType =
49 |     column.getType match {
50 |       case _: BooleanType =>
51 |         GlueDataType.BOOLEAN
52 |       case _: LongType =>
53 |         GlueDataType.BIGINT
54 |       case _: DoubleType =>
55 |         GlueDataType.DOUBLE
56 |       case _: StringType | _: TimestampType | _: JsonType =>
57 |         GlueDataType.STRING
58 |       case _ =>
59 |         throw new ConfigException(s"Unsupported column type: ${column.getName}")
60 |     }
61 | 
62 |   override def consumeBoolean(consumer: RecordConsumer, v: Boolean): Unit =
63 |     consumer.addBoolean(v)
64 |   override def consumeString(consumer: RecordConsumer, v: String): Unit =
65 |     consumer.addBinary(Binary.fromString(v))
66 |   override def consumeLong(consumer: RecordConsumer, v: Long): Unit =
67 |     consumer.addLong(v)
68 |   override def consumeDouble(consumer: RecordConsumer, v: Double): Unit =
69 |     consumer.addDouble(v)
70 |   override def consumeTimestamp(
71 |       consumer: RecordConsumer,
72 |       v: Timestamp,
73 |       formatter: TimestampFormatter
74 |   ): Unit = consumer.addBinary(Binary.fromString(formatter.format(v)))
75 |   override def consumeJson(consumer: RecordConsumer, v: Value): Unit =
76 |     consumer.addBinary(Binary.fromString(v.toJson))
77 | }
78 | 


--------------------------------------------------------------------------------
/gradlew.bat:
--------------------------------------------------------------------------------
  1 | @rem
  2 | @rem Copyright 2015 the original author or authors.
  3 | @rem
  4 | @rem Licensed under the Apache License, Version 2.0 (the "License");
  5 | @rem you may not use this file except in compliance with the License.
  6 | @rem You may obtain a copy of the License at
  7 | @rem
  8 | @rem      https://www.apache.org/licenses/LICENSE-2.0
  9 | @rem
 10 | @rem Unless required by applicable law or agreed to in writing, software
 11 | @rem distributed under the License is distributed on an "AS IS" BASIS,
 12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | @rem See the License for the specific language governing permissions and
 14 | @rem limitations under the License.
 15 | @rem
 16 | 
 17 | @if "%DEBUG%" == "" @echo off
 18 | @rem ##########################################################################
 19 | @rem
 20 | @rem  Gradle startup script for Windows
 21 | @rem
 22 | @rem ##########################################################################
 23 | 
 24 | @rem Set local scope for the variables with windows NT shell
 25 | if "%OS%"=="Windows_NT" setlocal
 26 | 
 27 | set DIRNAME=%~dp0
 28 | if "%DIRNAME%" == "" set DIRNAME=.
 29 | set APP_BASE_NAME=%~n0
 30 | set APP_HOME=%DIRNAME%
 31 | 
 32 | @rem Resolve any "." and ".." in APP_HOME to make it shorter.
 33 | for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi
 34 | 
 35 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
 36 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
 37 | 
 38 | @rem Find java.exe
 39 | if defined JAVA_HOME goto findJavaFromJavaHome
 40 | 
 41 | set JAVA_EXE=java.exe
 42 | %JAVA_EXE% -version >NUL 2>&1
 43 | if "%ERRORLEVEL%" == "0" goto init
 44 | 
 45 | echo.
 46 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
 47 | echo.
 48 | echo Please set the JAVA_HOME variable in your environment to match the
 49 | echo location of your Java installation.
 50 | 
 51 | goto fail
 52 | 
 53 | :findJavaFromJavaHome
 54 | set JAVA_HOME=%JAVA_HOME:"=%
 55 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe
 56 | 
 57 | if exist "%JAVA_EXE%" goto init
 58 | 
 59 | echo.
 60 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
 61 | echo.
 62 | echo Please set the JAVA_HOME variable in your environment to match the
 63 | echo location of your Java installation.
 64 | 
 65 | goto fail
 66 | 
 67 | :init
 68 | @rem Get command-line arguments, handling Windows variants
 69 | 
 70 | if not "%OS%" == "Windows_NT" goto win9xME_args
 71 | 
 72 | :win9xME_args
 73 | @rem Slurp the command line arguments.
 74 | set CMD_LINE_ARGS=
 75 | set _SKIP=2
 76 | 
 77 | :win9xME_args_slurp
 78 | if "x%~1" == "x" goto execute
 79 | 
 80 | set CMD_LINE_ARGS=%*
 81 | 
 82 | :execute
 83 | @rem Setup the command line
 84 | 
 85 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
 86 | 
 87 | @rem Execute Gradle
 88 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
 89 | 
 90 | :end
 91 | @rem End local scope for the variables with windows NT shell
 92 | if "%ERRORLEVEL%"=="0" goto mainEnd
 93 | 
 94 | :fail
 95 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
 96 | rem the _cmd.exe /c_ return code!
 97 | if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
 98 | exit /b 1
 99 | 
100 | :mainEnd
101 | if "%OS%"=="Windows_NT" endlocal
102 | 
103 | :omega
104 | 


--------------------------------------------------------------------------------
/src/main/scala/org/embulk/output/s3_parquet/parquet/LogicalTypeProxy.scala:
--------------------------------------------------------------------------------
 1 | package org.embulk.output.s3_parquet.parquet
 2 | 
 3 | import java.time.ZoneId
 4 | import java.util.Locale
 5 | 
 6 | import org.apache.parquet.io.api.RecordConsumer
 7 | import org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit
 8 | import org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.MILLIS
 9 | import org.apache.parquet.schema.PrimitiveType
10 | import org.embulk.config.ConfigException
11 | import org.embulk.output.s3_parquet.catalog.GlueDataType
12 | import org.embulk.spi.Column
13 | import org.embulk.spi.time.{Timestamp, TimestampFormatter}
14 | import org.msgpack.value.Value
15 | 
16 | object LogicalTypeProxy {
17 |   private val DEFAULT_SCALE: Int = 0
18 |   private val DEFAULT_BID_WIDTH: Int = 64
19 |   private val DEFAULT_IS_SIGNED: Boolean = true
20 |   private val DEFAULT_IS_ADJUSTED_TO_UTC: Boolean = true
21 |   private val DEFAULT_TIME_UNIT: TimeUnit = MILLIS
22 |   private val DEFAULT_TIME_ZONE: ZoneId = ZoneId.of("UTC")
23 | }
24 | 
25 | case class LogicalTypeProxy(
26 |     name: String,
27 |     scale: Option[Int] = None,
28 |     precision: Option[Int] = None,
29 |     bitWidth: Option[Int] = None,
30 |     isSigned: Option[Boolean] = None,
31 |     isAdjustedToUtc: Option[Boolean] = None,
32 |     timeUnit: Option[TimeUnit] = None,
33 |     timeZone: Option[ZoneId] = None
34 | ) extends ParquetColumnType {
35 |   private def getScale: Int = scale.getOrElse(LogicalTypeProxy.DEFAULT_SCALE)
36 |   private def getPrecision: Int = precision.getOrElse {
37 |     throw new ConfigException("\"precision\" must be set.")
38 |   }
39 |   private def getBidWith: Int =
40 |     bitWidth.getOrElse(LogicalTypeProxy.DEFAULT_BID_WIDTH)
41 |   private def getIsSigned: Boolean =
42 |     isSigned.getOrElse(LogicalTypeProxy.DEFAULT_IS_SIGNED)
43 |   private def getIsAdjustedToUtc: Boolean =
44 |     isAdjustedToUtc.getOrElse(LogicalTypeProxy.DEFAULT_IS_ADJUSTED_TO_UTC)
45 |   private def getTimeUnit: TimeUnit =
46 |     timeUnit.getOrElse(LogicalTypeProxy.DEFAULT_TIME_UNIT)
47 |   private def getTimeZone: ZoneId =
48 |     timeZone.getOrElse(LogicalTypeProxy.DEFAULT_TIME_ZONE)
49 | 
50 |   lazy val logicalType: ParquetColumnType = {
51 |     name.toUpperCase(Locale.ENGLISH) match {
52 |       case "INT" => IntLogicalType(getBidWith, getIsSigned)
53 |       case "TIMESTAMP" =>
54 |         TimestampLogicalType(getIsAdjustedToUtc, getTimeUnit, getTimeZone)
55 |       case "TIME" =>
56 |         TimeLogicalType(getIsAdjustedToUtc, getTimeUnit, getTimeZone)
57 |       case "DECIMAL" => DecimalLogicalType(getScale, getPrecision)
58 |       case "DATE"    => DateLogicalType
59 |       case "JSON"    => JsonLogicalType
60 |       case _ =>
61 |         throw new ConfigException(s"Unsupported logical_type.name: $name.")
62 |     }
63 |   }
64 | 
65 |   override def primitiveType(column: Column): PrimitiveType =
66 |     logicalType.primitiveType(column)
67 |   override def glueDataType(column: Column): GlueDataType =
68 |     logicalType.glueDataType(column)
69 |   override def consumeBoolean(consumer: RecordConsumer, v: Boolean): Unit =
70 |     logicalType.consumeBoolean(consumer, v)
71 |   override def consumeString(consumer: RecordConsumer, v: String): Unit =
72 |     logicalType.consumeString(consumer, v)
73 |   override def consumeLong(consumer: RecordConsumer, v: Long): Unit =
74 |     logicalType.consumeLong(consumer, v)
75 |   override def consumeDouble(consumer: RecordConsumer, v: Double): Unit =
76 |     logicalType.consumeDouble(consumer, v)
77 |   override def consumeTimestamp(
78 |       consumer: RecordConsumer,
79 |       v: Timestamp,
80 |       formatter: TimestampFormatter
81 |   ): Unit = logicalType.consumeTimestamp(consumer, v, formatter)
82 |   override def consumeJson(consumer: RecordConsumer, v: Value): Unit =
83 |     logicalType.consumeJson(consumer, v)
84 | }
85 | 


--------------------------------------------------------------------------------
/gradle/dependency-locks/embulkPluginRuntime.lockfile:
--------------------------------------------------------------------------------
 1 | # This is a Gradle generated file for dependency locking.
 2 | # Manual edits can break the build and are not advised.
 3 | # This file is expected to be part of source control.
 4 | asm:asm:3.1
 5 | ch.qos.reload4j:reload4j:1.2.19
 6 | com.amazonaws:aws-java-sdk-core:1.11.769
 7 | com.amazonaws:aws-java-sdk-glue:1.11.769
 8 | com.amazonaws:aws-java-sdk-kms:1.11.769
 9 | com.amazonaws:aws-java-sdk-s3:1.11.769
10 | com.amazonaws:aws-java-sdk-sts:1.11.769
11 | com.amazonaws:jmespath-java:1.11.769
12 | com.fasterxml.jackson.dataformat:jackson-dataformat-cbor:2.6.7
13 | com.fasterxml.woodstox:woodstox-core:5.3.0
14 | com.github.spotbugs:spotbugs-annotations:3.1.9
15 | com.github.stephenc.jcip:jcip-annotations:1.0-1
16 | com.google.code.findbugs:jsr305:3.0.2
17 | com.google.code.gson:gson:2.2.4
18 | com.google.protobuf:protobuf-java:2.5.0
19 | com.jamesmurty.utils:java-xmlbuilder:0.4
20 | com.jcraft:jsch:0.1.55
21 | com.nimbusds:nimbus-jose-jwt:7.9
22 | com.sun.jersey:jersey-core:1.9
23 | com.sun.jersey:jersey-json:1.9
24 | com.sun.jersey:jersey-server:1.9
25 | com.sun.xml.bind:jaxb-impl:2.2.3-1
26 | com.thoughtworks.paranamer:paranamer:2.3
27 | commons-beanutils:commons-beanutils:1.9.4
28 | commons-cli:commons-cli:1.2
29 | commons-codec:commons-codec:1.11
30 | commons-collections:commons-collections:3.2.2
31 | commons-configuration:commons-configuration:1.6
32 | commons-digester:commons-digester:1.8
33 | commons-io:commons-io:2.5
34 | commons-lang:commons-lang:2.6
35 | commons-logging:commons-logging:1.2
36 | commons-net:commons-net:3.1
37 | commons-pool:commons-pool:1.6
38 | io.netty:netty:3.10.6.Final
39 | javax.activation:activation:1.1
40 | javax.annotation:javax.annotation-api:1.3.2
41 | javax.servlet.jsp:jsp-api:2.1
42 | javax.servlet:servlet-api:2.5
43 | javax.xml.bind:jaxb-api:2.2.2
44 | javax.xml.stream:stax-api:1.0-2
45 | jline:jline:0.9.94
46 | log4j:log4j:1.2.17
47 | net.java.dev.jets3t:jets3t:0.9.0
48 | net.minidev:accessors-smart:1.2
49 | net.minidev:json-smart:2.3
50 | org.apache.avro:avro:1.7.7
51 | org.apache.commons:commons-compress:1.21
52 | org.apache.commons:commons-math3:3.1.1
53 | org.apache.curator:curator-client:2.13.0
54 | org.apache.curator:curator-framework:2.13.0
55 | org.apache.curator:curator-recipes:2.13.0
56 | org.apache.directory.api:api-asn1-api:1.0.0-M20
57 | org.apache.directory.api:api-util:1.0.0-M20
58 | org.apache.directory.server:apacheds-i18n:2.0.0-M15
59 | org.apache.directory.server:apacheds-kerberos-codec:2.0.0-M15
60 | org.apache.hadoop:hadoop-annotations:2.10.2
61 | org.apache.hadoop:hadoop-auth:2.10.2
62 | org.apache.hadoop:hadoop-common:2.10.2
63 | org.apache.htrace:htrace-core4:4.1.0-incubating
64 | org.apache.httpcomponents:httpclient:4.5.13
65 | org.apache.httpcomponents:httpcore:4.4.13
66 | org.apache.parquet:parquet-column:1.11.0
67 | org.apache.parquet:parquet-common:1.11.0
68 | org.apache.parquet:parquet-encoding:1.11.0
69 | org.apache.parquet:parquet-format-structures:1.11.0
70 | org.apache.parquet:parquet-format:2.7.0
71 | org.apache.parquet:parquet-hadoop:1.11.0
72 | org.apache.parquet:parquet-jackson:1.11.0
73 | org.apache.yetus:audience-annotations:0.11.0
74 | org.apache.zookeeper:zookeeper:3.4.14
75 | org.codehaus.jackson:jackson-core-asl:1.9.13
76 | org.codehaus.jackson:jackson-jaxrs:1.8.3
77 | org.codehaus.jackson:jackson-mapper-asl:1.9.13
78 | org.codehaus.jackson:jackson-xc:1.8.3
79 | org.codehaus.jettison:jettison:1.1
80 | org.codehaus.woodstox:stax2-api:4.2.1
81 | org.mortbay.jetty:jetty-sslengine:6.1.26
82 | org.mortbay.jetty:jetty-util:6.1.26
83 | org.mortbay.jetty:jetty:6.1.26
84 | org.mortbay.jetty:servlet-api:2.5-20081211
85 | org.ow2.asm:asm:5.0.4
86 | org.scala-lang:scala-library:2.13.1
87 | org.slf4j:slf4j-log4j12:1.7.25
88 | org.slf4j:slf4j-reload4j:1.7.36
89 | org.xerial.snappy:snappy-java:1.1.7.3
90 | software.amazon.ion:ion-java:1.0.2
91 | xmlenc:xmlenc:0.52
92 | 


--------------------------------------------------------------------------------
/src/main/scala/org/embulk/output/s3_parquet/parquet/TimestampLogicalType.scala:
--------------------------------------------------------------------------------
 1 | package org.embulk.output.s3_parquet.parquet
 2 | 
 3 | import java.time.ZoneId
 4 | 
 5 | import org.apache.parquet.io.api.RecordConsumer
 6 | import org.apache.parquet.schema.{LogicalTypeAnnotation, PrimitiveType, Types}
 7 | import org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit
 8 | import org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.{
 9 |   MICROS,
10 |   MILLIS,
11 |   NANOS
12 | }
13 | import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName
14 | import org.embulk.config.ConfigException
15 | import org.embulk.output.s3_parquet.catalog.GlueDataType
16 | import org.embulk.spi.`type`.{
17 |   BooleanType,
18 |   DoubleType,
19 |   JsonType,
20 |   LongType,
21 |   StringType,
22 |   TimestampType
23 | }
24 | import org.embulk.spi.time.{Timestamp, TimestampFormatter}
25 | import org.embulk.spi.Column
26 | import org.msgpack.value.Value
27 | import org.slf4j.{Logger, LoggerFactory}
28 | 
29 | case class TimestampLogicalType(
30 |     isAdjustedToUtc: Boolean,
31 |     timeUnit: TimeUnit,
32 |     timeZone: ZoneId
33 | ) extends ParquetColumnType {
34 |   private val logger: Logger =
35 |     LoggerFactory.getLogger(classOf[TimestampLogicalType])
36 | 
37 |   override def primitiveType(column: Column): PrimitiveType =
38 |     column.getType match {
39 |       case _: LongType | _: TimestampType =>
40 |         Types
41 |           .optional(PrimitiveTypeName.INT64)
42 |           .as(LogicalTypeAnnotation.timestampType(isAdjustedToUtc, timeUnit))
43 |           .named(column.getName)
44 |       case _: BooleanType | _: DoubleType | _: StringType | _: JsonType | _ =>
45 |         throw new ConfigException(s"Unsupported column type: ${column.getName}")
46 |     }
47 | 
48 |   override def glueDataType(column: Column): GlueDataType =
49 |     column.getType match {
50 |       case _: LongType | _: TimestampType =>
51 |         timeUnit match {
52 |           case MILLIS => GlueDataType.TIMESTAMP
53 |           case MICROS | NANOS =>
54 |             warningWhenConvertingTimestampToGlueType(GlueDataType.BIGINT)
55 |             GlueDataType.BIGINT
56 |         }
57 |       case _: BooleanType | _: DoubleType | _: StringType | _: JsonType | _ =>
58 |         throw new ConfigException(s"Unsupported column type: ${column.getName}")
59 |     }
60 | 
61 |   override def consumeBoolean(consumer: RecordConsumer, v: Boolean): Unit =
62 |     throw newUnsupportedMethodException("consumeBoolean")
63 |   override def consumeString(consumer: RecordConsumer, v: String): Unit =
64 |     throw newUnsupportedMethodException("consumeString")
65 | 
66 |   override def consumeLong(consumer: RecordConsumer, v: Long): Unit =
67 |     consumer.addLong(v)
68 | 
69 |   override def consumeDouble(consumer: RecordConsumer, v: Double): Unit =
70 |     throw newUnsupportedMethodException("consumeDouble")
71 | 
72 |   override def consumeTimestamp(
73 |       consumer: RecordConsumer,
74 |       v: Timestamp,
75 |       formatter: TimestampFormatter
76 |   ): Unit = timeUnit match {
77 |     case MILLIS => consumer.addLong(v.toEpochMilli)
78 |     case MICROS =>
79 |       consumer.addLong(v.getEpochSecond * 1_000_000L + (v.getNano / 1_000L))
80 |     case NANOS =>
81 |       consumer.addLong(v.getEpochSecond * 1_000_000_000L + v.getNano)
82 |   }
83 | 
84 |   override def consumeJson(consumer: RecordConsumer, v: Value): Unit =
85 |     throw newUnsupportedMethodException("consumeJson")
86 | 
87 |   private def warningWhenConvertingTimestampToGlueType(
88 |       glueType: GlueDataType
89 |   ): Unit =
90 |     logger.warn(
91 |       s"timestamp(isAdjustedToUtc = $isAdjustedToUtc, timeUnit = $timeUnit) is converted" +
92 |         s" to Glue ${glueType.name} but this is not represented correctly, because Glue" +
93 |         s" does not support time type. Please use `catalog.column_options` to define the type."
94 |     )
95 | }
96 | 


--------------------------------------------------------------------------------
/src/main/scala/org/embulk/output/s3_parquet/catalog/GlueDataType.scala:
--------------------------------------------------------------------------------
 1 | package org.embulk.output.s3_parquet.catalog
 2 | 
 3 | // https://docs.aws.amazon.com/athena/latest/ug/data-types.html
 4 | 
 5 | sealed abstract class GlueDataType(val name: String)
 6 | object GlueDataType {
 7 |   sealed abstract class AbstractIntGlueDataType(name: String, val bitWidth: Int)
 8 |       extends GlueDataType(name)
 9 | 
10 |   // BOOLEAN – Values are true and false.
11 |   case object BOOLEAN extends GlueDataType("boolean")
12 |   // TINYINT – A 8-bit signed INTEGER in two’s complement format, with a minimum value of -27 and a maximum value of 27-1.
13 |   case object TINYINT extends AbstractIntGlueDataType("tinyint", bitWidth = 8)
14 |   // SMALLINT – A 16-bit signed INTEGER in two’s complement format, with a minimum value of -215 and a maximum value of 215-1.
15 |   case object SMALLINT
16 |       extends AbstractIntGlueDataType("smallint", bitWidth = 16)
17 |   // INT and INTEGER – Athena combines two different implementations of the integer data type, as follows:
18 |   //   * INT – In Data Definition Language (DDL) queries, Athena uses the INT data type.
19 |   //   * INTEGER – In DML queries, Athena uses the INTEGER data type. INTEGER is represented as a 32-bit signed value in two's complement format, with a minimum value of -231 and a maximum value of 231-1.
20 |   case object INT extends AbstractIntGlueDataType("int", bitWidth = 32)
21 |   // BIGINT – A 64-bit signed INTEGER in two’s complement format, with a minimum value of -263 and a maximum value of 263-1.
22 |   case object BIGINT extends AbstractIntGlueDataType("bigint", bitWidth = 64)
23 |   // DOUBLE – A 64-bit double-precision floating point number.
24 |   case object DOUBLE extends GlueDataType("double")
25 |   // FLOAT – A 32-bit single-precision floating point number. Equivalent to the REAL in Presto.
26 |   case object FLOAT extends GlueDataType("float")
27 |   // DECIMAL(precision, scale) – precision is the total number of digits. scale (optional) is the number of digits in fractional part with a default of 0. For example, use these type definitions: DECIMAL(11,5), DECIMAL(15).
28 |   case class DECIMAL(precision: Int, scale: Int)
29 |       extends GlueDataType(s"decimal($precision,$scale)")
30 |   // STRING – A string literal enclosed in single or double quotes. For more information, see STRING Hive Data Type.
31 |   case object STRING extends GlueDataType("string")
32 |   // CHAR – Fixed length character data, with a specified length between 1 and 255, such as char(10). For more information, see CHAR Hive Data Type.
33 |   case class CHAR(length: Int) extends GlueDataType(s"char($length)")
34 |   // VARCHAR – Variable length character data, with a specified length between 1 and 65535, such as varchar(10). For more information, see VARCHAR Hive Data Type.
35 |   case class VARCHAR(length: Int) extends GlueDataType(s"varchar($length)")
36 |   // BINARY – Used for data in Parquet.
37 |   case object BINARY extends GlueDataType("binary")
38 |   // DATE – A date in UNIX format, such as YYYY-MM-DD.
39 |   case object DATE extends GlueDataType("date")
40 |   // TIMESTAMP – Date and time instant in the UNiX format, such as yyyy-mm-dd hh:mm:ss[.f...]. For example, TIMESTAMP '2008-09-15 03:04:05.324'. This format uses the session time zone.
41 |   case object TIMESTAMP extends GlueDataType("timestamp")
42 |   // ARRAY<data_type>
43 |   case class ARRAY(dataType: GlueDataType)
44 |       extends GlueDataType(s"array<${dataType.name}>")
45 |   // MAP<primitive_type, data_type>
46 |   case class MAP(keyDataType: GlueDataType, valueDataType: GlueDataType)
47 |       extends GlueDataType(s"map<${keyDataType.name},${valueDataType.name}>")
48 |   // STRUCT<col_name : data_type [COMMENT col_comment] , ...>
49 |   case class STRUCT(struct: Map[String, GlueDataType])
50 |       extends GlueDataType({
51 |         val columns = struct
52 |           .map {
53 |             case (columnName, glueType) => s"$columnName : ${glueType.name}"
54 |           }
55 |         s"struct<${columns.mkString(",")}>"
56 |       })
57 | }
58 | 


--------------------------------------------------------------------------------
/src/main/scala/org/embulk/output/s3_parquet/PluginTask.scala:
--------------------------------------------------------------------------------
  1 | package org.embulk.output.s3_parquet
  2 | 
  3 | import java.util.{Locale, MissingFormatArgumentException, Optional}
  4 | 
  5 | import com.amazonaws.services.s3.model.CannedAccessControlList
  6 | import org.apache.parquet.hadoop.metadata.CompressionCodecName
  7 | import org.embulk.config.{
  8 |   Config,
  9 |   ConfigDefault,
 10 |   ConfigException,
 11 |   ConfigSource,
 12 |   Task,
 13 |   TaskSource
 14 | }
 15 | import org.embulk.output.s3_parquet.aws.Aws
 16 | import org.embulk.output.s3_parquet.catalog.CatalogRegistrator
 17 | import org.embulk.output.s3_parquet.parquet.ParquetFileWriteSupport
 18 | 
 19 | trait PluginTask extends Task with ParquetFileWriteSupport.Task with Aws.Task {
 20 | 
 21 |   @Config("bucket")
 22 |   def getBucket: String
 23 | 
 24 |   @Config("path_prefix")
 25 |   @ConfigDefault("\"\"")
 26 |   def getPathPrefix: String
 27 | 
 28 |   @Config("sequence_format")
 29 |   @ConfigDefault("\"%03d.%02d.\"")
 30 |   def getSequenceFormat: String
 31 | 
 32 |   @Config("file_ext")
 33 |   @ConfigDefault("\"parquet\"")
 34 |   def getFileExt: String
 35 | 
 36 |   @Config("compression_codec")
 37 |   @ConfigDefault("\"uncompressed\"")
 38 |   def getCompressionCodecString: String
 39 | 
 40 |   def getCompressionCodec: CompressionCodecName
 41 |   def setCompressionCodec(v: CompressionCodecName): Unit
 42 | 
 43 |   @Config("canned_acl")
 44 |   @ConfigDefault("\"private\"")
 45 |   def getCannedAclString: String
 46 | 
 47 |   def getCannedAcl: CannedAccessControlList
 48 |   def setCannedAcl(v: CannedAccessControlList): Unit
 49 | 
 50 |   @Config("block_size")
 51 |   @ConfigDefault("null")
 52 |   def getBlockSize: Optional[Int]
 53 | 
 54 |   @Config("page_size")
 55 |   @ConfigDefault("null")
 56 |   def getPageSize: Optional[Int]
 57 | 
 58 |   @Config("max_padding_size")
 59 |   @ConfigDefault("null")
 60 |   def getMaxPaddingSize: Optional[Int]
 61 | 
 62 |   @Config("enable_dictionary_encoding")
 63 |   @ConfigDefault("null")
 64 |   def getEnableDictionaryEncoding: Optional[Boolean]
 65 | 
 66 |   @Config("buffer_dir")
 67 |   @ConfigDefault("null")
 68 |   def getBufferDir: Optional[String]
 69 | 
 70 |   @Config("catalog")
 71 |   @ConfigDefault("null")
 72 |   def getCatalog: Optional[CatalogRegistrator.Task]
 73 | }
 74 | 
 75 | object PluginTask {
 76 | 
 77 |   def loadConfig(config: ConfigSource): PluginTask = {
 78 |     val task = config.loadConfig(classOf[PluginTask])
 79 |     // sequence_format
 80 |     try task.getSequenceFormat.format(0, 0)
 81 |     catch {
 82 |       case e: MissingFormatArgumentException =>
 83 |         throw new ConfigException(
 84 |           s"Invalid sequence_format: ${task.getSequenceFormat}",
 85 |           e
 86 |         )
 87 |     }
 88 | 
 89 |     // compression_codec
 90 |     CompressionCodecName
 91 |       .values()
 92 |       .find(
 93 |         _.name()
 94 |           .toLowerCase(Locale.ENGLISH)
 95 |           .equals(task.getCompressionCodecString)
 96 |       ) match {
 97 |       case Some(v) => task.setCompressionCodec(v)
 98 |       case None =>
 99 |         val unsupported: String = task.getCompressionCodecString
100 |         val supported: String = CompressionCodecName
101 |           .values()
102 |           .map(v => s"'${v.name().toLowerCase}'")
103 |           .mkString(", ")
104 |         throw new ConfigException(
105 |           s"'$unsupported' is unsupported: `compression_codec` must be one of [$supported]."
106 |         )
107 |     }
108 | 
109 |     // canned_acl
110 |     CannedAccessControlList
111 |       .values()
112 |       .find(_.toString.equals(task.getCannedAclString)) match {
113 |       case Some(v) => task.setCannedAcl(v)
114 |       case None =>
115 |         val unsupported: String = task.getCannedAclString
116 |         val supported: String = CannedAccessControlList
117 |           .values()
118 |           .map(v => s"'${v.toString}'")
119 |           .mkString(", ")
120 |         throw new ConfigException(
121 |           s"'$unsupported' is unsupported: `canned_acl` must be one of [$supported]."
122 |         )
123 |     }
124 | 
125 |     ParquetFileWriteSupport.configure(task)
126 |     task
127 |   }
128 | 
129 |   def loadTask(taskSource: TaskSource): PluginTask =
130 |     taskSource.loadTask(classOf[PluginTask])
131 | 
132 | }
133 | 


--------------------------------------------------------------------------------
/src/main/scala/org/embulk/output/s3_parquet/parquet/DecimalLogicalType.scala:
--------------------------------------------------------------------------------
  1 | package org.embulk.output.s3_parquet.parquet
  2 | 
  3 | import java.math.{MathContext, RoundingMode => JRoundingMode}
  4 | 
  5 | import org.apache.parquet.io.api.{Binary, RecordConsumer}
  6 | import org.apache.parquet.schema.{LogicalTypeAnnotation, PrimitiveType, Types}
  7 | import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName
  8 | import org.embulk.config.ConfigException
  9 | import org.embulk.output.s3_parquet.catalog.GlueDataType
 10 | import org.embulk.spi.{Column, DataException}
 11 | import org.embulk.spi.`type`.{
 12 |   BooleanType,
 13 |   DoubleType,
 14 |   JsonType,
 15 |   LongType,
 16 |   StringType,
 17 |   TimestampType
 18 | }
 19 | import org.embulk.spi.time.{Timestamp, TimestampFormatter}
 20 | import org.msgpack.value.Value
 21 | 
 22 | import scala.math.BigDecimal.RoundingMode
 23 | 
 24 | case class DecimalLogicalType(scale: Int, precision: Int)
 25 |     extends ParquetColumnType {
 26 |   // ref. https://github.com/apache/parquet-format/blob/apache-parquet-format-2.8.0/LogicalTypes.md#decimal
 27 |   require(scale >= 0, "Scale must be zero or a positive integer.")
 28 |   require(
 29 |     scale < precision,
 30 |     "Scale must be a positive integer less than the precision."
 31 |   )
 32 |   require(
 33 |     precision > 0,
 34 |     "Precision is required and must be a non-zero positive integer."
 35 |   )
 36 | 
 37 |   override def primitiveType(column: Column): PrimitiveType =
 38 |     column.getType match {
 39 |       case _: LongType if 1 <= precision && precision <= 9 =>
 40 |         Types
 41 |           .optional(PrimitiveTypeName.INT32)
 42 |           .as(LogicalTypeAnnotation.decimalType(scale, precision))
 43 |           .named(column.getName)
 44 |       case _: LongType if 10 <= precision && precision <= 18 =>
 45 |         Types
 46 |           .optional(PrimitiveTypeName.INT64)
 47 |           .as(LogicalTypeAnnotation.decimalType(scale, precision))
 48 |           .named(column.getName)
 49 |       case _: StringType | _: DoubleType =>
 50 |         Types
 51 |           .optional(PrimitiveTypeName.BINARY)
 52 |           .as(LogicalTypeAnnotation.decimalType(scale, precision))
 53 |           .named(column.getName)
 54 |       case _: BooleanType | _: TimestampType | _: JsonType | _ =>
 55 |         throw new ConfigException(
 56 |           s"Unsupported column type: ${column.getName} (scale: $scale, precision: $precision)"
 57 |         )
 58 |     }
 59 | 
 60 |   override def glueDataType(column: Column): GlueDataType =
 61 |     column.getType match {
 62 |       case _: StringType | _: LongType | _: DoubleType =>
 63 |         GlueDataType.DECIMAL(scale = scale, precision = precision)
 64 |       case _: BooleanType | _: TimestampType | _: JsonType | _ =>
 65 |         throw new ConfigException(
 66 |           s"Unsupported column type: ${column.getName} (scale: $scale, precision: $precision)"
 67 |         )
 68 |     }
 69 | 
 70 |   override def consumeBoolean(consumer: RecordConsumer, v: Boolean): Unit =
 71 |     throw newUnsupportedMethodException("consumeBoolean")
 72 |   override def consumeString(consumer: RecordConsumer, v: String): Unit =
 73 |     try consumeBigDecimal(consumer, BigDecimal.exact(v))
 74 |     catch {
 75 |       case ex: NumberFormatException =>
 76 |         throw new DataException(s"Failed to cast String: $v to BigDecimal.", ex)
 77 |     }
 78 |   override def consumeLong(consumer: RecordConsumer, v: Long): Unit =
 79 |     if (1 <= precision && precision <= 9) consumeLongAsInteger(consumer, v)
 80 |     else if (10 <= precision && precision <= 18) consumer.addLong(v)
 81 |     else
 82 |       throw new ConfigException(
 83 |         s"precision must be 1 <= precision <= 18 when consuming long values but precision is $precision."
 84 |       )
 85 |   override def consumeDouble(consumer: RecordConsumer, v: Double): Unit =
 86 |     consumeBigDecimal(consumer, BigDecimal.exact(v))
 87 |   override def consumeTimestamp(
 88 |       consumer: RecordConsumer,
 89 |       v: Timestamp,
 90 |       formatter: TimestampFormatter
 91 |   ): Unit = throw newUnsupportedMethodException("consumeTimestamp")
 92 |   override def consumeJson(consumer: RecordConsumer, v: Value): Unit =
 93 |     throw newUnsupportedMethodException("consumeJson")
 94 | 
 95 |   private def consumeBigDecimal(consumer: RecordConsumer, v: BigDecimal): Unit =
 96 |     // TODO: Make RoundingMode configurable?
 97 |     consumer.addBinary(
 98 |       Binary.fromString(
 99 |         v.setScale(scale, RoundingMode.HALF_UP)
100 |           .round(new MathContext(precision, JRoundingMode.HALF_UP))
101 |           .toString()
102 |       )
103 |     )
104 | }
105 | 


--------------------------------------------------------------------------------
/src/main/scala/org/embulk/output/s3_parquet/parquet/TimeLogicalType.scala:
--------------------------------------------------------------------------------
  1 | package org.embulk.output.s3_parquet.parquet
  2 | 
  3 | import java.time.{OffsetTime, ZoneId}
  4 | import java.time.temporal.ChronoField.{MICRO_OF_DAY, MILLI_OF_DAY, NANO_OF_DAY}
  5 | 
  6 | import org.apache.parquet.io.api.RecordConsumer
  7 | import org.apache.parquet.schema.{LogicalTypeAnnotation, PrimitiveType, Types}
  8 | import org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit
  9 | import org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.{
 10 |   MICROS,
 11 |   MILLIS,
 12 |   NANOS
 13 | }
 14 | import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName
 15 | import org.embulk.config.ConfigException
 16 | import org.embulk.output.s3_parquet.catalog.GlueDataType
 17 | import org.embulk.spi.Column
 18 | import org.embulk.spi.`type`.{
 19 |   BooleanType,
 20 |   DoubleType,
 21 |   JsonType,
 22 |   LongType,
 23 |   StringType,
 24 |   TimestampType
 25 | }
 26 | import org.embulk.spi.time.{Timestamp, TimestampFormatter}
 27 | import org.msgpack.value.Value
 28 | import org.slf4j.{Logger, LoggerFactory}
 29 | 
 30 | case class TimeLogicalType(
 31 |     isAdjustedToUtc: Boolean,
 32 |     timeUnit: TimeUnit,
 33 |     timeZone: ZoneId
 34 | ) extends ParquetColumnType {
 35 |   private val logger: Logger = LoggerFactory.getLogger(classOf[TimeLogicalType])
 36 |   private val UTC: ZoneId = ZoneId.of("UTC")
 37 | 
 38 |   override def primitiveType(column: Column): PrimitiveType =
 39 |     column.getType match {
 40 |       case _: LongType | _: TimestampType =>
 41 |         Types
 42 |           .optional(timeUnit match {
 43 |             case MILLIS         => PrimitiveTypeName.INT32
 44 |             case MICROS | NANOS => PrimitiveTypeName.INT64
 45 |           })
 46 |           .as(LogicalTypeAnnotation.timeType(isAdjustedToUtc, timeUnit))
 47 |           .named(column.getName)
 48 |       case _: BooleanType | _: DoubleType | _: StringType | _: JsonType | _ =>
 49 |         throw new ConfigException(s"Unsupported column type: ${column.getName}")
 50 |     }
 51 | 
 52 |   override def glueDataType(column: Column): GlueDataType =
 53 |     column.getType match {
 54 |       case _: LongType | _: TimestampType =>
 55 |         timeUnit match {
 56 |           case MILLIS =>
 57 |             warningWhenConvertingTimeToGlueType(GlueDataType.INT)
 58 |             GlueDataType.INT
 59 |           case MICROS | NANOS =>
 60 |             warningWhenConvertingTimeToGlueType(GlueDataType.BIGINT)
 61 |             GlueDataType.BIGINT
 62 |         }
 63 |       case _: BooleanType | _: DoubleType | _: StringType | _: JsonType | _ =>
 64 |         throw new ConfigException(s"Unsupported column type: ${column.getName}")
 65 |     }
 66 | 
 67 |   override def consumeBoolean(consumer: RecordConsumer, v: Boolean): Unit =
 68 |     throw newUnsupportedMethodException("consumeBoolean")
 69 | 
 70 |   override def consumeString(consumer: RecordConsumer, v: String): Unit =
 71 |     throw newUnsupportedMethodException("consumeString")
 72 | 
 73 |   override def consumeLong(consumer: RecordConsumer, v: Long): Unit =
 74 |     timeUnit match {
 75 |       case MILLIS         => consumeLongAsInteger(consumer, v)
 76 |       case MICROS | NANOS => consumer.addLong(v)
 77 |     }
 78 | 
 79 |   override def consumeDouble(consumer: RecordConsumer, v: Double): Unit =
 80 |     throw newUnsupportedMethodException("consumeDouble")
 81 | 
 82 |   override def consumeTimestamp(
 83 |       consumer: RecordConsumer,
 84 |       v: Timestamp,
 85 |       formatter: TimestampFormatter
 86 |   ): Unit = {
 87 |     // * `TIME` with precision `MILLIS` is used for millisecond precision.
 88 |     //   It must annotate an `int32` that stores the number of milliseconds after midnight.
 89 |     // * `TIME` with precision `MICROS` is used for microsecond precision.
 90 |     //   It must annotate an `int64` that stores the number of microseconds after midnight.
 91 |     // * `TIME` with precision `NANOS` is used for nanosecond precision.
 92 |     //   It must annotate an `int64` that stores the number of nanoseconds after midnight.
 93 |     //
 94 |     // ref. https://github.com/apache/parquet-format/blob/apache-parquet-format-2.7.0/LogicalTypes.md#time
 95 |     val zoneId = if (isAdjustedToUtc) UTC else timeZone
 96 |     val offsetTime: OffsetTime = OffsetTime.ofInstant(v.getInstant, zoneId)
 97 |     timeUnit match {
 98 |       case MILLIS =>
 99 |         consumeLongAsInteger(consumer, offsetTime.get(MILLI_OF_DAY))
100 |       case MICROS =>
101 |         consumer.addLong(offsetTime.getLong(MICRO_OF_DAY))
102 |       case NANOS =>
103 |         consumer.addLong(offsetTime.getLong(NANO_OF_DAY))
104 |     }
105 |   }
106 | 
107 |   override def consumeJson(consumer: RecordConsumer, v: Value): Unit =
108 |     throw newUnsupportedMethodException("consumeJson")
109 | 
110 |   private def warningWhenConvertingTimeToGlueType(
111 |       glueType: GlueDataType
112 |   ): Unit =
113 |     logger.warn(
114 |       s"time(isAdjustedToUtc = $isAdjustedToUtc, timeUnit = $timeUnit) is converted to Glue" +
115 |         s" ${glueType.name} but this is not represented correctly, because Glue does not" +
116 |         s" support time type. Please use `catalog.column_options` to define the type."
117 |     )
118 | }
119 | 


--------------------------------------------------------------------------------
/src/main/scala/org/embulk/output/s3_parquet/S3ParquetOutputPlugin.scala:
--------------------------------------------------------------------------------
  1 | package org.embulk.output.s3_parquet
  2 | 
  3 | import java.nio.file.{Files, Paths}
  4 | import java.util.{List => JList}
  5 | 
  6 | import org.apache.parquet.column.ParquetProperties
  7 | import org.apache.parquet.hadoop.ParquetWriter
  8 | import org.embulk.config.{ConfigDiff, ConfigSource, TaskReport, TaskSource}
  9 | import org.embulk.output.s3_parquet.aws.Aws
 10 | import org.embulk.output.s3_parquet.catalog.CatalogRegistrator
 11 | import org.embulk.output.s3_parquet.parquet.ParquetFileWriteSupport
 12 | import org.embulk.spi.{
 13 |   Exec,
 14 |   OutputPlugin,
 15 |   PageReader,
 16 |   Schema,
 17 |   TransactionalPageOutput
 18 | }
 19 | import org.slf4j.{Logger, LoggerFactory}
 20 | 
 21 | class S3ParquetOutputPlugin extends OutputPlugin {
 22 | 
 23 |   import implicits._
 24 | 
 25 |   val logger: Logger = LoggerFactory.getLogger(classOf[S3ParquetOutputPlugin])
 26 | 
 27 |   override def transaction(
 28 |       config: ConfigSource,
 29 |       schema: Schema,
 30 |       taskCount: Int,
 31 |       control: OutputPlugin.Control
 32 |   ): ConfigDiff = {
 33 |     val task: PluginTask = PluginTask.loadConfig(config)
 34 |     val support: ParquetFileWriteSupport = ParquetFileWriteSupport(task, schema)
 35 |     support.showOutputSchema(logger)
 36 |     control.run(task.dump)
 37 | 
 38 |     task.getCatalog.ifPresent { catalog =>
 39 |       val location =
 40 |         s"s3://${task.getBucket}/${task.getPathPrefix.replaceFirst("(.*/)[^/]+$", "$1")}"
 41 |       val cr = CatalogRegistrator.fromTask(
 42 |         task = catalog,
 43 |         aws = Aws(task),
 44 |         schema = schema,
 45 |         location = location,
 46 |         compressionCodec = task.getCompressionCodec,
 47 |         defaultGlueTypes =
 48 |           support.parquetSchema.transform((k, v) => v.glueDataType(k))
 49 |       )
 50 |       ContextClassLoaderSwapper.usingPluginClass {
 51 |         cr.run()
 52 |       }
 53 |     }
 54 | 
 55 |     Exec.newConfigDiff
 56 |   }
 57 | 
 58 |   override def resume(
 59 |       taskSource: TaskSource,
 60 |       schema: Schema,
 61 |       taskCount: Int,
 62 |       control: OutputPlugin.Control
 63 |   ): ConfigDiff = {
 64 |     throw new UnsupportedOperationException(
 65 |       "s3_parquet output plugin does not support resuming"
 66 |     )
 67 |   }
 68 | 
 69 |   override def cleanup(
 70 |       taskSource: TaskSource,
 71 |       schema: Schema,
 72 |       taskCount: Int,
 73 |       successTaskReports: JList[TaskReport]
 74 |   ): Unit = {
 75 |     successTaskReports.foreach { tr =>
 76 |       logger.info(
 77 |         s"Created: s3://${tr.get(classOf[String], "bucket")}/${tr.get(classOf[String], "key")}, "
 78 |           + s"version_id: ${tr.get(classOf[String], "version_id", null)}, "
 79 |           + s"etag: ${tr.get(classOf[String], "etag", null)}"
 80 |       )
 81 |     }
 82 |   }
 83 | 
 84 |   override def open(
 85 |       taskSource: TaskSource,
 86 |       schema: Schema,
 87 |       taskIndex: Int
 88 |   ): TransactionalPageOutput = {
 89 |     val task = PluginTask.loadTask(taskSource)
 90 |     val bufferDir: String = task.getBufferDir.getOrElse(
 91 |       Files.createTempDirectory("embulk-output-s3_parquet-").toString
 92 |     )
 93 |     val bufferFile: String = Paths
 94 |       .get(bufferDir, s"embulk-output-s3_parquet-task-$taskIndex-0.parquet")
 95 |       .toString
 96 |     val destS3bucket: String = task.getBucket
 97 |     val destS3Key: String =
 98 |       s"${task.getPathPrefix}${task.getSequenceFormat.format(taskIndex, 0)}${task.getFileExt}"
 99 | 
100 |     val pageReader: PageReader = new PageReader(schema)
101 |     val aws: Aws = Aws(task)
102 |     val parquetWriter: ParquetWriter[PageReader] =
103 |       ContextClassLoaderSwapper.usingPluginClass {
104 |         ParquetFileWriteSupport(task, schema)
105 |           .newWriterBuilder(bufferFile)
106 |           .withCompressionCodec(task.getCompressionCodec)
107 |           .withDictionaryEncoding(
108 |             task.getEnableDictionaryEncoding.orElse(
109 |               ParquetProperties.DEFAULT_IS_DICTIONARY_ENABLED
110 |             )
111 |           )
112 |           .withDictionaryPageSize(
113 |             task.getPageSize.orElse(
114 |               ParquetProperties.DEFAULT_DICTIONARY_PAGE_SIZE
115 |             )
116 |           )
117 |           .withMaxPaddingSize(
118 |             task.getMaxPaddingSize.orElse(
119 |               ParquetWriter.MAX_PADDING_SIZE_DEFAULT
120 |             )
121 |           )
122 |           .withPageSize(
123 |             task.getPageSize.orElse(ParquetProperties.DEFAULT_PAGE_SIZE)
124 |           )
125 |           .withRowGroupSize(
126 |             task.getBlockSize.orElse(ParquetWriter.DEFAULT_BLOCK_SIZE)
127 |           )
128 |           .withValidation(ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED)
129 |           .withWriteMode(
130 |             org.apache.parquet.hadoop.ParquetFileWriter.Mode.CREATE
131 |           )
132 |           .withWriterVersion(ParquetProperties.DEFAULT_WRITER_VERSION)
133 |           .build()
134 |       }
135 | 
136 |     logger.info(
137 |       s"Local Buffer File: $bufferFile, Destination: s3://$destS3bucket/$destS3Key"
138 |     )
139 | 
140 |     S3ParquetPageOutput(
141 |       bufferFile,
142 |       pageReader,
143 |       parquetWriter,
144 |       aws,
145 |       destS3bucket,
146 |       destS3Key
147 |     )
148 |   }
149 | 
150 | }
151 | 


--------------------------------------------------------------------------------
/src/test/scala/org/embulk/output/s3_parquet/parquet/TestJsonLogicalType.scala:
--------------------------------------------------------------------------------
  1 | package org.embulk.output.s3_parquet.parquet
  2 | 
  3 | import org.apache.parquet.io.api.Binary
  4 | import org.apache.parquet.schema.LogicalTypeAnnotation
  5 | import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName
  6 | import org.embulk.config.ConfigException
  7 | import org.embulk.output.s3_parquet.catalog.GlueDataType
  8 | import org.embulk.spi.`type`.{
  9 |   BooleanType,
 10 |   DoubleType,
 11 |   JsonType,
 12 |   LongType,
 13 |   StringType
 14 | }
 15 | import org.embulk.spi.json.JsonParser
 16 | import org.embulk.spi.time.TimestampFormatter
 17 | import org.msgpack.value.ValueFactory
 18 | import org.scalatest.diagrams.Diagrams
 19 | import org.scalatest.funsuite.AnyFunSuite
 20 | import org.scalatest.prop.TableDrivenPropertyChecks
 21 | 
 22 | import scala.util.chaining._
 23 | 
 24 | class TestJsonLogicalType
 25 |     extends AnyFunSuite
 26 |     with ParquetColumnTypeTestHelper
 27 |     with TableDrivenPropertyChecks
 28 |     with Diagrams {
 29 | 
 30 |   private val conditions = Table(
 31 |     "column",
 32 |     Seq(
 33 |       SAMPLE_BOOLEAN_COLUMN,
 34 |       SAMPLE_LONG_COLUMN,
 35 |       SAMPLE_DOUBLE_COLUMN,
 36 |       SAMPLE_STRING_COLUMN,
 37 |       SAMPLE_TIMESTAMP_COLUMN,
 38 |       SAMPLE_JSON_COLUMN
 39 |     ): _*
 40 |   )
 41 | 
 42 |   test(
 43 |     "#primitiveType(column) returns PrimitiveTypeName.{BOOLEAN,INT64,DOUBLE,BINARY} with LogicalType"
 44 |   ) {
 45 |     forAll(conditions) { column =>
 46 |       // format: off
 47 |       column.getType match {
 48 |         case _: BooleanType | _: LongType | _: DoubleType | _: StringType |
 49 |             _: JsonType =>
 50 |           assert(PrimitiveTypeName.BINARY == JsonLogicalType.primitiveType(column).getPrimitiveTypeName)
 51 |           assert(LogicalTypeAnnotation.jsonType() == JsonLogicalType.primitiveType(column).getLogicalTypeAnnotation)
 52 |         case _          =>
 53 |           assert(intercept[ConfigException](JsonLogicalType.primitiveType(column)).getMessage.startsWith("Unsupported column type: "))
 54 |       }
 55 |       // format: on
 56 |     }
 57 |   }
 58 | 
 59 |   test("#glueDataType(column) returns GlueDataType") {
 60 |     forAll(conditions) { column =>
 61 |       // format: off
 62 |       column.getType match {
 63 |         case _: BooleanType | _: LongType | _: DoubleType | _: StringType |
 64 |              _: JsonType =>
 65 |           assert(GlueDataType.STRING == JsonLogicalType.glueDataType(column))
 66 |         case _          =>
 67 |           assert(intercept[ConfigException](JsonLogicalType.glueDataType(column)).getMessage.startsWith("Unsupported column type: "))
 68 |       }
 69 |       // format: on
 70 |     }
 71 |   }
 72 | 
 73 |   test("#consumeBoolean") {
 74 |     newMockRecordConsumer().tap { consumer =>
 75 |       consumer.writingSampleField {
 76 |         JsonLogicalType.consumeBoolean(consumer, true)
 77 |       }
 78 |       // format: off
 79 |       assert(consumer.data.head.head.isInstanceOf[Binary])
 80 |       assert(consumer.data.head.head == Binary.fromString(ValueFactory.newBoolean(true).toJson))
 81 |       // format: on
 82 |     }
 83 |   }
 84 | 
 85 |   test("#consumeString") {
 86 |     newMockRecordConsumer().tap { consumer =>
 87 |       consumer.writingSampleField {
 88 |         JsonLogicalType.consumeString(consumer, "string")
 89 |       }
 90 |       // format: off
 91 |       assert(consumer.data.head.head.isInstanceOf[Binary])
 92 |       assert(consumer.data.head.head == Binary.fromString(ValueFactory.newString("string").toJson))
 93 |       // format: on
 94 |     }
 95 |   }
 96 | 
 97 |   test("#consumeLong") {
 98 |     newMockRecordConsumer().tap { consumer =>
 99 |       consumer.writingSampleField {
100 |         JsonLogicalType.consumeLong(consumer, Long.MaxValue)
101 |       }
102 |       // format: off
103 |       assert(consumer.data.head.head.isInstanceOf[Binary])
104 |       assert(consumer.data.head.head == Binary.fromString(ValueFactory.newInteger(Long.MaxValue).toJson))
105 |       // format: on
106 |     }
107 |   }
108 | 
109 |   test("#consumeDouble") {
110 |     newMockRecordConsumer().tap { consumer =>
111 |       consumer.writingSampleField {
112 |         JsonLogicalType.consumeDouble(consumer, Double.MaxValue)
113 |       }
114 |       // format: off
115 |       assert(consumer.data.head.head.isInstanceOf[Binary])
116 |       assert(consumer.data.head.head == Binary.fromString(ValueFactory.newFloat(Double.MaxValue).toJson))
117 |       // format: on
118 |     }
119 |   }
120 | 
121 |   test("#consumeTimestamp") {
122 |     val formatter = TimestampFormatter
123 |       .of("%Y-%m-%d %H:%M:%S.%6N %z", "UTC")
124 |     newMockRecordConsumer().tap { consumer =>
125 |       consumer.writingSampleField {
126 |         // format: off
127 |         assert(intercept[ConfigException](JsonLogicalType.consumeTimestamp(consumer, null, null)).getMessage.endsWith("is unsupported."))
128 |         // format: on
129 |       }
130 |     }
131 |   }
132 | 
133 |   test("#consumeJson") {
134 |     newMockRecordConsumer().tap { consumer =>
135 |       consumer.writingSampleField {
136 |         JsonLogicalType.consumeJson(
137 |           consumer,
138 |           new JsonParser().parse("""{"a":1,"b":"c","d":5.5,"e":true}""")
139 |         )
140 |       }
141 |       // format: off
142 |       assert(consumer.data.head.head.isInstanceOf[Binary])
143 |       assert(consumer.data.head.head == Binary.fromString("""{"a":1,"b":"c","d":5.5,"e":true}"""))
144 |       // format: on
145 |     }
146 |   }
147 | 
148 | }
149 | 


--------------------------------------------------------------------------------
/src/test/scala/org/embulk/output/s3_parquet/parquet/TestDefaultColumnType.scala:
--------------------------------------------------------------------------------
  1 | package org.embulk.output.s3_parquet.parquet
  2 | 
  3 | import org.apache.parquet.io.api.Binary
  4 | import org.apache.parquet.schema.LogicalTypeAnnotation
  5 | import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName
  6 | import org.embulk.output.s3_parquet.catalog.GlueDataType
  7 | import org.embulk.spi.`type`.{
  8 |   BooleanType,
  9 |   DoubleType,
 10 |   JsonType,
 11 |   LongType,
 12 |   StringType,
 13 |   TimestampType
 14 | }
 15 | import org.embulk.spi.json.JsonParser
 16 | import org.embulk.spi.time.{Timestamp, TimestampFormatter}
 17 | import org.scalatest.diagrams.Diagrams
 18 | import org.scalatest.funsuite.AnyFunSuite
 19 | import org.scalatest.prop.TableDrivenPropertyChecks
 20 | 
 21 | import scala.util.chaining._
 22 | 
 23 | class TestDefaultColumnType
 24 |     extends AnyFunSuite
 25 |     with ParquetColumnTypeTestHelper
 26 |     with TableDrivenPropertyChecks
 27 |     with Diagrams {
 28 | 
 29 |   private val conditions = Table(
 30 |     "column",
 31 |     Seq(
 32 |       SAMPLE_BOOLEAN_COLUMN,
 33 |       SAMPLE_LONG_COLUMN,
 34 |       SAMPLE_DOUBLE_COLUMN,
 35 |       SAMPLE_STRING_COLUMN,
 36 |       SAMPLE_TIMESTAMP_COLUMN,
 37 |       SAMPLE_JSON_COLUMN
 38 |     ): _*
 39 |   )
 40 | 
 41 |   test(
 42 |     "#primitiveType(column) returns PrimitiveTypeName.{BOOLEAN,INT64,DOUBLE,BINARY}"
 43 |   ) {
 44 |     forAll(conditions) { column =>
 45 |       // format: off
 46 |       column.getType match {
 47 |         case _: BooleanType =>
 48 |           assert(PrimitiveTypeName.BOOLEAN == DefaultColumnType.primitiveType(column).getPrimitiveTypeName)
 49 |           assert(null == DefaultColumnType.primitiveType(column).getLogicalTypeAnnotation)
 50 |         case _: LongType =>
 51 |           assert(PrimitiveTypeName.INT64 == DefaultColumnType.primitiveType(column).getPrimitiveTypeName)
 52 |           assert(null == DefaultColumnType.primitiveType(column).getLogicalTypeAnnotation)
 53 |         case _: DoubleType =>
 54 |           assert(PrimitiveTypeName.DOUBLE == DefaultColumnType.primitiveType(column).getPrimitiveTypeName)
 55 |           assert(null == DefaultColumnType.primitiveType(column).getLogicalTypeAnnotation)
 56 |         case _: StringType | _: TimestampType | _: JsonType =>
 57 |           assert(PrimitiveTypeName.BINARY == DefaultColumnType.primitiveType(column).getPrimitiveTypeName)
 58 |           assert(LogicalTypeAnnotation.stringType() == DefaultColumnType.primitiveType(column).getLogicalTypeAnnotation)
 59 |         case _ =>
 60 |           fail()
 61 |       }
 62 |       // format: on
 63 |     }
 64 |   }
 65 | 
 66 |   test("#glueDataType(column) returns GlueDataType") {
 67 |     forAll(conditions) { column =>
 68 |       // format: off
 69 |       column.getType match {
 70 |         case _: BooleanType =>
 71 |           assert(GlueDataType.BOOLEAN == DefaultColumnType.glueDataType(column))
 72 |         case _: LongType =>
 73 |           assert(GlueDataType.BIGINT == DefaultColumnType.glueDataType(column))
 74 |         case _: DoubleType =>
 75 |           assert(GlueDataType.DOUBLE == DefaultColumnType.glueDataType(column))
 76 |         case _: StringType | _: TimestampType | _: JsonType =>
 77 |           assert(GlueDataType.STRING == DefaultColumnType.glueDataType(column))
 78 |         case _ =>
 79 |           fail()
 80 |       }
 81 |       // format: on
 82 |     }
 83 |   }
 84 | 
 85 |   test("#consumeBoolean") {
 86 |     newMockRecordConsumer().tap { consumer =>
 87 |       consumer.writingSampleField {
 88 |         DefaultColumnType.consumeBoolean(consumer, true)
 89 |       }
 90 |       assert(consumer.data.head.head.isInstanceOf[Boolean])
 91 |       assert(consumer.data.head.head == true)
 92 |     }
 93 |   }
 94 | 
 95 |   test("#consumeString") {
 96 |     newMockRecordConsumer().tap { consumer =>
 97 |       consumer.writingSampleField {
 98 |         DefaultColumnType.consumeString(consumer, "string")
 99 |       }
100 |       assert(consumer.data.head.head.isInstanceOf[Binary])
101 |       assert(consumer.data.head.head == Binary.fromString("string"))
102 |     }
103 |   }
104 | 
105 |   test("#consumeLong") {
106 |     newMockRecordConsumer().tap { consumer =>
107 |       consumer.writingSampleField {
108 |         DefaultColumnType.consumeLong(consumer, Long.MaxValue)
109 |       }
110 |       assert(consumer.data.head.head.isInstanceOf[Long])
111 |       assert(consumer.data.head.head == Long.MaxValue)
112 |     }
113 |   }
114 | 
115 |   test("#consumeDouble") {
116 |     newMockRecordConsumer().tap { consumer =>
117 |       consumer.writingSampleField {
118 |         DefaultColumnType.consumeDouble(consumer, Double.MaxValue)
119 |       }
120 |       assert(consumer.data.head.head.isInstanceOf[Double])
121 |       assert(consumer.data.head.head == Double.MaxValue)
122 |     }
123 |   }
124 | 
125 |   test("#consumeTimestamp") {
126 |     val formatter = TimestampFormatter
127 |       .of("%Y-%m-%d %H:%M:%S.%6N %z", "UTC")
128 |     newMockRecordConsumer().tap { consumer =>
129 |       consumer.writingSampleField {
130 |         DefaultColumnType.consumeTimestamp(
131 |           consumer,
132 |           Timestamp.ofEpochMilli(Int.MaxValue),
133 |           formatter
134 |         )
135 |       }
136 |       // format: off
137 |       assert(consumer.data.head.head.isInstanceOf[Binary])
138 |       assert(consumer.data.head.head == Binary.fromString("1970-01-25 20:31:23.647000 +0000"))
139 |       // format: on
140 |     }
141 |   }
142 | 
143 |   test("#consumeJson") {
144 |     newMockRecordConsumer().tap { consumer =>
145 |       consumer.writingSampleField {
146 |         DefaultColumnType.consumeJson(
147 |           consumer,
148 |           new JsonParser().parse("""{"a":1,"b":"c","d":5.5,"e":true}""")
149 |         )
150 |       }
151 |       // format: off
152 |       assert(consumer.data.head.head.isInstanceOf[Binary])
153 |       assert(consumer.data.head.head == Binary.fromString("""{"a":1,"b":"c","d":5.5,"e":true}"""))
154 |       // format: on
155 |     }
156 |   }
157 | }
158 | 


--------------------------------------------------------------------------------
/src/main/scala/org/embulk/output/s3_parquet/aws/AwsCredentials.scala:
--------------------------------------------------------------------------------
  1 | package org.embulk.output.s3_parquet.aws
  2 | 
  3 | import java.util.Optional
  4 | 
  5 | import com.amazonaws.auth.{
  6 |   AnonymousAWSCredentials,
  7 |   AWSCredentialsProvider,
  8 |   AWSStaticCredentialsProvider,
  9 |   BasicAWSCredentials,
 10 |   BasicSessionCredentials,
 11 |   DefaultAWSCredentialsProviderChain,
 12 |   EC2ContainerCredentialsProviderWrapper,
 13 |   EnvironmentVariableCredentialsProvider,
 14 |   STSAssumeRoleSessionCredentialsProvider,
 15 |   SystemPropertiesCredentialsProvider,
 16 |   WebIdentityTokenCredentialsProvider
 17 | }
 18 | import com.amazonaws.auth.profile.{
 19 |   ProfileCredentialsProvider,
 20 |   ProfilesConfigFile
 21 | }
 22 | import org.embulk.config.{Config, ConfigDefault, ConfigException}
 23 | import org.embulk.output.s3_parquet.aws.AwsCredentials.Task
 24 | import org.embulk.spi.unit.LocalFile
 25 | 
 26 | object AwsCredentials {
 27 | 
 28 |   trait Task {
 29 | 
 30 |     @Config("auth_method")
 31 |     @ConfigDefault("\"default\"")
 32 |     def getAuthMethod: String
 33 | 
 34 |     @Config("access_key_id")
 35 |     @ConfigDefault("null")
 36 |     def getAccessKeyId: Optional[String]
 37 | 
 38 |     @Config("secret_access_key")
 39 |     @ConfigDefault("null")
 40 |     def getSecretAccessKey: Optional[String]
 41 | 
 42 |     @Config("session_token")
 43 |     @ConfigDefault("null")
 44 |     def getSessionToken: Optional[String]
 45 | 
 46 |     @Config("profile_file")
 47 |     @ConfigDefault("null")
 48 |     def getProfileFile: Optional[LocalFile]
 49 | 
 50 |     @Config("profile_name")
 51 |     @ConfigDefault("\"default\"")
 52 |     def getProfileName: String
 53 | 
 54 |     @Config("role_arn")
 55 |     @ConfigDefault("null")
 56 |     def getRoleArn: Optional[String]
 57 | 
 58 |     @Config("role_session_name")
 59 |     @ConfigDefault("null")
 60 |     def getRoleSessionName: Optional[String]
 61 | 
 62 |     @Config("role_external_id")
 63 |     @ConfigDefault("null")
 64 |     def getRoleExternalId: Optional[String]
 65 | 
 66 |     @Config("role_session_duration_seconds")
 67 |     @ConfigDefault("null")
 68 |     def getRoleSessionDurationSeconds: Optional[Int]
 69 | 
 70 |     @Config("scope_down_policy")
 71 |     @ConfigDefault("null")
 72 |     def getScopeDownPolicy: Optional[String]
 73 | 
 74 |     @Config("web_identity_token_file")
 75 |     @ConfigDefault("null")
 76 |     def getWebIdentityTokenFile: Optional[String]
 77 |   }
 78 | 
 79 |   def apply(task: Task): AwsCredentials = {
 80 |     new AwsCredentials(task)
 81 |   }
 82 | }
 83 | 
 84 | class AwsCredentials(task: Task) {
 85 | 
 86 |   def createAwsCredentialsProvider: AWSCredentialsProvider = {
 87 |     task.getAuthMethod match {
 88 |       case "basic" =>
 89 |         new AWSStaticCredentialsProvider(
 90 |           new BasicAWSCredentials(
 91 |             getRequiredOption(task.getAccessKeyId, "access_key_id"),
 92 |             getRequiredOption(task.getSecretAccessKey, "secret_access_key")
 93 |           )
 94 |         )
 95 | 
 96 |       case "env" =>
 97 |         new EnvironmentVariableCredentialsProvider
 98 | 
 99 |       case "instance" =>
100 |         // NOTE: combination of InstanceProfileCredentialsProvider and ContainerCredentialsProvider
101 |         new EC2ContainerCredentialsProviderWrapper
102 | 
103 |       case "profile" =>
104 |         if (task.getProfileFile.isPresent) {
105 |           val pf: ProfilesConfigFile = new ProfilesConfigFile(
106 |             task.getProfileFile.get().getFile
107 |           )
108 |           new ProfileCredentialsProvider(pf, task.getProfileName)
109 |         }
110 |         else new ProfileCredentialsProvider(task.getProfileName)
111 | 
112 |       case "properties" =>
113 |         new SystemPropertiesCredentialsProvider
114 | 
115 |       case "anonymous" =>
116 |         new AWSStaticCredentialsProvider(new AnonymousAWSCredentials)
117 | 
118 |       case "session" =>
119 |         new AWSStaticCredentialsProvider(
120 |           new BasicSessionCredentials(
121 |             getRequiredOption(task.getAccessKeyId, "access_key_id"),
122 |             getRequiredOption(task.getSecretAccessKey, "secret_access_key"),
123 |             getRequiredOption(task.getSessionToken, "session_token")
124 |           )
125 |         )
126 | 
127 |       case "assume_role" =>
128 |         // NOTE: Are http_proxy, endpoint, region required when assuming role?
129 |         val builder = new STSAssumeRoleSessionCredentialsProvider.Builder(
130 |           getRequiredOption(task.getRoleArn, "role_arn"),
131 |           getRequiredOption(task.getRoleSessionName, "role_session_name")
132 |         )
133 |         task.getRoleExternalId.ifPresent(v => builder.withExternalId(v))
134 |         task.getRoleSessionDurationSeconds.ifPresent(v =>
135 |           builder.withRoleSessionDurationSeconds(v)
136 |         )
137 |         task.getScopeDownPolicy.ifPresent(v => builder.withScopeDownPolicy(v))
138 | 
139 |         builder.build()
140 | 
141 |       case "web_identity_token" =>
142 |         WebIdentityTokenCredentialsProvider
143 |           .builder()
144 |           .roleArn(getRequiredOption(task.getRoleArn, "role_arn"))
145 |           .roleSessionName(
146 |             getRequiredOption(task.getRoleSessionName, "role_session_name")
147 |           )
148 |           .webIdentityTokenFile(
149 |             getRequiredOption(
150 |               task.getWebIdentityTokenFile,
151 |               "web_identity_token_file"
152 |             )
153 |           )
154 |           .build()
155 | 
156 |       case "default" =>
157 |         new DefaultAWSCredentialsProviderChain
158 | 
159 |       case am =>
160 |         throw new ConfigException(
161 |           s"'$am' is unsupported: `auth_method` must be one of ['basic', 'env', 'instance', 'profile', 'properties', 'anonymous', 'session', 'assume_role', 'default']."
162 |         )
163 |     }
164 |   }
165 | 
166 |   private def getRequiredOption[A](o: Optional[A], name: String): A = {
167 |     o.orElseThrow(() =>
168 |       new ConfigException(
169 |         s"`$name` must be set when `auth_method` is ${task.getAuthMethod}."
170 |       )
171 |     )
172 |   }
173 | 
174 | }
175 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | 0.5.3 (2024-06-28)
 2 | ==================
 3 | 
 4 | * [Enhancement] [#55](https://github.com/civitaspo/embulk-output-s3_parquet/pull/55) Replace parquet-tools with parquet-avro
 5 | * [Enhancement] [#57](https://github.com/civitaspo/embulk-output-s3_parquet/pull/57) Upgrade hadoop-common library to resolve CVE-2021-37404
 6 | 
 7 | 
 8 | 0.5.2 (2020-10-12)
 9 | ==================
10 | 
11 | * [Fix] [#51](https://github.com/civitaspo/embulk-output-s3_parquet/pull/51) Use PluginClassLoader when oparating catalog.
12 | 
13 | 0.5.1 (2020-06-24)
14 | ==================
15 | 
16 | * [Fix] [#47](https://github.com/civitaspo/embulk-output-s3_parquet/pull/47) Use lower case without any space for Glue data type.
17 | 
18 | 0.5.0 (2020-05-25)
19 | ==================
20 | 
21 | * [New Feature] [#45](https://github.com/civitaspo/embulk-output-s3_parquet/pull/45) Introduce the new usage of **column_options.logical_type**, **type_options.logical_type** to configure more detailed logical types.
22 | * [Deprecated] [#45](https://github.com/civitaspo/embulk-output-s3_parquet/pull/45) The old usage of **column_options.logical_type**, **type_options.logical_type** is deprecated. Use **column_options.converted_type**, **type_options.converted_type** instead.
23 | * [New Feature] [#45](https://github.com/civitaspo/embulk-output-s3_parquet/pull/45) Support casting boolean, double, string, timestamp, json to the int logical type.
24 | * [New Feature] [#45](https://github.com/civitaspo/embulk-output-s3_parquet/pull/45) Support casting long to the timestamp logical type.
25 | * [New Feature] [#45](https://github.com/civitaspo/embulk-output-s3_parquet/pull/45) Support the decimal logical type. (close [#44](https://github.com/civitaspo/embulk-output-s3_parquet/issues/44))
26 | * [New Feature] [#45](https://github.com/civitaspo/embulk-output-s3_parquet/pull/45) Support the time logical type.
27 | * [New Feature] [#45](https://github.com/civitaspo/embulk-output-s3_parquet/pull/45) Support the date logical type.
28 | * [New Feature] [#45](https://github.com/civitaspo/embulk-output-s3_parquet/pull/45) Support is_adjusted_to_utc = false for the timestamp logical type.
29 | * [Fix] [#45](https://github.com/civitaspo/embulk-output-s3_parquet/pull/45) Fix the issue 'Logical type int{8,16,32} don't work' (close [#43](https://github.com/civitaspo/embulk-output-s3_parquet/issues/43))
30 | * [Enhancement] Add lots of tests.
31 | 
32 | 0.4.2 (2020-04-30)
33 | ==================
34 | 
35 | * [Enhancement] [#40](https://github.com/civitaspo/embulk-output-s3_parquet/pull/40) Check combinations with embulk-type and logical-type strictly.
36 | 
37 | 0.4.1 (2020-04-30)
38 | ==================
39 | 
40 | * [Enhancement] [#37](https://github.com/civitaspo/embulk-output-s3_parquet/pull/37) Rewrite the integration tests to make writing and reading tests easier & Use Diagrams for all test cases.
41 | * [Enhancement] [#38](https://github.com/civitaspo/embulk-output-s3_parquet/pull/38) Make all column types enable to use LogicalTypeHandler.
42 | * [Enhancement] [#38](https://github.com/civitaspo/embulk-output-s3_parquet/pull/38) Make parquet schema testable.
43 | * [New Feature] [#38](https://github.com/civitaspo/embulk-output-s3_parquet/pull/38) Support timestamp-nanos.
44 | 
45 | 0.4.0 (2020-04-28)
46 | ==================
47 | 
48 | * [Enhancement] [#35](https://github.com/civitaspo/embulk-output-s3_parquet/pull/35) Fix deprecation warnings.
49 | 
50 | 
51 | 0.3.0 (2020-04-26)
52 | ==================
53 | 
54 | * [Enhancement] [#27](https://github.com/civitaspo/embulk-output-s3_parquet/pull/27) Github Actions releases automatically when a new release tag pushed instead of releasing from local.
55 |   * [HotFix] [#29](https://github.com/civitaspo/embulk-output-s3_parquet/pull/29) Do not skip the CI when a tag is pushed.
56 | * [Enhancement] [#28](https://github.com/civitaspo/embulk-output-s3_parquet/pull/28) Apply the "org.embulk.embulk-plugins" Gradle plugin.
57 | 
58 | 0.2.0 (2020-03-10)
59 | ==================
60 | 
61 | * [Enhancement] [#23](https://github.com/civitaspo/embulk-output-s3_parquet/pull/23) Limit the usage of swapping ContextClassLoader
62 | * [BugFix] [#24](https://github.com/civitaspo/embulk-output-s3_parquet/pull/24) Use basic credentials correctly
63 | * [Enhancement] [#20](https://github.com/civitaspo/embulk-output-s3_parquet/pull/20) Update gradle 4.1 -> 6.1
64 | * [Enhancement] [#20](https://github.com/civitaspo/embulk-output-s3_parquet/pull/20) Update parquet-{column,common,encoding,hadoop,jackson,tools} 1.10.1 -> 1.11.0 with the latest parquet-format 2.4.0 -> 2.7.0
65 |     * [parquet-format CHANGELOG](https://github.com/apache/parquet-format/blob/master/CHANGES.md)
66 |     * [parquet-mr CHANGELOG](https://github.com/apache/parquet-mr/blob/apache-parquet-1.11.0/CHANGES.md#version-1110)
67 | * [Enhancement] [#20](https://github.com/civitaspo/embulk-output-s3_parquet/pull/20) Update aws-java-sdk 1.11.676 -> 1.11.739
68 | * [Enhancement] [#20](https://github.com/civitaspo/embulk-output-s3_parquet/pull/20) Update embulk 0.9.20 -> 0.9.23 with embulk-deps-{config,buffer}
69 | * [Enhancement] [#19](https://github.com/civitaspo/embulk-output-s3_parquet/pull/19) Use scalafmt instead of the Intellij formatter.
70 | * [Enhancement] [#19](https://github.com/civitaspo/embulk-output-s3_parquet/pull/19) Use scalafmt in CI.
71 | * [Enhancement] [#19](https://github.com/civitaspo/embulk-output-s3_parquet/pull/19) Enable to run examples locally with some prepared scripts.
72 | 
73 | 0.1.0 (2019-11-17)
74 | ==================
75 | 
76 | * [New Feature] Support Logical Types older representations(OriginalTypes) #12 
77 | * [Enhancement] Add Github Actions CI settings #13 
78 | * [Enhancement] Support LogicalTypes for Glue Data Catalog #14 
79 | * [Enhancement] Update dependencies #15
80 | * [New Feature] Support `auth_method: web_identity_token` #15 
81 | 
82 | 0.0.3 (2019-07-17)
83 | ==================
84 | 
85 | * [New Feature] Add `catalog` option to register a new table that has data created by `s3_parquet` plugin.
86 | * [Enhancement] Update dependencies.
87 | 
88 | 0.0.2 (2019-01-21)
89 | ==================
90 | 
91 | * [Fix] Close local buffer files before uploading even if lots of pages exist.
92 | 
93 | 0.0.1 (2019-01-18)
94 | ==================
95 | 
96 | * First Release
97 | 


--------------------------------------------------------------------------------
/src/main/scala/org/embulk/output/s3_parquet/catalog/CatalogRegistrator.scala:
--------------------------------------------------------------------------------
  1 | package org.embulk.output.s3_parquet.catalog
  2 | 
  3 | import java.util.{Optional, Map => JMap}
  4 | 
  5 | import com.amazonaws.services.glue.model.{
  6 |   Column,
  7 |   CreateTableRequest,
  8 |   DeleteTableRequest,
  9 |   GetTableRequest,
 10 |   SerDeInfo,
 11 |   StorageDescriptor,
 12 |   TableInput
 13 | }
 14 | import org.apache.parquet.hadoop.metadata.CompressionCodecName
 15 | import org.embulk.config.{Config, ConfigDefault, ConfigException}
 16 | import org.embulk.output.s3_parquet.aws.Aws
 17 | import org.embulk.output.s3_parquet.implicits
 18 | import org.embulk.spi.{Schema, Column => EmbulkColumn}
 19 | import org.slf4j.{Logger, LoggerFactory}
 20 | 
 21 | import scala.util.Try
 22 | 
 23 | object CatalogRegistrator {
 24 | 
 25 |   trait Task extends org.embulk.config.Task {
 26 |     @Config("catalog_id")
 27 |     @ConfigDefault("null")
 28 |     def getCatalogId: Optional[String]
 29 | 
 30 |     @Config("database")
 31 |     def getDatabase: String
 32 | 
 33 |     @Config("table")
 34 |     def getTable: String
 35 | 
 36 |     @Config("column_options")
 37 |     @ConfigDefault("{}")
 38 |     def getColumnOptions: JMap[String, ColumnOption]
 39 | 
 40 |     @Config("operation_if_exists")
 41 |     @ConfigDefault("\"delete\"")
 42 |     def getOperationIfExists: String
 43 |   }
 44 | 
 45 |   trait ColumnOption {
 46 |     @Config("type")
 47 |     def getType: String
 48 |   }
 49 | 
 50 |   import implicits._
 51 | 
 52 |   def fromTask(
 53 |       task: CatalogRegistrator.Task,
 54 |       aws: Aws,
 55 |       schema: Schema,
 56 |       location: String,
 57 |       compressionCodec: CompressionCodecName,
 58 |       defaultGlueTypes: Map[EmbulkColumn, GlueDataType] = Map.empty
 59 |   ): CatalogRegistrator =
 60 |     CatalogRegistrator(
 61 |       aws = aws,
 62 |       catalogId = task.getCatalogId,
 63 |       database = task.getDatabase,
 64 |       table = task.getTable,
 65 |       operationIfExists = task.getOperationIfExists,
 66 |       location = location,
 67 |       compressionCodec = compressionCodec,
 68 |       schema = schema,
 69 |       columnOptions = task.getColumnOptions,
 70 |       defaultGlueTypes = defaultGlueTypes
 71 |     )
 72 | }
 73 | 
 74 | case class CatalogRegistrator(
 75 |     aws: Aws,
 76 |     catalogId: Option[String] = None,
 77 |     database: String,
 78 |     table: String,
 79 |     operationIfExists: String,
 80 |     location: String,
 81 |     compressionCodec: CompressionCodecName,
 82 |     schema: Schema,
 83 |     columnOptions: Map[String, CatalogRegistrator.ColumnOption],
 84 |     defaultGlueTypes: Map[EmbulkColumn, GlueDataType] = Map.empty
 85 | ) {
 86 | 
 87 |   import implicits._
 88 | 
 89 |   private val logger: Logger =
 90 |     LoggerFactory.getLogger(classOf[CatalogRegistrator])
 91 | 
 92 |   def run(): Unit = {
 93 |     if (doesTableExists()) {
 94 |       operationIfExists match {
 95 |         case "skip" =>
 96 |           logger.info(
 97 |             s"Skip to register the table: ${database}.${table}"
 98 |           )
 99 |           return
100 | 
101 |         case "delete" =>
102 |           logger.info(s"Delete the table: ${database}.${table}")
103 |           deleteTable()
104 | 
105 |         case unknown =>
106 |           throw new ConfigException(s"Unsupported operation: $unknown")
107 |       }
108 |     }
109 |     registerNewParquetTable()
110 |     showNewTableInfo()
111 |   }
112 | 
113 |   def showNewTableInfo(): Unit = {
114 |     val req = new GetTableRequest()
115 |     catalogId.foreach(req.setCatalogId)
116 |     req.setDatabaseName(database)
117 |     req.setName(table)
118 | 
119 |     val t = aws.withGlue(_.getTable(req)).getTable
120 |     logger.info(s"Created a table: ${t.toString}")
121 |   }
122 | 
123 |   def doesTableExists(): Boolean = {
124 |     val req = new GetTableRequest()
125 |     catalogId.foreach(req.setCatalogId)
126 |     req.setDatabaseName(database)
127 |     req.setName(table)
128 | 
129 |     Try(aws.withGlue(_.getTable(req))).isSuccess
130 |   }
131 | 
132 |   def deleteTable(): Unit = {
133 |     val req = new DeleteTableRequest()
134 |     catalogId.foreach(req.setCatalogId)
135 |     req.setDatabaseName(database)
136 |     req.setName(table)
137 |     aws.withGlue(_.deleteTable(req))
138 |   }
139 | 
140 |   def registerNewParquetTable(): Unit = {
141 |     logger.info(s"Create a new table: ${database}.${table}")
142 |     val req = new CreateTableRequest()
143 |     catalogId.foreach(req.setCatalogId)
144 |     req.setDatabaseName(database)
145 |     req.setTableInput(
146 |       new TableInput()
147 |         .withName(table)
148 |         .withDescription("Created by embulk-output-s3_parquet")
149 |         .withTableType("EXTERNAL_TABLE")
150 |         .withParameters(
151 |           Map(
152 |             "EXTERNAL" -> "TRUE",
153 |             "classification" -> "parquet",
154 |             "parquet.compression" -> compressionCodec.name()
155 |           )
156 |         )
157 |         .withStorageDescriptor(
158 |           new StorageDescriptor()
159 |             .withColumns(getGlueSchema: _*)
160 |             .withLocation(location)
161 |             .withCompressed(isCompressed)
162 |             .withInputFormat(
163 |               "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat"
164 |             )
165 |             .withOutputFormat(
166 |               "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat"
167 |             )
168 |             .withSerdeInfo(
169 |               new SerDeInfo()
170 |                 .withSerializationLibrary(
171 |                   "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe"
172 |                 )
173 |                 .withParameters(Map("serialization.format" -> "1"))
174 |             )
175 |         )
176 |     )
177 |     aws.withGlue(_.createTable(req))
178 |   }
179 | 
180 |   private def getGlueSchema: Seq[Column] = {
181 |     schema.getColumns.map { c: EmbulkColumn =>
182 |       new Column()
183 |         .withName(c.getName)
184 |         .withType(
185 |           columnOptions
186 |             .get(c.getName)
187 |             .map(_.getType)
188 |             .getOrElse(defaultGlueTypes(c).name)
189 |         )
190 |     }
191 |   }
192 | 
193 |   private def isCompressed: Boolean = {
194 |     !compressionCodec.equals(CompressionCodecName.UNCOMPRESSED)
195 |   }
196 | 
197 | }
198 | 


--------------------------------------------------------------------------------
/src/test/scala/org/embulk/output/s3_parquet/parquet/TestDateLogicalType.scala:
--------------------------------------------------------------------------------
  1 | package org.embulk.output.s3_parquet.parquet
  2 | 
  3 | import org.apache.parquet.schema.LogicalTypeAnnotation
  4 | import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName
  5 | import org.embulk.config.ConfigException
  6 | import org.embulk.output.s3_parquet.catalog.GlueDataType
  7 | import org.embulk.spi.DataException
  8 | import org.embulk.spi.time.Timestamp
  9 | import org.scalatest.diagrams.Diagrams
 10 | import org.scalatest.funsuite.AnyFunSuite
 11 | import org.scalatest.prop.TableDrivenPropertyChecks
 12 | 
 13 | import scala.util.chaining._
 14 | 
 15 | class TestDateLogicalType
 16 |     extends AnyFunSuite
 17 |     with ParquetColumnTypeTestHelper
 18 |     with TableDrivenPropertyChecks
 19 |     with Diagrams {
 20 | 
 21 |   private val conditions = Table(
 22 |     "column",
 23 |     Seq(
 24 |       SAMPLE_BOOLEAN_COLUMN,
 25 |       SAMPLE_LONG_COLUMN,
 26 |       SAMPLE_DOUBLE_COLUMN,
 27 |       SAMPLE_STRING_COLUMN,
 28 |       SAMPLE_TIMESTAMP_COLUMN,
 29 |       SAMPLE_JSON_COLUMN
 30 |     ): _*
 31 |   )
 32 | 
 33 |   private val unsupportedEmbulkColumns = Seq(
 34 |     SAMPLE_BOOLEAN_COLUMN,
 35 |     SAMPLE_DOUBLE_COLUMN,
 36 |     SAMPLE_STRING_COLUMN,
 37 |     SAMPLE_JSON_COLUMN
 38 |   )
 39 | 
 40 |   test(
 41 |     "#primitiveType(column) returns PrimitiveTypeName.INT32 with LogicalType"
 42 |   ) {
 43 |     forAll(conditions) { column =>
 44 |       whenever(!unsupportedEmbulkColumns.contains(column)) {
 45 |         // format: off
 46 |         assert(PrimitiveTypeName.INT32 == DateLogicalType.primitiveType(column).getPrimitiveTypeName)
 47 |         assert(LogicalTypeAnnotation.dateType() == DateLogicalType.primitiveType(column).getLogicalTypeAnnotation)
 48 |         // format: on
 49 |       }
 50 |     }
 51 |   }
 52 | 
 53 |   test(
 54 |     s"#primitiveType(column) cannot return any PrimitiveType when embulk column type is one of (${unsupportedEmbulkColumns
 55 |       .map(_.getType.getName)
 56 |       .mkString(",")})"
 57 |   ) {
 58 |     forAll(conditions) { column =>
 59 |       whenever(unsupportedEmbulkColumns.contains(column)) {
 60 |         // format: off
 61 |         assert(intercept[ConfigException](DateLogicalType.primitiveType(column)).getMessage.startsWith("Unsupported column type: "))
 62 |         // format: on
 63 |       }
 64 |     }
 65 |   }
 66 | 
 67 |   test("#glueDataType(column) returns GlueDataType") {
 68 |     forAll(conditions) { column =>
 69 |       whenever(!unsupportedEmbulkColumns.contains(column)) {
 70 |         assert(GlueDataType.DATE == DateLogicalType.glueDataType(column))
 71 |       }
 72 |     }
 73 |   }
 74 | 
 75 |   test(
 76 |     s"#glueDataType(column) cannot return any GlueDataType when embulk column type is one of (${unsupportedEmbulkColumns
 77 |       .map(_.getType.getName)
 78 |       .mkString(",")})"
 79 |   ) {
 80 |     forAll(conditions) { column =>
 81 |       whenever(unsupportedEmbulkColumns.contains(column)) {
 82 |         // format: off
 83 |         assert(intercept[ConfigException](DateLogicalType.glueDataType(column)).getMessage.startsWith("Unsupported column type: "))
 84 |         // format: on
 85 |       }
 86 |     }
 87 |   }
 88 | 
 89 |   test("#consumeBoolean") {
 90 |     newMockRecordConsumer().tap { consumer =>
 91 |       consumer.writingSampleField {
 92 |         // format: off
 93 |         assert(intercept[ConfigException](DateLogicalType.consumeBoolean(consumer, true)).getMessage.endsWith("is unsupported."))
 94 |         // format: on
 95 |       }
 96 |     }
 97 |   }
 98 | 
 99 |   test("#consumeString") {
100 |     newMockRecordConsumer().tap { consumer =>
101 |       consumer.writingSampleField {
102 |         // format: off
103 |         assert(intercept[ConfigException](DateLogicalType.consumeString(consumer, "")).getMessage.endsWith("is unsupported."))
104 |         // format: on
105 |       }
106 |     }
107 |   }
108 | 
109 |   test("#consumeLong") {
110 |     newMockRecordConsumer().tap { consumer =>
111 |       consumer.writingSampleField {
112 |         DateLogicalType.consumeLong(consumer, 1L)
113 |       }
114 |       assert(consumer.data.head.head.isInstanceOf[Int])
115 |       assert(consumer.data.head.head == 1)
116 |     }
117 |     newMockRecordConsumer().tap { consumer =>
118 |       consumer.writingSampleField {
119 |         // format: off
120 |         assert(intercept[DataException](DateLogicalType.consumeLong(consumer, Long.MaxValue)).getMessage.startsWith("Failed to cast Long: "))
121 |         // format: on
122 |       }
123 |     }
124 |   }
125 | 
126 |   test("#consumeDouble") {
127 |     newMockRecordConsumer().tap { consumer =>
128 |       consumer.writingSampleField {
129 |         // format: off
130 |         assert(intercept[ConfigException](DateLogicalType.consumeDouble(consumer, 0.0d)).getMessage.endsWith("is unsupported."))
131 |         // format: on
132 |       }
133 | 
134 |     }
135 |   }
136 | 
137 |   test("#consumeTimestamp") {
138 |     newMockRecordConsumer().tap { consumer =>
139 |       consumer.writingSampleField {
140 |         DateLogicalType.consumeTimestamp(
141 |           consumer,
142 |           Timestamp.ofEpochSecond(24 * 60 * 60), // 1day
143 |           null
144 |         )
145 |       }
146 |       assert(consumer.data.head.head.isInstanceOf[Int])
147 |       assert(consumer.data.head.head == 1)
148 |     }
149 |     newMockRecordConsumer().tap { consumer =>
150 |       consumer.writingSampleField {
151 |         // NOTE: See. java.time.Instant#MAX_SECOND
152 |         val instantMaxEpochSeconds = 31556889864403199L
153 |         // format: off
154 |         assert(intercept[DataException](DateLogicalType.consumeTimestamp(consumer, Timestamp.ofEpochSecond(instantMaxEpochSeconds), null)).getMessage.startsWith("Failed to cast Long: "))
155 |         // format: on
156 |       }
157 |     }
158 |     newMockRecordConsumer().tap { consumer =>
159 |       consumer.writingSampleField {
160 |         // NOTE: See. java.time.Instant#MIN_SECOND
161 |         val instantMinEpochSeconds = -31557014167219200L
162 |         // format: off
163 |         assert(intercept[DataException](DateLogicalType.consumeTimestamp(consumer, Timestamp.ofEpochSecond(instantMinEpochSeconds), null)).getMessage.startsWith("Failed to cast Long: "))
164 |         // format: on
165 |       }
166 |     }
167 |   }
168 | 
169 |   test("#consumeJson") {
170 |     newMockRecordConsumer().tap { consumer =>
171 |       consumer.writingSampleField {
172 |         // format: off
173 |         assert(intercept[ConfigException](DateLogicalType.consumeJson(consumer, null)).getMessage.endsWith("is unsupported."))
174 |         // format: on
175 |       }
176 |     }
177 |   }
178 | }
179 | 


--------------------------------------------------------------------------------
/gradlew:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env sh
  2 | 
  3 | #
  4 | # Copyright 2015 the original author or authors.
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | #      https://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | #
 18 | 
 19 | ##############################################################################
 20 | ##
 21 | ##  Gradle start up script for UN*X
 22 | ##
 23 | ##############################################################################
 24 | 
 25 | # Attempt to set APP_HOME
 26 | # Resolve links: $0 may be a link
 27 | PRG="$0"
 28 | # Need this for relative symlinks.
 29 | while [ -h "$PRG" ] ; do
 30 |     ls=`ls -ld "$PRG"`
 31 |     link=`expr "$ls" : '.*-> \(.*\)$'`
 32 |     if expr "$link" : '/.*' > /dev/null; then
 33 |         PRG="$link"
 34 |     else
 35 |         PRG=`dirname "$PRG"`"/$link"
 36 |     fi
 37 | done
 38 | SAVED="`pwd`"
 39 | cd "`dirname \"$PRG\"`/" >/dev/null
 40 | APP_HOME="`pwd -P`"
 41 | cd "$SAVED" >/dev/null
 42 | 
 43 | APP_NAME="Gradle"
 44 | APP_BASE_NAME=`basename "$0"`
 45 | 
 46 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
 47 | DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
 48 | 
 49 | # Use the maximum available, or set MAX_FD != -1 to use that value.
 50 | MAX_FD="maximum"
 51 | 
 52 | warn () {
 53 |     echo "$*"
 54 | }
 55 | 
 56 | die () {
 57 |     echo
 58 |     echo "$*"
 59 |     echo
 60 |     exit 1
 61 | }
 62 | 
 63 | # OS specific support (must be 'true' or 'false').
 64 | cygwin=false
 65 | msys=false
 66 | darwin=false
 67 | nonstop=false
 68 | case "`uname`" in
 69 |   CYGWIN* )
 70 |     cygwin=true
 71 |     ;;
 72 |   Darwin* )
 73 |     darwin=true
 74 |     ;;
 75 |   MINGW* )
 76 |     msys=true
 77 |     ;;
 78 |   NONSTOP* )
 79 |     nonstop=true
 80 |     ;;
 81 | esac
 82 | 
 83 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
 84 | 
 85 | # Determine the Java command to use to start the JVM.
 86 | if [ -n "$JAVA_HOME" ] ; then
 87 |     if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
 88 |         # IBM's JDK on AIX uses strange locations for the executables
 89 |         JAVACMD="$JAVA_HOME/jre/sh/java"
 90 |     else
 91 |         JAVACMD="$JAVA_HOME/bin/java"
 92 |     fi
 93 |     if [ ! -x "$JAVACMD" ] ; then
 94 |         die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
 95 | 
 96 | Please set the JAVA_HOME variable in your environment to match the
 97 | location of your Java installation."
 98 |     fi
 99 | else
100 |     JAVACMD="java"
101 |     which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
102 | 
103 | Please set the JAVA_HOME variable in your environment to match the
104 | location of your Java installation."
105 | fi
106 | 
107 | # Increase the maximum file descriptors if we can.
108 | if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
109 |     MAX_FD_LIMIT=`ulimit -H -n`
110 |     if [ $? -eq 0 ] ; then
111 |         if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
112 |             MAX_FD="$MAX_FD_LIMIT"
113 |         fi
114 |         ulimit -n $MAX_FD
115 |         if [ $? -ne 0 ] ; then
116 |             warn "Could not set maximum file descriptor limit: $MAX_FD"
117 |         fi
118 |     else
119 |         warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
120 |     fi
121 | fi
122 | 
123 | # For Darwin, add options to specify how the application appears in the dock
124 | if $darwin; then
125 |     GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
126 | fi
127 | 
128 | # For Cygwin or MSYS, switch paths to Windows format before running java
129 | if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then
130 |     APP_HOME=`cygpath --path --mixed "$APP_HOME"`
131 |     CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
132 |     JAVACMD=`cygpath --unix "$JAVACMD"`
133 | 
134 |     # We build the pattern for arguments to be converted via cygpath
135 |     ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
136 |     SEP=""
137 |     for dir in $ROOTDIRSRAW ; do
138 |         ROOTDIRS="$ROOTDIRS$SEP$dir"
139 |         SEP="|"
140 |     done
141 |     OURCYGPATTERN="(^($ROOTDIRS))"
142 |     # Add a user-defined pattern to the cygpath arguments
143 |     if [ "$GRADLE_CYGPATTERN" != "" ] ; then
144 |         OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
145 |     fi
146 |     # Now convert the arguments - kludge to limit ourselves to /bin/sh
147 |     i=0
148 |     for arg in "$@" ; do
149 |         CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
150 |         CHECK2=`echo "$arg"|egrep -c "^-"`                                 ### Determine if an option
151 | 
152 |         if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then                    ### Added a condition
153 |             eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
154 |         else
155 |             eval `echo args$i`="\"$arg\""
156 |         fi
157 |         i=`expr $i + 1`
158 |     done
159 |     case $i in
160 |         0) set -- ;;
161 |         1) set -- "$args0" ;;
162 |         2) set -- "$args0" "$args1" ;;
163 |         3) set -- "$args0" "$args1" "$args2" ;;
164 |         4) set -- "$args0" "$args1" "$args2" "$args3" ;;
165 |         5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
166 |         6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
167 |         7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
168 |         8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
169 |         9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
170 |     esac
171 | fi
172 | 
173 | # Escape application args
174 | save () {
175 |     for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
176 |     echo " "
177 | }
178 | APP_ARGS=`save "$@"`
179 | 
180 | # Collect all arguments for the java command, following the shell quoting and substitution rules
181 | eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
182 | 
183 | exec "$JAVACMD" "$@"
184 | 


--------------------------------------------------------------------------------
/src/test/scala/org/embulk/output/s3_parquet/parquet/TestTimestampLogicalType.scala:
--------------------------------------------------------------------------------
  1 | package org.embulk.output.s3_parquet.parquet
  2 | 
  3 | import java.time.ZoneId
  4 | 
  5 | import org.apache.parquet.io.api.RecordConsumer
  6 | import org.apache.parquet.schema.LogicalTypeAnnotation
  7 | import org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.{
  8 |   MICROS,
  9 |   MILLIS,
 10 |   NANOS
 11 | }
 12 | import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName
 13 | import org.embulk.config.ConfigException
 14 | import org.embulk.output.s3_parquet.catalog.GlueDataType
 15 | import org.embulk.spi.time.Timestamp
 16 | import org.scalatest.diagrams.Diagrams
 17 | import org.scalatest.funsuite.AnyFunSuite
 18 | import org.scalatest.prop.TableDrivenPropertyChecks
 19 | 
 20 | import scala.util.chaining._
 21 | 
 22 | class TestTimestampLogicalType
 23 |     extends AnyFunSuite
 24 |     with ParquetColumnTypeTestHelper
 25 |     with TableDrivenPropertyChecks
 26 |     with Diagrams {
 27 | 
 28 |   private val conditions = Table(
 29 |     ("isAdjustedToUtc", "timeUnit", "timeZone", "column"), {
 30 |       for {
 31 |         isAdjustedToUtc <- Seq(true, false)
 32 |         timeUnit <- Seq(MILLIS, MICROS, NANOS)
 33 |         timeZone <- Seq(ZoneId.of("UTC"), ZoneId.of("Asia/Tokyo"))
 34 |         column <- Seq(
 35 |           SAMPLE_BOOLEAN_COLUMN,
 36 |           SAMPLE_LONG_COLUMN,
 37 |           SAMPLE_DOUBLE_COLUMN,
 38 |           SAMPLE_STRING_COLUMN,
 39 |           SAMPLE_TIMESTAMP_COLUMN,
 40 |           SAMPLE_JSON_COLUMN
 41 |         )
 42 |       } yield (isAdjustedToUtc, timeUnit, timeZone, column)
 43 |     }: _*
 44 |   )
 45 | 
 46 |   private val unsupportedEmbulkColumns = Seq(
 47 |     SAMPLE_BOOLEAN_COLUMN,
 48 |     SAMPLE_DOUBLE_COLUMN,
 49 |     SAMPLE_STRING_COLUMN,
 50 |     SAMPLE_JSON_COLUMN
 51 |   )
 52 | 
 53 |   test(
 54 |     "#primitiveType(column) returns PrimitiveTypeName.{INT32,INT64} with LogicalType"
 55 |   ) {
 56 |     forAll(conditions) { (isAdjustedToUtc, timeUnit, timeZone, column) =>
 57 |       whenever(unsupportedEmbulkColumns.contains(column)) {
 58 |         // format: off
 59 |         assert(intercept[ConfigException](TimestampLogicalType(isAdjustedToUtc = isAdjustedToUtc, timeUnit = timeUnit, timeZone = timeZone).primitiveType(column)).getMessage.startsWith("Unsupported column type: "))
 60 |         // format: on
 61 |       }
 62 | 
 63 |       whenever(!unsupportedEmbulkColumns.contains(column)) {
 64 |         // format: off
 65 |         assert(PrimitiveTypeName.INT64 == TimestampLogicalType(isAdjustedToUtc = isAdjustedToUtc, timeUnit = timeUnit, timeZone = timeZone).primitiveType(column).getPrimitiveTypeName)
 66 |         assert(LogicalTypeAnnotation.timeType(isAdjustedToUtc, timeUnit) == TimestampLogicalType(isAdjustedToUtc = isAdjustedToUtc, timeUnit = timeUnit, timeZone = timeZone).primitiveType(column).getLogicalTypeAnnotation)
 67 |         // format: on
 68 |       }
 69 |     }
 70 |   }
 71 | 
 72 |   test("#glueDataType(column) returns GlueDataType") {
 73 |     forAll(conditions) { (isAdjustedToUtc, timeUnit, timeZone, column) =>
 74 |       whenever(unsupportedEmbulkColumns.contains(column)) {
 75 |         // format: off
 76 |         assert(intercept[ConfigException](TimestampLogicalType(isAdjustedToUtc = isAdjustedToUtc, timeUnit = timeUnit, timeZone = timeZone).glueDataType(column)).getMessage.startsWith("Unsupported column type: "))
 77 |         // format: on
 78 |       }
 79 |       whenever(!unsupportedEmbulkColumns.contains(column)) {
 80 |         val expectedGlueDataType =
 81 |           if (timeUnit === MILLIS) GlueDataType.TIMESTAMP
 82 |           else GlueDataType.BIGINT
 83 |         // format: off
 84 |         assert(expectedGlueDataType == TimestampLogicalType(isAdjustedToUtc = isAdjustedToUtc, timeUnit = timeUnit, timeZone =  timeZone).glueDataType(column))
 85 |         // format: on
 86 |       }
 87 |     }
 88 |   }
 89 | 
 90 |   test("#consumeLong") {
 91 |     forAll(conditions) { (isAdjustedToUtc, timeUnit, timeZone, _) =>
 92 |       newMockRecordConsumer().tap { consumer =>
 93 |         consumer.writingSampleField {
 94 |           TimestampLogicalType(
 95 |             isAdjustedToUtc = isAdjustedToUtc,
 96 |             timeUnit = timeUnit,
 97 |             timeZone = timeZone
 98 |           ).consumeLong(consumer, 5)
 99 |         }
100 |         assert(consumer.data.head.head.isInstanceOf[Long])
101 |         assert(consumer.data.head.head == 5L)
102 |       }
103 |       newMockRecordConsumer().tap { consumer =>
104 |         consumer.writingSampleField {
105 |           TimestampLogicalType(
106 |             isAdjustedToUtc = isAdjustedToUtc,
107 |             timeUnit = timeUnit,
108 |             timeZone = timeZone
109 |           ).consumeLong(consumer, Long.MaxValue)
110 |         }
111 |         assert(consumer.data.head.head.isInstanceOf[Long])
112 |         assert(consumer.data.head.head == Long.MaxValue)
113 |       }
114 |     }
115 |   }
116 | 
117 |   test("#consumeTimestamp") {
118 |     forAll(conditions) { (isAdjustedToUtc, timeUnit, timeZone, _) =>
119 |       timeUnit match {
120 |         case MILLIS =>
121 |           val v = Timestamp.ofEpochMilli(Int.MaxValue)
122 |           newMockRecordConsumer().tap { consumer =>
123 |             consumer.writingSampleField {
124 |               TimestampLogicalType(
125 |                 isAdjustedToUtc = isAdjustedToUtc,
126 |                 timeUnit = timeUnit,
127 |                 timeZone = timeZone
128 |               ).consumeTimestamp(consumer, v, null)
129 |             }
130 |             assert(consumer.data.head.head.isInstanceOf[Long])
131 |             assert(consumer.data.head.head == Int.MaxValue)
132 |           }
133 |         case MICROS =>
134 |           val v = Timestamp.ofEpochMilli(Int.MaxValue)
135 |           newMockRecordConsumer().tap { consumer =>
136 |             consumer.writingSampleField {
137 |               TimestampLogicalType(
138 |                 isAdjustedToUtc = isAdjustedToUtc,
139 |                 timeUnit = timeUnit,
140 |                 timeZone = timeZone
141 |               ).consumeTimestamp(consumer, v, null)
142 |             }
143 |             assert(consumer.data.head.head.isInstanceOf[Long])
144 | 
145 |             assert(consumer.data.head.head == Int.MaxValue * 1_000L)
146 |           }
147 |         case NANOS =>
148 |           val v = Timestamp.ofEpochMilli(Int.MaxValue)
149 |           newMockRecordConsumer().tap { consumer =>
150 |             consumer.writingSampleField {
151 |               TimestampLogicalType(
152 |                 isAdjustedToUtc = isAdjustedToUtc,
153 |                 timeUnit = timeUnit,
154 |                 timeZone = timeZone
155 |               ).consumeTimestamp(consumer, v, null)
156 |             }
157 |             assert(consumer.data.head.head.isInstanceOf[Long])
158 |             assert(consumer.data.head.head == Int.MaxValue * 1_000_000L)
159 |           }
160 |       }
161 | 
162 |     }
163 |   }
164 | 
165 |   test("#consume{Boolean,Double,String,Json} are unsupported.") {
166 |     def assertUnsupportedConsume(f: RecordConsumer => Unit) =
167 |       newMockRecordConsumer().tap { consumer =>
168 |         consumer.writingSampleField {
169 |           // format: off
170 |           assert(intercept[ConfigException](f(consumer)).getMessage.endsWith("is unsupported."))
171 |           // format: on
172 |         }
173 |       }
174 | 
175 |     forAll(conditions) { (isAdjustedToUtc, timeUnit, timeZone, _) =>
176 |       val t =
177 |         TimestampLogicalType(
178 |           isAdjustedToUtc = isAdjustedToUtc,
179 |           timeUnit = timeUnit,
180 |           timeZone = timeZone
181 |         )
182 |       assertUnsupportedConsume(t.consumeBoolean(_, true))
183 |       assertUnsupportedConsume(t.consumeDouble(_, 0.0d))
184 |       assertUnsupportedConsume(t.consumeString(_, null))
185 |       assertUnsupportedConsume(t.consumeJson(_, null))
186 |     }
187 |   }
188 | 
189 | }
190 | 


--------------------------------------------------------------------------------
/src/main/scala/org/embulk/output/s3_parquet/parquet/IntLogicalType.scala:
--------------------------------------------------------------------------------
  1 | package org.embulk.output.s3_parquet.parquet
  2 | 
  3 | import org.apache.parquet.io.api.RecordConsumer
  4 | import org.apache.parquet.schema.{LogicalTypeAnnotation, PrimitiveType, Types}
  5 | import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName
  6 | import org.embulk.config.ConfigException
  7 | import org.embulk.output.s3_parquet.catalog.GlueDataType
  8 | import org.embulk.output.s3_parquet.catalog.GlueDataType.AbstractIntGlueDataType
  9 | import org.embulk.spi.{Column, DataException}
 10 | import org.embulk.spi.`type`.{
 11 |   BooleanType,
 12 |   DoubleType,
 13 |   JsonType,
 14 |   LongType,
 15 |   StringType,
 16 |   TimestampType
 17 | }
 18 | import org.embulk.spi.time.{Timestamp, TimestampFormatter}
 19 | import org.msgpack.value.Value
 20 | import org.slf4j.{Logger, LoggerFactory}
 21 | 
 22 | import scala.math.BigDecimal.RoundingMode
 23 | 
 24 | case class IntLogicalType(bitWidth: Int, isSigned: Boolean)
 25 |     extends ParquetColumnType {
 26 |   require(
 27 |     Seq(8, 16, 32, 64).contains(bitWidth),
 28 |     s"bitWidth value must be one of (8, 16, 32, 64)."
 29 |   )
 30 | 
 31 |   private val logger: Logger = LoggerFactory.getLogger(classOf[IntLogicalType])
 32 | 
 33 |   private val SIGNED_64BIT_INT_MAX_VALUE = BigInt("9223372036854775807")
 34 |   private val SIGNED_64BIT_INT_MIN_VALUE = BigInt("-9223372036854775808")
 35 |   private val SIGNED_32BIT_INT_MAX_VALUE = BigInt("2147483647")
 36 |   private val SIGNED_32BIT_INT_MIN_VALUE = BigInt("-2147483648")
 37 |   private val SIGNED_16BIT_INT_MAX_VALUE = BigInt("32767")
 38 |   private val SIGNED_16BIT_INT_MIN_VALUE = BigInt("-32768")
 39 |   private val SIGNED_8BIT_INT_MAX_VALUE = BigInt("127")
 40 |   private val SIGNED_8BIT_INT_MIN_VALUE = BigInt("-128")
 41 |   private val UNSIGNED_64BIT_INT_MAX_VALUE = BigInt("18446744073709551615")
 42 |   private val UNSIGNED_64BIT_INT_MIN_VALUE = BigInt("0")
 43 |   private val UNSIGNED_32BIT_INT_MAX_VALUE = BigInt("4294967295")
 44 |   private val UNSIGNED_32BIT_INT_MIN_VALUE = BigInt("0")
 45 |   private val UNSIGNED_16BIT_INT_MAX_VALUE = BigInt("65535")
 46 |   private val UNSIGNED_16BIT_INT_MIN_VALUE = BigInt("0")
 47 |   private val UNSIGNED_8BIT_INT_MAX_VALUE = BigInt("255")
 48 |   private val UNSIGNED_8BIT_INT_MIN_VALUE = BigInt("0")
 49 | 
 50 |   private def isINT32: Boolean = bitWidth < 64
 51 | 
 52 |   override def primitiveType(column: Column): PrimitiveType =
 53 |     column.getType match {
 54 |       case _: BooleanType | _: LongType | _: DoubleType | _: StringType =>
 55 |         Types
 56 |           .optional(
 57 |             if (isINT32) PrimitiveTypeName.INT32
 58 |             else PrimitiveTypeName.INT64
 59 |           )
 60 |           .as(LogicalTypeAnnotation.intType(bitWidth, isSigned))
 61 |           .named(column.getName)
 62 |       case _: TimestampType | _: JsonType | _ =>
 63 |         throw new ConfigException(s"Unsupported column type: ${column.getName}")
 64 |     }
 65 | 
 66 |   override def glueDataType(column: Column): GlueDataType =
 67 |     column.getType match {
 68 |       case _: BooleanType | _: LongType | _: DoubleType | _: StringType =>
 69 |         (bitWidth, isSigned) match {
 70 |           case (8, true)  => GlueDataType.TINYINT
 71 |           case (16, true) => GlueDataType.SMALLINT
 72 |           case (32, true) => GlueDataType.INT
 73 |           case (64, true) => GlueDataType.BIGINT
 74 |           case (8, false) =>
 75 |             warningWhenConvertingUnsignedIntegerToGlueType(
 76 |               GlueDataType.SMALLINT
 77 |             )
 78 |             GlueDataType.SMALLINT
 79 |           case (16, false) =>
 80 |             warningWhenConvertingUnsignedIntegerToGlueType(GlueDataType.INT)
 81 |             GlueDataType.INT
 82 |           case (32, false) =>
 83 |             warningWhenConvertingUnsignedIntegerToGlueType(GlueDataType.BIGINT)
 84 |             GlueDataType.BIGINT
 85 |           case (64, false) =>
 86 |             warningWhenConvertingUnsignedIntegerToGlueType(GlueDataType.BIGINT)
 87 |             GlueDataType.BIGINT
 88 |           case (_, _) =>
 89 |             throw new ConfigException(
 90 |               s"Unsupported column type: ${column.getName} (bitWidth: $bitWidth, isSigned: $isSigned)"
 91 |             )
 92 |         }
 93 |       case _: TimestampType | _: JsonType | _ =>
 94 |         throw new ConfigException(s"Unsupported column type: ${column.getName}")
 95 |     }
 96 | 
 97 |   override def consumeBoolean(consumer: RecordConsumer, v: Boolean): Unit =
 98 |     if (isINT32)
 99 |       consumer.addInteger(
100 |         if (v) 1
101 |         else 0
102 |       )
103 |     else
104 |       consumer.addLong(
105 |         if (v) 1
106 |         else 0
107 |       )
108 | 
109 |   override def consumeString(consumer: RecordConsumer, v: String): Unit =
110 |     try consumeBigDecimal(consumer, BigDecimal.exact(v))
111 |     catch {
112 |       case ex: NumberFormatException =>
113 |         throw new DataException(s"Failed to cast String: $v to BigDecimal.", ex)
114 |     }
115 |   override def consumeLong(consumer: RecordConsumer, v: Long): Unit =
116 |     consumeBigInt(consumer, BigInt(v))
117 |   override def consumeDouble(consumer: RecordConsumer, v: Double): Unit =
118 |     consumeBigDecimal(consumer, BigDecimal.exact(v))
119 |   override def consumeTimestamp(
120 |       consumer: RecordConsumer,
121 |       v: Timestamp,
122 |       formatter: TimestampFormatter
123 |   ): Unit = throw newUnsupportedMethodException("consumeTimestamp")
124 |   override def consumeJson(consumer: RecordConsumer, v: Value): Unit =
125 |     throw newUnsupportedMethodException("consumeJson")
126 | 
127 |   private def warningWhenConvertingUnsignedIntegerToGlueType(
128 |       glueType: AbstractIntGlueDataType
129 |   ): Unit = {
130 |     logger.warn {
131 |       s"int(bit_width = $bitWidth, is_signed  $isSigned) is converted to Glue ${glueType.name}" +
132 |         s" but this is not represented correctly, because the Glue ${glueType.name} represents" +
133 |         s" a ${glueType.bitWidth}-bit signed integer. Please use `catalog.column_options` to define the type."
134 |     }
135 |   }
136 | 
137 |   private def consumeBigDecimal(consumer: RecordConsumer, v: BigDecimal): Unit =
138 |     // TODO: Make RoundingMode configurable?
139 |     consumeBigInt(consumer, v.setScale(0, RoundingMode.HALF_UP).toBigInt)
140 | 
141 |   private def consumeBigInt(consumer: RecordConsumer, v: BigInt): Unit = {
142 |     def consume(min: BigInt, max: BigInt): Unit =
143 |       if (min <= v && v <= max)
144 |         if (isINT32) consumer.addInteger(v.toInt)
145 |         else consumer.addLong(v.toLong)
146 |       else
147 |         throw new DataException(
148 |           s"The value is out of the range: that is '$min <= value <= $max'" +
149 |             s" in the case of int(bit_width = $bitWidth, is_signed  $isSigned)" +
150 |             s", but the value is $v."
151 |         )
152 |     (bitWidth, isSigned) match {
153 |       case (8, true) =>
154 |         consume(SIGNED_8BIT_INT_MIN_VALUE, SIGNED_8BIT_INT_MAX_VALUE)
155 |       case (16, true) =>
156 |         consume(SIGNED_16BIT_INT_MIN_VALUE, SIGNED_16BIT_INT_MAX_VALUE)
157 |       case (32, true) =>
158 |         consume(SIGNED_32BIT_INT_MIN_VALUE, SIGNED_32BIT_INT_MAX_VALUE)
159 |       case (64, true) =>
160 |         consume(SIGNED_64BIT_INT_MIN_VALUE, SIGNED_64BIT_INT_MAX_VALUE)
161 |       case (8, false) =>
162 |         consume(UNSIGNED_8BIT_INT_MIN_VALUE, UNSIGNED_8BIT_INT_MAX_VALUE)
163 |       case (16, false) =>
164 |         consume(UNSIGNED_16BIT_INT_MIN_VALUE, UNSIGNED_16BIT_INT_MAX_VALUE)
165 |       case (32, false) =>
166 |         consume(UNSIGNED_32BIT_INT_MIN_VALUE, UNSIGNED_32BIT_INT_MAX_VALUE)
167 |       case (64, false) =>
168 |         consume(UNSIGNED_64BIT_INT_MIN_VALUE, UNSIGNED_64BIT_INT_MAX_VALUE)
169 |       case _ =>
170 |         throw new ConfigException(
171 |           s"int(bit_width = $bitWidth, is_signed  $isSigned) is unsupported."
172 |         )
173 |     }
174 |   }
175 | }
176 | 


--------------------------------------------------------------------------------
/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetFileWriteSupport.scala:
--------------------------------------------------------------------------------
  1 | package org.embulk.output.s3_parquet.parquet
  2 | 
  3 | import java.lang.{StringBuilder => JStringBuilder}
  4 | import java.util.{Map => JMap}
  5 | 
  6 | import org.apache.hadoop.conf.Configuration
  7 | import org.apache.hadoop.fs.Path
  8 | import org.apache.parquet.hadoop.api.WriteSupport
  9 | import org.apache.parquet.hadoop.api.WriteSupport.WriteContext
 10 | import org.apache.parquet.hadoop.ParquetWriter
 11 | import org.apache.parquet.io.api.RecordConsumer
 12 | import org.apache.parquet.schema.MessageType
 13 | import org.embulk.config.{
 14 |   Config,
 15 |   ConfigDefault,
 16 |   ConfigException,
 17 |   ConfigSource,
 18 |   Task => EmbulkTask
 19 | }
 20 | import org.embulk.output.s3_parquet.implicits
 21 | import org.embulk.output.s3_parquet.parquet.ParquetFileWriteSupport.WriterBuilder
 22 | import org.embulk.spi.{Column, ColumnVisitor, PageReader, Schema}
 23 | import org.embulk.spi.`type`.{TimestampType, Type, Types}
 24 | import org.embulk.spi.time.TimestampFormatter
 25 | import org.embulk.spi.util.Timestamps
 26 | import org.slf4j.Logger
 27 | 
 28 | object ParquetFileWriteSupport {
 29 | 
 30 |   import implicits._
 31 | 
 32 |   trait Task extends TimestampFormatter.Task with EmbulkTask {
 33 |     @Config("column_options")
 34 |     @ConfigDefault("{}")
 35 |     def getRawColumnOptions: JMap[String, ConfigSource]
 36 | 
 37 |     def getColumnOptions: JMap[String, ParquetColumnType.Task]
 38 |     def setColumnOptions(
 39 |         columnOptions: JMap[String, ParquetColumnType.Task]
 40 |     ): Unit
 41 | 
 42 |     @Config("type_options")
 43 |     @ConfigDefault("{}")
 44 |     def getRawTypeOptions: JMap[String, ConfigSource]
 45 | 
 46 |     def getTypeOptions: JMap[String, ParquetColumnType.Task]
 47 |     def setTypeOptions(typeOptions: JMap[String, ParquetColumnType.Task]): Unit
 48 |   }
 49 | 
 50 |   case class WriterBuilder(path: Path, writeSupport: ParquetFileWriteSupport)
 51 |       extends ParquetWriter.Builder[PageReader, WriterBuilder](path) {
 52 |     override def self(): WriterBuilder = this
 53 |     override def getWriteSupport(
 54 |         conf: Configuration
 55 |     ): WriteSupport[PageReader] = writeSupport
 56 |   }
 57 | 
 58 |   def configure(task: Task): Unit = {
 59 |     task.setColumnOptions(task.getRawColumnOptions.map {
 60 |       case (columnName, config) =>
 61 |         columnName -> ParquetColumnType.loadConfig(config)
 62 |     })
 63 |     task.setTypeOptions(task.getRawTypeOptions.map {
 64 |       case (columnType, config) =>
 65 |         columnType -> ParquetColumnType.loadConfig(config)
 66 |     })
 67 |   }
 68 | 
 69 |   private def validateTask(task: Task, schema: Schema): Unit = {
 70 |     if (task.getColumnOptions == null || task.getTypeOptions == null)
 71 |       assert(false)
 72 | 
 73 |     task.getTypeOptions.keys.foreach(
 74 |       embulkType
 75 |     ) // throw ConfigException if unknown type name is found.
 76 | 
 77 |     task.getColumnOptions.foreach {
 78 |       case (c: String, t: ParquetColumnType.Task) =>
 79 |         val column: Column = schema.lookupColumn(c) // throw ConfigException if columnName does not exist.
 80 | 
 81 |         if (t.getFormat.isDefined || t.getTimeZoneId.isDefined) {
 82 |           if (!column.getType.isInstanceOf[TimestampType]) {
 83 |             // NOTE: Warning is better instead of throwing.
 84 |             throw new ConfigException(
 85 |               s"The type of column{name:${column.getName},type:${column.getType.getName}} is not 'timestamp'," +
 86 |                 " but timestamp options (\"format\" or \"timezone\") are set."
 87 |             )
 88 |           }
 89 |         }
 90 |     }
 91 |   }
 92 | 
 93 |   private def embulkType(typeName: String): Type = {
 94 |     Seq(
 95 |       Types.BOOLEAN,
 96 |       Types.STRING,
 97 |       Types.LONG,
 98 |       Types.DOUBLE,
 99 |       Types.TIMESTAMP,
100 |       Types.JSON
101 |     ).foreach { embulkType =>
102 |       if (embulkType.getName.equals(typeName)) return embulkType
103 |     }
104 |     throw new ConfigException(s"Unknown embulk type: $typeName.")
105 |   }
106 | 
107 |   def apply(task: Task, schema: Schema): ParquetFileWriteSupport = {
108 |     validateTask(task, schema)
109 | 
110 |     val parquetSchema: Map[Column, ParquetColumnType] = schema.getColumns.map {
111 |       c: Column =>
112 |         c -> task.getColumnOptions.toMap
113 |           .get(c.getName)
114 |           .orElse(task.getTypeOptions.toMap.get(c.getType.getName))
115 |           .flatMap(ParquetColumnType.fromTask)
116 |           .getOrElse(DefaultColumnType)
117 |     }.toMap
118 |     val timestampFormatters: Seq[TimestampFormatter] = Timestamps
119 |       .newTimestampColumnFormatters(task, schema, task.getColumnOptions)
120 |     new ParquetFileWriteSupport(schema, parquetSchema, timestampFormatters)
121 |   }
122 | }
123 | 
124 | case class ParquetFileWriteSupport private (
125 |     schema: Schema,
126 |     parquetSchema: Map[Column, ParquetColumnType],
127 |     timestampFormatters: Seq[TimestampFormatter]
128 | ) extends WriteSupport[PageReader] {
129 | 
130 |   import implicits._
131 | 
132 |   private val messageType: MessageType =
133 |     new MessageType("embulk", schema.getColumns.map { c =>
134 |       parquetSchema(c).primitiveType(c)
135 |     })
136 | 
137 |   private var current: RecordConsumer = _
138 | 
139 |   def showOutputSchema(logger: Logger): Unit = {
140 |     val sb = new JStringBuilder()
141 |     sb.append("=== Output Parquet Schema ===\n")
142 |     messageType.writeToStringBuilder(sb, null) // NOTE: indent is not used.
143 |     sb.append("=============================\n")
144 |     sb.toString.split("\n").foreach(logger.info)
145 |   }
146 | 
147 |   override def init(configuration: Configuration): WriteContext = {
148 |     val metadata: Map[String, String] = Map.empty // NOTE: When is this used?
149 |     new WriteContext(messageType, metadata)
150 |   }
151 | 
152 |   override def prepareForWrite(recordConsumer: RecordConsumer): Unit =
153 |     current = recordConsumer
154 | 
155 |   override def write(record: PageReader): Unit = {
156 |     writingRecord {
157 |       schema.visitColumns(new ColumnVisitor {
158 |         override def booleanColumn(column: Column): Unit = nullOr(column) {
159 |           parquetSchema(column)
160 |             .consumeBoolean(current, record.getBoolean(column))
161 |         }
162 |         override def longColumn(column: Column): Unit = nullOr(column) {
163 |           parquetSchema(column).consumeLong(current, record.getLong(column))
164 |         }
165 |         override def doubleColumn(column: Column): Unit = nullOr(column) {
166 |           parquetSchema(column).consumeDouble(current, record.getDouble(column))
167 |         }
168 |         override def stringColumn(column: Column): Unit = nullOr(column) {
169 |           parquetSchema(column).consumeString(current, record.getString(column))
170 |         }
171 |         override def timestampColumn(column: Column): Unit = nullOr(column) {
172 |           parquetSchema(column).consumeTimestamp(
173 |             current,
174 |             record.getTimestamp(column),
175 |             timestampFormatters(column.getIndex)
176 |           )
177 |         }
178 |         override def jsonColumn(column: Column): Unit = nullOr(column) {
179 |           parquetSchema(column).consumeJson(current, record.getJson(column))
180 |         }
181 |         private def nullOr(column: Column)(f: => Unit): Unit =
182 |           if (!record.isNull(column)) writingColumn(column)(f)
183 |       })
184 |     }
185 |   }
186 | 
187 |   private def writingRecord(f: => Unit): Unit = {
188 |     current.startMessage()
189 |     f
190 |     current.endMessage()
191 |   }
192 | 
193 |   private def writingColumn(column: Column)(f: => Unit): Unit = {
194 |     current.startField(column.getName, column.getIndex)
195 |     f
196 |     current.endField(column.getName, column.getIndex)
197 |   }
198 | 
199 |   def newWriterBuilder(pathString: String): WriterBuilder =
200 |     WriterBuilder(new Path(pathString), this)
201 | }
202 | 


--------------------------------------------------------------------------------
/src/test/scala/org/embulk/output/s3_parquet/parquet/TestDecimalLogicalType.scala:
--------------------------------------------------------------------------------
  1 | package org.embulk.output.s3_parquet.parquet
  2 | 
  3 | import org.apache.parquet.io.api.{Binary, RecordConsumer}
  4 | import org.apache.parquet.schema.LogicalTypeAnnotation
  5 | import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName
  6 | import org.embulk.config.ConfigException
  7 | import org.embulk.output.s3_parquet.catalog.GlueDataType
  8 | import org.embulk.spi.`type`.{DoubleType, LongType, StringType}
  9 | import org.embulk.spi.DataException
 10 | import org.scalatest.diagrams.Diagrams
 11 | import org.scalatest.funsuite.AnyFunSuite
 12 | import org.scalatest.prop.TableDrivenPropertyChecks
 13 | 
 14 | import scala.util.chaining._
 15 | 
 16 | class TestDecimalLogicalType
 17 |     extends AnyFunSuite
 18 |     with ParquetColumnTypeTestHelper
 19 |     with TableDrivenPropertyChecks
 20 |     with Diagrams {
 21 | 
 22 |   private val conditions = Table(
 23 |     ("precision", "scale", "column"), {
 24 |       for {
 25 |         precision <- Seq(1, 9, 10, 18, 19)
 26 |         scale <- Seq(0, 1, 20)
 27 |         column <- Seq(
 28 |           SAMPLE_BOOLEAN_COLUMN,
 29 |           SAMPLE_LONG_COLUMN,
 30 |           SAMPLE_DOUBLE_COLUMN,
 31 |           SAMPLE_STRING_COLUMN,
 32 |           SAMPLE_TIMESTAMP_COLUMN,
 33 |           SAMPLE_JSON_COLUMN
 34 |         )
 35 |       } yield (precision, scale, column)
 36 |     }: _*
 37 |   )
 38 | 
 39 |   private val unsupportedEmbulkColumns = Seq(
 40 |     SAMPLE_BOOLEAN_COLUMN,
 41 |     SAMPLE_TIMESTAMP_COLUMN,
 42 |     SAMPLE_JSON_COLUMN
 43 |   )
 44 | 
 45 |   def isValidScaleAndPrecision(scale: Int, precision: Int): Boolean =
 46 |     scale >= 0 && scale < precision && precision > 0
 47 | 
 48 |   test("throws IllegalArgumentException") {
 49 |     // format: off
 50 |     assert(intercept[IllegalArgumentException](DecimalLogicalType(-1, 5)).getMessage.startsWith("requirement failed: Scale must be zero or a positive integer."))
 51 |     assert(intercept[IllegalArgumentException](DecimalLogicalType(10, 5)).getMessage.startsWith("requirement failed: Scale must be a positive integer less than the precision."))
 52 |     // format: on
 53 |   }
 54 | 
 55 |   test(
 56 |     "#primitiveType(column) returns PrimitiveTypeName.{INT32, INT64, BINARY} with LogicalType"
 57 |   ) {
 58 |     forAll(conditions) { (precision, scale, column) =>
 59 |       whenever(isValidScaleAndPrecision(scale, precision)) {
 60 |         // format: off
 61 |         column.getType match {
 62 |           case _: LongType if 1 <= precision && precision <= 9   =>
 63 |             assert(PrimitiveTypeName.INT32 == DecimalLogicalType(scale, precision).primitiveType(column).getPrimitiveTypeName)
 64 |             assert(LogicalTypeAnnotation.decimalType(scale, precision) == DecimalLogicalType(scale, precision).primitiveType(column).getLogicalTypeAnnotation)
 65 |           case _: LongType if 10 <= precision && precision <= 18 =>
 66 |             assert(PrimitiveTypeName.INT64 == DecimalLogicalType(scale, precision).primitiveType(column).getPrimitiveTypeName)
 67 |             assert(LogicalTypeAnnotation.decimalType(scale, precision) == DecimalLogicalType(scale, precision).primitiveType(column).getLogicalTypeAnnotation)
 68 |           case _: StringType | _: DoubleType                     =>
 69 |             assert(PrimitiveTypeName.BINARY == DecimalLogicalType(scale, precision).primitiveType(column).getPrimitiveTypeName)
 70 |             assert(LogicalTypeAnnotation.decimalType(scale, precision) == DecimalLogicalType(scale, precision).primitiveType(column).getLogicalTypeAnnotation)
 71 |           case _                                                 =>
 72 |             assert(intercept[ConfigException](DecimalLogicalType(scale, precision).primitiveType(column)).getMessage.startsWith("Unsupported column type: "))
 73 |         }
 74 |         // format: on
 75 |       }
 76 |     }
 77 |   }
 78 | 
 79 |   test("#glueDataType(column) returns GlueDataType") {
 80 |     forAll(conditions) { (precision, scale, column) =>
 81 |       whenever(isValidScaleAndPrecision(scale, precision)) {
 82 |         // format: off
 83 |         column.getType match {
 84 |           case _: LongType | _: StringType | _: DoubleType =>
 85 |             assert(GlueDataType.DECIMAL(precision, scale) == DecimalLogicalType(scale, precision).glueDataType(column))
 86 |           case _ =>
 87 |             assert(intercept[ConfigException](DecimalLogicalType(scale, precision).glueDataType(column)).getMessage.startsWith("Unsupported column type: "))
 88 |         }
 89 |         // format: on
 90 |       }
 91 |     }
 92 |   }
 93 | 
 94 |   test("#consumeString") {
 95 |     forAll(conditions) { (precision, scale, _) =>
 96 |       whenever(isValidScaleAndPrecision(scale, precision)) {
 97 |         newMockRecordConsumer().tap { consumer =>
 98 |           consumer.writingSampleField {
 99 |             // format: off
100 |             assert(intercept[DataException](DecimalLogicalType(scale, precision).consumeString(consumer, "string")).getMessage.startsWith("Failed to cast String: "))
101 |             // format: on
102 |           }
103 |         }
104 |         newMockRecordConsumer().tap { consumer =>
105 |           consumer.writingSampleField {
106 |             DecimalLogicalType(scale, precision).consumeString(consumer, "5.5")
107 |           }
108 |           assert(consumer.data.head.head.isInstanceOf[Binary])
109 |           if (scale == 0)
110 |             assert(consumer.data.head.head == Binary.fromString("6"))
111 |           else assert(consumer.data.head.head == Binary.fromString("5.5"))
112 |         }
113 |       }
114 |     }
115 |   }
116 | 
117 |   test("#consumeLong") {
118 |     forAll(conditions) { (precision, scale, _) =>
119 |       whenever(isValidScaleAndPrecision(scale, precision) && precision <= 18) {
120 |         newMockRecordConsumer().tap { consumer =>
121 |           consumer.writingSampleField {
122 |             DecimalLogicalType(scale, precision)
123 |               .consumeLong(consumer, 1L)
124 |           }
125 |           if (1 <= precision && precision <= 9) {
126 |             assert(consumer.data.head.head.isInstanceOf[Int])
127 |             assert(consumer.data.head.head == 1)
128 |           }
129 |           else {
130 |             assert(consumer.data.head.head.isInstanceOf[Long])
131 |             assert(consumer.data.head.head == 1)
132 |           }
133 |         }
134 |       }
135 |       whenever(isValidScaleAndPrecision(scale, precision) && precision > 18) {
136 |         newMockRecordConsumer().tap { consumer =>
137 |           consumer.writingSampleField {
138 |             // format: off
139 |             assert(intercept[ConfigException](DecimalLogicalType(scale, precision).consumeLong(consumer, 1L)).getMessage.startsWith("precision must be 1 <= precision <= 18 when consuming long values but precision is "))
140 |             // format: on
141 |           }
142 |         }
143 |       }
144 |     }
145 |   }
146 | 
147 |   test("#consumeDouble") {
148 |     forAll(conditions) { (precision, scale, _) =>
149 |       whenever(isValidScaleAndPrecision(scale, precision)) {
150 |         newMockRecordConsumer().tap { consumer =>
151 |           consumer.writingSampleField {
152 |             DecimalLogicalType(scale, precision)
153 |               .consumeDouble(consumer, 1.1d)
154 |           }
155 |           assert(consumer.data.head.head.isInstanceOf[Binary])
156 |           if (scale == 0)
157 |             assert(consumer.data.head.head == Binary.fromString("1"))
158 |           else assert(consumer.data.head.head == Binary.fromString("1.1"))
159 |         }
160 |       }
161 |     }
162 |   }
163 | 
164 |   test("#consume{Boolean,Timestamp,Json} are unsupported.") {
165 |     def assertUnsupportedConsume(f: RecordConsumer => Unit) =
166 |       newMockRecordConsumer().tap { consumer =>
167 |         consumer.writingSampleField {
168 |           // format: off
169 |           assert(intercept[ConfigException](f(consumer)).getMessage.endsWith("is unsupported."))
170 |           // format: on
171 |         }
172 |       }
173 |     assertUnsupportedConsume(DecimalLogicalType(5, 10).consumeBoolean(_, true))
174 |     assertUnsupportedConsume(
175 |       DecimalLogicalType(5, 10).consumeTimestamp(_, null, null)
176 |     )
177 |     assertUnsupportedConsume(DecimalLogicalType(5, 10).consumeJson(_, null))
178 |   }
179 | }
180 | 


--------------------------------------------------------------------------------
/src/test/scala/org/embulk/output/s3_parquet/TestS3ParquetOutputPlugin.scala:
--------------------------------------------------------------------------------
  1 | package org.embulk.output.s3_parquet
  2 | 
  3 | import org.apache.parquet.schema.LogicalTypeAnnotation
  4 | import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName
  5 | import org.embulk.spi.Schema
  6 | import org.embulk.spi.`type`.Types
  7 | import org.embulk.spi.time.{Timestamp, TimestampFormatter, TimestampParser}
  8 | import org.msgpack.value.Value
  9 | 
 10 | import scala.util.chaining._
 11 | 
 12 | class TestS3ParquetOutputPlugin extends EmbulkPluginTestHelper {
 13 | 
 14 |   test("minimal default case") {
 15 |     val schema: Schema = Schema
 16 |       .builder()
 17 |       .add("c0", Types.BOOLEAN)
 18 |       .add("c1", Types.LONG)
 19 |       .add("c2", Types.DOUBLE)
 20 |       .add("c3", Types.STRING)
 21 |       .add("c4", Types.TIMESTAMP)
 22 |       .add("c5", Types.JSON)
 23 |       .build()
 24 |     // scalafmt: { maxColumn = 200 }
 25 |     val parser = TimestampParser.of("%Y-%m-%d %H:%M:%S.%N %z", "UTC")
 26 |     val data: Seq[Seq[Any]] = Seq(
 27 |       Seq(true, 0L, 0.0d, "c212c89f91", parser.parse("2017-10-22 19:53:31.000000 +0900"), json("""{"a":0,"b":"00"}""")),
 28 |       Seq(false, 1L, -0.5d, "aaaaa", parser.parse("2017-10-22 19:53:31.000000 +0900"), json("""{"a":1,"b":"11"}""")),
 29 |       Seq(false, 2L, 1.5d, "90823c6a1f", parser.parse("2017-10-23 23:42:43.000000 +0900"), json("""{"a":2,"b":"22"}""")),
 30 |       Seq(true, 3L, 0.44d, "", parser.parse("2017-10-22 06:12:13.000000 +0900"), json("""{"a":3,"b":"33","c":3.3}""")),
 31 |       Seq(false, 9999L, 10000.33333d, "e56a40571c", parser.parse("2017-10-23 04:59:16.000000 +0900"), json("""{"a":4,"b":"44","c":4.4,"d":true}"""))
 32 |     )
 33 |     // scalafmt: { maxColumn = 80 }
 34 | 
 35 |     val result: Seq[Seq[AnyRef]] =
 36 |       runOutput(
 37 |         newDefaultConfig,
 38 |         schema,
 39 |         data,
 40 |         messageTypeTest = { messageType =>
 41 |           // format: off
 42 |           assert(PrimitiveTypeName.BOOLEAN == messageType.getColumns.get(0).getPrimitiveType.getPrimitiveTypeName)
 43 |           assert(PrimitiveTypeName.INT64 == messageType.getColumns.get(1).getPrimitiveType.getPrimitiveTypeName)
 44 |           assert(PrimitiveTypeName.DOUBLE == messageType.getColumns.get(2).getPrimitiveType.getPrimitiveTypeName)
 45 |           assert(PrimitiveTypeName.BINARY == messageType.getColumns.get(3).getPrimitiveType.getPrimitiveTypeName)
 46 |           assert(PrimitiveTypeName.BINARY == messageType.getColumns.get(4).getPrimitiveType.getPrimitiveTypeName)
 47 |           assert(PrimitiveTypeName.BINARY == messageType.getColumns.get(5).getPrimitiveType.getPrimitiveTypeName)
 48 |           
 49 |           assert(null == messageType.getColumns.get(0).getPrimitiveType.getLogicalTypeAnnotation)
 50 |           assert(null == messageType.getColumns.get(1).getPrimitiveType.getLogicalTypeAnnotation)
 51 |           assert(null == messageType.getColumns.get(2).getPrimitiveType.getLogicalTypeAnnotation)
 52 |           
 53 |           assert(LogicalTypeAnnotation.stringType() == messageType.getColumns.get(3).getPrimitiveType.getLogicalTypeAnnotation)
 54 |           assert(LogicalTypeAnnotation.stringType() == messageType.getColumns.get(4).getPrimitiveType.getLogicalTypeAnnotation)
 55 |           assert(LogicalTypeAnnotation.stringType() == messageType.getColumns.get(5).getPrimitiveType.getLogicalTypeAnnotation)
 56 |           // format: on
 57 |         }
 58 |       )
 59 | 
 60 |     assert(result.size == 5)
 61 |     data.indices.foreach { i =>
 62 |       data(i).indices.foreach { j =>
 63 |         data(i)(j) match {
 64 |           case timestamp: Timestamp =>
 65 |             val formatter =
 66 |               TimestampFormatter.of("%Y-%m-%d %H:%M:%S.%6N %z", "Asia/Tokyo")
 67 |             assert(
 68 |               formatter.format(timestamp) == result(i)(j),
 69 |               s"A different timestamp value is found (Record Index: $i, Column Index: $j)"
 70 |             )
 71 |           case value: Value =>
 72 |             assert(
 73 |               value.toJson == result(i)(j),
 74 |               s"A different json value is found (Record Index: $i, Column Index: $j)"
 75 |             )
 76 |           case _ =>
 77 |             assert(
 78 |               data(i)(j) == result(i)(j),
 79 |               s"A different value is found (Record Index: $i, Column Index: $j)"
 80 |             )
 81 |         }
 82 |       }
 83 |     }
 84 |   }
 85 | 
 86 |   test("timestamp-millis") {
 87 |     val schema = Schema.builder().add("c0", Types.TIMESTAMP).build()
 88 |     val data: Seq[Seq[Timestamp]] = Seq(
 89 |       Seq(Timestamp.ofEpochMilli(111_111_111L)),
 90 |       Seq(Timestamp.ofEpochMilli(222_222_222L)),
 91 |       Seq(Timestamp.ofEpochMilli(333_333_333L))
 92 |     )
 93 |     val cfg = newDefaultConfig.merge(
 94 |       loadConfigSourceFromYamlString("""
 95 |                                        |type_options:
 96 |                                        |  timestamp:
 97 |                                        |    logical_type: "timestamp-millis"
 98 |                                        |""".stripMargin)
 99 |     )
100 | 
101 |     val result: Seq[Seq[AnyRef]] = runOutput(
102 |       cfg,
103 |       schema,
104 |       data,
105 |       messageTypeTest = { messageType =>
106 |         // format: off
107 |         assert(PrimitiveTypeName.INT64 == messageType.getColumns.get(0).getPrimitiveType.getPrimitiveTypeName)
108 |         assert(LogicalTypeAnnotation.timestampType(true, LogicalTypeAnnotation.TimeUnit.MILLIS) == messageType.getColumns.get(0).getPrimitiveType.getLogicalTypeAnnotation)
109 |         // format: on
110 |       }
111 |     )
112 | 
113 |     assert(data.size == result.size)
114 |     data.indices.foreach { i =>
115 |       assert {
116 |         data(i).head.toEpochMilli == result(i).head.asInstanceOf[Long]
117 |       }
118 |     }
119 |   }
120 | 
121 |   test("timestamp-micros") {
122 |     val schema = Schema.builder().add("c0", Types.TIMESTAMP).build()
123 |     val data: Seq[Seq[Timestamp]] = Seq(
124 |       Seq(Timestamp.ofEpochSecond(111_111_111L, 111_111_000L)),
125 |       Seq(Timestamp.ofEpochSecond(222_222_222L, 222_222_222L)),
126 |       Seq(Timestamp.ofEpochSecond(333_333_333L, 333_000L))
127 |     )
128 |     val cfg = newDefaultConfig.merge(
129 |       loadConfigSourceFromYamlString("""
130 |                                        |type_options:
131 |                                        |  timestamp:
132 |                                        |    logical_type: "timestamp-micros"
133 |                                        |""".stripMargin)
134 |     )
135 | 
136 |     val result: Seq[Seq[AnyRef]] = runOutput(
137 |       cfg,
138 |       schema,
139 |       data,
140 |       messageTypeTest = { messageType =>
141 |         // format: off
142 |         assert(PrimitiveTypeName.INT64 == messageType.getColumns.get(0).getPrimitiveType.getPrimitiveTypeName)
143 |         assert(LogicalTypeAnnotation.timestampType(true, LogicalTypeAnnotation.TimeUnit.MICROS) == messageType.getColumns.get(0).getPrimitiveType.getLogicalTypeAnnotation)
144 |         // format: on
145 |       }
146 |     )
147 | 
148 |     assert(data.size == result.size)
149 |     data.indices.foreach { i =>
150 |       // format: off
151 |       assert(
152 |         data(i).head.pipe(ts => (ts.getEpochSecond * 1_000_000L) + (ts.getNano / 1_000L)) == result(i).head.asInstanceOf[Long]
153 |       )
154 |       // format: on
155 |     }
156 |   }
157 | 
158 |   test("timestamp-nanos") {
159 |     val schema = Schema.builder().add("c0", Types.TIMESTAMP).build()
160 |     val data: Seq[Seq[Timestamp]] = Seq(
161 |       Seq(Timestamp.ofEpochSecond(111_111_111L, 111_111_000L)),
162 |       Seq(Timestamp.ofEpochSecond(222_222_222L, 222_222_222L)),
163 |       Seq(Timestamp.ofEpochSecond(333_333_333L, 333_000L))
164 |     )
165 |     val cfg = newDefaultConfig.merge(
166 |       loadConfigSourceFromYamlString("""
167 |                                        |type_options:
168 |                                        |  timestamp:
169 |                                        |    logical_type: "timestamp-nanos"
170 |                                        |""".stripMargin)
171 |     )
172 | 
173 |     val result: Seq[Seq[AnyRef]] = runOutput(
174 |       cfg,
175 |       schema,
176 |       data,
177 |       messageTypeTest = { messageType =>
178 |         // format: off
179 |         assert(PrimitiveTypeName.INT64 == messageType.getColumns.get(0).getPrimitiveType.getPrimitiveTypeName)
180 |         assert(LogicalTypeAnnotation.timestampType(true, LogicalTypeAnnotation.TimeUnit.NANOS) == messageType.getColumns.get(0).getPrimitiveType.getLogicalTypeAnnotation)
181 |         // format: on
182 |       }
183 |     )
184 | 
185 |     assert(data.size == result.size)
186 |     data.indices.foreach { i =>
187 |       // format: off
188 |       assert(data(i).head.pipe(ts => (ts.getEpochSecond * 1_000_000_000L) + ts.getNano) == result(i).head.asInstanceOf[Long])
189 |       // format: on
190 |     }
191 |   }
192 | }
193 | 


--------------------------------------------------------------------------------
/src/test/scala/org/embulk/output/s3_parquet/EmbulkPluginTestHelper.scala:
--------------------------------------------------------------------------------
  1 | package org.embulk.output.s3_parquet
  2 | 
  3 | import java.io.File
  4 | import java.nio.file.{Files, Path}
  5 | import java.util.concurrent.ExecutionException
  6 | 
  7 | import com.amazonaws.auth.{AWSStaticCredentialsProvider, BasicAWSCredentials}
  8 | import com.amazonaws.client.builder.AwsClientBuilder.EndpointConfiguration
  9 | import com.amazonaws.services.s3.{AmazonS3, AmazonS3ClientBuilder}
 10 | import com.amazonaws.services.s3.model.ObjectListing
 11 | import com.amazonaws.services.s3.transfer.{
 12 |   TransferManager,
 13 |   TransferManagerBuilder
 14 | }
 15 | import com.google.inject.{Binder, Guice, Module, Stage}
 16 | import org.apache.avro.generic.GenericRecord
 17 | import org.apache.hadoop.conf.Configuration
 18 | import org.apache.hadoop.fs.{Path => HadoopPath}
 19 | import org.apache.parquet.avro.AvroReadSupport
 20 | import org.apache.parquet.hadoop.{ParquetFileReader, ParquetReader}
 21 | import org.apache.parquet.hadoop.util.HadoopInputFile
 22 | import org.apache.parquet.schema.MessageType
 23 | import org.embulk.{TestPluginSourceModule, TestUtilityModule}
 24 | import org.embulk.config.{
 25 |   ConfigLoader,
 26 |   ConfigSource,
 27 |   DataSourceImpl,
 28 |   ModelManager,
 29 |   TaskSource
 30 | }
 31 | import org.embulk.exec.{
 32 |   ExecModule,
 33 |   ExtensionServiceLoaderModule,
 34 |   SystemConfigModule
 35 | }
 36 | import org.embulk.jruby.JRubyScriptingModule
 37 | import org.embulk.plugin.{
 38 |   BuiltinPluginSourceModule,
 39 |   InjectedPluginSource,
 40 |   PluginClassLoaderModule
 41 | }
 42 | import org.embulk.spi.{Exec, ExecSession, OutputPlugin, PageTestUtils, Schema}
 43 | import org.embulk.spi.json.JsonParser
 44 | import org.msgpack.value.Value
 45 | import org.scalatest.funsuite.AnyFunSuite
 46 | import org.scalatest.BeforeAndAfter
 47 | import org.scalatest.diagrams.Diagrams
 48 | 
 49 | import scala.util.Using
 50 | 
 51 | object EmbulkPluginTestHelper {
 52 | 
 53 |   case class TestRuntimeModule() extends Module {
 54 | 
 55 |     override def configure(binder: Binder): Unit = {
 56 |       val systemConfig = new DataSourceImpl(null)
 57 |       new SystemConfigModule(systemConfig).configure(binder)
 58 |       new ExecModule(systemConfig).configure(binder)
 59 |       new ExtensionServiceLoaderModule(systemConfig).configure(binder)
 60 |       new BuiltinPluginSourceModule().configure(binder)
 61 |       new JRubyScriptingModule(systemConfig).configure(binder)
 62 |       new PluginClassLoaderModule().configure(binder)
 63 |       new TestUtilityModule().configure(binder)
 64 |       new TestPluginSourceModule().configure(binder)
 65 |       InjectedPluginSource.registerPluginTo(
 66 |         binder,
 67 |         classOf[OutputPlugin],
 68 |         "s3_parquet",
 69 |         classOf[S3ParquetOutputPlugin]
 70 |       )
 71 |     }
 72 |   }
 73 | 
 74 |   def getExecSession: ExecSession = {
 75 |     val injector =
 76 |       Guice.createInjector(Stage.PRODUCTION, TestRuntimeModule())
 77 |     val execConfig = new DataSourceImpl(
 78 |       injector.getInstance(classOf[ModelManager])
 79 |     )
 80 |     ExecSession.builder(injector).fromExecConfig(execConfig).build()
 81 |   }
 82 | }
 83 | 
 84 | abstract class EmbulkPluginTestHelper
 85 |     extends AnyFunSuite
 86 |     with BeforeAndAfter
 87 |     with Diagrams {
 88 |   import implicits._
 89 | 
 90 |   private var exec: ExecSession = _
 91 | 
 92 |   val TEST_S3_ENDPOINT: String = "http://localhost:4566"
 93 |   val TEST_S3_REGION: String = "us-east-1"
 94 |   val TEST_S3_ACCESS_KEY_ID: String = "test"
 95 |   val TEST_S3_SECRET_ACCESS_KEY: String = "test"
 96 |   val TEST_BUCKET_NAME: String = "my-bucket"
 97 |   val TEST_PATH_PREFIX: String = "path/to/parquet-"
 98 | 
 99 |   before {
100 |     exec = EmbulkPluginTestHelper.getExecSession
101 | 
102 |     withLocalStackS3Client(_.createBucket(TEST_BUCKET_NAME))
103 |   }
104 |   after {
105 |     exec.cleanup()
106 |     exec = null
107 | 
108 |     withLocalStackS3Client { cli =>
109 |       @scala.annotation.tailrec
110 |       def rmRecursive(listing: ObjectListing): Unit = {
111 |         listing.getObjectSummaries.foreach(o =>
112 |           cli.deleteObject(TEST_BUCKET_NAME, o.getKey)
113 |         )
114 |         if (listing.isTruncated)
115 |           rmRecursive(cli.listNextBatchOfObjects(listing))
116 |       }
117 |       rmRecursive(cli.listObjects(TEST_BUCKET_NAME))
118 |     }
119 |     withLocalStackS3Client(_.deleteBucket(TEST_BUCKET_NAME))
120 |   }
121 | 
122 |   def execDoWith[A](f: => A): A =
123 |     try Exec.doWith(exec, () => f)
124 |     catch {
125 |       case ex: ExecutionException => throw ex.getCause
126 |     }
127 | 
128 |   def runOutput(
129 |       outConfig: ConfigSource,
130 |       schema: Schema,
131 |       data: Seq[Seq[Any]],
132 |       messageTypeTest: MessageType => Unit = { _ => }
133 |   ): Seq[Seq[AnyRef]] = {
134 |     execDoWith {
135 |       val plugin =
136 |         exec.getInjector.getInstance(classOf[S3ParquetOutputPlugin])
137 |       plugin.transaction(
138 |         outConfig,
139 |         schema,
140 |         1,
141 |         (taskSource: TaskSource) => {
142 |           Using.resource(plugin.open(taskSource, schema, 0)) { output =>
143 |             try {
144 |               PageTestUtils
145 |                 .buildPage(
146 |                   exec.getBufferAllocator,
147 |                   schema,
148 |                   data.flatten: _*
149 |                 )
150 |                 .foreach(output.add)
151 |               output.commit()
152 |             }
153 |             catch {
154 |               case ex: Throwable =>
155 |                 output.abort()
156 |                 throw ex
157 |             }
158 |           }
159 |           Seq.empty
160 |         }
161 |       )
162 |     }
163 | 
164 |     readS3Parquet(TEST_BUCKET_NAME, TEST_PATH_PREFIX, messageTypeTest)
165 |   }
166 | 
167 |   private def withLocalStackS3Client[A](f: AmazonS3 => A): A = {
168 |     val client: AmazonS3 = AmazonS3ClientBuilder.standard
169 |       .withEndpointConfiguration(
170 |         new EndpointConfiguration(TEST_S3_ENDPOINT, TEST_S3_REGION)
171 |       )
172 |       .withCredentials(
173 |         new AWSStaticCredentialsProvider(
174 |           new BasicAWSCredentials(
175 |             TEST_S3_ACCESS_KEY_ID,
176 |             TEST_S3_SECRET_ACCESS_KEY
177 |           )
178 |         )
179 |       )
180 |       .withPathStyleAccessEnabled(true)
181 |       .build()
182 | 
183 |     try f(client)
184 |     finally client.shutdown()
185 |   }
186 | 
187 |   private def readS3Parquet(
188 |       bucket: String,
189 |       prefix: String,
190 |       messageTypeTest: MessageType => Unit = { _ => }
191 |   ): Seq[Seq[AnyRef]] = {
192 |     val tmpDir: Path = Files.createTempDirectory("embulk-output-parquet")
193 |     withLocalStackS3Client { s3 =>
194 |       val xfer: TransferManager = TransferManagerBuilder
195 |         .standard()
196 |         .withS3Client(s3)
197 |         .build()
198 |       try xfer
199 |         .downloadDirectory(bucket, prefix, tmpDir.toFile)
200 |         .waitForCompletion()
201 |       finally xfer.shutdownNow()
202 |     }
203 | 
204 |     def listFiles(file: File): Seq[File] = {
205 |       file
206 |         .listFiles()
207 |         .flatMap(f =>
208 |           if (f.isFile) Seq(f)
209 |           else listFiles(f)
210 |         )
211 |         .toSeq
212 |     }
213 | 
214 |     listFiles(tmpDir.toFile)
215 |       .map(_.getAbsolutePath)
216 |       .foldLeft(Seq[Seq[AnyRef]]()) {
217 |         (result: Seq[Seq[AnyRef]], path: String) =>
218 |           result ++ readParquetFile(path, messageTypeTest)
219 |       }
220 |   }
221 | 
222 |   private def readParquetFile(
223 |       pathString: String,
224 |       messageTypeTest: MessageType => Unit = { _ => }
225 |   ): Seq[Seq[AnyRef]] = {
226 |     Using.resource(
227 |       ParquetFileReader.open(
228 |         HadoopInputFile
229 |           .fromPath(new HadoopPath(pathString), new Configuration())
230 |       )
231 |     ) { reader => messageTypeTest(reader.getFileMetaData.getSchema) }
232 | 
233 |     val reader: ParquetReader[GenericRecord] = ParquetReader
234 |       .builder(
235 |         new AvroReadSupport[GenericRecord](),
236 |         new HadoopPath(pathString)
237 |       )
238 |       .build()
239 | 
240 |     Iterator
241 |       .continually(reader.read())
242 |       .takeWhile(_ != null)
243 |       .map(record => record.getSchema.getFields.map(f => record.get(f.name())))
244 |       .toSeq
245 |   }
246 | 
247 |   def loadConfigSourceFromYamlString(yaml: String): ConfigSource =
248 |     new ConfigLoader(exec.getModelManager).fromYamlString(yaml)
249 | 
250 |   def newDefaultConfig: ConfigSource =
251 |     loadConfigSourceFromYamlString(
252 |       s"""
253 |          |endpoint: $TEST_S3_ENDPOINT
254 |          |bucket: $TEST_BUCKET_NAME
255 |          |path_prefix: $TEST_PATH_PREFIX
256 |          |auth_method: basic
257 |          |access_key_id: $TEST_S3_ACCESS_KEY_ID
258 |          |secret_access_key: $TEST_S3_SECRET_ACCESS_KEY
259 |          |path_style_access_enabled: true
260 |          |default_timezone: Asia/Tokyo
261 |          |""".stripMargin
262 |     )
263 | 
264 |   def json(str: String): Value = new JsonParser().parse(str)
265 | }
266 | 


--------------------------------------------------------------------------------
/src/test/scala/org/embulk/output/s3_parquet/parquet/TestTimeLogicalType.scala:
--------------------------------------------------------------------------------
  1 | package org.embulk.output.s3_parquet.parquet
  2 | 
  3 | import java.time.ZoneId
  4 | 
  5 | import org.apache.parquet.io.api.RecordConsumer
  6 | import org.apache.parquet.schema.LogicalTypeAnnotation
  7 | import org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.{
  8 |   MICROS,
  9 |   MILLIS,
 10 |   NANOS
 11 | }
 12 | import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName
 13 | import org.embulk.config.ConfigException
 14 | import org.embulk.output.s3_parquet.catalog.GlueDataType
 15 | import org.embulk.spi.DataException
 16 | import org.embulk.spi.time.Timestamp
 17 | import org.scalatest.diagrams.Diagrams
 18 | import org.scalatest.funsuite.AnyFunSuite
 19 | import org.scalatest.prop.TableDrivenPropertyChecks
 20 | 
 21 | import scala.util.chaining._
 22 | 
 23 | class TestTimeLogicalType
 24 |     extends AnyFunSuite
 25 |     with ParquetColumnTypeTestHelper
 26 |     with TableDrivenPropertyChecks
 27 |     with Diagrams {
 28 | 
 29 |   private val conditions = Table(
 30 |     ("isAdjustedToUtc", "timeUnit", "timeZone", "column"), {
 31 |       for {
 32 |         isAdjustedToUtc <- Seq(true, false)
 33 |         timeUnit <- Seq(MILLIS, MICROS, NANOS)
 34 |         timeZone <- Seq(ZoneId.of("UTC"), ZoneId.of("Asia/Tokyo"))
 35 |         column <- Seq(
 36 |           SAMPLE_BOOLEAN_COLUMN,
 37 |           SAMPLE_LONG_COLUMN,
 38 |           SAMPLE_DOUBLE_COLUMN,
 39 |           SAMPLE_STRING_COLUMN,
 40 |           SAMPLE_TIMESTAMP_COLUMN,
 41 |           SAMPLE_JSON_COLUMN
 42 |         )
 43 |       } yield (isAdjustedToUtc, timeUnit, timeZone, column)
 44 |     }: _*
 45 |   )
 46 | 
 47 |   private val unsupportedEmbulkColumns = Seq(
 48 |     SAMPLE_BOOLEAN_COLUMN,
 49 |     SAMPLE_DOUBLE_COLUMN,
 50 |     SAMPLE_STRING_COLUMN,
 51 |     SAMPLE_JSON_COLUMN
 52 |   )
 53 | 
 54 |   test(
 55 |     "#primitiveType(column) returns PrimitiveTypeName.{INT32,INT64} with LogicalType"
 56 |   ) {
 57 |     forAll(conditions) { (isAdjustedToUtc, timeUnit, timeZone, column) =>
 58 |       whenever(unsupportedEmbulkColumns.contains(column)) {
 59 |         // format: off
 60 |       assert(intercept[ConfigException](TimeLogicalType(isAdjustedToUtc = isAdjustedToUtc, timeUnit = timeUnit, timeZone = timeZone).primitiveType(column)).getMessage.startsWith("Unsupported column type: "))
 61 |         // format: on
 62 |       }
 63 | 
 64 |       whenever(!unsupportedEmbulkColumns.contains(column)) {
 65 |         val expectedPrimitiveTypeName =
 66 |           if (timeUnit === MILLIS) PrimitiveTypeName.INT32
 67 |           else PrimitiveTypeName.INT64
 68 |         // format: off
 69 |         assert(expectedPrimitiveTypeName == TimeLogicalType(isAdjustedToUtc = isAdjustedToUtc, timeUnit = timeUnit, timeZone = timeZone).primitiveType(column).getPrimitiveTypeName)
 70 |         assert(LogicalTypeAnnotation.timeType(isAdjustedToUtc, timeUnit) == TimeLogicalType(isAdjustedToUtc = isAdjustedToUtc, timeUnit = timeUnit, timeZone = timeZone).primitiveType(column).getLogicalTypeAnnotation)
 71 |         // format: on
 72 |       }
 73 |     }
 74 |   }
 75 | 
 76 |   test("#glueDataType(column) returns GlueDataType") {
 77 |     forAll(conditions) { (isAdjustedToUtc, timeUnit, timeZone, column) =>
 78 |       whenever(unsupportedEmbulkColumns.contains(column)) {
 79 |         // format: off
 80 |         assert(intercept[ConfigException](TimeLogicalType(isAdjustedToUtc = isAdjustedToUtc, timeUnit = timeUnit, timeZone = timeZone).glueDataType(column)).getMessage.startsWith("Unsupported column type: "))
 81 |         // format: on
 82 |       }
 83 |       whenever(!unsupportedEmbulkColumns.contains(column)) {
 84 |         val expectedGlueDataType =
 85 |           if (timeUnit === MILLIS) GlueDataType.INT
 86 |           else GlueDataType.BIGINT
 87 |         // format: off
 88 |         assert(expectedGlueDataType == TimeLogicalType(isAdjustedToUtc = isAdjustedToUtc, timeUnit = timeUnit, timeZone =  timeZone).glueDataType(column))
 89 |         // format: on
 90 |       }
 91 |     }
 92 |   }
 93 | 
 94 |   test("#consumeLong") {
 95 |     forAll(conditions) { (isAdjustedToUtc, timeUnit, timeZone, _) =>
 96 |       timeUnit match {
 97 |         case MILLIS =>
 98 |           newMockRecordConsumer().tap { consumer =>
 99 |             consumer.writingSampleField {
100 |               TimeLogicalType(
101 |                 isAdjustedToUtc = isAdjustedToUtc,
102 |                 timeUnit = timeUnit,
103 |                 timeZone = timeZone
104 |               ).consumeLong(consumer, 5)
105 |             }
106 |             assert(consumer.data.head.head.isInstanceOf[Int])
107 |             assert(consumer.data.head.head == 5)
108 |           }
109 |           newMockRecordConsumer().tap { consumer =>
110 |             consumer.writingSampleField {
111 |               // format: off
112 |               assert(intercept[DataException](TimeLogicalType(isAdjustedToUtc = isAdjustedToUtc, timeUnit = timeUnit, timeZone = timeZone).consumeLong(consumer, Long.MaxValue)).getMessage.startsWith("Failed to cast Long: "))
113 |               // format: on
114 |             }
115 |           }
116 |         case MICROS | NANOS =>
117 |           newMockRecordConsumer().tap { consumer =>
118 |             consumer.writingSampleField {
119 |               TimeLogicalType(
120 |                 isAdjustedToUtc = isAdjustedToUtc,
121 |                 timeUnit = timeUnit,
122 |                 timeZone = timeZone
123 |               ).consumeLong(consumer, 5)
124 |             }
125 |             assert(consumer.data.head.head.isInstanceOf[Long])
126 |             assert(consumer.data.head.head == 5L)
127 |           }
128 |           newMockRecordConsumer().tap { consumer =>
129 |             consumer.writingSampleField {
130 |               TimeLogicalType(
131 |                 isAdjustedToUtc = isAdjustedToUtc,
132 |                 timeUnit = timeUnit,
133 |                 timeZone = timeZone
134 |               ).consumeLong(consumer, Long.MaxValue)
135 |             }
136 |             assert(consumer.data.head.head.isInstanceOf[Long])
137 |             assert(consumer.data.head.head == Long.MaxValue)
138 |           }
139 |       }
140 |     }
141 |   }
142 | 
143 |   test("#consumeTimestamp") {
144 |     forAll(conditions) { (isAdjustedToUtc, timeUnit, timeZone, _) =>
145 |       timeUnit match {
146 |         case MILLIS =>
147 |           val v = Timestamp.ofEpochMilli(Int.MaxValue)
148 |           newMockRecordConsumer().tap { consumer =>
149 |             consumer.writingSampleField {
150 |               TimeLogicalType(
151 |                 isAdjustedToUtc = isAdjustedToUtc,
152 |                 timeUnit = timeUnit,
153 |                 timeZone = timeZone
154 |               ).consumeTimestamp(consumer, v, null)
155 |             }
156 |             assert(consumer.data.head.head.isInstanceOf[Int])
157 |             if (timeZone.getId == "Asia/Tokyo" && !isAdjustedToUtc)
158 |               assert(consumer.data.head.head == 19883647)
159 |             else // UTC
160 |               assert(consumer.data.head.head == 73883647)
161 |           }
162 |         case MICROS =>
163 |           val v = Timestamp.ofEpochMilli(Int.MaxValue)
164 |           newMockRecordConsumer().tap { consumer =>
165 |             consumer.writingSampleField {
166 |               TimeLogicalType(
167 |                 isAdjustedToUtc = isAdjustedToUtc,
168 |                 timeUnit = timeUnit,
169 |                 timeZone = timeZone
170 |               ).consumeTimestamp(consumer, v, null)
171 |             }
172 |             assert(consumer.data.head.head.isInstanceOf[Long])
173 |             if (timeZone.getId == "Asia/Tokyo" && !isAdjustedToUtc)
174 |               assert(consumer.data.head.head == 19883647000L)
175 |             else // UTC
176 |               assert(consumer.data.head.head == 73883647000L)
177 |           }
178 |         case NANOS =>
179 |           val v = Timestamp.ofEpochMilli(Int.MaxValue)
180 |           newMockRecordConsumer().tap { consumer =>
181 |             consumer.writingSampleField {
182 |               TimeLogicalType(
183 |                 isAdjustedToUtc = isAdjustedToUtc,
184 |                 timeUnit = timeUnit,
185 |                 timeZone = timeZone
186 |               ).consumeTimestamp(consumer, v, null)
187 |             }
188 |             assert(consumer.data.head.head.isInstanceOf[Long])
189 |             if (timeZone.getId == "Asia/Tokyo" && !isAdjustedToUtc)
190 |               assert(consumer.data.head.head == 19883647000000L)
191 |             else // UTC
192 |               assert(consumer.data.head.head == 73883647000000L)
193 |           }
194 |       }
195 | 
196 |     }
197 |   }
198 | 
199 |   test("#consume{Boolean,Double,String,Json} are unsupported.") {
200 |     def assertUnsupportedConsume(f: RecordConsumer => Unit) =
201 |       newMockRecordConsumer().tap { consumer =>
202 |         consumer.writingSampleField {
203 |           // format: off
204 |           assert(intercept[ConfigException](f(consumer)).getMessage.endsWith("is unsupported."))
205 |           // format: on
206 |         }
207 |       }
208 | 
209 |     forAll(conditions) { (isAdjustedToUtc, timeUnit, timeZone, _) =>
210 |       val t =
211 |         TimeLogicalType(
212 |           isAdjustedToUtc = isAdjustedToUtc,
213 |           timeUnit = timeUnit,
214 |           timeZone = timeZone
215 |         )
216 |       assertUnsupportedConsume(t.consumeBoolean(_, true))
217 |       assertUnsupportedConsume(t.consumeDouble(_, 0.0d))
218 |       assertUnsupportedConsume(t.consumeString(_, null))
219 |       assertUnsupportedConsume(t.consumeJson(_, null))
220 |     }
221 |   }
222 | 
223 | }
224 | 


--------------------------------------------------------------------------------
/src/main/scala/org/embulk/output/s3_parquet/parquet/ParquetColumnType.scala:
--------------------------------------------------------------------------------
  1 | package org.embulk.output.s3_parquet.parquet
  2 | 
  3 | import java.time.ZoneId
  4 | import java.util.{Locale, Optional}
  5 | 
  6 | import org.apache.parquet.format.ConvertedType
  7 | import org.apache.parquet.io.api.RecordConsumer
  8 | import org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit
  9 | import org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.{
 10 |   MICROS,
 11 |   MILLIS,
 12 |   NANOS
 13 | }
 14 | import org.apache.parquet.schema.PrimitiveType
 15 | import org.embulk.config.{
 16 |   Config,
 17 |   ConfigDefault,
 18 |   ConfigException,
 19 |   ConfigSource,
 20 |   Task => EmbulkTask
 21 | }
 22 | import org.embulk.output.s3_parquet.catalog.GlueDataType
 23 | import org.embulk.output.s3_parquet.implicits
 24 | import org.embulk.spi.{Column, DataException, Exec}
 25 | import org.embulk.spi.time.{Timestamp, TimestampFormatter}
 26 | import org.embulk.spi.time.TimestampFormatter.TimestampColumnOption
 27 | import org.msgpack.value.Value
 28 | import org.slf4j.{Logger, LoggerFactory}
 29 | 
 30 | import scala.util.{Failure, Success, Try}
 31 | import scala.util.chaining._
 32 | 
 33 | object ParquetColumnType {
 34 | 
 35 |   import implicits._
 36 | 
 37 |   private val logger: Logger =
 38 |     LoggerFactory.getLogger(classOf[ParquetColumnType])
 39 | 
 40 |   trait Task extends EmbulkTask with TimestampColumnOption {
 41 |     @Config("logical_type")
 42 |     @ConfigDefault("null")
 43 |     def getLogicalType: Optional[LogicalTypeOption]
 44 |   }
 45 | 
 46 |   trait LogicalTypeOption extends EmbulkTask {
 47 |     @Config("name")
 48 |     def getName: String
 49 | 
 50 |     @Config("scale")
 51 |     @ConfigDefault("null")
 52 |     def getScale: Optional[Int]
 53 | 
 54 |     @Config("precision")
 55 |     @ConfigDefault("null")
 56 |     def getPrecision: Optional[Int]
 57 | 
 58 |     @Config("bit_width")
 59 |     @ConfigDefault("null")
 60 |     def getBitWidth: Optional[Int]
 61 | 
 62 |     @Config("is_signed")
 63 |     @ConfigDefault("null")
 64 |     def getIsSigned: Optional[Boolean]
 65 | 
 66 |     @Config("is_adjusted_to_utc")
 67 |     @ConfigDefault("null")
 68 |     def getIsAdjustedToUtc: Optional[Boolean]
 69 | 
 70 |     @Config("time_unit")
 71 |     @ConfigDefault("null")
 72 |     def getTimeUnit: Optional[TimeUnit]
 73 |   }
 74 | 
 75 |   object LogicalTypeOption {
 76 |     case class ConfigBuilder private () {
 77 |       case class Attributes private (
 78 |           name: Option[String] = None,
 79 |           precision: Option[Int] = None,
 80 |           scale: Option[Int] = None,
 81 |           bitWidth: Option[Int] = None,
 82 |           isSigned: Option[Boolean] = None,
 83 |           isAdjustedToUtc: Option[Boolean] = None,
 84 |           timeUnit: Option[TimeUnit] = None
 85 |       ) {
 86 |         def toOnelineYaml: String = {
 87 |           val builder = Seq.newBuilder[String]
 88 |           name.foreach(v => builder.addOne(s"name: ${v}"))
 89 |           precision.foreach(v => builder.addOne(s"precision: ${v}"))
 90 |           scale.foreach(v => builder.addOne(s"scale: ${v}"))
 91 |           bitWidth.foreach(v => builder.addOne(s"bit_width: ${v}"))
 92 |           isSigned.foreach(v => builder.addOne(s"is_signed: ${v}"))
 93 |           isAdjustedToUtc.foreach(v =>
 94 |             builder.addOne(s"is_adjusted_to_utc: ${v}")
 95 |           )
 96 |           timeUnit.foreach(tu => builder.addOne(s"time_unit: ${tu.name()}"))
 97 |           "{" + builder.result().mkString(", ") + "}"
 98 |         }
 99 | 
100 |         def build(): ConfigSource = {
101 |           val c = Exec.newConfigSource()
102 |           name.foreach(c.set("name", _))
103 |           precision.foreach(c.set("precision", _))
104 |           scale.foreach(c.set("scale", _))
105 |           bitWidth.foreach(c.set("bit_width", _))
106 |           isSigned.foreach(c.set("is_signed", _))
107 |           isAdjustedToUtc.foreach(c.set("is_adjusted_to_utc", _))
108 |           timeUnit.foreach(tu => c.set("time_unit", tu.name()))
109 |           c
110 |         }
111 |       }
112 |       var attrs: Attributes = Attributes()
113 | 
114 |       def name(name: String): ConfigBuilder =
115 |         this.tap(_ => attrs = attrs.copy(name = Option(name)))
116 |       def scale(scale: Int): ConfigBuilder =
117 |         this.tap(_ => attrs = attrs.copy(scale = Option(scale)))
118 |       def precision(precision: Int): ConfigBuilder =
119 |         this.tap(_ => attrs = attrs.copy(precision = Option(precision)))
120 |       def bitWidth(bitWidth: Int): ConfigBuilder =
121 |         this.tap(_ => attrs = attrs.copy(bitWidth = Option(bitWidth)))
122 |       def isSigned(isSigned: Boolean): ConfigBuilder =
123 |         this.tap(_ => attrs = attrs.copy(isSigned = Option(isSigned)))
124 |       def isAdjustedToUtc(isAdjustedToUtc: Boolean): ConfigBuilder =
125 |         this.tap(_ =>
126 |           attrs = attrs.copy(isAdjustedToUtc = Option(isAdjustedToUtc))
127 |         )
128 |       def timeUnit(timeUnit: TimeUnit): ConfigBuilder =
129 |         this.tap(_ => attrs = attrs.copy(timeUnit = Option(timeUnit)))
130 | 
131 |       def toOnelineYaml: String = attrs.toOnelineYaml
132 | 
133 |       def build(): ConfigSource = attrs.build()
134 |     }
135 | 
136 |     def builder(): ConfigBuilder = ConfigBuilder()
137 |   }
138 | 
139 |   def loadConfig(c: ConfigSource): Task = {
140 |     if (c.has("logical_type")) {
141 |       Try(c.get(classOf[String], "logical_type")).foreach { v =>
142 |         logger.warn(
143 |           "[DEPRECATED] Now, it is deprecated to use the \"logical_type\" option in this usage." +
144 |             " Use \"converted_type\" instead."
145 |         )
146 |         logger.warn(
147 |           s"[DEPRECATED] Translate {logical_type: $v} => {converted_type: $v}"
148 |         )
149 |         c.remove("logical_type")
150 |         c.set("converted_type", v)
151 |       }
152 |     }
153 |     if (c.has("converted_type")) {
154 |       if (c.has("logical_type"))
155 |         throw new ConfigException(
156 |           "\"converted_type\" and \"logical_type\" options cannot be used at the same time."
157 |         )
158 |       Try(c.get(classOf[String], "converted_type")) match {
159 |         case Success(convertedType) =>
160 |           val logicalTypeConfig: ConfigSource =
161 |             translateConvertedType2LogicalType(convertedType)
162 |           c.setNested("logical_type", logicalTypeConfig)
163 |         case Failure(ex) =>
164 |           throw new ConfigException(
165 |             "The value of \"converted_type\" option must be string.",
166 |             ex
167 |           )
168 |       }
169 |     }
170 |     c.loadConfig(classOf[Task])
171 |   }
172 | 
173 |   private def translateConvertedType2LogicalType(
174 |       convertedType: String
175 |   ): ConfigSource = {
176 |     val builder = LogicalTypeOption.builder()
177 |     val normalizedConvertedType: String = normalizeConvertedType(convertedType)
178 |     if (normalizedConvertedType == "TIMESTAMP_NANOS") {
179 |       builder.name("timestamp").isAdjustedToUtc(true).timeUnit(NANOS)
180 |       logger.warn(
181 |         s"[DEPRECATED] $convertedType is deprecated because this is not one of" +
182 |           s" ConvertedTypes actually. Please use 'logical_type: ${builder.toOnelineYaml}'"
183 |       )
184 |     }
185 |     else {
186 | 
187 |       ConvertedType.valueOf(normalizedConvertedType) match {
188 |         case ConvertedType.UTF8 => builder.name("string")
189 |         case ConvertedType.DATE => builder.name("date")
190 |         case ConvertedType.TIME_MILLIS =>
191 |           builder.name("time").isAdjustedToUtc(true).timeUnit(MILLIS)
192 |         case ConvertedType.TIME_MICROS =>
193 |           builder.name("time").isAdjustedToUtc(true).timeUnit(MICROS)
194 |         case ConvertedType.TIMESTAMP_MILLIS =>
195 |           builder.name("timestamp").isAdjustedToUtc(true).timeUnit(MILLIS)
196 |         case ConvertedType.TIMESTAMP_MICROS =>
197 |           builder.name("timestamp").isAdjustedToUtc(true).timeUnit(MICROS)
198 |         case ConvertedType.UINT_8 =>
199 |           builder.name("int").bitWidth(8).isSigned(false)
200 |         case ConvertedType.UINT_16 =>
201 |           builder.name("int").bitWidth(16).isSigned(false)
202 |         case ConvertedType.UINT_32 =>
203 |           builder.name("int").bitWidth(32).isSigned(false)
204 |         case ConvertedType.UINT_64 =>
205 |           builder.name("int").bitWidth(64).isSigned(false)
206 |         case ConvertedType.INT_8 =>
207 |           builder.name("int").bitWidth(8).isSigned(true)
208 |         case ConvertedType.INT_16 =>
209 |           builder.name("int").bitWidth(16).isSigned(true)
210 |         case ConvertedType.INT_32 =>
211 |           builder.name("int").bitWidth(32).isSigned(true)
212 |         case ConvertedType.INT_64 =>
213 |           builder.name("int").bitWidth(64).isSigned(true)
214 |         case ConvertedType.JSON => builder.name("json")
215 |         case _                  =>
216 |           // MAP, MAP_KEY_VALUE, LIST, ENUM, DECIMAL, BSON, INTERVAL
217 |           throw new ConfigException(
218 |             s"converted_type: $convertedType is not supported."
219 |           )
220 |       }
221 |     }
222 |     logger.info(
223 |       s"Translate {converted_type: $convertedType} => {logical_type: ${builder.toOnelineYaml}}"
224 |     )
225 |     builder.build()
226 |   }
227 | 
228 |   private def normalizeConvertedType(convertedType: String): String = {
229 |     convertedType
230 |       .toUpperCase(Locale.ENGLISH)
231 |       .replaceAll("-", "_")
232 |       .replaceAll("INT(\\d)", "INT_$1")
233 |   }
234 | 
235 |   def fromTask(task: Task): Option[LogicalTypeProxy] = {
236 |     task.getLogicalType.map { o =>
237 |       LogicalTypeProxy(
238 |         name = o.getName,
239 |         scale = o.getScale,
240 |         precision = o.getPrecision,
241 |         bitWidth = o.getBitWidth,
242 |         isSigned = o.getIsSigned,
243 |         isAdjustedToUtc = o.getIsAdjustedToUtc,
244 |         timeUnit = o.getTimeUnit,
245 |         timeZone = task.getTimeZoneId.map(ZoneId.of)
246 |       )
247 |     }
248 |   }
249 | }
250 | 
251 | trait ParquetColumnType {
252 |   def primitiveType(column: Column): PrimitiveType
253 |   def glueDataType(column: Column): GlueDataType
254 |   def consumeBoolean(consumer: RecordConsumer, v: Boolean): Unit
255 |   def consumeString(consumer: RecordConsumer, v: String): Unit
256 |   def consumeLong(consumer: RecordConsumer, v: Long): Unit
257 |   def consumeDouble(consumer: RecordConsumer, v: Double): Unit
258 |   def consumeTimestamp(
259 |       consumer: RecordConsumer,
260 |       v: Timestamp,
261 |       formatter: TimestampFormatter
262 |   ): Unit
263 |   def consumeJson(consumer: RecordConsumer, v: Value): Unit
264 |   def newUnsupportedMethodException(methodName: String) =
265 |     new ConfigException(s"${getClass.getName}#$methodName is unsupported.")
266 | 
267 |   protected def consumeLongAsInteger(
268 |       consumer: RecordConsumer,
269 |       v: Long
270 |   ): Unit = {
271 |     if (v < Int.MinValue || v > Int.MaxValue)
272 |       throw new DataException(
273 |         s"Failed to cast Long: $v to Int, " +
274 |           s"because $v exceeds ${Int.MaxValue} (Int.MaxValue) or ${Int.MinValue} (Int.MinValue)"
275 |       )
276 |     consumer.addInteger(v.toInt)
277 |   }
278 | }
279 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # S3 Parquet output plugin for Embulk
  2 | 
  3 | [![Release CI Status Badge](https://github.com/civitaspo/embulk-output-s3_parquet/workflows/Release%20CI/badge.svg)](https://github.com/civitaspo/embulk-output-s3_parquet/actions?query=workflow%3A%22Release+CI%22) [![Test CI Status Badge](https://github.com/civitaspo/embulk-output-s3_parquet/workflows/Test%20CI/badge.svg)](https://github.com/civitaspo/embulk-output-s3_parquet/actions?query=workflow%3A%22Test+CI%22)
  4 | 
  5 | [Embulk](https://github.com/embulk/embulk/) output plugin to dump records as [Apache Parquet](https://parquet.apache.org/) files on S3.
  6 | 
  7 | ## Overview
  8 | 
  9 | * **Plugin type**: output
 10 | * **Load all or nothing**: no
 11 | * **Resume supported**: no
 12 | * **Cleanup supported**: yes
 13 | 
 14 | ## Configuration
 15 | 
 16 | - **bucket**: s3 bucket name (string, required)
 17 | - **path_prefix**: prefix of target keys (string, optional)
 18 | - **sequence_format**: format of the sequence number of the output files (string, default: `"%03d.%02d."`)
 19 |   - **sequence_format** formats task index and sequence number in a task. 
 20 | - **file_ext**: path suffix of the output files (string, default: `"parquet"`)
 21 | - **compression_codec**: compression codec for parquet file (`"uncompressed"`,`"snappy"`,`"gzip"`,`"lzo"`,`"brotli"`,`"lz4"` or `"zstd"`, default: `"uncompressed"`)
 22 | - **default_timestamp_format**: default timestamp format (string, default: `"%Y-%m-%d %H:%M:%S.%6N %z"`)
 23 | - **default_timezone**: default timezone (string, default: `"UTC"`)
 24 | - **column_options**: a map whose keys are name of columns, and values are configuration with following parameters (optional)
 25 |   - **timezone**: timezone if type of this column is timestamp. If not set, **default_timezone** is used. (string, optional)
 26 |   - **format**: timestamp format if type of this column is timestamp. If not set, **default_timestamp_format**: is used. (string, optional)
 27 |   - **converted_type**: a Parquet converted type name (`timestamp-millis`, `timestamp-micros`, `timestamp-nanos`, `json`, `int8`, `int16`, `int32`, `int64`, `uint8`, `uint16`, `uint32`, `uint64`) (string, optional)
 28 |   - **logical_type**: **[DEPRECATED: Use **converted_type** instead]** a Parquet converted type name (`timestamp-millis`, `timestamp-micros`, `timestamp-nanos`, `json`, `int8`, `int16`, `int32`, `int64`, `uint8`, `uint16`, `uint32`, `uint64`) (string, optional)
 29 |   - **logical_type**: configuration for the detailed logical type. See [Logical Type Specification](https://github.com/apache/parquet-format/blob/apache-parquet-format-2.7.0/LogicalTypes.md) (optional)
 30 |     - **name**: The name of logical type (`"date"`, `"decimal"`, `"int"`, `"json"`, `"time"`, `"timestamp"`) (string, required)
 31 |     - **bit_width**: The bit width for `"int"` logical type (Allowed bit width values are `8`, `16`, `32`, `64`). (int, default: `64`)
 32 |     - **is_signed**: Signed or not for `"int"` logical type (boolean, default: `true`)
 33 |     - **scale**: The scale for `"decimal"` logical type (int, default: `0`)
 34 |     - **precision**: The precision for `"decimal"` logical type (int, default: `0`)
 35 |     - **is_adjusted_to_utc**: (boolean, default: `true`)
 36 |     - **time_unit**: The precision for `"time"` or `"timestamp"` logical type (Allowed values are `"MILLIS`, `MICROS`, `NANOS`)
 37 | - **canned_acl**: grants one of [canned ACLs](https://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#CannedACL) for created objects (string, default: `private`)
 38 | - **block_size**: The block size is the size of a row group being buffered in memory. This limits the memory usage when writing. Larger values will improve the I/O when reading but consume more memory when writing. (int, default: `134217728` (128MB))
 39 | - **page_size**: The page size is for compression. When reading, each page can be decompressed independently. A block is composed of pages. The page is the smallest unit that must be read fully to access a single record. If this value is too small, the compression will deteriorate. (int, default: `1048576` (1MB))
 40 | - **max_padding_size**: The max size (bytes) to write as padding and the min size of a row group (int, default: `8388608` (8MB))
 41 | - **enable_dictionary_encoding**: The boolean value is to enable/disable dictionary encoding. (boolean, default: `true`)
 42 | - **auth_method**: name of mechanism to authenticate requests (`"basic"`, `"env"`, `"instance"`, `"profile"`, `"properties"`, `"anonymous"`, `"session"`, `"web_identity_token"`, default: `"default"`)
 43 |   - `"basic"`: uses **access_key_id** and **secret_access_key** to authenticate.
 44 |   - `"env"`: uses `AWS_ACCESS_KEY_ID` (or `AWS_ACCESS_KEY`) and `AWS_SECRET_KEY` (or `AWS_SECRET_ACCESS_KEY`) environment variables.
 45 |   - `"instance"`: uses EC2 instance profile or attached ECS task role.
 46 |   - `"profile"`: uses credentials written in a file. Format of the file is as following, where `[...]` is a name of profile.
 47 |     ```
 48 |     [default]
 49 |     aws_access_key_id=YOUR_ACCESS_KEY_ID
 50 |     aws_secret_access_key=YOUR_SECRET_ACCESS_KEY
 51 | 
 52 |     [profile2]
 53 |     ...
 54 |     ```
 55 |   - `"properties"`: uses aws.accessKeyId and aws.secretKey Java system properties.
 56 |   - `"anonymous"`: uses anonymous access. This auth method can access only public files.
 57 |   - `"session"`: uses temporary-generated **access_key_id**, **secret_access_key** and **session_token**.
 58 |   - `"assume_role"`: uses temporary-generated credentials by assuming **role_arn** role.
 59 |   - `"web_identity_token"`: uses temporary-generated credentials by assuming **role_arn** role with web identity.
 60 |   - `"default"`: uses AWS SDK's default strategy to look up available credentials from runtime environment. This method behaves like the combination of the following methods.
 61 |     1. `"env"`
 62 |     1. `"properties"`
 63 |     1. `"profile"`
 64 |     1. `"instance"`
 65 | - **profile_file**: path to a profiles file. this is optionally used when **auth_method** is `"profile"`. (string, default: given by `AWS_CREDENTIAL_PROFILES_FILE` environment variable, or ~/.aws/credentials).
 66 | - **profile_name**: name of a profile. this is optionally used when **auth_method** is `"profile"`. (string, default: `"default"`)
 67 | - **access_key_id**: aws access key id. this is required when **auth_method** is `"basic"` or `"session"`. (string, optional)
 68 | - **secret_access_key**: aws secret access key. this is required when **auth_method** is `"basic"` or `"session"`. (string, optional)
 69 | - **session_token**: aws session token. this is required when **auth_method** is `"session"`. (string, optional)
 70 | - **role_arn**: arn of the role to assume. this is required for **auth_method** is `"assume_role"` or `"web_identity_token"`. (string, optional)
 71 | - **role_session_name**: an identifier for the assumed role session. this is required when **auth_method** is `"assume_role"` or `"web_identity_token"`. (string, optional)
 72 | - **role_external_id**: a unique identifier that is used by third parties when assuming roles in their customers' accounts. this is optionally used for **auth_method**: `"assume_role"`. (string, optional)
 73 | - **role_session_duration_seconds**: duration, in seconds, of the role session. this is optionally used for **auth_method**: `"assume_role"`. (int, optional)
 74 | - **web_identity_token_file**: the absolute path to the web identity token file. this is required when **auth_method** is `"web_identity_token"`. (string, optional)
 75 | - **scope_down_policy**: an iam policy in json format. this is optionally used for **auth_method**: `"assume_role"`. (string, optional)
 76 | - **catalog**: Register a table if this option is specified (optional)
 77 |   - **catalog_id**: glue data catalog id if you use a catalog different from account/region default catalog. (string, optional)
 78 |   - **database**: The name of the database (string, required)
 79 |   - **table**: The name of the table (string, required)
 80 |   - **column_options**: a key-value pairs where key is a column name and value is options for the column. (string to options map, default: `{}`)
 81 |     - **type**: type of column when this plugin creates new tables (e.g. `string`, `bigint`) (string, default: depends on the input embulk column type, or the parquet logical type. See the below table)
 82 |     
 83 |       |embulk column type|glue data type|
 84 |       |:---|:---|
 85 |       |long|bigint|
 86 |       |boolean|boolean|
 87 |       |double|double|
 88 |       |string|string|
 89 |       |timestamp|string|
 90 |       |json|string|
 91 |       
 92 |       |parquet converted type|glue data type|note|
 93 |       |:---|:---|:---|
 94 |       |timestamp-millis|timestamp||
 95 |       |timestamp-micros|long|Glue cannot recognize timestamp-micros.|
 96 |       |timestamp-nanos|long|Glue cannot recognize timestamp-nanos.|
 97 |       |int8|tinyint||
 98 |       |int16|smallint||
 99 |       |int32|int||
100 |       |int64|bigint||
101 |       |uint8|smallint|Glue tinyint is a minimum value of -2^7 and a maximum value of 2^7-1|
102 |       |uint16|int|Glue smallint is a minimum value of -2^15 and a maximum value of 2^15-1.|
103 |       |uint32|bigint|Glue int is a minimum value of-2^31 and a maximum value of 2^31-1.|
104 |       |uint64|ConfigException|Glue bigint supports only a 64-bit signed integer.|
105 |       |json|string||
106 | 
107 |   - **operation_if_exists**: operation if the table already exist. Available operations are `"delete"` and `"skip"` (string, default: `"delete"`)
108 | - **endpoint**: The AWS Service endpoint (string, optional)
109 | - **region**: The AWS region (string, optional)
110 | - **http_proxy**: Indicate whether using when accessing AWS via http proxy. (optional)
111 |   - **host** proxy host (string, required)
112 |   - **port** proxy port (int, optional)
113 |   - **protocol** proxy protocol (string, default: `"https"`)
114 |   - **user** proxy user (string, optional)
115 |   - **password** proxy password (string, optional)
116 | - **buffer_dir**: buffer directory for parquet files to be uploaded on S3 (string, default: Create a Temporary Directory)
117 | - **type_options**:  a map whose keys are name of embulk type(`boolean`, `long`, `double`, `string`, `timestamp`, `json`), and values are configuration with following parameters (optional)
118 |   - **converted_type**: a Parquet converted type name (`timestamp-millis`, `timestamp-micros`, `timestamp-nanos`, `json`, `int8`, `int16`, `int32`, `int64`, `uint8`, `uint16`, `uint32`, `uint64`) (string, optional)
119 |   - **logical_type**: **[DEPRECATED: Use **converted_type** instead]** a Parquet converted type name (`timestamp-millis`, `timestamp-micros`, `timestamp-nanos`, `json`, `int8`, `int16`, `int32`, `int64`, `uint8`, `uint16`, `uint32`, `uint64`) (string, optional)
120 |   - **logical_type**: configuration for the detailed logical type. See [Logical Type Specification](https://github.com/apache/parquet-format/blob/apache-parquet-format-2.7.0/LogicalTypes.md) (optional)
121 |     - **name**: The name of logical type (`"date"`, `"decimal"`, `"int"`, `"json"`, `"time"`, `"timestamp"`) (string, required)
122 |     - **bit_width**: The bit width for `"int"` logical type (Allowed bit width values are `8`, `16`, `32`, `64`). (int, default: `64`)
123 |     - **is_signed**: Signed or not for `"int"` logical type (boolean, default: `true`)
124 |     - **scale**: The scale for `"decimal"` logical type (int, default: `0`)
125 |     - **precision**: The precision for `"decimal"` logical type (int, default: `0`)
126 |     - **is_adjusted_to_utc**: (boolean, default: `true`)
127 |     - **time_unit**: The precision for `"time"` or `"timestamp"` logical type (Allowed values are `"MILLIS`, `MICROS`, `NANOS`)
128 | 
129 | 
130 | ## Example
131 | 
132 | ```yaml
133 | out:
134 |   type: s3_parquet
135 |   bucket: my-bucket
136 |   path_prefix: path/to/my-obj.
137 |   file_ext: snappy.parquet
138 |   compression_codec: snappy
139 |   default_timezone: Asia/Tokyo
140 |   canned_acl: bucket-owner-full-control
141 | ```
142 | 
143 | ## Note
144 | 
145 | * This plugin implements the Parquet [LogicalTypes](https://github.com/apache/parquet-format/blob/apache-parquet-format-2.8.0/LogicalTypes.md) as much as possible. But it does not implement all of ones.
146 | * Some kind of LogicalTypes are sometimes not supported on your middleware. Be careful to giving logical type name.
147 | 
148 | ## Development
149 | 
150 | ### Run example:
151 | 
152 | ```shell
153 | $ ./run_s3_local.sh
154 | $ ./example/prepare_s3_bucket.sh
155 | $ ./gradlew gem
156 | $ embulk run example/config.yml -Ibuild/gemContents/lib
157 | ```
158 | 
159 | ### Run test:
160 | 
161 | ```shell
162 | $ ./run_s3_local.sh
163 | $ ./gradlew scalatest
164 | ```
165 | 
166 | ### Build
167 | 
168 | ```
169 | $ ./gradlew gem --write-locks  # -t to watch change of files and rebuild continuously
170 | ```
171 | 
172 | ### Release gem:
173 | Fix [build.gradle](./build.gradle), then
174 | 
175 | 
176 | ```shell
177 | $ ./gradlew gemPush
178 | ```
179 | 
180 | ## ChangeLog
181 | 
182 | [CHANGELOG.md](./CHANGELOG.md)
183 | 


--------------------------------------------------------------------------------
/src/test/scala/org/embulk/output/s3_parquet/parquet/TestIntLogicalType.scala:
--------------------------------------------------------------------------------
  1 | package org.embulk.output.s3_parquet.parquet
  2 | 
  3 | import org.apache.parquet.schema.LogicalTypeAnnotation
  4 | import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName
  5 | import org.embulk.config.ConfigException
  6 | import org.embulk.output.s3_parquet.catalog.GlueDataType
  7 | import org.embulk.spi.DataException
  8 | import org.scalatest.diagrams.Diagrams
  9 | import org.scalatest.funsuite.AnyFunSuite
 10 | import org.scalatest.prop.TableDrivenPropertyChecks
 11 | 
 12 | import scala.util.chaining._
 13 | class TestIntLogicalType
 14 |     extends AnyFunSuite
 15 |     with ParquetColumnTypeTestHelper
 16 |     with TableDrivenPropertyChecks
 17 |     with Diagrams {
 18 | 
 19 |   private val conditions = Table(
 20 |     ("bitWidth", "isSigned", "column"), {
 21 |       for {
 22 |         bitWidth <- Seq(8, 16, 32, 64)
 23 |         isSigned <- Seq(true, false)
 24 |         column <- Seq(
 25 |           SAMPLE_BOOLEAN_COLUMN,
 26 |           SAMPLE_LONG_COLUMN,
 27 |           SAMPLE_DOUBLE_COLUMN,
 28 |           SAMPLE_STRING_COLUMN,
 29 |           SAMPLE_TIMESTAMP_COLUMN,
 30 |           SAMPLE_JSON_COLUMN
 31 |         )
 32 |       } yield (bitWidth, isSigned, column)
 33 |     }: _*
 34 |   )
 35 | 
 36 |   private val unsupportedEmbulkColumns = Seq(
 37 |     SAMPLE_TIMESTAMP_COLUMN,
 38 |     SAMPLE_JSON_COLUMN
 39 |   )
 40 | 
 41 |   private def isINT32(bitWidth: Int): Boolean = bitWidth < 64
 42 | 
 43 |   test(
 44 |     "#primitiveType(column) returns PrimitiveTypeName.INT32 with LogicalType"
 45 |   ) {
 46 |     forAll(conditions) { (bitWidth, isSigned, column) =>
 47 |       whenever(isINT32(bitWidth) && !unsupportedEmbulkColumns.contains(column)) {
 48 |         val logicalType =
 49 |           IntLogicalType(bitWidth = bitWidth, isSigned = isSigned)
 50 |         // format: off
 51 |         assert(PrimitiveTypeName.INT32 == logicalType.primitiveType(column).getPrimitiveTypeName)
 52 |         assert(LogicalTypeAnnotation.intType(bitWidth, isSigned) == logicalType.primitiveType(column).getLogicalTypeAnnotation)
 53 |         // format: on
 54 |       }
 55 |     }
 56 |   }
 57 | 
 58 |   test(
 59 |     "#primitiveType(column) returns PrimitiveTypeName.INT64 with LogicalType"
 60 |   ) {
 61 |     forAll(conditions) { (bitWidth, isSigned, column) =>
 62 |       whenever(!isINT32(bitWidth) && !unsupportedEmbulkColumns.contains(column)) {
 63 |         val logicalType =
 64 |           IntLogicalType(bitWidth = bitWidth, isSigned = isSigned)
 65 |         // format: off
 66 |         assert(PrimitiveTypeName.INT64 == logicalType.primitiveType(column).getPrimitiveTypeName)
 67 |         assert(LogicalTypeAnnotation.intType(bitWidth, isSigned) == logicalType.primitiveType(column).getLogicalTypeAnnotation)
 68 |         // format: on
 69 |       }
 70 |     }
 71 |   }
 72 | 
 73 |   test(
 74 |     s"#primitiveType(column) cannot return any PrimitiveType when embulk column type is one of (${unsupportedEmbulkColumns
 75 |       .map(_.getType.getName)
 76 |       .mkString(",")})"
 77 |   ) {
 78 |     forAll(conditions) { (bitWidth, isSigned, column) =>
 79 |       whenever(unsupportedEmbulkColumns.contains(column)) {
 80 |         // format: off
 81 |         assert(intercept[ConfigException](IntLogicalType(bitWidth = bitWidth, isSigned = isSigned).primitiveType(column)).getMessage.startsWith("Unsupported column type: "))
 82 |         // format: on
 83 |       }
 84 |     }
 85 |   }
 86 | 
 87 |   test("#glueDataType(column) returns GlueDataType") {
 88 |     forAll(conditions) { (bitWidth, isSigned, column) =>
 89 |       whenever(!unsupportedEmbulkColumns.contains(column)) {
 90 |         def assertGlueDataType(expected: GlueDataType) = {
 91 |           // format: off
 92 |           assert(expected == IntLogicalType(bitWidth = bitWidth, isSigned = isSigned).glueDataType(column))
 93 |           // format: on
 94 |         }
 95 |         if (isSigned) {
 96 |           bitWidth match {
 97 |             case 8  => assertGlueDataType(GlueDataType.TINYINT)
 98 |             case 16 => assertGlueDataType(GlueDataType.SMALLINT)
 99 |             case 32 => assertGlueDataType(GlueDataType.INT)
100 |             case 64 => assertGlueDataType(GlueDataType.BIGINT)
101 |             case _  => fail()
102 |           }
103 |         }
104 |         else {
105 |           bitWidth match {
106 |             case 8  => assertGlueDataType(GlueDataType.SMALLINT)
107 |             case 16 => assertGlueDataType(GlueDataType.INT)
108 |             case 32 => assertGlueDataType(GlueDataType.BIGINT)
109 |             case 64 => assertGlueDataType(GlueDataType.BIGINT)
110 |             case _  => fail()
111 |           }
112 |         }
113 |       }
114 |     }
115 |   }
116 | 
117 |   test(
118 |     s"#glueDataType(column) cannot return any GlueDataType when embulk column type is one of (${unsupportedEmbulkColumns
119 |       .map(_.getType.getName)
120 |       .mkString(",")})"
121 |   ) {
122 |     forAll(conditions) { (bitWidth, isSigned, column) =>
123 |       whenever(unsupportedEmbulkColumns.contains(column)) {
124 |         // format: off
125 |         assert(intercept[ConfigException](IntLogicalType(bitWidth = bitWidth, isSigned = isSigned).glueDataType(column)).getMessage.startsWith("Unsupported column type: "))
126 |         // format: on
127 |       }
128 |     }
129 |   }
130 | 
131 |   test("#consumeBoolean (INT32)") {
132 |     forAll(conditions) { (bitWidth, isSigned, _) =>
133 |       whenever(isINT32(bitWidth)) {
134 |         newMockRecordConsumer().tap { consumer =>
135 |           consumer.writingSampleField {
136 |             IntLogicalType(bitWidth = bitWidth, isSigned = isSigned)
137 |               .consumeBoolean(consumer, true)
138 |           }
139 |           assert(consumer.data.head.head.isInstanceOf[Int])
140 |           assert(consumer.data.head.head == 1)
141 |         }
142 |         newMockRecordConsumer().tap { consumer =>
143 |           consumer.writingSampleField {
144 |             IntLogicalType(bitWidth = bitWidth, isSigned = isSigned)
145 |               .consumeBoolean(consumer, false)
146 |           }
147 |           assert(consumer.data.head.head.isInstanceOf[Int])
148 |           assert(consumer.data.head.head == 0)
149 |         }
150 |       }
151 |     }
152 |   }
153 | 
154 |   test("#consumeBoolean (INT64)") {
155 |     forAll(conditions) { (bitWidth, isSigned, _) =>
156 |       whenever(!isINT32(bitWidth)) {
157 |         newMockRecordConsumer().tap { consumer =>
158 |           consumer.writingSampleField {
159 |             IntLogicalType(bitWidth = bitWidth, isSigned = isSigned)
160 |               .consumeBoolean(consumer, true)
161 |           }
162 |           assert(consumer.data.head.head.isInstanceOf[Long])
163 |           assert(consumer.data.head.head == 1L)
164 |         }
165 |         newMockRecordConsumer().tap { consumer =>
166 |           consumer.writingSampleField {
167 |             IntLogicalType(bitWidth = bitWidth, isSigned = isSigned)
168 |               .consumeBoolean(consumer, false)
169 |           }
170 |           assert(consumer.data.head.head.isInstanceOf[Long])
171 |           assert(consumer.data.head.head == 0L)
172 |         }
173 |       }
174 |     }
175 |   }
176 | 
177 |   test("#consumeString  (INT32)") {
178 |     forAll(conditions) { (bitWidth, isSigned, _) =>
179 |       whenever(isINT32(bitWidth)) {
180 |         newMockRecordConsumer().tap { consumer =>
181 |           consumer.writingSampleField {
182 |             IntLogicalType(bitWidth = bitWidth, isSigned = isSigned)
183 |               .consumeString(consumer, "1")
184 |           }
185 |           assert(consumer.data.head.head.isInstanceOf[Int])
186 |           assert(consumer.data.head.head == 1)
187 |         }
188 |         newMockRecordConsumer().tap { consumer =>
189 |           consumer.writingSampleField {
190 |             // format: off
191 |             assert(intercept[DataException](IntLogicalType(bitWidth = bitWidth, isSigned = isSigned).consumeString(consumer, "string")).getMessage.startsWith("Failed to cast String: "))
192 |             // format: on
193 |           }
194 |         }
195 |       }
196 |     }
197 |   }
198 | 
199 |   test("#consumeString  (INT64)") {
200 |     forAll(conditions) { (bitWidth, isSigned, _) =>
201 |       whenever(!isINT32(bitWidth)) {
202 |         newMockRecordConsumer().tap { consumer =>
203 |           consumer.writingSampleField {
204 |             IntLogicalType(bitWidth = bitWidth, isSigned = isSigned)
205 |               .consumeString(consumer, "1")
206 |           }
207 |           assert(consumer.data.head.head.isInstanceOf[Long])
208 |           assert(consumer.data.head.head == 1L)
209 |         }
210 |         newMockRecordConsumer().tap { consumer =>
211 |           consumer.writingSampleField {
212 |             // format: off
213 |             assert(intercept[DataException](IntLogicalType(bitWidth = bitWidth, isSigned = isSigned).consumeString(consumer, "string")).getMessage.startsWith("Failed to cast String: "))
214 |             // format: on
215 |           }
216 |         }
217 |       }
218 |     }
219 |   }
220 | 
221 |   test("#consumeLong (INT32)") {
222 |     forAll(conditions) { (bitWidth, isSigned, _) =>
223 |       whenever(isINT32(bitWidth)) {
224 |         newMockRecordConsumer().tap { consumer =>
225 |           consumer.writingSampleField {
226 |             IntLogicalType(bitWidth = bitWidth, isSigned = isSigned)
227 |               .consumeLong(consumer, 1L)
228 |           }
229 |           assert(consumer.data.head.head.isInstanceOf[Int])
230 |           assert(consumer.data.head.head == 1)
231 |         }
232 |         newMockRecordConsumer().tap { consumer =>
233 |           consumer.writingSampleField {
234 |             // format: off
235 |             assert(intercept[DataException](IntLogicalType(bitWidth = bitWidth, isSigned = isSigned).consumeLong(consumer, Long.MaxValue)).getMessage.startsWith("The value is out of the range: that is "))
236 |             // format: on
237 |           }
238 |         }
239 |       }
240 |     }
241 |   }
242 | 
243 |   test("#consumeLong (INT64)") {
244 |     forAll(conditions) { (bitWidth, isSigned, _) =>
245 |       whenever(!isINT32(bitWidth)) {
246 |         newMockRecordConsumer().tap { consumer =>
247 |           consumer.writingSampleField {
248 |             IntLogicalType(bitWidth = bitWidth, isSigned = isSigned)
249 |               .consumeLong(consumer, 1L)
250 |           }
251 |           assert(consumer.data.head.head.isInstanceOf[Long])
252 |           assert(consumer.data.head.head == 1L)
253 |         }
254 |         newMockRecordConsumer().tap { consumer =>
255 |           consumer.writingSampleField {
256 |             IntLogicalType(bitWidth = bitWidth, isSigned = isSigned)
257 |               .consumeLong(consumer, Long.MaxValue)
258 |           }
259 |           assert(consumer.data.head.head.isInstanceOf[Long])
260 |           assert(consumer.data.head.head == Long.MaxValue)
261 |         }
262 |       }
263 |     }
264 |   }
265 | 
266 |   test("#consumeDouble (INT32)") {
267 |     forAll(conditions) { (bitWidth, isSigned, _) =>
268 |       whenever(isINT32(bitWidth)) {
269 |         newMockRecordConsumer().tap { consumer =>
270 |           consumer.writingSampleField {
271 |             IntLogicalType(bitWidth = bitWidth, isSigned = isSigned)
272 |               .consumeDouble(consumer, 1.4d)
273 |           }
274 |           assert(consumer.data.head.head.isInstanceOf[Int])
275 |           assert(consumer.data.head.head == 1)
276 |         }
277 |         newMockRecordConsumer().tap { consumer =>
278 |           consumer.writingSampleField {
279 |             IntLogicalType(bitWidth = bitWidth, isSigned = isSigned)
280 |               .consumeDouble(consumer, 1.5d)
281 |           }
282 |           assert(consumer.data.head.head.isInstanceOf[Int])
283 |           assert(consumer.data.head.head == 2)
284 |         }
285 | 
286 |         newMockRecordConsumer().tap { consumer =>
287 |           consumer.writingSampleField {
288 |             // format: off
289 |             assert(intercept[DataException](IntLogicalType(bitWidth = bitWidth, isSigned = isSigned).consumeDouble(consumer, Double.MaxValue)).getMessage.startsWith("The value is out of the range: that is "))
290 |             // format: on
291 |           }
292 |         }
293 |       }
294 |     }
295 |   }
296 | 
297 |   test("#consumeDouble (INT64)") {
298 |     forAll(conditions) { (bitWidth, isSigned, _) =>
299 |       whenever(!isINT32(bitWidth)) {
300 |         newMockRecordConsumer().tap { consumer =>
301 |           consumer.writingSampleField {
302 |             IntLogicalType(bitWidth = bitWidth, isSigned = isSigned)
303 |               .consumeDouble(consumer, 1.4d)
304 |           }
305 |           assert(consumer.data.head.head.isInstanceOf[Long])
306 |           assert(consumer.data.head.head == 1L)
307 |         }
308 |         newMockRecordConsumer().tap { consumer =>
309 |           consumer.writingSampleField {
310 |             IntLogicalType(bitWidth = bitWidth, isSigned = isSigned)
311 |               .consumeDouble(consumer, 1.5d)
312 |           }
313 |           assert(consumer.data.head.head.isInstanceOf[Long])
314 |           assert(consumer.data.head.head == 2L)
315 |         }
316 |         newMockRecordConsumer().tap { consumer =>
317 |           consumer.writingSampleField {
318 |             // format: off
319 |           assert(intercept[DataException](IntLogicalType(bitWidth = bitWidth, isSigned = isSigned).consumeDouble(consumer, Double.MaxValue)).getMessage.startsWith("The value is out of the range: "))
320 |           // format: on
321 |           }
322 |         }
323 |       }
324 |     }
325 |   }
326 | 
327 |   test("#consumeTimestamp is unsupported") {
328 |     forAll(conditions) { (bitWidth, isSigned, _) =>
329 |       newMockRecordConsumer().tap { consumer =>
330 |         consumer.writingSampleField {
331 |           // format: off
332 |           assert(intercept[ConfigException](IntLogicalType(bitWidth = bitWidth, isSigned = isSigned).consumeTimestamp(consumer, null, null)).getMessage.endsWith("is unsupported."))
333 |           // format: on
334 |         }
335 |       }
336 |     }
337 |   }
338 |   test("#consumeJson is unsupported") {
339 |     forAll(conditions) { (bitWidth, isSigned, _) =>
340 |       newMockRecordConsumer().tap { consumer =>
341 |         consumer.writingSampleField {
342 |           // format: off
343 |           assert(intercept[ConfigException](IntLogicalType(bitWidth = bitWidth, isSigned = isSigned).consumeJson(consumer, null)).getMessage.endsWith("is unsupported."))
344 |           // format: on
345 |         }
346 |       }
347 |     }
348 |   }
349 | }
350 | 


--------------------------------------------------------------------------------