├── .circleci
└── config.yml
├── .github
└── FUNDING.yml
├── .gitignore
├── .java-version
├── .jvmopts
├── .scalafmt.conf
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── akkaPekko
└── src
│ ├── it
│ └── scala
│ │ └── com
│ │ └── github
│ │ └── mjakubowski84
│ │ └── parquet4s
│ │ └── ParquetStreamsITSpec.scala
│ └── main
│ ├── scala-akka-jvm
│ └── com
│ │ └── github
│ │ └── mjakubowski84
│ │ └── parquet4s
│ │ └── ScalaCompat.scala
│ ├── scala-pekko-jvm
│ └── com
│ │ └── github
│ │ └── mjakubowski84
│ │ └── parquet4s
│ │ └── ScalaCompat.scala
│ └── scala
│ └── com
│ └── github
│ └── mjakubowski84
│ └── parquet4s
│ ├── ParquetPartitioningFlow.scala
│ ├── ParquetSource.scala
│ ├── ParquetStreams.scala
│ └── SingleFileParquetSink.scala
├── akkaPekkoBenchmarks
└── src
│ └── main
│ └── scala
│ └── com
│ └── github
│ └── mjakubowski84
│ └── parquet4s
│ └── AkkaPekkoBenchmark.scala
├── build.sbt
├── core
└── src
│ ├── it
│ ├── resources
│ │ └── logback-test.xml
│ ├── scala-2.12
│ │ └── com
│ │ │ └── github
│ │ │ └── mjakubowski84
│ │ │ └── parquet4s
│ │ │ ├── CustomTypeITSpec.scala
│ │ │ ├── ParquetWriterAndSparkCompatibilityItSpec.scala
│ │ │ ├── SparkAndParquetReaderCompatibilityItSpec.scala
│ │ │ ├── SparkHelper.scala
│ │ │ ├── TestCaseSupport.scala
│ │ │ ├── TimeEncodingCompatibilityItSpec.scala
│ │ │ ├── TimeEncodingInt64MicrosCompatibilityItSpec.scala
│ │ │ ├── TimeEncodingInt64MillisCompatibilityItSpec.scala
│ │ │ ├── TimeEncodingInt64NanosCompatibilityItSpec.scala
│ │ │ └── TimeEncodingInt96CompatibilityItSpec.scala
│ ├── scala-2.13
│ │ └── com
│ │ │ └── github
│ │ │ └── mjakubowski84
│ │ │ └── parquet4s
│ │ │ ├── CustomTypeITSpec.scala
│ │ │ ├── ParquetWriterAndSparkCompatibilityItSpec.scala
│ │ │ ├── SparkAndParquetReaderCompatibilityItSpec.scala
│ │ │ ├── SparkHelper.scala
│ │ │ ├── TestCaseSupport.scala
│ │ │ ├── TimeEncodingCompatibilityItSpec.scala
│ │ │ ├── TimeEncodingInt64MicrosCompatibilityItSpec.scala
│ │ │ ├── TimeEncodingInt64MillisCompatibilityItSpec.scala
│ │ │ ├── TimeEncodingInt64NanosCompatibilityItSpec.scala
│ │ │ └── TimeEncodingInt96CompatibilityItSpec.scala
│ ├── scala-3
│ │ └── com
│ │ │ └── github
│ │ │ └── mjakubowski84
│ │ │ └── parquet4s
│ │ │ ├── CustomTypeITSpec.scala
│ │ │ └── TestCaseSupport.scala
│ └── scala
│ │ └── com
│ │ └── github
│ │ └── mjakubowski84
│ │ └── parquet4s
│ │ ├── CompatibilityTestCases.scala
│ │ ├── FilteringByListItSpec.scala
│ │ ├── FilteringITSpec.scala
│ │ ├── IOOpsITSpec.scala
│ │ ├── ParquetReaderItSpec.scala
│ │ ├── ParquetWriterAndParquetReaderCompatibilityItSpec.scala
│ │ ├── ParquetWriterItSpec.scala
│ │ ├── ProjectionItSpec.scala
│ │ ├── RecordFilterItSpec.scala
│ │ ├── TestUtils.scala
│ │ └── stats
│ │ ├── CompoundAndPartitionedStatsITSpec.scala
│ │ ├── FileStatsItSpec.scala
│ │ └── FilteredFileStatsItSpec.scala
│ ├── main
│ ├── scala-2.12
│ │ └── com
│ │ │ └── github
│ │ │ └── mjakubowski84
│ │ │ └── parquet4s
│ │ │ ├── ParquetRecordDecoder.scala
│ │ │ ├── ParquetRecordEncoder.scala
│ │ │ ├── ParquetSchemaResolver.scala
│ │ │ ├── ProductDecoders.scala
│ │ │ ├── ProductEncoders.scala
│ │ │ ├── ProductSchemaDefs.scala
│ │ │ └── compat
│ │ │ ├── CursorCompat.scala
│ │ │ ├── IteratorCompat.scala
│ │ │ └── MapCompat.scala
│ ├── scala-2.13
│ │ └── com
│ │ │ └── github
│ │ │ └── mjakubowski84
│ │ │ └── parquet4s
│ │ │ ├── ParquetRecordDecoder.scala
│ │ │ ├── ParquetRecordEncoder.scala
│ │ │ ├── ParquetSchemaResolver.scala
│ │ │ ├── ProductDecoders.scala
│ │ │ ├── ProductEncoders.scala
│ │ │ ├── ProductSchemaDefs.scala
│ │ │ └── compat
│ │ │ ├── CursorCompat.scala
│ │ │ ├── IteratorCompat.scala
│ │ │ └── MapCompat.scala
│ ├── scala-3
│ │ └── com
│ │ │ └── github
│ │ │ └── mjakubowski84
│ │ │ └── parquet4s
│ │ │ ├── ParquetRecordDecoder.scala
│ │ │ ├── ParquetRecordEncoder.scala
│ │ │ ├── ParquetSchemaResolver.scala
│ │ │ ├── ProductDecoders.scala
│ │ │ ├── ProductEncoders.scala
│ │ │ ├── ProductSchemaDefs.scala
│ │ │ └── compat
│ │ │ ├── CursorCompat.scala
│ │ │ ├── IteratorCompat.scala
│ │ │ └── MapCompat.scala
│ └── scala
│ │ └── com
│ │ └── github
│ │ └── mjakubowski84
│ │ └── parquet4s
│ │ ├── ColumnPath.scala
│ │ ├── ColumnProjection.scala
│ │ ├── Cursor.scala
│ │ ├── DecimalFormat.scala
│ │ ├── Filter.scala
│ │ ├── HadoopParquetReader.scala
│ │ ├── IOOps.scala
│ │ ├── InMemoryInputFile.scala
│ │ ├── InMemoryOutputFile.scala
│ │ ├── MetadataWriter.scala
│ │ ├── ParquetIterable.scala
│ │ ├── ParquetIterator.scala
│ │ ├── ParquetReadSupport.scala
│ │ ├── ParquetReader.scala
│ │ ├── ParquetRecord.scala
│ │ ├── ParquetWriter.scala
│ │ ├── PartitionFilter.scala
│ │ ├── Path.scala
│ │ ├── Schema.scala
│ │ ├── Stats.scala
│ │ ├── TimestampFormat.scala
│ │ ├── UDP.scala
│ │ ├── Value.scala
│ │ ├── ValueCodec.scala
│ │ ├── ValueCodecConfiguration.scala
│ │ ├── ValueCodecs.scala
│ │ ├── ValueDecoder.scala
│ │ ├── ValueEncoder.scala
│ │ ├── ValueImplicits.scala
│ │ ├── etl
│ │ ├── CompoundParquetIterable.scala
│ │ ├── InMemoryParquetIterable.scala
│ │ └── Join.scala
│ │ ├── experimental.scala
│ │ └── stats
│ │ ├── CompoundStats.scala
│ │ ├── FileStats.scala
│ │ ├── FilteredFileStats.scala
│ │ ├── InMemoryStats.scala
│ │ ├── LazyDelegateStats.scala
│ │ └── PartitionedFileStats.scala
│ └── test
│ ├── resources
│ └── logback-test.xml
│ └── scala
│ └── com
│ └── github
│ └── mjakubowski84
│ └── parquet4s
│ ├── ColumnPathSpec.scala
│ ├── CursorSpec.scala
│ ├── DecimalFormatSpec.scala
│ ├── DecimalValueSpec.scala
│ ├── FilterSpec.scala
│ ├── IOOpsSpec.scala
│ ├── InMemoryFileSpec.scala
│ ├── ParquetIterableSpec.scala
│ ├── ParquetIteratorSpec.scala
│ ├── ParquetRecordDecoderSpec.scala
│ ├── ParquetRecordEncoderSpec.scala
│ ├── ParquetRecordSpec.scala
│ ├── ParquetSchemaResolverSpec.scala
│ ├── PartitionFilterSpec.scala
│ ├── PartitionTestUtils.scala
│ ├── SkippingParquetSchemaResolverSpec.scala
│ ├── TestCases.scala
│ ├── TimestampFormatSpec.scala
│ ├── ValueCodecsSpec.scala
│ ├── ValueEncodingAndDecodingSpec.scala
│ └── etl
│ └── JoinSpec.scala
├── coreBenchmarks
└── src
│ └── main
│ └── scala
│ └── com
│ └── github
│ └── mjakubowski84
│ └── parquet4s
│ └── CoreBenchmark.scala
├── examples
└── src
│ └── main
│ ├── protobuf
│ └── data.proto
│ ├── resources
│ └── logback.xml
│ ├── scala-akka-jvm
│ └── com
│ │ └── github
│ │ └── mjakubowski84
│ │ └── parquet4s
│ │ └── ScalaKafkaCompat.scala
│ ├── scala-pekko-jvm
│ └── com
│ │ └── github
│ │ └── mjakubowski84
│ │ └── parquet4s
│ │ └── ScalaKafkaCompat.scala
│ └── scala
│ └── com
│ └── github
│ └── mjakubowski84
│ └── parquet4s
│ ├── CustomType.scala
│ ├── akkaPekko
│ ├── CustomAvroWriteAndReadAkkaPekkoApp.scala
│ ├── CustomPartitioningAvroWriteAkkaPekkoApp.scala
│ ├── CustomProtobufWriteAndReadAkkaPekkoApp.scala
│ ├── WriteAndReadAkkaPekkoApp.scala
│ ├── WriteAndReadCustomTypeAkkaPekkoApp.scala
│ ├── WriteAndReadFilteredAkkaPekkoApp.scala
│ ├── WriteAndReadGenericAkkaPekkoApp.scala
│ └── indefinite
│ │ ├── AkkaPekko.scala
│ │ ├── ExampleApp.scala
│ │ ├── Kafka.scala
│ │ ├── Logger.scala
│ │ ├── MessageSink.scala
│ │ ├── MessageSource.scala
│ │ └── RandomDataProducer.scala
│ ├── core
│ ├── ColumnProjectionAndDataConcatenationApp.scala
│ ├── ETLApp.scala
│ ├── WriteAndReadApp.scala
│ ├── WriteAndReadCustomTypeApp.scala
│ ├── WriteAndReadFilteredApp.scala
│ ├── WriteAndReadGenericApp.scala
│ ├── WriteAndReadUsingRecordFilterApp.scala
│ └── WriteIncrementallyAndReadApp.scala
│ ├── fs2
│ ├── CustomAvroPartitioningWriteFS2App.scala
│ ├── CustomAvroWriteAndReadFS2App.scala
│ ├── CustomProtobufWriteAndReadFS2App.scala
│ ├── IndefiniteFS2App.scala
│ ├── WriteAndReadFS2App.scala
│ ├── WriteAndReadFilteredFS2App.scala
│ └── WriteAndReadGenericFS2App.scala
│ └── scalapb
│ ├── WriteAndReadApp.scala
│ └── WriteIncrementallyAndReadApp.scala
├── fs2
└── src
│ ├── it
│ ├── resources
│ │ └── logback-test.xml
│ └── scala
│ │ └── com
│ │ └── github
│ │ └── mjakubowski84
│ │ └── parquet4s
│ │ ├── Fs2ParquetItSpec.scala
│ │ └── parquet
│ │ └── IoITSpec.scala
│ ├── main
│ └── scala
│ │ └── com
│ │ └── github
│ │ └── mjakubowski84
│ │ └── parquet4s
│ │ └── parquet
│ │ ├── io.scala
│ │ ├── logger.scala
│ │ ├── package.scala
│ │ ├── reader.scala
│ │ ├── rotatingWriter.scala
│ │ └── writer.scala
│ └── test
│ └── scala
│ └── com
│ └── github
│ └── mjakubowski84
│ └── parquet4s
│ └── parquet
│ └── IoSpec.scala
├── fs2Benchmarks
└── src
│ └── main
│ └── scala
│ └── com
│ └── github
│ └── mjakubowski84
│ └── parquet4s
│ └── Fs2Benchmark.scala
├── project
├── ActorLibCross.scala
├── Compilation.scala
├── DependecyVersions.scala
├── Documentation.scala
├── Releasing.scala
├── Signing.scala
├── build.properties
├── metals.sbt
└── plugins.sbt
├── s3Test
└── src
│ └── it
│ ├── resources
│ └── logback-test.xml
│ └── scala
│ └── com
│ └── github
│ └── mjakubowski84
│ └── parquet4s
│ └── s3
│ └── S3ItSpec.scala
├── scalapb
└── src
│ ├── main
│ └── scala
│ │ └── com
│ │ └── github
│ │ └── mjakubowski84
│ │ └── parquet4s
│ │ ├── ScalaPBImplicits.scala
│ │ ├── ScalaPBParquetRecordDecoder.scala
│ │ ├── ScalaPBParquetRecordEncoder.scala
│ │ └── ScalaPBParquetSchemaResolver.scala
│ └── test
│ ├── protobuf
│ └── data.proto
│ └── scala
│ └── com
│ └── github
│ └── mjakubowski84
│ └── parquet4s
│ ├── Parquet4sScalaPBAkkaPekkoSpec.scala
│ ├── Parquet4sScalaPBCoreSpec.scala
│ ├── Parquet4sScalaPBFS2Spec.scala
│ ├── Parquet4sScalaPBSpec.scala
│ └── TestData.scala
└── site
└── src
└── main
└── resources
└── docs
├── data
└── menu.yml
├── docs
├── akka.md
├── etl.md
├── examples.md
├── filtering.md
├── fs2.md
├── introduction.md
├── migration.md
├── partitioning.md
├── pekko.md
├── projection.md
├── protobuf.md
├── quick_start.md
├── records_and_schema.md
├── sponsors.md
├── statistics.md
└── storage_types.md
├── images
├── favicon-16x16.png
├── favicon-32x32.png
├── features-header.svg
├── light-navbar-brand.svg
└── light-sidebar-brand.svg
└── index.md
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: mjakubowski84
2 |
3 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.class
2 | *.log
3 | *.iml
4 |
5 | .vscode
6 | .metals
7 | .bloop
8 | .idea
9 | .bsp
10 | .sbt
11 |
12 | target
13 | spark-warehouse
14 | project/.plugins.sbt.swp
15 | project/project
16 |
--------------------------------------------------------------------------------
/.java-version:
--------------------------------------------------------------------------------
1 | 17
2 |
--------------------------------------------------------------------------------
/.jvmopts:
--------------------------------------------------------------------------------
1 | -Xmx4G
2 | -Xss2M
3 | -Xms512m
4 | -XX:+UseG1GC
5 | -XX:MaxInlineLevel=20
6 | # --add-opens=java.base/java.lang=ALL-UNNAMED
7 | # --add-opens=java.base/java.lang.invoke=ALL-UNNAMED
8 | # --add-opens=java.base/java.lang.reflect=ALL-UNNAMED
9 | # --add-opens=java.base/java.io=ALL-UNNAMED
10 | # --add-opens=java.base/java.net=ALL-UNNAMED
11 | # --add-opens=java.base/java.nio=ALL-UNNAMED
12 | # --add-opens=java.base/java.util=ALL-UNNAMED
13 | # --add-opens=java.base/java.util.concurrent=ALL-UNNAMED
14 | # --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED
15 | # --add-opens=java.base/sun.nio.ch=ALL-UNNAMED
16 | # --add-opens=java.base/sun.nio.cs=ALL-UNNAMED
17 | # --add-opens=java.base/sun.security.action=ALL-UNNAMED
18 | # --add-opens=java.base/sun.util.calendar=ALL-UNNAMED
19 |
--------------------------------------------------------------------------------
/.scalafmt.conf:
--------------------------------------------------------------------------------
1 | version = 3.9.1
2 | preset = default
3 | maxColumn = 120
4 |
5 | rewrite.rules = [
6 | RedundantBraces,
7 | RedundantParens,
8 | SortModifiers,
9 | PreferCurlyFors,
10 | Imports
11 | ]
12 |
13 | align.preset = more
14 | align.tokens = [
15 | {code = "="},
16 | {code = "=>"},
17 | {code = "<-"}
18 | ]
19 |
20 | runner.dialect = scala213source3
21 | fileOverride {
22 | "glob:**/scala-2.12/**" {
23 | runner.dialect = scala212source3
24 | }
25 | "glob:**/scala-2.13/**" {
26 | runner.dialect = scala213source3
27 | }
28 | "glob:**/scala-3/**" {
29 | runner.dialect = scala3
30 | }
31 | }
32 | project.excludePaths = [
33 | "glob:**/metals.sbt"
34 | ]
35 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # How to contribute
2 |
3 | I am happy that you are reading this! Your contribution is warmly welcome! Here you can read the guideliness that will make it easier.
4 |
5 | 1. Before contributing please create GitHub issue (if it is not created already) and discuss your case and the solution with the authors.
6 |
7 | 2. Propose your changes via pull request.
8 |
9 | 3. Please write descriptive messages in your commits and pull requests so that we can understand all easily.
10 |
11 | 4. Should your change be covered by a missing test? Write it!
12 |
13 | 5. Remember to update README.md if necessary.
14 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 Marcin Jakubowski
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Parquet4S
2 |
3 |
4 |
5 | Parquet4s is a simple I/O for [Parquet](https://parquet.apache.org/). Allows you to easily read and write Parquet files in [Scala](https://www.scala-lang.org/).
6 |
7 | Use just a Scala case class to define the schema of your data. No need to use Avro, Protobuf, Thrift, or other data serialisation systems. You can use generic records if you don't want to use the case class, too.
8 |
9 | Compatible with files generated with [Apache Spark](https://spark.apache.org/). However, unlike in Spark, you do not have to start a cluster to perform I/O operations.
10 |
11 | Based on official [Parquet library](https://github.com/apache/parquet-mr), [Hadoop Client](https://github.com/apache/hadoop) and [Shapeless](https://github.com/milessabin/shapeless) (Shapeless is not in use in a version for Scala 3).
12 |
13 | As it is based on Hadoop Client, you can connect to any Hadoop-compatible storage like AWS S3 or Google Cloud Storage.
14 |
15 | Integrations for [Akka Streams](https://doc.akka.io/docs/akka/current/stream/index.html), [Pekko Streams](https://pekko.apache.org/docs/pekko/current/stream/index.html), and [FS2](https://fs2.io/).
16 |
17 | Released for Scala 2.12.x, 2.13.x and 3.3.x.
18 |
19 | ## Documentation
20 |
21 | Documentation is available at [here](https://mjakubowski84.github.io/parquet4s/).
22 |
23 | ## Contributing
24 |
25 | Do you want to contribute? Please read the [contribution guidelines](CONTRIBUTING.md).
26 |
27 | ## Sponsors
28 |
29 | - [calvinlfer](https://github.com/calvinlfer)
30 |
--------------------------------------------------------------------------------
/akkaPekko/src/main/scala-akka-jvm/com/github/mjakubowski84/parquet4s/ScalaCompat.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | object ScalaCompat {
4 |
5 | type NotUsed = akka.NotUsed
6 | def NotUsed = akka.NotUsed
7 |
8 | type Done = akka.Done
9 | def Done = akka.Done
10 |
11 | object stream {
12 | type Attributes = akka.stream.Attributes
13 |
14 | def ActorAttributes = akka.stream.ActorAttributes
15 |
16 | type Inlet[T] = akka.stream.Inlet[T]
17 | def Inlet = akka.stream.Inlet
18 |
19 | type Outlet[T] = akka.stream.Outlet[T]
20 | def Outlet = akka.stream.Outlet
21 |
22 | type Shape = akka.stream.Shape
23 |
24 | type FlowShape[I, O] = akka.stream.FlowShape[I, O]
25 | def FlowShape = akka.stream.FlowShape
26 |
27 | object stage {
28 | type GraphStage[S <: Shape] = akka.stream.stage.GraphStage[S]
29 |
30 | type GraphStageLogic = akka.stream.stage.GraphStageLogic
31 |
32 | type TimerGraphStageLogic = akka.stream.stage.TimerGraphStageLogic
33 |
34 | type InHandler = akka.stream.stage.InHandler
35 |
36 | type OutHandler = akka.stream.stage.OutHandler
37 | }
38 |
39 | object scaladsl {
40 | type Source[Out, Mat] = akka.stream.scaladsl.Source[Out, Mat]
41 | def Source = akka.stream.scaladsl.Source
42 |
43 | type Flow[In, Out, Mat] = akka.stream.scaladsl.Flow[In, Out, Mat]
44 | def Flow[T] = akka.stream.scaladsl.Flow[T]
45 |
46 | def Keep = akka.stream.scaladsl.Keep
47 |
48 | type Sink[In, Mat] = akka.stream.scaladsl.Sink[In, Mat]
49 | def Sink = akka.stream.scaladsl.Sink
50 | }
51 | }
52 |
53 | object pattern {
54 | type AskSupport = akka.pattern.AskSupport
55 | }
56 |
57 | object actor {
58 | type Actor = akka.actor.Actor
59 | def Actor = akka.actor.Actor
60 |
61 | type ActorRef = akka.actor.ActorRef
62 | def ActorRef = akka.actor.ActorRef
63 |
64 | type CoordinatedShutdown = akka.actor.CoordinatedShutdown
65 | def CoordinatedShutdown = akka.actor.CoordinatedShutdown
66 |
67 | type Cancellable = akka.actor.Cancellable
68 |
69 | type Props = akka.actor.Props
70 | def Props = akka.actor.Props
71 |
72 | type Scheduler = akka.actor.Scheduler
73 |
74 | type ActorSystem = akka.actor.ActorSystem
75 | def ActorSystem = akka.actor.ActorSystem
76 | }
77 |
78 | object util {
79 | type Timeout = akka.util.Timeout
80 | def Timeout = akka.util.Timeout
81 | }
82 | }
83 |
--------------------------------------------------------------------------------
/akkaPekko/src/main/scala-pekko-jvm/com/github/mjakubowski84/parquet4s/ScalaCompat.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | object ScalaCompat {
4 |
5 | type NotUsed = org.apache.pekko.NotUsed
6 | def NotUsed = org.apache.pekko.NotUsed
7 |
8 | type Done = org.apache.pekko.Done
9 | def Done = org.apache.pekko.Done
10 |
11 | object stream {
12 | type Attributes = org.apache.pekko.stream.Attributes
13 |
14 | def ActorAttributes = org.apache.pekko.stream.ActorAttributes
15 |
16 | type Inlet[T] = org.apache.pekko.stream.Inlet[T]
17 | def Inlet = org.apache.pekko.stream.Inlet
18 |
19 | type Outlet[T] = org.apache.pekko.stream.Outlet[T]
20 | def Outlet = org.apache.pekko.stream.Outlet
21 |
22 | type Shape = org.apache.pekko.stream.Shape
23 |
24 | type FlowShape[I, O] = org.apache.pekko.stream.FlowShape[I, O]
25 | def FlowShape = org.apache.pekko.stream.FlowShape
26 |
27 | object stage {
28 | type GraphStage[S <: Shape] = org.apache.pekko.stream.stage.GraphStage[S]
29 |
30 | type GraphStageLogic = org.apache.pekko.stream.stage.GraphStageLogic
31 |
32 | type TimerGraphStageLogic = org.apache.pekko.stream.stage.TimerGraphStageLogic
33 |
34 | type InHandler = org.apache.pekko.stream.stage.InHandler
35 |
36 | type OutHandler = org.apache.pekko.stream.stage.OutHandler
37 | }
38 |
39 | object scaladsl {
40 | type Source[Out, Mat] = org.apache.pekko.stream.scaladsl.Source[Out, Mat]
41 | def Source = org.apache.pekko.stream.scaladsl.Source
42 |
43 | type Flow[In, Out, Mat] = org.apache.pekko.stream.scaladsl.Flow[In, Out, Mat]
44 | def Flow[T] = org.apache.pekko.stream.scaladsl.Flow[T]
45 |
46 | def Keep = org.apache.pekko.stream.scaladsl.Keep
47 |
48 | type Sink[In, Mat] = org.apache.pekko.stream.scaladsl.Sink[In, Mat]
49 | def Sink = org.apache.pekko.stream.scaladsl.Sink
50 | }
51 | }
52 |
53 | object pattern {
54 | type AskSupport = org.apache.pekko.pattern.AskSupport
55 | }
56 |
57 | object actor {
58 | type Actor = org.apache.pekko.actor.Actor
59 | def Actor = org.apache.pekko.actor.Actor
60 |
61 | type ActorRef = org.apache.pekko.actor.ActorRef
62 | def ActorRef = org.apache.pekko.actor.ActorRef
63 |
64 | type CoordinatedShutdown = org.apache.pekko.actor.CoordinatedShutdown
65 | def CoordinatedShutdown = org.apache.pekko.actor.CoordinatedShutdown
66 |
67 | type Cancellable = org.apache.pekko.actor.Cancellable
68 |
69 | type Props = org.apache.pekko.actor.Props
70 | def Props = org.apache.pekko.actor.Props
71 |
72 | type Scheduler = org.apache.pekko.actor.Scheduler
73 |
74 | type ActorSystem = org.apache.pekko.actor.ActorSystem
75 | def ActorSystem = org.apache.pekko.actor.ActorSystem
76 | }
77 |
78 | object util {
79 | type Timeout = org.apache.pekko.util.Timeout
80 | def Timeout = org.apache.pekko.util.Timeout
81 | }
82 | }
83 |
--------------------------------------------------------------------------------
/akkaPekko/src/main/scala/com/github/mjakubowski84/parquet4s/ParquetStreams.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | /** Holds factory of Akka Streams / Pekko Streams sources and sinks that allow reading from and writing to Parquet
4 | * files.
5 | */
6 | object ParquetStreams {
7 |
8 | /** Creates a [[com.github.mjakubowski84.parquet4s.ScalaCompat.stream.scaladsl.Source]] that reads Parquet data from
9 | * the specified path. If there are multiple files at path then the order in which files are loaded is determined by
10 | * underlying filesystem.
Path can refer to local file, HDFS, AWS S3, Google Storage, Azure, etc. Please refer
11 | * to Hadoop client documentation or your data provider in order to know how to configure the connection.
Can
12 | * read also partitioned directories. Filter applies also to partition values. Partition values are set as
13 | * fields in read entities at path defined by partition name. Path can be a simple column name or a dot-separated
14 | * path to nested field. Missing intermediate fields are automatically created for each read record.
Allows to
15 | * turn on a projection over original file schema in order to boost read performance if not all columns are
16 | * required to be read.
Provides explicit API for both custom data types and generic records.
17 | * @return
18 | * Builder of the source.
19 | */
20 | def fromParquet: ParquetSource.FromParquet = ParquetSource.FromParquetImpl
21 |
22 | /** Creates a [[com.github.mjakubowski84.parquet4s.ScalaCompat.stream.scaladsl.Sink]] that writes Parquet data to
23 | * single file at the specified path (including file name).
Path can refer to local file, HDFS, AWS S3, Google
24 | * Storage, Azure, etc. Please refer to Hadoop client documentation or your data provider in order to know how to
25 | * configure the connection.
Provides explicit API for both custom data types and generic records.
26 | * @return
27 | * Builder of a sink that writes Parquet file
28 | */
29 | def toParquetSingleFile: SingleFileParquetSink.ToParquet = SingleFileParquetSink.ToParquetImpl
30 |
31 | /** Builds a flow that:
- Is designed to write Parquet files indefinitely
- Is able to (optionally)
32 | * partition data by a list of provided fields
- Flushes and rotates files after given number of rows is
33 | * written to the partition or given time period elapses
- Outputs incoming message after it is written but
34 | * can write an effect of provided message transformation.
Provides explicit API for both custom
35 | * data types and generic records.
36 | * @return
37 | * Builder of the flow.
38 | */
39 | def viaParquet: ParquetPartitioningFlow.ViaParquet = ParquetPartitioningFlow.ViaParquetImpl
40 | }
41 |
--------------------------------------------------------------------------------
/core/src/it/resources/logback-test.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
--------------------------------------------------------------------------------
/core/src/it/scala-2.12/com/github/mjakubowski84/parquet4s/ParquetWriterAndSparkCompatibilityItSpec.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import com.github.mjakubowski84.parquet4s.Case.CaseDef
4 | import com.github.mjakubowski84.parquet4s.CompatibilityParty.*
5 | import org.scalatest.BeforeAndAfter
6 | import org.scalatest.freespec.AnyFreeSpec
7 | import org.scalatest.matchers.should.Matchers
8 |
9 | class ParquetWriterAndSparkCompatibilityItSpec extends AnyFreeSpec with Matchers with BeforeAndAfter with SparkHelper {
10 |
11 | before {
12 | clearTemp()
13 | }
14 |
15 | private def runTestCase(testCase: CaseDef): Unit =
16 | testCase.description in {
17 | ParquetWriter.of[testCase.DataType](testCase.encoder, testCase.resolver).writeAndClose(tempPath, testCase.data)
18 | readFromTemp(testCase.typeTag) should contain theSameElementsAs testCase.data
19 | }
20 |
21 | "Spark should be able to read file saved by ParquetWriter if the file contains" -
22 | CompatibilityTestCases.cases(Writer, Spark).foreach(runTestCase)
23 |
24 | }
25 |
--------------------------------------------------------------------------------
/core/src/it/scala-2.12/com/github/mjakubowski84/parquet4s/SparkAndParquetReaderCompatibilityItSpec.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import com.github.mjakubowski84.parquet4s.CompatibilityParty.*
4 | import SparkAndParquetReaderCompatibilityItSpec.Partitioned
5 | import org.scalatest.BeforeAndAfter
6 | import org.scalatest.freespec.AnyFreeSpec
7 | import org.scalatest.matchers.should.Matchers
8 | import scala.annotation.nowarn
9 | import scala.util.Using
10 |
11 | object SparkAndParquetReaderCompatibilityItSpec {
12 | case class Partitioned(partition: String, s: String)
13 | }
14 |
15 | @nowarn
16 | class SparkAndParquetReaderCompatibilityItSpec extends AnyFreeSpec with Matchers with BeforeAndAfter with SparkHelper {
17 |
18 | before {
19 | clearTemp()
20 | }
21 |
22 | private def runTestCase(testCase: Case.CaseDef): Unit =
23 | testCase.description in {
24 | writeToTemp(testCase.data)(testCase.typeTag)
25 | Using.resource(ParquetReader.as[testCase.DataType].read(tempPath)(testCase.decoder)) { parquetIterable =>
26 | parquetIterable should contain theSameElementsAs testCase.data
27 | }
28 | }
29 |
30 | "ParquetReader should be able to read file saved by Spark if the file contains" -
31 | CompatibilityTestCases.cases(Spark, Reader).foreach(runTestCase)
32 |
33 | "ParquetReader should read data partitioned by Spark" in {
34 | import sparkSession.implicits.*
35 | val data = Seq(
36 | Partitioned(partition = "a", s = "a"),
37 | Partitioned(partition = "a=1", s = "a")
38 | )
39 | data.toDS().write.partitionBy("partition").parquet(tempPath.toString)
40 | Using.resource(ParquetReader.as[Partitioned].read(tempPath)) { parquetIterable =>
41 | parquetIterable should contain theSameElementsAs data
42 | }
43 | }
44 |
45 | }
46 |
--------------------------------------------------------------------------------
/core/src/it/scala-2.12/com/github/mjakubowski84/parquet4s/SparkHelper.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.scalatest.{BeforeAndAfterAll, Suite}
5 |
6 | import scala.reflect.runtime.universe.TypeTag
7 |
8 | trait SparkHelper extends BeforeAndAfterAll with TestUtils {
9 |
10 | this: Suite =>
11 |
12 | sealed trait OutputTimestampType
13 | case object Int96 extends OutputTimestampType
14 | case object TIMESTAMP_MICROS extends OutputTimestampType
15 | case object TIMESTAMP_MILLIS extends OutputTimestampType
16 |
17 | private var sparkStarted = false
18 |
19 | protected def outputTimestampType: OutputTimestampType = Int96
20 |
21 | lazy val sparkSession: SparkSession = {
22 | sparkStarted = true
23 | SparkSession
24 | .builder()
25 | .master("local[2]")
26 | .appName(getClass.getSimpleName)
27 | .config("spark.sql.parquet.outputTimestampType", outputTimestampType.toString)
28 | .config("spark.sql.session.timeZone", "UTC")
29 | .getOrCreate()
30 | }
31 |
32 | override def afterAll(): Unit = {
33 | super.afterAll()
34 | if (sparkStarted) sparkSession.stop()
35 | }
36 |
37 | def writeToTemp[T <: Product: TypeTag](data: Seq[T]): Unit = {
38 | import sparkSession.implicits.*
39 | data.toDS().write.parquet(tempPath.toString)
40 | }
41 |
42 | def readFromTemp[T <: Product: TypeTag]: Seq[T] = {
43 | import sparkSession.implicits.*
44 | sparkSession.read.parquet(tempPath.toString).as[T].collect().toSeq
45 | }
46 |
47 | }
48 |
--------------------------------------------------------------------------------
/core/src/it/scala-2.12/com/github/mjakubowski84/parquet4s/TestCaseSupport.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import com.github.mjakubowski84.parquet4s.CompatibilityParty.CompatibilityParty
4 |
5 | import scala.reflect.runtime.universe.TypeTag
6 |
7 | object CompatibilityParty {
8 | sealed trait CompatibilityParty
9 | case object Spark extends CompatibilityParty
10 | case object Reader extends CompatibilityParty
11 | case object Writer extends CompatibilityParty
12 |
13 | val All: Set[CompatibilityParty] = Set(Spark, Reader, Writer)
14 | }
15 |
16 | object Case {
17 |
18 | type CaseDef = Case[? <: Product]
19 |
20 | def apply[T <: Product: TypeTag: ParquetRecordDecoder: ParquetRecordEncoder: ParquetSchemaResolver](
21 | description: String,
22 | data: Seq[T],
23 | compatibilityParties: Set[CompatibilityParty] = CompatibilityParty.All
24 | ): Case[T] =
25 | new Case(
26 | description = description,
27 | compatibilityParties = compatibilityParties,
28 | _data = data,
29 | _decoder = implicitly[ParquetRecordDecoder[T]],
30 | _encoder = implicitly[ParquetRecordEncoder[T]],
31 | _resolver = implicitly[ParquetSchemaResolver[T]],
32 | _typeTag = implicitly[TypeTag[T]]
33 | )
34 | }
35 |
36 | class Case[T <: Product](
37 | val description: String,
38 | val compatibilityParties: Set[CompatibilityParty],
39 | _data: Seq[T],
40 | _decoder: ParquetRecordDecoder[T],
41 | _encoder: ParquetRecordEncoder[T],
42 | _resolver: ParquetSchemaResolver[T],
43 | _typeTag: TypeTag[T]
44 | ) {
45 | type DataType = T
46 | def data: Seq[DataType] = _data
47 | def decoder: ParquetRecordDecoder[DataType] = _decoder
48 | def encoder: ParquetRecordEncoder[DataType] = _encoder
49 | def resolver: ParquetSchemaResolver[DataType] = _resolver
50 | def typeTag: TypeTag[DataType] = _typeTag
51 | }
52 |
53 | trait TestCaseSupport {
54 |
55 | def caseDefinitions: Seq[Case.CaseDef]
56 |
57 | def cases(compatibilityParties: Set[CompatibilityParty] = CompatibilityParty.All): Seq[Case.CaseDef] =
58 | caseDefinitions.filter { caseDefinition =>
59 | compatibilityParties.forall(caseDefinition.compatibilityParties.contains)
60 | }
61 |
62 | def cases(compatibilityParty: CompatibilityParty*): Seq[Case.CaseDef] = cases(compatibilityParty.toSet)
63 |
64 | def only[T: TypeTag]: Case.CaseDef = {
65 | val targetTpe = implicitly[TypeTag[T]].tpe
66 | caseDefinitions
67 | .find(_.typeTag.tpe =:= targetTpe)
68 | .getOrElse(throw new NoSuchElementException(s"Case $targetTpe is not defined"))
69 | }
70 |
71 | }
72 |
--------------------------------------------------------------------------------
/core/src/it/scala-2.12/com/github/mjakubowski84/parquet4s/TimeEncodingCompatibilityItSpec.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import com.github.mjakubowski84.parquet4s.CompatibilityTestCases.TimePrimitives
4 | import com.github.mjakubowski84.parquet4s.TimeValueCodecs.*
5 | import org.scalatest.BeforeAndAfter
6 | import org.scalatest.freespec.AnyFreeSpecLike
7 | import org.scalatest.matchers.should.Matchers
8 |
9 | import java.sql.Date
10 | import java.time.{LocalDate, LocalDateTime}
11 | import java.util.TimeZone
12 |
13 | abstract class TimeEncodingCompatibilityItSpec
14 | extends AnyFreeSpecLike
15 | with Matchers
16 | with BeforeAndAfter
17 | with SparkHelper {
18 |
19 | private val newYearMidnight = LocalDateTime.of(2019, 1, 1, 0, 0, 0)
20 | private val newYear = Date.valueOf(LocalDate.of(2019, 1, 1))
21 | private val timeZones = List(
22 | TimeZone.getTimeZone("GMT-1"),
23 | TimeZone.getTimeZone("UTC"),
24 | TimeZone.getTimeZone("GMT+1")
25 | )
26 |
27 | protected val parquetWriter: ParquetWriter.Builder[TimePrimitives] = ParquetWriter.of[TimePrimitives]
28 |
29 | before {
30 | clearTemp()
31 | }
32 |
33 | private def writeWithSpark(data: TimePrimitives): Unit = writeToTemp(Seq(data))
34 |
35 | private def readWithSpark: TimePrimitives = readFromTemp[TimePrimitives].head
36 |
37 | private def writeWithParquet4S(data: TimePrimitives, timeZone: TimeZone): Unit =
38 | parquetWriter
39 | .options(ParquetWriter.Options(timeZone = timeZone))
40 | .writeAndClose(tempPath, Seq(data))
41 |
42 | private def readWithParquet4S(timeZone: TimeZone): TimePrimitives = {
43 | val parquetIterable =
44 | ParquetReader.as[TimePrimitives].options(ParquetReader.Options(timeZone = timeZone)).read(tempPath)
45 | try parquetIterable.head
46 | finally parquetIterable.close()
47 | }
48 |
49 | "For time zone of" -
50 | timeZones.foreach { timeZone =>
51 | val data = TimePrimitives(timestamp = localDateTimeToTimestamp(newYearMidnight, timeZone), date = newYear)
52 | timeZone.getDisplayName - {
53 | "Spark should read data written by Parquet4s" in {
54 | writeWithParquet4S(data, timeZone)
55 | readWithSpark should be(data)
56 | }
57 | "Parquet4s should read data written by Spark" in {
58 | writeWithSpark(data)
59 | readWithParquet4S(timeZone) should be(data)
60 | }
61 | }
62 | }
63 |
64 | }
65 |
--------------------------------------------------------------------------------
/core/src/it/scala-2.12/com/github/mjakubowski84/parquet4s/TimeEncodingInt64MicrosCompatibilityItSpec.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import com.github.mjakubowski84.parquet4s.CompatibilityTestCases.TimePrimitives
4 |
5 | class TimeEncodingInt64MicrosCompatibilityItSpec extends TimeEncodingCompatibilityItSpec {
6 |
7 | import TimestampFormat.Implicits.Micros.*
8 |
9 | override val outputTimestampType: OutputTimestampType = Int96
10 | override protected val parquetWriter: ParquetWriter.Builder[TimePrimitives] = ParquetWriter.of[TimePrimitives]
11 |
12 | }
13 |
--------------------------------------------------------------------------------
/core/src/it/scala-2.12/com/github/mjakubowski84/parquet4s/TimeEncodingInt64MillisCompatibilityItSpec.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import com.github.mjakubowski84.parquet4s.CompatibilityTestCases.TimePrimitives
4 |
5 | class TimeEncodingInt64MillisCompatibilityItSpec extends TimeEncodingCompatibilityItSpec {
6 |
7 | import TimestampFormat.Implicits.Millis.*
8 |
9 | override val outputTimestampType: OutputTimestampType = TIMESTAMP_MILLIS
10 | override protected val parquetWriter: ParquetWriter.Builder[TimePrimitives] = ParquetWriter.of[TimePrimitives]
11 |
12 | }
13 |
--------------------------------------------------------------------------------
/core/src/it/scala-2.12/com/github/mjakubowski84/parquet4s/TimeEncodingInt64NanosCompatibilityItSpec.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import com.github.mjakubowski84.parquet4s.CompatibilityTestCases.TimePrimitives
4 | import com.github.mjakubowski84.parquet4s.TimeValueCodecs.localDateTimeToTimestamp
5 | import org.scalatest.BeforeAndAfter
6 | import org.scalatest.freespec.AnyFreeSpec
7 | import org.scalatest.matchers.should.Matchers
8 |
9 | import java.sql.Date
10 | import java.time.{LocalDate, LocalDateTime}
11 | import java.util.TimeZone
12 | import TimestampFormat.Implicits.Nanos.*
13 |
14 | class TimeEncodingInt64NanosCompatibilityItSpec extends AnyFreeSpec with Matchers with BeforeAndAfter with TestUtils {
15 |
16 | private val newYearMidnight = LocalDateTime.of(2019, 1, 1, 0, 0, 0)
17 | private val newYear = Date.valueOf(LocalDate.of(2019, 1, 1))
18 | private val timeZones = List(
19 | TimeZone.getTimeZone("GMT-1"),
20 | TimeZone.getTimeZone("UTC"),
21 | TimeZone.getTimeZone("GMT+1")
22 | )
23 |
24 | before {
25 | clearTemp()
26 | }
27 |
28 | private def writeWithParquet4S(data: TimePrimitives, timeZone: TimeZone): Unit =
29 | ParquetWriter
30 | .of[TimePrimitives]
31 | .options(ParquetWriter.Options(timeZone = timeZone))
32 | .writeAndClose(tempPath, Seq(data))
33 |
34 | private def readWithParquet4S(timeZone: TimeZone): TimePrimitives = {
35 | val parquetIterable =
36 | ParquetReader.as[TimePrimitives].options(ParquetReader.Options(timeZone = timeZone)).read(tempPath)
37 | try parquetIterable.head
38 | finally parquetIterable.close()
39 | }
40 |
41 | "Parquet4s should read data written with time zone of" -
42 | timeZones.foreach { timeZone =>
43 | val data = TimePrimitives(timestamp = localDateTimeToTimestamp(newYearMidnight, timeZone), date = newYear)
44 | timeZone.getDisplayName in {
45 | writeWithParquet4S(data, timeZone)
46 | readWithParquet4S(timeZone) should be(data)
47 | }
48 | }
49 |
50 | }
51 |
--------------------------------------------------------------------------------
/core/src/it/scala-2.12/com/github/mjakubowski84/parquet4s/TimeEncodingInt96CompatibilityItSpec.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | class TimeEncodingInt96MillisCompatibilityItSpec extends TimeEncodingCompatibilityItSpec {
4 |
5 | override val outputTimestampType: OutputTimestampType = Int96
6 |
7 | }
8 |
--------------------------------------------------------------------------------
/core/src/it/scala-2.13/com/github/mjakubowski84/parquet4s/ParquetWriterAndSparkCompatibilityItSpec.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import com.github.mjakubowski84.parquet4s.Case.CaseDef
4 | import com.github.mjakubowski84.parquet4s.CompatibilityParty.*
5 | import org.scalatest.BeforeAndAfter
6 | import org.scalatest.freespec.AnyFreeSpec
7 | import org.scalatest.matchers.should.Matchers
8 |
9 | class ParquetWriterAndSparkCompatibilityItSpec extends AnyFreeSpec with Matchers with BeforeAndAfter with SparkHelper {
10 |
11 | before {
12 | clearTemp()
13 | }
14 |
15 | private def runTestCase(testCase: CaseDef): Unit =
16 | testCase.description in {
17 | ParquetWriter.of[testCase.DataType](testCase.encoder, testCase.resolver).writeAndClose(tempPath, testCase.data)
18 | readFromTemp(testCase.typeTag) should contain theSameElementsAs testCase.data
19 | }
20 |
21 | "Spark should be able to read file saved by ParquetWriter if the file contains" -
22 | CompatibilityTestCases.cases(Writer, Spark).foreach(runTestCase)
23 |
24 | }
25 |
--------------------------------------------------------------------------------
/core/src/it/scala-2.13/com/github/mjakubowski84/parquet4s/SparkAndParquetReaderCompatibilityItSpec.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import com.github.mjakubowski84.parquet4s.CompatibilityParty.*
4 | import org.scalatest.BeforeAndAfter
5 | import org.scalatest.freespec.AnyFreeSpec
6 | import org.scalatest.matchers.should.Matchers
7 |
8 | class SparkAndParquetReaderCompatibilityItSpec extends AnyFreeSpec with Matchers with BeforeAndAfter with SparkHelper {
9 |
10 | before {
11 | clearTemp()
12 | }
13 |
14 | private def runTestCase(testCase: Case.CaseDef): Unit =
15 | testCase.description in {
16 | writeToTemp(testCase.data)(testCase.typeTag)
17 | val parquetIterable = ParquetReader.as[testCase.DataType].read(tempPath)(testCase.decoder)
18 | try parquetIterable should contain theSameElementsAs testCase.data
19 | finally parquetIterable.close()
20 | }
21 |
22 | "ParquetReader should be able to read file saved by Spark if the file contains" -
23 | CompatibilityTestCases.cases(Spark, Reader).foreach(runTestCase)
24 |
25 | }
26 |
--------------------------------------------------------------------------------
/core/src/it/scala-2.13/com/github/mjakubowski84/parquet4s/SparkHelper.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.scalatest.{BeforeAndAfterAll, Suite}
5 |
6 | import scala.reflect.runtime.universe.TypeTag
7 |
8 | trait SparkHelper extends BeforeAndAfterAll with TestUtils {
9 |
10 | this: Suite =>
11 |
12 | sealed trait OutputTimestampType
13 | case object Int96 extends OutputTimestampType
14 | case object TIMESTAMP_MICROS extends OutputTimestampType
15 | case object TIMESTAMP_MILLIS extends OutputTimestampType
16 |
17 | private var sparkStarted = false
18 |
19 | protected def outputTimestampType: OutputTimestampType = Int96
20 |
21 | lazy val sparkSession: SparkSession = {
22 | sparkStarted = true
23 | SparkSession
24 | .builder()
25 | .master("local[2]")
26 | .appName(getClass.getSimpleName)
27 | .config("spark.sql.parquet.outputTimestampType", outputTimestampType.toString)
28 | .config("spark.sql.session.timeZone", "UTC")
29 | .getOrCreate()
30 | }
31 |
32 | override def afterAll(): Unit = {
33 | super.afterAll()
34 | if (sparkStarted) sparkSession.stop()
35 | }
36 |
37 | def writeToTemp[T <: Product: TypeTag](data: Seq[T]): Unit = {
38 | import sparkSession.implicits.*
39 | data.toDS().write.parquet(tempPath.toString)
40 | }
41 |
42 | def readFromTemp[T <: Product: TypeTag]: Seq[T] = {
43 | import sparkSession.implicits.*
44 | sparkSession.read.parquet(tempPath.toString).as[T].collect().toSeq
45 | }
46 |
47 | }
48 |
--------------------------------------------------------------------------------
/core/src/it/scala-2.13/com/github/mjakubowski84/parquet4s/TestCaseSupport.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import com.github.mjakubowski84.parquet4s.CompatibilityParty.CompatibilityParty
4 |
5 | import scala.reflect.runtime.universe.TypeTag
6 |
7 | object CompatibilityParty {
8 | sealed trait CompatibilityParty
9 | case object Spark extends CompatibilityParty
10 | case object Reader extends CompatibilityParty
11 | case object Writer extends CompatibilityParty
12 |
13 | val All: Set[CompatibilityParty] = Set(Spark, Reader, Writer)
14 | }
15 |
16 | object Case {
17 |
18 | type CaseDef = Case[? <: Product]
19 |
20 | def apply[T <: Product: TypeTag: ParquetRecordDecoder: ParquetRecordEncoder: ParquetSchemaResolver](
21 | description: String,
22 | data: Seq[T],
23 | compatibilityParties: Set[CompatibilityParty] = CompatibilityParty.All
24 | ): Case[T] =
25 | new Case(
26 | description = description,
27 | compatibilityParties = compatibilityParties,
28 | _data = data,
29 | _decoder = implicitly[ParquetRecordDecoder[T]],
30 | _encoder = implicitly[ParquetRecordEncoder[T]],
31 | _resolver = implicitly[ParquetSchemaResolver[T]],
32 | _typeTag = implicitly[TypeTag[T]]
33 | )
34 | }
35 |
36 | class Case[T <: Product](
37 | val description: String,
38 | val compatibilityParties: Set[CompatibilityParty],
39 | _data: Seq[T],
40 | _decoder: ParquetRecordDecoder[T],
41 | _encoder: ParquetRecordEncoder[T],
42 | _resolver: ParquetSchemaResolver[T],
43 | _typeTag: TypeTag[T]
44 | ) {
45 | type DataType = T
46 | def data: Seq[DataType] = _data
47 | def decoder: ParquetRecordDecoder[DataType] = _decoder
48 | def encoder: ParquetRecordEncoder[DataType] = _encoder
49 | def resolver: ParquetSchemaResolver[DataType] = _resolver
50 | def typeTag: TypeTag[DataType] = _typeTag
51 | }
52 |
53 | trait TestCaseSupport {
54 |
55 | def caseDefinitions: Seq[Case.CaseDef]
56 |
57 | def cases(compatibilityParties: Set[CompatibilityParty] = CompatibilityParty.All): Seq[Case.CaseDef] =
58 | caseDefinitions.filter { caseDefinition =>
59 | compatibilityParties.forall(caseDefinition.compatibilityParties.contains)
60 | }
61 |
62 | def cases(compatibilityParty: CompatibilityParty*): Seq[Case.CaseDef] = cases(compatibilityParty.toSet)
63 |
64 | def only[T: TypeTag]: Case.CaseDef = {
65 | val targetTpe = implicitly[TypeTag[T]].tpe
66 | caseDefinitions
67 | .find(_.typeTag.tpe =:= targetTpe)
68 | .getOrElse(throw new NoSuchElementException(s"Case $targetTpe is not defined"))
69 | }
70 |
71 | }
72 |
--------------------------------------------------------------------------------
/core/src/it/scala-2.13/com/github/mjakubowski84/parquet4s/TimeEncodingCompatibilityItSpec.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import com.github.mjakubowski84.parquet4s.CompatibilityTestCases.TimePrimitives
4 | import com.github.mjakubowski84.parquet4s.TimeValueCodecs.*
5 | import org.scalatest.BeforeAndAfter
6 | import org.scalatest.freespec.AnyFreeSpecLike
7 | import org.scalatest.matchers.should.Matchers
8 |
9 | import java.sql.Date
10 | import java.time.{LocalDate, LocalDateTime}
11 | import java.util.TimeZone
12 |
13 | abstract class TimeEncodingCompatibilityItSpec
14 | extends AnyFreeSpecLike
15 | with Matchers
16 | with BeforeAndAfter
17 | with SparkHelper {
18 |
19 | private val newYearMidnight = LocalDateTime.of(2019, 1, 1, 0, 0, 0)
20 | private val newYear = Date.valueOf(LocalDate.of(2019, 1, 1))
21 | private val timeZones = List(
22 | TimeZone.getTimeZone("GMT-1"),
23 | TimeZone.getTimeZone("UTC"),
24 | TimeZone.getTimeZone("GMT+1")
25 | )
26 |
27 | protected val parquetWriter: ParquetWriter.Builder[TimePrimitives] = ParquetWriter.of[TimePrimitives]
28 |
29 | before {
30 | clearTemp()
31 | }
32 |
33 | private def writeWithSpark(data: TimePrimitives): Unit = writeToTemp(Seq(data))
34 |
35 | private def readWithSpark: TimePrimitives = readFromTemp[TimePrimitives].head
36 |
37 | private def writeWithParquet4S(data: TimePrimitives, timeZone: TimeZone): Unit =
38 | parquetWriter
39 | .options(ParquetWriter.Options(timeZone = timeZone))
40 | .writeAndClose(tempPath, Seq(data))
41 |
42 | private def readWithParquet4S(timeZone: TimeZone): TimePrimitives = {
43 | val parquetIterable =
44 | ParquetReader.as[TimePrimitives].options(ParquetReader.Options(timeZone = timeZone)).read(tempPath)
45 | try parquetIterable.head
46 | finally parquetIterable.close()
47 | }
48 |
49 | "For time zone of" -
50 | timeZones.foreach { timeZone =>
51 | val data = TimePrimitives(timestamp = localDateTimeToTimestamp(newYearMidnight, timeZone), date = newYear)
52 | timeZone.getDisplayName - {
53 | "Spark should read data written by Parquet4s" in {
54 | writeWithParquet4S(data, timeZone)
55 | readWithSpark should be(data)
56 | }
57 | "Parquet4s should read data written by Spark" in {
58 | writeWithSpark(data)
59 | readWithParquet4S(timeZone) should be(data)
60 | }
61 | }
62 | }
63 |
64 | }
65 |
--------------------------------------------------------------------------------
/core/src/it/scala-2.13/com/github/mjakubowski84/parquet4s/TimeEncodingInt64MicrosCompatibilityItSpec.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import com.github.mjakubowski84.parquet4s.CompatibilityTestCases.TimePrimitives
4 |
5 | class TimeEncodingInt64MicrosCompatibilityItSpec extends TimeEncodingCompatibilityItSpec {
6 |
7 | import TimestampFormat.Implicits.Micros.*
8 |
9 | override val outputTimestampType: OutputTimestampType = Int96
10 | override protected val parquetWriter: ParquetWriter.Builder[TimePrimitives] = ParquetWriter.of[TimePrimitives]
11 |
12 | }
13 |
--------------------------------------------------------------------------------
/core/src/it/scala-2.13/com/github/mjakubowski84/parquet4s/TimeEncodingInt64MillisCompatibilityItSpec.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import com.github.mjakubowski84.parquet4s.CompatibilityTestCases.TimePrimitives
4 |
5 | class TimeEncodingInt64MillisCompatibilityItSpec extends TimeEncodingCompatibilityItSpec {
6 |
7 | import TimestampFormat.Implicits.Millis.*
8 |
9 | override val outputTimestampType: OutputTimestampType = TIMESTAMP_MILLIS
10 | override protected val parquetWriter: ParquetWriter.Builder[TimePrimitives] = ParquetWriter.of[TimePrimitives]
11 |
12 | }
13 |
--------------------------------------------------------------------------------
/core/src/it/scala-2.13/com/github/mjakubowski84/parquet4s/TimeEncodingInt64NanosCompatibilityItSpec.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import com.github.mjakubowski84.parquet4s.CompatibilityTestCases.TimePrimitives
4 | import com.github.mjakubowski84.parquet4s.TimeValueCodecs.localDateTimeToTimestamp
5 | import org.scalatest.BeforeAndAfter
6 | import org.scalatest.freespec.AnyFreeSpec
7 | import org.scalatest.matchers.should.Matchers
8 |
9 | import java.sql.Date
10 | import java.time.{LocalDate, LocalDateTime}
11 | import java.util.TimeZone
12 | import TimestampFormat.Implicits.Nanos.*
13 |
14 | class TimeEncodingInt64NanosCompatibilityItSpec extends AnyFreeSpec with Matchers with BeforeAndAfter with TestUtils {
15 |
16 | private val newYearMidnight = LocalDateTime.of(2019, 1, 1, 0, 0, 0)
17 | private val newYear = Date.valueOf(LocalDate.of(2019, 1, 1))
18 | private val timeZones = List(
19 | TimeZone.getTimeZone("GMT-1"),
20 | TimeZone.getTimeZone("UTC"),
21 | TimeZone.getTimeZone("GMT+1")
22 | )
23 |
24 | before {
25 | clearTemp()
26 | }
27 |
28 | private def writeWithParquet4S(data: TimePrimitives, timeZone: TimeZone): Unit =
29 | ParquetWriter
30 | .of[TimePrimitives]
31 | .options(ParquetWriter.Options(timeZone = timeZone))
32 | .writeAndClose(tempPath, Seq(data))
33 |
34 | private def readWithParquet4S(timeZone: TimeZone): TimePrimitives = {
35 | val parquetIterable =
36 | ParquetReader.as[TimePrimitives].options(ParquetReader.Options(timeZone = timeZone)).read(tempPath)
37 | try parquetIterable.head
38 | finally parquetIterable.close()
39 | }
40 |
41 | "Parquet4s should read data written with time zone of" -
42 | timeZones.foreach { timeZone =>
43 | val data = TimePrimitives(timestamp = localDateTimeToTimestamp(newYearMidnight, timeZone), date = newYear)
44 | timeZone.getDisplayName in {
45 | writeWithParquet4S(data, timeZone)
46 | readWithParquet4S(timeZone) should be(data)
47 | }
48 | }
49 |
50 | }
51 |
--------------------------------------------------------------------------------
/core/src/it/scala-2.13/com/github/mjakubowski84/parquet4s/TimeEncodingInt96CompatibilityItSpec.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | class TimeEncodingInt96MillisCompatibilityItSpec extends TimeEncodingCompatibilityItSpec {
4 |
5 | override val outputTimestampType: OutputTimestampType = Int96
6 |
7 | }
8 |
--------------------------------------------------------------------------------
/core/src/it/scala-3/com/github/mjakubowski84/parquet4s/TestCaseSupport.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import com.github.mjakubowski84.parquet4s.CompatibilityParty.CompatibilityParty
4 |
5 | object CompatibilityParty {
6 | sealed trait CompatibilityParty
7 | case object Spark extends CompatibilityParty
8 | case object Reader extends CompatibilityParty
9 | case object Writer extends CompatibilityParty
10 |
11 | val All: Set[CompatibilityParty] = Set(Spark, Reader, Writer)
12 | }
13 |
14 | object Case {
15 |
16 | type CaseDef = Case[? <: Product]
17 |
18 | def apply[T <: Product: ParquetRecordDecoder: ParquetRecordEncoder: ParquetSchemaResolver](
19 | description: String,
20 | data: Seq[T],
21 | compatibilityParties: Set[CompatibilityParty] = CompatibilityParty.All
22 | ): Case[T] =
23 | new Case(
24 | description = description,
25 | compatibilityParties = compatibilityParties,
26 | _data = data,
27 | _decoder = implicitly[ParquetRecordDecoder[T]],
28 | _encoder = implicitly[ParquetRecordEncoder[T]],
29 | _resolver = implicitly[ParquetSchemaResolver[T]]
30 | )
31 | }
32 |
33 | class Case[T <: Product](
34 | val description: String,
35 | val compatibilityParties: Set[CompatibilityParty],
36 | _data: Seq[T],
37 | _decoder: ParquetRecordDecoder[T],
38 | _encoder: ParquetRecordEncoder[T],
39 | _resolver: ParquetSchemaResolver[T]
40 | ) {
41 | opaque type DataType = T
42 | def data: Seq[DataType] = _data
43 | def decoder: ParquetRecordDecoder[DataType] = _decoder
44 | def encoder: ParquetRecordEncoder[DataType] = _encoder
45 | def resolver: ParquetSchemaResolver[DataType] = _resolver
46 | }
47 |
48 | trait TestCaseSupport {
49 |
50 | def caseDefinitions: Seq[Case.CaseDef]
51 |
52 | def cases(compatibilityParties: Set[CompatibilityParty] = CompatibilityParty.All): Seq[Case.CaseDef] =
53 | caseDefinitions.filter { caseDefinition =>
54 | compatibilityParties.forall(caseDefinition.compatibilityParties.contains)
55 | }
56 |
57 | def cases(compatibilityParty: CompatibilityParty*): Seq[Case.CaseDef] = cases(compatibilityParty.toSet)
58 |
59 | }
60 |
--------------------------------------------------------------------------------
/core/src/it/scala/com/github/mjakubowski84/parquet4s/ParquetWriterAndParquetReaderCompatibilityItSpec.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import com.github.mjakubowski84.parquet4s.Case.CaseDef
4 | import com.github.mjakubowski84.parquet4s.CompatibilityParty.*
5 | import org.scalatest.matchers.should.Matchers
6 | import org.scalatest.BeforeAndAfter
7 | import org.scalatest.freespec.AnyFreeSpec
8 |
9 | class ParquetWriterAndParquetReaderCompatibilityItSpec
10 | extends AnyFreeSpec
11 | with Matchers
12 | with BeforeAndAfter
13 | with TestUtils {
14 |
15 | before {
16 | clearTemp()
17 | }
18 |
19 | private def runTestCase(testCase: CaseDef): Unit =
20 | testCase.description in {
21 | ParquetWriter
22 | .of(testCase.encoder, testCase.resolver)
23 | .writeAndClose(tempPath, testCase.data)
24 | val parquetIterable = ParquetReader.as[testCase.DataType].read(tempPath)(testCase.decoder)
25 | try parquetIterable should contain theSameElementsAs testCase.data
26 | finally parquetIterable.close()
27 | }
28 |
29 | "Spark should be able to read file saved by ParquetWriter if the file contains" -
30 | CompatibilityTestCases.cases(Writer, Reader).foreach(runTestCase)
31 |
32 | }
33 |
--------------------------------------------------------------------------------
/core/src/it/scala/com/github/mjakubowski84/parquet4s/RecordFilterItSpec.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import org.scalatest.flatspec.AnyFlatSpec
4 | import org.scalatest.matchers.should.Matchers
5 | import scala.util.Using
6 |
7 | class RecordFilterSpec extends AnyFlatSpec with Matchers {
8 |
9 | private case class Data(i: Int)
10 |
11 | private val data = (0 to 10).map(Data(_))
12 |
13 | "RecordFilter" should "filter data by record index" in {
14 | val outFile = InMemoryOutputFile(initBufferSize = 1024)
15 | ParquetWriter.of[Data].writeAndClose(outFile, data)
16 | val inFile = outFile.toInputFile
17 | Using.resource(ParquetReader.as[Data].filter(RecordFilter(i => i >= 1 && i < 10)).read(inFile)) { iterable =>
18 | val result = iterable.toVector
19 | result should have size 9
20 | result.head should be(Data(1))
21 | result.last should be(Data(9))
22 | }
23 | }
24 |
25 | }
26 |
--------------------------------------------------------------------------------
/core/src/it/scala/com/github/mjakubowski84/parquet4s/TestUtils.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import org.apache.hadoop.conf.Configuration
4 | import org.apache.hadoop.fs.{FileSystem, FileUtil}
5 |
6 | import java.io.File
7 | import java.nio.file.Files
8 |
9 | trait TestUtils {
10 |
11 | protected val tempPath: Path = Path(Path(Files.createTempDirectory("example")), "testOutputPath")
12 | protected lazy val configuration = new Configuration()
13 | protected lazy val fileSystem: FileSystem = tempPath.toHadoop.getFileSystem(configuration)
14 | private val tempDir = new File(tempPath.toUri)
15 |
16 | def clearTemp(): Unit = {
17 | FileUtil.fullyDelete(tempDir)
18 | ()
19 | }
20 |
21 | }
22 |
--------------------------------------------------------------------------------
/core/src/main/scala-2.12/com/github/mjakubowski84/parquet4s/ParquetRecordDecoder.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import shapeless.labelled.{FieldType, field}
4 | import shapeless.{::, HList, HNil, LabelledGeneric, Lazy, Witness}
5 |
6 | import scala.annotation.{implicitNotFound, unused}
7 | import scala.util.control.NonFatal
8 |
9 | /** Type class that allows to decode instances of [[RowParquetRecord]]
10 | * @tparam T
11 | * represents schema of [[RowParquetRecord]]
12 | */
13 | @implicitNotFound(
14 | "ParquetRecordDecoder. Cannot read data of type ${T}. " +
15 | "Please check if there is implicit ValueDecoder available for each field and subfield of ${T}."
16 | )
17 | trait ParquetRecordDecoder[T] extends MetadataReader {
18 |
19 | /** @param record
20 | * to be decoded to instance of given type
21 | * @param configuration
22 | * [ValueCodecConfiguration] used by some codecs
23 | * @return
24 | * instance of product type decoded from record
25 | */
26 | def decode(record: RowParquetRecord, configuration: ValueCodecConfiguration): T
27 |
28 | def setMetadata(@unused metadata: collection.Map[String, String]): Unit = {}
29 | }
30 |
31 | object ParquetRecordDecoder {
32 |
33 | object DecodingException {
34 | def apply(msg: String, cause: Throwable): DecodingException = {
35 | val decodingException = DecodingException(msg)
36 | decodingException.initCause(cause)
37 | decodingException
38 | }
39 | }
40 |
41 | case class DecodingException(msg: String) extends RuntimeException(msg)
42 |
43 | def apply[T](implicit ev: ParquetRecordDecoder[T]): ParquetRecordDecoder[T] = ev
44 |
45 | def decode[T](record: RowParquetRecord, configuration: ValueCodecConfiguration = ValueCodecConfiguration.Default)(
46 | implicit ev: ParquetRecordDecoder[T]
47 | ): T = ev.decode(record, configuration)
48 |
49 | implicit val nilDecoder: ParquetRecordDecoder[HNil] = (_, _) => HNil
50 |
51 | implicit def headValueDecoder[FieldName <: Symbol, Head, Tail <: HList](implicit
52 | witness: Witness.Aux[FieldName],
53 | headDecoder: ValueDecoder[Head],
54 | tailDecoder: ParquetRecordDecoder[Tail]
55 | ): ParquetRecordDecoder[FieldType[FieldName, Head] :: Tail] =
56 | (record: RowParquetRecord, configuration: ValueCodecConfiguration) => {
57 | val fieldName = witness.value.name
58 | val decodedFieldOpt =
59 | try record.get[Head](fieldName, configuration)
60 | catch {
61 | case NonFatal(cause) =>
62 | throw DecodingException(s"Failed to decode field $fieldName of record: $record", cause)
63 | }
64 | decodedFieldOpt match {
65 | case Some(decodedFieldValue) =>
66 | field[FieldName](decodedFieldValue) :: tailDecoder.decode(record, configuration)
67 | case None =>
68 | throw DecodingException(s"Missing required field $fieldName in a record: $record")
69 | }
70 | }
71 |
72 | implicit def genericDecoder[A, R](implicit
73 | gen: LabelledGeneric.Aux[A, R],
74 | decoder: Lazy[ParquetRecordDecoder[R]]
75 | ): ParquetRecordDecoder[A] =
76 | (record: RowParquetRecord, configuration: ValueCodecConfiguration) =>
77 | gen.from(decoder.value.decode(record, configuration))
78 |
79 | }
80 |
--------------------------------------------------------------------------------
/core/src/main/scala-2.12/com/github/mjakubowski84/parquet4s/ProductDecoders.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import shapeless.LowPriority
4 |
5 | import scala.annotation.nowarn
6 |
7 | trait ProductDecoders {
8 |
9 | implicit def productDecoder[T](implicit
10 | @nowarn ev: LowPriority,
11 | decoder: ParquetRecordDecoder[T]
12 | ): OptionalValueDecoder[T] =
13 | (value, configuration) =>
14 | value match {
15 | case record: RowParquetRecord => decoder.decode(record, configuration)
16 | }
17 |
18 | }
19 |
--------------------------------------------------------------------------------
/core/src/main/scala-2.12/com/github/mjakubowski84/parquet4s/ProductEncoders.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import shapeless.LowPriority
4 |
5 | import scala.annotation.nowarn
6 |
7 | trait ProductEncoders {
8 | implicit def productEncoder[T](implicit
9 | @nowarn ev: LowPriority,
10 | encoder: ParquetRecordEncoder[T]
11 | ): OptionalValueEncoder[T] =
12 | (data, configuration) => encoder.encode(data, null, configuration)
13 | }
14 |
--------------------------------------------------------------------------------
/core/src/main/scala-2.12/com/github/mjakubowski84/parquet4s/ProductSchemaDefs.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import shapeless.LowPriority
4 |
5 | import scala.annotation.nowarn
6 |
7 | trait ProductSchemaDefs {
8 | implicit def productSchema[T](implicit
9 | @nowarn ev: LowPriority,
10 | parquetSchemaResolver: ParquetSchemaResolver[T]
11 | ): TypedSchemaDef[T] =
12 | SchemaDef.group(parquetSchemaResolver.resolveSchema(Cursor.simple)*).withMetadata(SchemaDef.Meta.Generated).typed[T]
13 |
14 | }
15 |
--------------------------------------------------------------------------------
/core/src/main/scala-2.12/com/github/mjakubowski84/parquet4s/compat/CursorCompat.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s.compat
2 |
3 | import com.github.mjakubowski84.parquet4s.Cursor
4 | import shapeless.Witness
5 |
6 | trait CursorCompat {
7 |
8 | this: Cursor =>
9 |
10 | /** @tparam FieldName
11 | * symbol of the field that cursor shall advance
12 | * @return
13 | * a new cursor or None if advance to given field is disallowed
14 | */
15 | def advance[FieldName <: Symbol: Witness.Aux]: Option[Cursor] =
16 | advanceByFieldName(implicitly[Witness.Aux[FieldName]].value.name)
17 |
18 | }
19 |
--------------------------------------------------------------------------------
/core/src/main/scala-2.12/com/github/mjakubowski84/parquet4s/compat/IteratorCompat.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s.compat
2 |
3 | import scala.collection.AbstractIterator
4 |
5 | object IteratorCompat {
6 |
7 | private class UnfoldIterator[A, S](init: S, f: S => Option[(A, S)]) extends AbstractIterator[A] {
8 | private var state: S = init
9 | private var nextValue: Option[(A, S)] = null
10 |
11 | override def hasNext: Boolean = {
12 | if (nextValue == null) {
13 | nextValue = f(state)
14 | }
15 | nextValue.isDefined
16 | }
17 |
18 | override def next(): A =
19 | if (hasNext) {
20 | val (out, nextState) = nextValue.get
21 | state = nextState
22 | nextValue = null
23 | out
24 | } else {
25 | Iterator.empty.next()
26 | }
27 |
28 | }
29 |
30 | @inline
31 | def unfold[A, S](init: S)(f: S => Option[(A, S)]): Iterator[A] = new UnfoldIterator[A, S](init, f)
32 | }
33 |
--------------------------------------------------------------------------------
/core/src/main/scala-2.12/com/github/mjakubowski84/parquet4s/compat/MapCompat.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s.compat
2 |
3 | import com.github.mjakubowski84.parquet4s.{MapParquetRecord, Value}
4 |
5 | object MapCompat {
6 | @inline
7 | def remove[K, V](map: Map[K, V], key: K): Map[K, V] = map - key
8 | }
9 |
10 | trait MapCompat {
11 |
12 | this: MapParquetRecord =>
13 |
14 | /** Removes a single entry from this map.
15 | *
16 | * @param key
17 | * the key of the entry to remove.
18 | * @return
19 | * the [[MapParquetRecord]] itself
20 | */
21 | override def -(key: Value): MapParquetRecord =
22 | new MapParquetRecord(MapCompat.remove(entries, key))
23 |
24 | /** Adds a single entry to this map.
25 | *
26 | * @param entry
27 | * the element to add
28 | * @return
29 | * map of inner values entries with the entry added
30 | */
31 | override def +[V1 >: Value](entry: (Value, V1)): Map[Value, V1] =
32 | entries + entry
33 |
34 | }
35 |
--------------------------------------------------------------------------------
/core/src/main/scala-2.13/com/github/mjakubowski84/parquet4s/ParquetRecordDecoder.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import shapeless.labelled.{FieldType, field}
4 | import shapeless.{::, HList, HNil, LabelledGeneric, Lazy, Witness}
5 |
6 | import scala.annotation.{implicitNotFound, unused}
7 | import scala.util.control.NonFatal
8 |
9 | /** Type class that allows to decode instances of [[RowParquetRecord]]
10 | * @tparam T
11 | * represents schema of [[RowParquetRecord]]
12 | */
13 | @implicitNotFound(
14 | "ParquetRecordDecoder. Cannot read data of type ${T}. " +
15 | "Please check if there is implicit ValueDecoder available for each field and subfield of ${T}."
16 | )
17 | trait ParquetRecordDecoder[T] extends MetadataReader {
18 |
19 | /** @param record
20 | * to be decoded to instance of given type
21 | * @param configuration
22 | * [ValueCodecConfiguration] used by some codecs
23 | * @return
24 | * instance of product type decoded from record
25 | */
26 | def decode(record: RowParquetRecord, configuration: ValueCodecConfiguration): T
27 |
28 | override def setMetadata(@unused metadata: collection.Map[String, String]): Unit = {}
29 | }
30 |
31 | object ParquetRecordDecoder {
32 |
33 | object DecodingException {
34 | def apply(msg: String, cause: Throwable): DecodingException = {
35 | val decodingException = DecodingException(msg)
36 | decodingException.initCause(cause)
37 | decodingException
38 | }
39 | }
40 |
41 | case class DecodingException(msg: String) extends RuntimeException(msg)
42 |
43 | def apply[T](implicit ev: ParquetRecordDecoder[T]): ParquetRecordDecoder[T] = ev
44 |
45 | def decode[T](record: RowParquetRecord, configuration: ValueCodecConfiguration = ValueCodecConfiguration.Default)(
46 | implicit ev: ParquetRecordDecoder[T]
47 | ): T = ev.decode(record, configuration)
48 |
49 | implicit val nilDecoder: ParquetRecordDecoder[HNil] = (_, _) => HNil
50 |
51 | implicit def headValueDecoder[FieldName <: Symbol, Head, Tail <: HList](implicit
52 | witness: Witness.Aux[FieldName],
53 | headDecoder: ValueDecoder[Head],
54 | tailDecoder: ParquetRecordDecoder[Tail]
55 | ): ParquetRecordDecoder[FieldType[FieldName, Head] :: Tail] =
56 | (record: RowParquetRecord, configuration: ValueCodecConfiguration) => {
57 | val fieldName = witness.value.name
58 | val decodedFieldOpt =
59 | try record.get[Head](fieldName, configuration)
60 | catch {
61 | case NonFatal(cause) =>
62 | throw DecodingException(s"Failed to decode field $fieldName of record: $record", cause)
63 | }
64 | decodedFieldOpt match {
65 | case Some(decodedFieldValue) =>
66 | field[FieldName](decodedFieldValue) :: tailDecoder.decode(record, configuration)
67 | case None =>
68 | throw DecodingException(s"Missing required field $fieldName in a record: $record")
69 | }
70 | }
71 |
72 | implicit def genericDecoder[A, R](implicit
73 | gen: LabelledGeneric.Aux[A, R],
74 | decoder: Lazy[ParquetRecordDecoder[R]]
75 | ): ParquetRecordDecoder[A] =
76 | (record: RowParquetRecord, configuration: ValueCodecConfiguration) =>
77 | gen.from(decoder.value.decode(record, configuration))
78 |
79 | }
80 |
--------------------------------------------------------------------------------
/core/src/main/scala-2.13/com/github/mjakubowski84/parquet4s/ProductDecoders.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import shapeless.LowPriority
4 |
5 | import scala.annotation.nowarn
6 |
7 | trait ProductDecoders {
8 |
9 | implicit def productDecoder[T](implicit
10 | @nowarn
11 | ev: LowPriority,
12 | decoder: ParquetRecordDecoder[T]
13 | ): OptionalValueDecoder[T] =
14 | (value, configuration) =>
15 | value match {
16 | case record: RowParquetRecord => decoder.decode(record, configuration)
17 | }
18 |
19 | }
20 |
--------------------------------------------------------------------------------
/core/src/main/scala-2.13/com/github/mjakubowski84/parquet4s/ProductEncoders.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import shapeless.LowPriority
4 |
5 | import scala.annotation.nowarn
6 |
7 | trait ProductEncoders {
8 | implicit def productEncoder[T](implicit
9 | @nowarn ev: LowPriority,
10 | encoder: ParquetRecordEncoder[T]
11 | ): OptionalValueEncoder[T] =
12 | (data, configuration) => encoder.encode(data, null, configuration)
13 | }
14 |
--------------------------------------------------------------------------------
/core/src/main/scala-2.13/com/github/mjakubowski84/parquet4s/ProductSchemaDefs.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import shapeless.LowPriority
4 |
5 | import scala.annotation.nowarn
6 |
7 | trait ProductSchemaDefs {
8 | implicit def productSchema[T](implicit
9 | @nowarn ev: LowPriority,
10 | parquetSchemaResolver: ParquetSchemaResolver[T]
11 | ): TypedSchemaDef[T] =
12 | SchemaDef
13 | .group(parquetSchemaResolver.resolveSchema(Cursor.simple)*)
14 | .withMetadata(SchemaDef.Meta.Generated)
15 | .typed[T]
16 |
17 | }
18 |
--------------------------------------------------------------------------------
/core/src/main/scala-2.13/com/github/mjakubowski84/parquet4s/compat/CursorCompat.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s.compat
2 |
3 | import com.github.mjakubowski84.parquet4s.Cursor
4 | import shapeless.Witness
5 |
6 | trait CursorCompat {
7 |
8 | this: Cursor =>
9 |
10 | /** @tparam FieldName
11 | * symbol of the field that cursor shall advance
12 | * @return
13 | * a new cursor or None if advance to given field is disallowed
14 | */
15 | def advance[FieldName <: Symbol: Witness.Aux]: Option[Cursor] =
16 | advanceByFieldName(implicitly[Witness.Aux[FieldName]].value.name)
17 |
18 | }
19 |
--------------------------------------------------------------------------------
/core/src/main/scala-2.13/com/github/mjakubowski84/parquet4s/compat/IteratorCompat.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s.compat
2 |
3 | object IteratorCompat {
4 |
5 | @inline
6 | def unfold[A, S](init: S)(f: S => Option[(A, S)]): Iterator[A] = Iterator.unfold[A, S](init)(f)
7 |
8 | }
9 |
--------------------------------------------------------------------------------
/core/src/main/scala-2.13/com/github/mjakubowski84/parquet4s/compat/MapCompat.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s.compat
2 |
3 | import com.github.mjakubowski84.parquet4s.{MapParquetRecord, Value}
4 |
5 | object MapCompat {
6 | @inline
7 | def remove[K, V](map: Map[K, V], key: K): Map[K, V] = map.removed(key)
8 | }
9 |
10 | trait MapCompat {
11 |
12 | this: MapParquetRecord =>
13 |
14 | /** Removes a single entry from this map.
15 | *
16 | * @param key
17 | * key of the element to remove
18 | * @return
19 | * map of inner entries with the element removed
20 | */
21 | override def removed(key: Value): MapParquetRecord =
22 | new MapParquetRecord(MapCompat.remove(entries, key))
23 |
24 | /** Adds a single entry to this map.
25 | *
26 | * @param key
27 | * key of the entry to add
28 | * @param value
29 | * value of the entry to add
30 | * @return
31 | * map of inner entries updated with the new entry added
32 | */
33 | override def updated[V1 >: Value](key: Value, value: V1): Map[Value, V1] =
34 | entries.updated(key, value)
35 |
36 | }
37 |
--------------------------------------------------------------------------------
/core/src/main/scala-3/com/github/mjakubowski84/parquet4s/ProductDecoders.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import scala.util.NotGiven
4 |
5 | trait ProductDecoders:
6 |
7 | given productDecoder[T](using
8 | ev: NotGiven[ValueDecoder[T]],
9 | decoder: ParquetRecordDecoder[T]
10 | ): OptionalValueDecoder[T] =
11 | (value, configuration) =>
12 | value match
13 | case record: RowParquetRecord => decoder.decode(record, configuration)
14 |
15 | end ProductDecoders
16 |
--------------------------------------------------------------------------------
/core/src/main/scala-3/com/github/mjakubowski84/parquet4s/ProductEncoders.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import scala.util.NotGiven
4 |
5 | trait ProductEncoders:
6 |
7 | given productEncoder[T](using
8 | ev: NotGiven[ValueEncoder[T]],
9 | encoder: ParquetRecordEncoder[T]
10 | ): OptionalValueEncoder[T] =
11 | (data, configuration) => encoder.encode(data, null, configuration)
12 |
13 | end ProductEncoders
14 |
--------------------------------------------------------------------------------
/core/src/main/scala-3/com/github/mjakubowski84/parquet4s/ProductSchemaDefs.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import scala.util.NotGiven
4 |
5 | trait ProductSchemaDefs:
6 |
7 | given productSchema[T <: Product: ParquetSchemaResolver](using
8 | NotGiven[TypedSchemaDef[T]]
9 | ): TypedSchemaDef[T] =
10 | SchemaDef
11 | .group(summon[ParquetSchemaResolver[T]].resolveSchema(Cursor.simple)*)
12 | .withMetadata(SchemaDef.Meta.Generated)
13 | .typed[T]
14 |
15 | end ProductSchemaDefs
16 |
--------------------------------------------------------------------------------
/core/src/main/scala-3/com/github/mjakubowski84/parquet4s/compat/CursorCompat.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s.compat
2 |
3 | import com.github.mjakubowski84.parquet4s.Cursor
4 |
5 | trait CursorCompat:
6 |
7 | this: Cursor =>
8 |
9 | /** @tparam FieldName
10 | * String & Singleton of the field that cursor shall advance
11 | * @return
12 | * a new cursor or None if advance to given field is disallowed
13 | */
14 | def advance[FieldName <: String & Singleton: ValueOf]: Option[Cursor] =
15 | advanceByFieldName(summon[ValueOf[FieldName]].value)
16 |
17 | end CursorCompat
18 |
--------------------------------------------------------------------------------
/core/src/main/scala-3/com/github/mjakubowski84/parquet4s/compat/IteratorCompat.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s.compat
2 |
3 | object IteratorCompat:
4 | inline def unfold[A, S](init: S)(f: S => Option[(A, S)]): Iterator[A] = Iterator.unfold[A, S](init)(f)
5 |
--------------------------------------------------------------------------------
/core/src/main/scala-3/com/github/mjakubowski84/parquet4s/compat/MapCompat.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s.compat
2 |
3 | import com.github.mjakubowski84.parquet4s.{MapParquetRecord, Value}
4 |
5 | object MapCompat:
6 | inline def remove[K, V](map: Map[K, V], key: K): Map[K, V] = map.removed(key)
7 |
8 | trait MapCompat:
9 |
10 | this: MapParquetRecord =>
11 |
12 | /** Removes a single entry from this map.
13 | *
14 | * @param key
15 | * key of the element to remove
16 | * @return
17 | * map of inner entries with the element removed
18 | */
19 | override def removed(key: Value): MapParquetRecord =
20 | new MapParquetRecord(MapCompat.remove(entries, key))
21 |
22 | /** Adds a single entry to this map.
23 | *
24 | * @param key
25 | * key of the entry to add
26 | * @param value
27 | * value of the entry to add
28 | * @return
29 | * map of inner entries updated with the new entry added
30 | */
31 | override def updated[V1 >: Value](key: Value, value: V1): Map[Value, V1] =
32 | entries.updated(key, value)
33 |
34 | end MapCompat
35 |
--------------------------------------------------------------------------------
/core/src/main/scala/com/github/mjakubowski84/parquet4s/ColumnProjection.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | object ColumnProjection {
4 | def apply(typedColumnPath: TypedColumnPath[?], ordinal: Int): ColumnProjection =
5 | ColumnProjection(typedColumnPath, ordinal, typedColumnPath.alias)
6 | }
7 |
8 | /** Column projection extracts a value from a field at given path and sets it at given position, optionally with a new
9 | * name.
10 | * @param columnPath
11 | * path to the field
12 | * @param ordinal
13 | * position of a column in a schema defined by the projection
14 | * @param alias
15 | * optional new name of the field
16 | */
17 | case class ColumnProjection(columnPath: ColumnPath, ordinal: Int, alias: Option[String]) {
18 | val length: Int = columnPath.elements.length
19 | }
20 |
--------------------------------------------------------------------------------
/core/src/main/scala/com/github/mjakubowski84/parquet4s/HadoopParquetReader.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import org.apache.parquet.filter2.compat.FilterCompat
4 | import org.apache.parquet.hadoop.api.ReadSupport
5 | import org.apache.parquet.io.InputFile
6 | import org.apache.parquet.schema.MessageType
7 |
8 | object HadoopParquetReader {
9 |
10 | private class Builder(
11 | inputFile: InputFile,
12 | projectedSchemaOpt: Option[MessageType],
13 | columnProjections: Seq[ColumnProjection],
14 | metadataReader: MetadataReader
15 | ) extends org.apache.parquet.hadoop.ParquetReader.Builder[RowParquetRecord](inputFile) {
16 | override lazy val getReadSupport: ReadSupport[RowParquetRecord] = new ParquetReadSupport(
17 | projectedSchemaOpt = projectedSchemaOpt,
18 | columnProjections = columnProjections,
19 | metadataReader = metadataReader
20 | )
21 | }
22 |
23 | def apply(
24 | inputFile: InputFile,
25 | projectedSchemaOpt: Option[MessageType],
26 | columnProjections: Seq[ColumnProjection],
27 | filter: FilterCompat.Filter,
28 | metadataReader: MetadataReader
29 | ): org.apache.parquet.hadoop.ParquetReader.Builder[RowParquetRecord] =
30 | new Builder(inputFile, projectedSchemaOpt, columnProjections, metadataReader).withFilter(filter)
31 |
32 | }
33 |
--------------------------------------------------------------------------------
/core/src/main/scala/com/github/mjakubowski84/parquet4s/InMemoryInputFile.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import org.apache.parquet.io.{InputFile, SeekableInputStream}
4 |
5 | import java.io.EOFException
6 | import java.nio.ByteBuffer
7 | import scala.util.control.NoStackTrace
8 |
9 | object InMemoryInputFile {
10 | def fromBytesUnsafe(bytes: Array[Byte]): InMemoryInputFile = new InMemoryInputFile(bytes)
11 |
12 | def fromBytes(bytes: Array[Byte]): InMemoryInputFile = new InMemoryInputFile(bytes.clone())
13 | }
14 |
15 | @experimental
16 | class InMemoryInputFile private (content: Array[Byte]) extends InputFile {
17 |
18 | override def getLength: Long = content.length.toLong
19 |
20 | override def newStream(): SeekableInputStream = new SeekableInputStream {
21 | private var pos: Int = 0
22 |
23 | override def getPos: Long = pos.toLong
24 |
25 | override def seek(newPos: Long): Unit = pos = newPos.toInt
26 |
27 | override def readFully(bytes: Array[Byte]): Unit = readFully(bytes, 0, bytes.length)
28 |
29 | override def readFully(bytes: Array[Byte], start: Int, len: Int): Unit = {
30 | if (content.length - pos < len) throw new EOFException with NoStackTrace
31 | System.arraycopy(content, pos, bytes, start, len)
32 | pos += len
33 | }
34 |
35 | override def read(buf: ByteBuffer): Int = {
36 | val avail = remaining
37 | if (avail == 0) -1
38 | else {
39 | val len = avail.min(buf.remaining())
40 | if (len > 0) {
41 | buf.put(content, pos, len)
42 | pos += len
43 | }
44 | len
45 | }
46 | }
47 |
48 | override def readFully(buf: ByteBuffer): Unit = {
49 | val availSpace = buf.remaining
50 | if (remaining < availSpace) throw new EOFException with NoStackTrace
51 | if (availSpace > 0) buf.put(content, pos, availSpace)
52 | pos += availSpace
53 | }
54 |
55 | override def read(): Int =
56 | if (remaining == 0) -1
57 | else {
58 | val next = content(pos) & 0xff
59 | pos += 1
60 | next
61 | }
62 |
63 | private def remaining: Int = content.length - pos
64 | }
65 | }
66 |
--------------------------------------------------------------------------------
/core/src/main/scala/com/github/mjakubowski84/parquet4s/InMemoryOutputFile.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import org.apache.hadoop.fs.FileAlreadyExistsException
4 | import org.apache.parquet.io.{InputFile, OutputFile, PositionOutputStream}
5 |
6 | import java.io.ByteArrayOutputStream
7 |
8 | object InMemoryOutputFile {
9 | val DefaultBlockSize: Int = 64 << 10
10 |
11 | def apply(
12 | initBufferSize: Int,
13 | maxBufferSize: Option[Int] = None,
14 | blockSize: Int = DefaultBlockSize
15 | ): InMemoryOutputFile = new InMemoryOutputFile(initBufferSize, maxBufferSize.getOrElse(3 * initBufferSize), blockSize)
16 | }
17 |
18 | /** Reusable in-memory `OutputFile` based on `ByteArrayOutputStream`
19 | *
20 | * @param initBufferSize
21 | * size of the `ByteArrayOutputStream`'s internal buffer when it is created
22 | * @param maxBufferSize
23 | * a threshold beyond which the internal buffer will be recreated with the initBufferSize
24 | * @param blockSize
25 | * size of a row group being buffered in memory. This limits the memory usage when writing
26 | */
27 | class InMemoryOutputFile private (initBufferSize: Int, maxBufferSize: Int, blockSize: Int) extends OutputFile {
28 | private val os = new ReusableByteArrayOutputStream(initBufferSize, maxBufferSize)
29 |
30 | override def create(blockSizeHint: Long): PositionOutputStream = {
31 | if (os.size() > 0) throw new FileAlreadyExistsException(s"In-memory file already exists")
32 | new PositionOutputStream {
33 | override def getPos: Long = os.size().toLong
34 | override def write(b: Int): Unit = os.write(b)
35 | override def write(b: Array[Byte], off: Int, len: Int): Unit = os.write(b, off, len)
36 | }
37 | }
38 |
39 | override def createOrOverwrite(blockSizeHint: Long): PositionOutputStream = {
40 | os.reset()
41 | create(blockSizeHint)
42 | }
43 |
44 | override def supportsBlockSize(): Boolean = true
45 |
46 | override def defaultBlockSize(): Long = blockSize.toLong
47 |
48 | /** Return an Array[Byte] copied from the current content of the internal buffer, and reset the internal state. The
49 | * [[InMemoryOutputFile]] could then be reused without allocating the internal buffer.
50 | *
51 | * @return
52 | * bytes copied from the current content of internal buffer
53 | */
54 | def take(): Array[Byte] = os.take
55 |
56 | def contentLength: Int = os.size()
57 |
58 | /** Creates an [[org.apache.parquet.io.InputFile]] from the content of this [[org.apache.parquet.io.OutputFile]].
59 | */
60 | def toInputFile: InputFile = InMemoryInputFile.fromBytes(take())
61 | }
62 |
63 | class ReusableByteArrayOutputStream(initBufferSize: Int, maxBufferSize: Int)
64 | extends ByteArrayOutputStream(initBufferSize) {
65 | def take: Array[Byte] = {
66 | val content = toByteArray
67 | if (buf.length > maxBufferSize) {
68 | buf = new Array[Byte](initBufferSize)
69 | }
70 | count = 0
71 | content
72 | }
73 | }
74 |
--------------------------------------------------------------------------------
/core/src/main/scala/com/github/mjakubowski84/parquet4s/MetadataWriter.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | private[parquet4s] object MetadataWriter {
4 | val NoOp: MetadataWriter = () => Map.empty
5 | }
6 |
7 | private[parquet4s] trait MetadataWriter {
8 | def getMetadata(): Map[String, String]
9 | }
10 |
11 | private[parquet4s] object MetadataReader {
12 | val NoOp: MetadataReader = _ => {}
13 | }
14 | private[parquet4s] trait MetadataReader {
15 | def setMetadata(metadata: collection.Map[String, String]): Unit
16 | }
17 |
--------------------------------------------------------------------------------
/core/src/main/scala/com/github/mjakubowski84/parquet4s/Path.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import org.apache.hadoop.conf.Configuration
4 | import org.apache.hadoop.fs.Path as HadoopPath
5 | import org.apache.parquet.hadoop.util.{HadoopInputFile, HadoopOutputFile}
6 | import org.apache.parquet.io.{InputFile, OutputFile}
7 |
8 | import java.net.URI
9 | import java.nio.file.{Paths, Path as NioPath}
10 |
11 | object Path {
12 |
13 | def apply(hadoopPath: HadoopPath): Path = new Path(hadoopPath)
14 |
15 | def apply(pathString: String): Path = apply(new HadoopPath(pathString))
16 |
17 | def apply(nioPath: NioPath): Path = apply(new URI("file", null, nioPath.toAbsolutePath.toString, null, null))
18 |
19 | def apply(uri: URI): Path = apply(new HadoopPath(uri))
20 |
21 | def apply(parent: Path, child: String): Path = apply(new HadoopPath(parent.toHadoop, child))
22 |
23 | val Separator: String = HadoopPath.SEPARATOR
24 | }
25 |
26 | /** Represents path/URI to Parquet file or directory containing Parquet files.
27 | */
28 | class Path private (val hadoopPath: HadoopPath) extends AnyVal {
29 |
30 | def append(element: String): Path = new Path(new HadoopPath(hadoopPath, element))
31 |
32 | def parent: Option[Path] = Option(hadoopPath.getParent).map(Path.apply)
33 |
34 | def name: String = hadoopPath.getName
35 |
36 | def toUri: URI = hadoopPath.toUri
37 |
38 | def toNio: NioPath = Paths.get(toUri)
39 |
40 | def toHadoop: HadoopPath = hadoopPath
41 |
42 | def canEqual(other: Any): Boolean = other.isInstanceOf[Path]
43 |
44 | def toOutputFile(conf: Configuration): OutputFile = HadoopOutputFile.fromPath(hadoopPath, conf)
45 |
46 | def toOutputFile(options: ParquetWriter.Options): OutputFile = toOutputFile(options.hadoopConf)
47 |
48 | def toInputFile(conf: Configuration): InputFile = HadoopInputFile.fromPath(hadoopPath, conf)
49 |
50 | def toInputFile(options: ParquetReader.Options): InputFile = HadoopInputFile.fromPath(hadoopPath, options.hadoopConf)
51 |
52 | override def toString: String = hadoopPath.toString
53 |
54 | }
55 |
--------------------------------------------------------------------------------
/core/src/main/scala/com/github/mjakubowski84/parquet4s/UDP.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import org.apache.parquet.filter2.predicate.{Statistics, UserDefinedPredicate}
4 |
5 | /** Extend this trait in order to build a non-standard filter.
Please note! When defining V, use Java
6 | * types supported by Parquet such as:
7 | * 1. [[java.lang.Boolean]]
8 | * 1. [[java.lang.Integer]]
9 | * 1. [[java.lang.Long]]
10 | * 1. [[java.lang.Double]]
11 | * 1. [[java.lang.Float]]
12 | * 1. [[org.apache.parquet.io.api.Binary]]
13 | *
14 | * @tparam V
15 | * Type of column custom filter refers to
16 | */
17 | trait UDP[V] {
18 |
19 | /** Used to filter record by record.
20 | *
21 | * @param value
22 | * column value of record that is filtered
23 | * @return
24 | * `true` if record containing given value should be kept
25 | */
26 | def keep(value: V): Boolean
27 |
28 | /** Used to drop a whole row group if collected statistics do not match your requirements.
29 | *
30 | * @param statistics
31 | * of the row group
32 | * @return
33 | * `true` if the whole row group can be omitted
34 | */
35 | def canDrop(statistics: FilterStatistics[V]): Boolean
36 |
37 | /** It is an opposite of `canDrop`. There is a separate function for inverse comparison as the some types may require
38 | * quite a different logic for that. This function will be called when processing `not` predicates.
39 | *
40 | * @param statistics
41 | * of the row group
42 | * @return
43 | * `true` if the whole row group can be omitted for inverse filter
44 | */
45 | def inverseCanDrop(statistics: FilterStatistics[V]): Boolean
46 |
47 | /** @return
48 | * name of the filter
49 | */
50 | def name: String
51 | }
52 |
53 | /** Row group statistics then can be used in [[UDP]] to drop unwanted data.
54 | * @param min
55 | * minimum value of `V` in a row group
56 | * @param max
57 | * maximum value of `V` in a row group
58 | * @param ordering
59 | * [[scala.Ordering]] of `V`
60 | * @tparam V
61 | * user type of column
62 | */
63 | class FilterStatistics[V](val min: V, val max: V)(implicit val ordering: Ordering[V])
64 |
65 | private[parquet4s] class UDPAdapter[V <: Comparable[V]](udp: UDP[V])(implicit ordering: Ordering[V])
66 | extends UserDefinedPredicate[V]
67 | with Serializable {
68 |
69 | override def keep(value: V): Boolean = udp.keep(value)
70 |
71 | override def canDrop(statistics: Statistics[V]): Boolean =
72 | udp.canDrop(convert(statistics))
73 |
74 | override def inverseCanDrop(statistics: Statistics[V]): Boolean =
75 | udp.inverseCanDrop(convert(statistics))
76 |
77 | override def toString: String = udp.name
78 |
79 | private def convert(statistics: Statistics[V]) = new FilterStatistics[V](statistics.getMin, statistics.getMax)
80 |
81 | }
82 |
--------------------------------------------------------------------------------
/core/src/main/scala/com/github/mjakubowski84/parquet4s/Value.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import org.apache.parquet.io.api.{Binary, RecordConsumer}
4 | import org.apache.parquet.schema.Type
5 | import java.math.BigInteger
6 |
7 | /** Basic structure element which Parquet data is built from. Represents any data element that can be read from or can
8 | * be written to Parquet files.
9 | */
10 | trait Value extends Any {
11 |
12 | /** Writes the value content to Parquet
13 | * @param schema
14 | * schema of that value
15 | * @param recordConsumer
16 | * has to be used to write the data to the file
17 | */
18 | def write(schema: Type, recordConsumer: RecordConsumer): Unit
19 |
20 | }
21 |
22 | /** Primitive value like integer or long.
23 | * @tparam T
24 | * type of the value
25 | */
26 | trait PrimitiveValue[T] extends Any with Value {
27 |
28 | /** Content of the value
29 | */
30 | def value: T
31 |
32 | }
33 |
34 | case class LongValue(value: Long) extends AnyVal with PrimitiveValue[Long] {
35 | override def write(schema: Type, recordConsumer: RecordConsumer): Unit = recordConsumer.addLong(value)
36 | }
37 |
38 | case class IntValue(value: Int) extends AnyVal with PrimitiveValue[Int] {
39 | override def write(schema: Type, recordConsumer: RecordConsumer): Unit = recordConsumer.addInteger(value)
40 | }
41 |
42 | case class FloatValue(value: Float) extends AnyVal with PrimitiveValue[Float] {
43 | override def write(schema: Type, recordConsumer: RecordConsumer): Unit = recordConsumer.addFloat(value)
44 | }
45 |
46 | case class DoubleValue(value: Double) extends AnyVal with PrimitiveValue[Double] {
47 | override def write(schema: Type, recordConsumer: RecordConsumer): Unit = recordConsumer.addDouble(value)
48 | }
49 |
50 | object BinaryValue {
51 | def apply(bytes: Array[Byte]): BinaryValue = BinaryValue(Binary.fromReusedByteArray(bytes))
52 | def apply(string: String): BinaryValue = BinaryValue(Binary.fromString(string))
53 | }
54 |
55 | case class BinaryValue(value: Binary) extends PrimitiveValue[Binary] {
56 |
57 | override def write(schema: Type, recordConsumer: RecordConsumer): Unit = recordConsumer.addBinary(value)
58 |
59 | }
60 |
61 | case class BooleanValue(value: Boolean) extends AnyVal with PrimitiveValue[Boolean] {
62 | override def write(schema: Type, recordConsumer: RecordConsumer): Unit = recordConsumer.addBoolean(value)
63 | }
64 |
65 | case class DateTimeValue(value: Long, format: TimestampFormat.Format) extends Value {
66 | override def write(schema: Type, recordConsumer: RecordConsumer): Unit = recordConsumer.addLong(value)
67 | }
68 |
69 | case class DecimalValue(value: BigInteger, format: DecimalFormat.Format) extends Value {
70 |
71 | override def write(schema: Type, recordConsumer: RecordConsumer): Unit = format.write(recordConsumer, value)
72 |
73 | def toBigDecimal: BigDecimal = BigDecimal(value, format.scale, format.mathContext)
74 | }
75 |
76 | /** Special instance of [[Value]] that represents lack of the value. [[NullValue]] does not hold any data so it cannot
77 | * be written.
78 | */
79 | case object NullValue extends Value {
80 | override def write(schema: Type, recordConsumer: RecordConsumer): Unit =
81 | throw new UnsupportedOperationException("Null values cannot be written.")
82 | }
83 |
--------------------------------------------------------------------------------
/core/src/main/scala/com/github/mjakubowski84/parquet4s/ValueCodec.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | /** Represents both [[ValueEncoder]] and [[ValueDecoder]]
4 | */
5 | trait ValueCodec[T] extends ValueEncoder[T] with ValueDecoder[T]
6 |
7 | /** Represents both [[RequiredValueEncoder]] and [[RequiredValueDecoder]]
8 | */
9 | trait RequiredValueCodec[T] extends ValueCodec[T] with RequiredValueEncoder[T] with RequiredValueDecoder[T]
10 |
11 | /** Represents both [[OptionalValueEncoder]] and [[OptionalValueDecoder]]
12 | */
13 | trait OptionalValueCodec[T] extends ValueCodec[T] with OptionalValueEncoder[T] with OptionalValueDecoder[T]
14 |
--------------------------------------------------------------------------------
/core/src/main/scala/com/github/mjakubowski84/parquet4s/ValueCodecConfiguration.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import java.util.TimeZone
4 |
5 | /** Configuration necessary for some of codecs
6 | *
7 | * @param timeZone
8 | * used when encoding and decoding time-based values
9 | */
10 | case class ValueCodecConfiguration(timeZone: TimeZone)
11 |
12 | object ValueCodecConfiguration {
13 | val Default: ValueCodecConfiguration = ValueCodecConfiguration(timeZone = TimeZone.getDefault)
14 |
15 | def apply(readerOptions: ParquetReader.Options): ValueCodecConfiguration =
16 | ValueCodecConfiguration(readerOptions.timeZone)
17 |
18 | def apply(writerOptions: ParquetWriter.Options): ValueCodecConfiguration =
19 | ValueCodecConfiguration(writerOptions.timeZone)
20 |
21 | }
22 |
--------------------------------------------------------------------------------
/core/src/main/scala/com/github/mjakubowski84/parquet4s/ValueDecoder.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import scala.annotation.implicitNotFound
4 |
5 | trait AllValueDecoders extends PrimitiveValueDecoders with TimeValueDecoders with ComplexValueDecoders
6 |
7 | /** Contains implicit instances of all [[ValueDecoder]]
8 | */
9 | object ValueDecoder extends AllValueDecoders
10 |
11 | @implicitNotFound(
12 | "Missing ValueDecoder for value type ${T}. Implement your own decoder in order to deserialise your data."
13 | )
14 | trait ValueDecoder[T] {
15 |
16 | /** @param value
17 | * source Parquet [[Value]]
18 | * @param configuration
19 | * [ValueCodecConfiguration] used by some codecs
20 | * @return
21 | * data decoded from [[Value]]
22 | */
23 | def decode(value: Value, configuration: ValueCodecConfiguration): T
24 | }
25 |
26 | /** Decoder for non-null type of [[Value]]
27 | * @tparam T
28 | * data type to decode from
29 | */
30 | trait RequiredValueDecoder[T] extends ValueDecoder[T] {
31 |
32 | final override def decode(value: Value, configuration: ValueCodecConfiguration): T =
33 | value match {
34 | case NullValue =>
35 | throw new IllegalArgumentException("NullValue cannot be decoded to required type")
36 | case other =>
37 | decodeNonNull(other, configuration)
38 | }
39 |
40 | protected def decodeNonNull(value: Value, configuration: ValueCodecConfiguration): T
41 |
42 | }
43 |
44 | /** Decoder for [[Value]] that can be null.
45 | * @tparam T
46 | * data type to decode from
47 | */
48 | trait OptionalValueDecoder[T] extends ValueDecoder[T] {
49 |
50 | final override def decode(value: Value, configuration: ValueCodecConfiguration): T =
51 | value match {
52 | case NullValue => null.asInstanceOf[T]
53 | case other => decodeNonNull(other, configuration)
54 | }
55 |
56 | protected def decodeNonNull(value: Value, configuration: ValueCodecConfiguration): T
57 |
58 | }
59 |
--------------------------------------------------------------------------------
/core/src/main/scala/com/github/mjakubowski84/parquet4s/ValueEncoder.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import scala.annotation.implicitNotFound
4 |
5 | trait AllValueEncoders extends PrimitiveValueEncoders with TimeValueEncoders with ComplexValueEncoders
6 |
7 | /** Contains implicit instances of all [[ValueEncoder]]
8 | */
9 | object ValueEncoder extends AllValueEncoders
10 |
11 | @implicitNotFound(
12 | "Missing ValueEncoder for value type ${T}. Implement your own encoder in order to serialise your data."
13 | )
14 | trait ValueEncoder[T] {
15 |
16 | /** @param data
17 | * source data
18 | * @param configuration
19 | * [ValueCodecConfiguration] used by some codecs
20 | * @return
21 | * encoded Parquet [[Value]]
22 | */
23 | def encode(data: T, configuration: ValueCodecConfiguration): Value
24 | }
25 |
26 | /** Encoder for non-null type of [[Value]]
27 | * @tparam T
28 | * data type to encode to
29 | */
30 | trait RequiredValueEncoder[T] extends ValueEncoder[T] {
31 | override def encode(data: T, configuration: ValueCodecConfiguration): Value =
32 | Option(data) match {
33 | case None =>
34 | throw new IllegalArgumentException("Cannot encode null instance of required type")
35 | case Some(other) =>
36 | encodeNonNull(other, configuration)
37 | }
38 |
39 | protected def encodeNonNull(data: T, configuration: ValueCodecConfiguration): Value
40 | }
41 |
42 | /** Encoder for [[Value]] that can be null.
43 | * @tparam T
44 | * data type to encode to
45 | */
46 | trait OptionalValueEncoder[T] extends ValueEncoder[T] {
47 | override def encode(data: T, configuration: ValueCodecConfiguration): Value =
48 | Option(data).fold[Value](NullValue)(nonNullData => encodeNonNull(nonNullData, configuration))
49 |
50 | protected def encodeNonNull(data: T, configuration: ValueCodecConfiguration): Value
51 | }
52 |
--------------------------------------------------------------------------------
/core/src/main/scala/com/github/mjakubowski84/parquet4s/ValueImplicits.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import java.time.{LocalDate, LocalDateTime}
4 |
5 | /** Provides simple conversion methods for primitives.
6 | */
7 | object ValueImplicits {
8 |
9 | import ValueCodecConfiguration.*
10 |
11 | implicit class IntWrapper(v: Int)(implicit encoder: ValueEncoder[Int]) {
12 | def value: Value = encoder.encode(v, Default)
13 | }
14 | implicit class LongWrapper(v: Long)(implicit encoder: ValueEncoder[Long]) {
15 | def value: Value = encoder.encode(v, Default)
16 | }
17 | implicit class FloatWrapper(v: Float)(implicit encoder: ValueEncoder[Float]) {
18 | def value: Value = encoder.encode(v, Default)
19 | }
20 | implicit class DoubleWrapper(v: Double)(implicit encoder: ValueEncoder[Double]) {
21 | def value: Value = encoder.encode(v, Default)
22 | }
23 | implicit class ByteWrapper(v: Byte)(implicit encoder: ValueEncoder[Byte]) {
24 | def value: Value = encoder.encode(v, Default)
25 | }
26 | implicit class ShortWrapper(v: Short)(implicit encoder: ValueEncoder[Short]) {
27 | def value: Value = encoder.encode(v, Default)
28 | }
29 | implicit class BooleanWrapper(v: Boolean)(implicit encoder: ValueEncoder[Boolean]) {
30 | def value: Value = encoder.encode(v, Default)
31 | }
32 | implicit class StringWrapper(v: String)(implicit encoder: ValueEncoder[String]) {
33 | def value: Value = encoder.encode(v, Default)
34 | }
35 | implicit class CharWrapper(v: Char)(implicit encoder: ValueEncoder[Char]) {
36 | def value: Value = encoder.encode(v, Default)
37 | }
38 | implicit class BigDecimalWrapper(v: BigDecimal)(implicit encoder: ValueEncoder[BigDecimal]) {
39 | def value: Value = encoder.encode(v, Default)
40 | }
41 | implicit class LocalDateTimeWrapper(v: LocalDateTime)(implicit encoder: ValueEncoder[LocalDateTime]) {
42 | def value: Value = encoder.encode(v, Default)
43 | }
44 | implicit class LocalDateWrapper(v: LocalDate)(implicit encoder: ValueEncoder[LocalDate]) {
45 | def value: Value = encoder.encode(v, Default)
46 | }
47 | implicit class TimestampWrapper(v: java.sql.Timestamp)(implicit encoder: ValueEncoder[java.sql.Timestamp]) {
48 | def value: Value = encoder.encode(v, Default)
49 | }
50 | implicit class DateWrapper(v: java.sql.Date)(implicit encoder: ValueEncoder[java.sql.Date]) {
51 | def value: Value = encoder.encode(v, Default)
52 | }
53 |
54 | }
55 |
--------------------------------------------------------------------------------
/core/src/main/scala/com/github/mjakubowski84/parquet4s/etl/CompoundParquetIterable.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s.etl
2 |
3 | import com.github.mjakubowski84.parquet4s.*
4 | import com.github.mjakubowski84.parquet4s.stats.CompoundStats
5 |
6 | private[parquet4s] class CompoundParquetIterable[T](components: Iterable[ParquetIterable[T]])
7 | extends ParquetIterable[T] {
8 |
9 | override val stats: Stats = new CompoundStats(components.map(_.stats))
10 |
11 | override lazy val valueCodecConfiguration: ValueCodecConfiguration =
12 | components.headOption.map(_.valueCodecConfiguration).getOrElse(ValueCodecConfiguration.Default)
13 |
14 | override def iterator: Iterator[T] =
15 | components.foldLeft[Iterator[T]](Iterator.empty)(_ ++ _.iterator)
16 |
17 | override def close(): Unit = components.foreach(_.close())
18 |
19 | override private[parquet4s] def appendTransformation(
20 | transformation: RowParquetRecord => Iterable[RowParquetRecord]
21 | ): ParquetIterable[T] =
22 | new CompoundParquetIterable[T](components.map(_.appendTransformation(transformation)))
23 |
24 | override private[parquet4s] def changeDecoder[U: ParquetRecordDecoder]: ParquetIterable[U] =
25 | new CompoundParquetIterable[U](components.map(_.changeDecoder[U]))
26 |
27 | override def concat(other: ParquetIterable[T]): ParquetIterable[T] =
28 | new CompoundParquetIterable(components ++ Iterable(other))
29 | }
30 |
--------------------------------------------------------------------------------
/core/src/main/scala/com/github/mjakubowski84/parquet4s/etl/InMemoryParquetIterable.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s.etl
2 |
3 | import com.github.mjakubowski84.parquet4s.*
4 | import com.github.mjakubowski84.parquet4s.stats.InMemoryStats
5 |
6 | private[parquet4s] class InMemoryParquetIterable[T](
7 | data: => Iterable[RowParquetRecord],
8 | override val valueCodecConfiguration: ValueCodecConfiguration = ValueCodecConfiguration.Default,
9 | transformations: Seq[RowParquetRecord => Iterable[RowParquetRecord]] = Seq.empty,
10 | decode: RowParquetRecord => T = identity[RowParquetRecord] _
11 | ) extends ParquetIterable[T] {
12 |
13 | override private[parquet4s] def appendTransformation(
14 | transformation: RowParquetRecord => Iterable[RowParquetRecord]
15 | ): ParquetIterable[T] =
16 | new InMemoryParquetIterable[T](
17 | data = data,
18 | valueCodecConfiguration = valueCodecConfiguration,
19 | transformations = transformations :+ transformation,
20 | decode = decode
21 | )
22 |
23 | override private[parquet4s] def changeDecoder[U: ParquetRecordDecoder]: ParquetIterable[U] =
24 | new InMemoryParquetIterable[U](
25 | data = data,
26 | valueCodecConfiguration = valueCodecConfiguration,
27 | transformations = transformations,
28 | decode = record => ParquetRecordDecoder.decode[U](record, valueCodecConfiguration)
29 | )
30 |
31 | override private[parquet4s] lazy val stats: Stats = new InMemoryStats(data, valueCodecConfiguration)
32 |
33 | override def close(): Unit = ()
34 |
35 | override def iterator: Iterator[T] =
36 | if (transformations.isEmpty) data.iterator.map(decode)
37 | else
38 | data.iterator.flatMap(record =>
39 | transformations
40 | .foldLeft(Iterator(record)) { case (iterator, transformation) =>
41 | iterator.flatMap(transformation)
42 | }
43 | .map(decode)
44 | )
45 | }
46 |
--------------------------------------------------------------------------------
/core/src/main/scala/com/github/mjakubowski84/parquet4s/experimental.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | /** Experimental feature. API and functionality may change or be removed completely.
4 | */
5 | class experimental extends scala.annotation.Annotation
6 |
--------------------------------------------------------------------------------
/core/src/main/scala/com/github/mjakubowski84/parquet4s/stats/CompoundStats.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s.stats
2 |
3 | import com.github.mjakubowski84.parquet4s.{ColumnPath, Stats, ValueDecoder}
4 |
5 | /** Calculates [[Stats]] from multiple files.
6 | */
7 | private[parquet4s] class CompoundStats(statsSeq: Iterable[Stats]) extends Stats {
8 |
9 | override lazy val recordCount: Long = statsSeq.map(_.recordCount).sum
10 |
11 | override def min[V](columnPath: ColumnPath, currentMin: Option[V])(implicit
12 | decoder: ValueDecoder[V],
13 | ordering: Ordering[V]
14 | ): Option[V] =
15 | statsSeq.foldLeft(currentMin) { case (acc, stats) =>
16 | stats.min(columnPath, acc)
17 | }
18 |
19 | override def max[V](columnPath: ColumnPath, currentMax: Option[V])(implicit
20 | decoder: ValueDecoder[V],
21 | ordering: Ordering[V]
22 | ): Option[V] =
23 | statsSeq.foldLeft(currentMax) { case (acc, stats) =>
24 | stats.max(columnPath, acc)
25 | }
26 |
27 | }
28 |
--------------------------------------------------------------------------------
/core/src/main/scala/com/github/mjakubowski84/parquet4s/stats/FileStats.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s.stats
2 |
3 | import com.github.mjakubowski84.parquet4s.*
4 | import org.apache.parquet.ParquetReadOptions
5 | import org.apache.parquet.column.statistics.Statistics
6 | import org.apache.parquet.hadoop.ParquetFileReader
7 | import org.apache.parquet.io.InputFile
8 | import org.apache.parquet.schema.MessageType
9 |
10 | import scala.jdk.CollectionConverters.*
11 | import scala.util.Using
12 |
13 | /** Calculates statistics from unfiltered Parquet files.
14 | */
15 | private[parquet4s] class FileStats(
16 | inputFile: InputFile,
17 | vcc: ValueCodecConfiguration,
18 | projectionSchemaOpt: Option[MessageType]
19 | ) extends Stats {
20 |
21 | private val readerOptions = ParquetReadOptions.builder().build()
22 |
23 | abstract private class StatsReader extends AutoCloseable {
24 | protected val reader: ParquetFileReader = ParquetFileReader.open(inputFile, readerOptions)
25 | projectionSchemaOpt.foreach(reader.setRequestedSchema)
26 | override def close(): Unit = reader.close()
27 | }
28 |
29 | private class RecordCountReader extends StatsReader {
30 | def recordCount: Long = reader.getRecordCount
31 | }
32 |
33 | private class MinMaxReader[V](columnPath: ColumnPath, currentExtreme: Option[V])(implicit
34 | decoder: ValueDecoder[V],
35 | ordering: Ordering[V]
36 | ) extends StatsReader {
37 | private val dotString = columnPath.toString
38 |
39 | private def extreme(statsValue: Statistics[?] => Option[Value], choose: (V, V) => V) =
40 | reader.getRowGroups.asScala.iterator
41 | .map(block => block.getColumns.asScala.find(_.getPath.toDotString == dotString))
42 | .flatMap {
43 | case Some(column) => statsValue(column.getStatistics).map(value => decoder.decode(value, vcc))
44 | case None => None
45 | }
46 | .foldLeft(currentExtreme) {
47 | case (None, v) => Option(v)
48 | case (Some(a), b) => Option(choose(a, b))
49 | }
50 |
51 | def min: Option[V] = extreme(statsMinValue, ordering.min)
52 | def max: Option[V] = extreme(statsMaxValue, ordering.max)
53 |
54 | }
55 |
56 | override def recordCount: Long =
57 | Using.resource(new RecordCountReader)(_.recordCount)
58 |
59 | override def min[V](columnPath: ColumnPath, currentMin: Option[V])(implicit
60 | decoder: ValueDecoder[V],
61 | ordering: Ordering[V]
62 | ): Option[V] =
63 | Using.resource(new MinMaxReader[V](columnPath, currentMin))(_.min)
64 |
65 | override def max[V](columnPath: ColumnPath, currentMax: Option[V])(implicit
66 | decoder: ValueDecoder[V],
67 | ordering: Ordering[V]
68 | ): Option[V] =
69 | Using.resource(new MinMaxReader[V](columnPath, currentMax))(_.max)
70 |
71 | }
72 |
--------------------------------------------------------------------------------
/core/src/main/scala/com/github/mjakubowski84/parquet4s/stats/InMemoryStats.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s.stats
2 |
3 | import com.github.mjakubowski84.parquet4s.*
4 |
5 | private[parquet4s] class InMemoryStats(iterable: Iterable[RowParquetRecord], vcc: ValueCodecConfiguration)
6 | extends Stats {
7 |
8 | override lazy val recordCount: Long = iterable.size.toLong
9 |
10 | override protected[parquet4s] def min[V](columnPath: ColumnPath, currentMin: Option[V])(implicit
11 | decoder: ValueDecoder[V],
12 | ordering: Ordering[V]
13 | ): Option[V] = iterable.foldLeft(currentMin) { case (currOpt, record) =>
14 | (record.get(columnPath).map(decoder.decode(_, vcc)), currOpt) match {
15 | case (Some(v), Some(curr)) => Some(ordering.min(curr, v))
16 | case (Some(v), None) => Some(v)
17 | case (None, _) => currOpt
18 | }
19 | }
20 |
21 | override protected[parquet4s] def max[V](columnPath: ColumnPath, currentMax: Option[V])(implicit
22 | decoder: ValueDecoder[V],
23 | ordering: Ordering[V]
24 | ): Option[V] = iterable.foldLeft(currentMax) { case (currOpt, record) =>
25 | (record.get(columnPath).map(decoder.decode(_, vcc)), currOpt) match {
26 | case (Some(v), Some(curr)) => Some(ordering.max(curr, v))
27 | case (Some(v), None) => Some(v)
28 | case (None, _) => currOpt
29 | }
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/core/src/main/scala/com/github/mjakubowski84/parquet4s/stats/LazyDelegateStats.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s.stats
2 |
3 | import com.github.mjakubowski84.parquet4s.*
4 | import org.apache.parquet.filter2.compat.FilterCompat
5 | import org.apache.parquet.filter2.compat.FilterCompat.NoOpFilter
6 | import org.apache.parquet.io.InputFile
7 | import org.apache.parquet.schema.MessageType
8 |
9 | private[parquet4s] class LazyDelegateStats(
10 | inputFile: InputFile,
11 | vcc: ValueCodecConfiguration,
12 | projectionSchemaOpt: Option[MessageType],
13 | filter: FilterCompat.Filter,
14 | partitionViewOpt: Option[PartitionView]
15 | ) extends Stats {
16 | private lazy val delegate: Stats = {
17 | val fileStats =
18 | if (filter.isInstanceOf[NoOpFilter])
19 | new FileStats(inputFile, vcc, projectionSchemaOpt)
20 | else
21 | new FilteredFileStats(inputFile, vcc, projectionSchemaOpt, filter)
22 | partitionViewOpt match {
23 | case Some(partitionView) if partitionView.nonEmpty =>
24 | new PartitionedFileStats(fileStats, partitionView)
25 | case _ =>
26 | fileStats
27 | }
28 | }
29 |
30 | override def recordCount: Long = delegate.recordCount
31 |
32 | override protected[parquet4s] def min[V](columnPath: ColumnPath, currentMin: Option[V])(implicit
33 | decoder: ValueDecoder[V],
34 | ordering: Ordering[V]
35 | ): Option[V] =
36 | delegate.min(columnPath, currentMin)
37 |
38 | override protected[parquet4s] def max[V](columnPath: ColumnPath, currentMax: Option[V])(implicit
39 | decoder: ValueDecoder[V],
40 | ordering: Ordering[V]
41 | ): Option[V] =
42 | delegate.max(columnPath, currentMax)
43 | }
44 |
--------------------------------------------------------------------------------
/core/src/main/scala/com/github/mjakubowski84/parquet4s/stats/PartitionedFileStats.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s.stats
2 |
3 | import com.github.mjakubowski84.parquet4s._
4 |
5 | private[parquet4s] class PartitionedFileStats(wrapped: Stats, partitionView: PartitionView) extends Stats {
6 | override def recordCount = wrapped.recordCount
7 |
8 | override protected[parquet4s] def min[V](columnPath: ColumnPath, currentMin: Option[V])(implicit
9 | decoder: ValueDecoder[V],
10 | ordering: Ordering[V]
11 | ): Option[V] =
12 | (partitionView.value(columnPath).map(_.toStringUsingUTF8.asInstanceOf[V]), currentMin) match {
13 | case (Some(partitionValue), Some(cm)) => Some(ordering.min(partitionValue, cm))
14 | case (Some(partitionValue), None) => Some(partitionValue)
15 | case _ => wrapped.min[V](columnPath, currentMin)
16 | }
17 |
18 | override protected[parquet4s] def max[V](columnPath: ColumnPath, currentMax: Option[V])(implicit
19 | decoder: ValueDecoder[V],
20 | ordering: Ordering[V]
21 | ): Option[V] =
22 | (partitionView.value(columnPath).map(_.toStringUsingUTF8.asInstanceOf[V]), currentMax) match {
23 | case (Some(partitionValue), Some(cm)) => Some(ordering.max(partitionValue, cm))
24 | case (Some(partitionValue), None) => Some(partitionValue)
25 | case _ => wrapped.max[V](columnPath, currentMax)
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/core/src/test/resources/logback-test.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/core/src/test/scala/com/github/mjakubowski84/parquet4s/ColumnPathSpec.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import com.github.mjakubowski84.parquet4s.LogicalTypes.StringType
4 | import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY
5 | import org.apache.parquet.schema.Type.Repetition.OPTIONAL
6 | import org.apache.parquet.schema.Types
7 | import org.scalatest.flatspec.AnyFlatSpec
8 | import org.scalatest.matchers.should.Matchers
9 |
10 | class ColumnPathSpec extends AnyFlatSpec with Matchers {
11 |
12 | "ColumnPath" should "be created with proper elements" in {
13 | Col("path").elements should be(Seq("path"))
14 | Col("path.subPath").elements should be(Seq("path", "subPath"))
15 | }
16 |
17 | it should "be appendable" in {
18 | Col("path").appendElement("subPath").elements should be(Seq("path", "subPath"))
19 | }
20 |
21 | it should "turn to dot path" in {
22 | Col("path").toString should be("path")
23 | Col("path.subPath").toString should be("path.subPath")
24 | }
25 |
26 | it should "be able to turn to typed" in {
27 | Col("path").as[String].toType should be(Types.primitive(BINARY, OPTIONAL).as(StringType).named("path"))
28 | }
29 |
30 | }
31 |
--------------------------------------------------------------------------------
/core/src/test/scala/com/github/mjakubowski84/parquet4s/IOOpsSpec.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import org.scalatest.Inside
4 | import org.scalatest.flatspec.AnyFlatSpec
5 | import org.scalatest.matchers.should.Matchers
6 |
7 | class IOOpsSpec extends AnyFlatSpec with Matchers with Inside with PartitionTestUtils {
8 |
9 | "PartitionRegexp" should "match valid partition names and values" in
10 | forAll(ValidPartitionsTable) { case (name, value) =>
11 | inside(s"$name=$value") { case IOOps.PartitionRegexp(`name`, `value`) =>
12 | succeed
13 | }
14 | }
15 |
16 | it should "not match invalid partition names and values" in
17 | forAll(InvalidPartitionsTable) { case (name, value) =>
18 | s"$name=$value" match {
19 | case IOOps.PartitionRegexp(capturedName, capturedValue) =>
20 | fail(
21 | s"Expected no match for name [$name] and value [$value] " +
22 | s"but one was found: [$capturedName, $capturedValue]"
23 | )
24 | case _ =>
25 | succeed
26 | }
27 | }
28 |
29 | }
30 |
--------------------------------------------------------------------------------
/core/src/test/scala/com/github/mjakubowski84/parquet4s/InMemoryFileSpec.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import org.apache.commons.lang3.RandomStringUtils
4 | import org.scalatest.flatspec.AnyFlatSpec
5 | import org.scalatest.matchers.should.Matchers
6 |
7 | import java.nio.file.Files
8 |
9 | class InMemoryFileSpec extends AnyFlatSpec with Matchers {
10 | it should "write to in-memory output file" in {
11 | case class Data(id: Int, text: String)
12 |
13 | val count = 100
14 | val data = (1 to count).map(i => Data(id = i, text = RandomStringUtils.randomPrint(4)))
15 | val file = InMemoryOutputFile(initBufferSize = 1024)
16 |
17 | // write
18 | ParquetWriter.of[Data].writeAndClose(file, data)
19 |
20 | val inputFile = Files.createTempFile("in-memory-output-file-test", ".parquet")
21 | Files.write(inputFile, file.take())
22 |
23 | // read
24 | val readData = ParquetReader.as[Data].read(Path(inputFile))
25 | try readData.toSeq shouldBe data
26 | finally readData.close()
27 | }
28 |
29 | it should "read from in-memory input file" in {
30 | case class Data(id: Int, text: String)
31 |
32 | val count = 100
33 | val data = (1 to count).map(i => Data(id = i, text = RandomStringUtils.randomPrint(4)))
34 | val outputFile = InMemoryOutputFile(initBufferSize = 1024)
35 |
36 | // write
37 | ParquetWriter.of[Data].writeAndClose(outputFile, data)
38 |
39 | val inputFile = InMemoryInputFile.fromBytesUnsafe(outputFile.take())
40 |
41 | // read
42 | val readData = ParquetReader.as[Data].read(inputFile)
43 | try readData.toSeq shouldBe data
44 | finally readData.close()
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/core/src/test/scala/com/github/mjakubowski84/parquet4s/PartitionTestUtils.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import org.scalatest.prop.{TableDrivenPropertyChecks, TableFor2}
4 |
5 | trait PartitionTestUtils extends TableDrivenPropertyChecks {
6 | private val allChars: Seq[Char] = (Byte.MinValue to Byte.MaxValue).map(_.toChar)
7 | private val alphaNumericChars: Seq[Char] = ('a' to 'z') ++ ('A' to 'Z') ++ ('0' to '9')
8 |
9 | private val allowedPartitionNameChars: Seq[Char] = alphaNumericChars ++ Seq('.', '_')
10 | private val allowedPartitionValueChars: Seq[Char] = alphaNumericChars ++ Seq(
11 | '!', '?', '-', '+', '_', '.', ',', '*', '\'', '(', ')', '&', '@', ':', ';', '/', ' '
12 | )
13 |
14 | private val disallowedPartitionNameChars: Seq[Char] = allChars.filterNot(allowedPartitionNameChars.contains)
15 | private val disallowedPartitionValueChars: Seq[Char] = allChars.filterNot(allowedPartitionValueChars.contains)
16 |
17 | private val validNames = generatePartitionStrings(prefix = "testValue", withChars = allowedPartitionNameChars)
18 | private val validValues = generatePartitionStrings(prefix = "testName", withChars = allowedPartitionValueChars)
19 |
20 | private val invalidNames = generatePartitionStrings(prefix = "testValue", withChars = disallowedPartitionNameChars)
21 | private val invalidValues = generatePartitionStrings(prefix = "testName", withChars = disallowedPartitionValueChars)
22 |
23 | private def generatePartitionStrings(prefix: String, withChars: Seq[Char]) = withChars.map(char => s"$prefix$char")
24 |
25 | val ValidPartitionsTable: TableFor2[String, String] = Table(
26 | ("name", "value"),
27 | validNames.flatMap(name => validValues.map(value => name -> value))*
28 | )
29 | val InvalidPartitionsTable: TableFor2[String, String] = Table(
30 | ("name", "value"),
31 | invalidNames.flatMap(name => invalidValues.map(value => name -> value))*
32 | )
33 |
34 | }
35 |
--------------------------------------------------------------------------------
/core/src/test/scala/com/github/mjakubowski84/parquet4s/TestCases.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | object TestCases {
4 |
5 | case class Empty()
6 |
7 | // Primitives
8 | case class Primitives(
9 | boolean: Boolean,
10 | int: Int,
11 | long: Long,
12 | float: Float,
13 | double: Double,
14 | string: String,
15 | short: Short,
16 | byte: Byte,
17 | char: Char,
18 | bigDecimal: BigDecimal
19 | )
20 | case class TimePrimitives(
21 | localDateTime: java.time.LocalDateTime,
22 | sqlTimestamp: java.sql.Timestamp,
23 | localDate: java.time.LocalDate,
24 | sqlDate: java.sql.Date
25 | )
26 | case class ContainsOption(optional: Option[Int])
27 |
28 | // Collections of primitives
29 | case class Collections(
30 | list: List[Int],
31 | seq: Seq[Int],
32 | vector: Vector[Int],
33 | set: Set[Int],
34 | array: Array[Int]
35 | ) {
36 | override def equals(obj: Any): Boolean =
37 | obj match {
38 | case other @ Collections(otherList, otherSeq, otherVector, otherSet, otherArray) =>
39 | (other canEqual this) &&
40 | list == otherList &&
41 | seq == otherSeq &&
42 | vector == otherVector &&
43 | set == otherSet &&
44 | array.sameElements(otherArray)
45 | case _ => false
46 | }
47 | }
48 | case class ArrayOfBytes(bytes: Array[Byte]) {
49 | override def equals(obj: Any): Boolean =
50 | obj match {
51 | case other @ ArrayOfBytes(bytes) =>
52 | (other canEqual this) && this.bytes.sameElements(bytes)
53 | case _ => false
54 | }
55 | }
56 | case class ContainsCollectionOfOptionalPrimitives(list: List[Option[Int]])
57 | case class ContainsCollectionOfCollections(listOfSets: List[Set[Int]])
58 | case class ContainsMapOfPrimitives(map: Map[String, Int])
59 | case class ContainsMapOfOptionalPrimitives(map: Map[String, Option[Int]])
60 | case class ContainsMapOfCollectionsOfPrimitives(map: Map[String, List[Int]])
61 |
62 | // Nested class
63 | case class Nested(int: Int)
64 | case class ContainsNestedClass(nested: Nested)
65 | case class ContainsOptionalNestedClass(nestedOptional: Option[Nested])
66 |
67 | // Collections of nested class
68 | case class CollectionsOfNestedClass(
69 | list: List[Nested],
70 | seq: Seq[Nested],
71 | vector: Vector[Nested],
72 | set: Set[Nested],
73 | array: Array[Nested]
74 | ) {
75 | override def equals(obj: Any): Boolean =
76 | obj match {
77 | case other @ CollectionsOfNestedClass(otherList, otherSeq, otherVector, otherSet, otherArray) =>
78 | (other canEqual this) &&
79 | list == otherList &&
80 | seq == otherSeq &&
81 | vector == otherVector &&
82 | set == otherSet &&
83 | array.sameElements(otherArray)
84 | case _ => false
85 | }
86 | }
87 | case class ContainsMapOfNestedClassAsValue(nested: Map[String, Nested])
88 | case class ContainsMapOfNestedClassAsKey(nested: Map[Nested, String])
89 | case class ContainsMapOfOptionalNestedClassAsValue(nested: Map[String, Option[Nested]])
90 | case class ContainsMapOfCollectionsOfNestedClassAsValue(nested: Map[String, List[Nested]])
91 |
92 | }
93 |
--------------------------------------------------------------------------------
/core/src/test/scala/com/github/mjakubowski84/parquet4s/ValueCodecsSpec.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import org.scalatest.flatspec.AnyFlatSpec
4 | import org.scalatest.matchers.should.Matchers
5 |
6 | import java.time.Instant
7 | import java.util.TimeZone
8 |
9 | class ValueCodecsSpec extends AnyFlatSpec with Matchers {
10 | private val defaultConfiguration = ValueCodecConfiguration(TimeZone.getTimeZone("Africa/Nairobi"))
11 |
12 | behavior of "Default timestamp format (INT96)"
13 |
14 | it should "be able to encode Instant and decode it back" in {
15 | val instant = Instant.ofEpochMilli(1234567L)
16 | val decodedInstant = codec(instant)
17 | decodedInstant should be(instant)
18 | }
19 |
20 | private def codec[A](a: A)(implicit encoder: ValueEncoder[A], decoder: ValueDecoder[A]): A = {
21 | val value = encoder.encode(a, defaultConfiguration)
22 | decoder.decode(value, defaultConfiguration)
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/core/src/test/scala/com/github/mjakubowski84/parquet4s/ValueEncodingAndDecodingSpec.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import org.scalatest.flatspec.AnyFlatSpec
4 | import org.scalatest.matchers.should.Matchers
5 |
6 | class ValueEncodingAndDecodingSpec extends AnyFlatSpec with Matchers {
7 |
8 | case class TestType(i: Int)
9 |
10 | val requiredValueEncoder: RequiredValueEncoder[TestType] = (data, _) => IntValue(data.i)
11 | val requiredValueDecoder: RequiredValueDecoder[TestType] = (value, _) =>
12 | value match {
13 | case IntValue(i) => TestType(i)
14 | }
15 | val optionalValueEncoder: OptionalValueEncoder[TestType] = (data, _) => IntValue(data.i)
16 | val optionalValueDecoder: OptionalValueDecoder[TestType] = (value, _) =>
17 | value match {
18 | case IntValue(i) => TestType(i)
19 | }
20 |
21 | val testType: TestType = TestType(42)
22 | val testValue: IntValue = IntValue(testType.i)
23 | val configuration: ValueCodecConfiguration = ValueCodecConfiguration.Default
24 |
25 | "Required value encoder" should "encode non-null value" in {
26 | requiredValueEncoder.encode(testType, configuration) should be(testValue)
27 | }
28 |
29 | it should "throw an exception when encoding null" in {
30 | an[IllegalArgumentException] should be thrownBy requiredValueEncoder.encode(
31 | null.asInstanceOf[TestType],
32 | configuration
33 | )
34 | }
35 |
36 | "Required value decoder" should "decode non-null value" in {
37 | requiredValueDecoder.decode(testValue, configuration) should be(testType)
38 | }
39 |
40 | it should "throw an exception when decoding null-value" in {
41 | an[IllegalArgumentException] should be thrownBy requiredValueDecoder.decode(NullValue, configuration)
42 | }
43 |
44 | "Optional value encoder" should "encode non-null value" in {
45 | optionalValueEncoder.encode(testType, configuration) should be(testValue)
46 | }
47 |
48 | it should "throw an exception when encoding null" in {
49 | optionalValueEncoder.encode(null.asInstanceOf[TestType], configuration) should be(NullValue)
50 | }
51 |
52 | "Optional value decoder" should "throw an exception when decoding null-value" in {
53 | optionalValueDecoder.decode(NullValue, configuration) should be(null)
54 | }
55 |
56 | it should "decode non-null value" in {
57 | optionalValueDecoder.decode(testValue, configuration) should be(testType)
58 | }
59 |
60 | }
61 |
--------------------------------------------------------------------------------
/examples/src/main/protobuf/data.proto:
--------------------------------------------------------------------------------
1 | syntax = "proto3";
2 |
3 | option java_package = "com.github.mjakubowski84.parquet4s.protobuf";
4 |
5 | message Data {
6 | int32 id = 1;
7 | string text = 2;
8 | }
9 |
--------------------------------------------------------------------------------
/examples/src/main/resources/logback.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/examples/src/main/scala-akka-jvm/com/github/mjakubowski84/parquet4s/ScalaKafkaCompat.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | object ScalaKafkaCompat {
4 | object kafka {
5 |
6 | type CommitterSettings = akka.kafka.CommitterSettings
7 | def CommitterSettings = akka.kafka.CommitterSettings
8 |
9 | val ConsumerMessage = akka.kafka.ConsumerMessage
10 |
11 | type ConsumerSettings[K, V] = akka.kafka.ConsumerSettings[K, V]
12 | def ConsumerSettings = akka.kafka.ConsumerSettings
13 |
14 | def Subscriptions = akka.kafka.Subscriptions
15 |
16 | type Subscription = akka.kafka.Subscription
17 |
18 | object scaladsl {
19 |
20 | val Consumer = akka.kafka.scaladsl.Consumer
21 |
22 | def Committer = akka.kafka.scaladsl.Committer
23 |
24 | }
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/examples/src/main/scala-pekko-jvm/com/github/mjakubowski84/parquet4s/ScalaKafkaCompat.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | object ScalaKafkaCompat {
4 | object kafka {
5 |
6 | type CommitterSettings = org.apache.pekko.kafka.CommitterSettings
7 | def CommitterSettings = org.apache.pekko.kafka.CommitterSettings
8 |
9 | val ConsumerMessage = org.apache.pekko.kafka.ConsumerMessage
10 |
11 | type ConsumerSettings[K, V] = org.apache.pekko.kafka.ConsumerSettings[K, V]
12 | def ConsumerSettings = org.apache.pekko.kafka.ConsumerSettings
13 |
14 | def Subscriptions = org.apache.pekko.kafka.Subscriptions
15 |
16 | type Subscription = org.apache.pekko.kafka.Subscription
17 |
18 | object scaladsl {
19 |
20 | val Consumer = org.apache.pekko.kafka.scaladsl.Consumer
21 |
22 | def Committer = org.apache.pekko.kafka.scaladsl.Committer
23 |
24 | }
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/examples/src/main/scala/com/github/mjakubowski84/parquet4s/CustomType.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import org.apache.parquet.schema.{LogicalTypeAnnotation, PrimitiveType}
4 |
5 | import scala.util.Random
6 |
7 | object CustomType {
8 |
9 | object Dict {
10 |
11 | sealed trait Type
12 | case object A extends Type
13 | case object B extends Type
14 | case object C extends Type
15 | case object D extends Type
16 |
17 | val values: List[Type] = List(A, B, C, D)
18 | def valueOf(name: String): Type = values
19 | .find(_.toString == name)
20 | .getOrElse(throw new IllegalArgumentException(s"Invalid dict name: $name"))
21 |
22 | def random: Type = values(Random.nextInt(values.length))
23 |
24 | // required for reading
25 | implicit val decoder: OptionalValueDecoder[Type] =
26 | (value: Value, _: ValueCodecConfiguration) =>
27 | value match {
28 | case BinaryValue(binary) => valueOf(binary.toStringUsingUTF8)
29 | }
30 | // required for writing
31 | implicit val encoder: OptionalValueEncoder[Type] =
32 | (data: Type, _: ValueCodecConfiguration) => BinaryValue(data.toString)
33 | // required for writing
34 | implicit val schema: TypedSchemaDef[Type] =
35 | SchemaDef
36 | .primitive(
37 | primitiveType = PrimitiveType.PrimitiveTypeName.BINARY,
38 | required = false,
39 | logicalTypeAnnotation = Option(LogicalTypeAnnotation.stringType())
40 | )
41 | .typed[Type]
42 | }
43 |
44 | }
45 |
--------------------------------------------------------------------------------
/examples/src/main/scala/com/github/mjakubowski84/parquet4s/akkaPekko/CustomAvroWriteAndReadAkkaPekkoApp.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s.akkaPekko
2 |
3 | import com.github.mjakubowski84.parquet4s.ScalaCompat.actor.ActorSystem
4 | import com.github.mjakubowski84.parquet4s.ScalaCompat.stream.scaladsl.Source
5 | import com.github.mjakubowski84.parquet4s.{ParquetStreams, Path}
6 | import org.apache.parquet.avro.AvroParquetWriter
7 |
8 | import java.nio.file.Files
9 | import scala.util.Random
10 | import org.apache.avro.generic.GenericRecordBuilder
11 | import com.github.mjakubowski84.parquet4s.ParquetWriter
12 | import org.apache.parquet.avro.AvroParquetReader
13 | import com.github.mjakubowski84.parquet4s.ParquetReader
14 | import org.apache.avro.SchemaBuilder
15 | import org.apache.avro.generic.GenericRecord
16 |
17 | object CustomAvroWriteAndReadAkkaPekkoApp extends App {
18 | val avroSchema = SchemaBuilder
19 | .record("data")
20 | .namespace("example")
21 | .fields()
22 | .requiredInt("i")
23 | .requiredString("text")
24 | .endRecord()
25 | val count = 100
26 | val data = (1 to count).map { i =>
27 | new GenericRecordBuilder(avroSchema)
28 | .set("i", i)
29 | .set("text", Random.nextString(4))
30 | .build()
31 | }
32 | val path = Path(Files.createTempDirectory("example")).append("data.parquet")
33 |
34 | implicit val system: ActorSystem = ActorSystem()
35 | import system.dispatcher
36 |
37 | lazy val writerBuilder = AvroParquetWriter
38 | .builder[GenericRecord](path.toOutputFile(ParquetWriter.Options()))
39 | .withSchema(avroSchema)
40 |
41 | lazy val writerSink = ParquetStreams.toParquetSingleFile
42 | .custom[GenericRecord, AvroParquetWriter.Builder[GenericRecord]](writerBuilder)
43 | .write
44 |
45 | lazy val readerBuilder = AvroParquetReader
46 | .builder[GenericRecord](path.toInputFile(ParquetReader.Options()))
47 |
48 | lazy val readerSource = ParquetStreams.fromParquet
49 | .custom[GenericRecord](readerBuilder)
50 | .read()
51 |
52 | val stream = for {
53 | _ <- Source(data).runWith(writerSink)
54 | _ <- readerSource.runForeach(println)
55 | } yield ()
56 |
57 | stream.andThen { case _ =>
58 | system.terminate()
59 | }
60 | }
61 |
--------------------------------------------------------------------------------
/examples/src/main/scala/com/github/mjakubowski84/parquet4s/akkaPekko/CustomPartitioningAvroWriteAkkaPekkoApp.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s.akkaPekko
2 |
3 | import com.github.mjakubowski84.parquet4s.ScalaCompat.actor.ActorSystem
4 | import com.github.mjakubowski84.parquet4s.ScalaCompat.stream.scaladsl.Sink
5 | import com.github.mjakubowski84.parquet4s.ScalaCompat.stream.scaladsl.Source
6 | import com.github.mjakubowski84.parquet4s.{ParquetStreams, Path}
7 | import org.apache.parquet.avro.AvroParquetWriter
8 |
9 | import java.nio.file.Files
10 | import scala.util.Random
11 | import org.apache.avro.generic.GenericRecordBuilder
12 | import com.github.mjakubowski84.parquet4s.ParquetWriter
13 | import org.apache.avro.SchemaBuilder
14 | import org.apache.avro.generic.GenericRecord
15 | import com.github.mjakubowski84.parquet4s.ValueCodecConfiguration
16 |
17 | object CustomPartitioningAvroWriteAkkaPekkoApp extends App {
18 | val inputDataAvroSchema = SchemaBuilder
19 | .record("data")
20 | .namespace("example")
21 | .fields()
22 | .requiredInt("i")
23 | .requiredString("text")
24 | .requiredString("partition")
25 | .endRecord()
26 | val partitionedDataAvroSchema = SchemaBuilder
27 | .record("data")
28 | .namespace("example")
29 | .fields()
30 | .requiredInt("i")
31 | .requiredString("text")
32 | .endRecord()
33 |
34 | val count = 100
35 | val data = (1 to count).map { i =>
36 | new GenericRecordBuilder(inputDataAvroSchema)
37 | .set("i", i)
38 | .set("text", Random.nextString(4))
39 | .set("partition", (i % 4).toString())
40 | .build()
41 | }
42 | val basePath = Path(Files.createTempDirectory("example"))
43 | val vcc = ValueCodecConfiguration.Default
44 |
45 | implicit val system: ActorSystem = ActorSystem()
46 | import system.dispatcher
47 |
48 | lazy val writerFlow = ParquetStreams.viaParquet
49 | .custom[GenericRecord, AvroParquetWriter.Builder[GenericRecord]](path =>
50 | AvroParquetWriter
51 | .builder[GenericRecord](path.toOutputFile(ParquetWriter.Options()))
52 | .withSchema(partitionedDataAvroSchema)
53 | )
54 | .partitionUsing { case (path, record) =>
55 | val partitionValue = record.get("partition")
56 | val partitionedRecord = new GenericRecordBuilder(partitionedDataAvroSchema)
57 | .set("i", record.get("i"))
58 | .set("text", record.get("text"))
59 | .build()
60 | (path.append(s"partition=$partitionValue"), partitionedRecord)
61 | }
62 | .write(basePath)
63 |
64 | val stream = for {
65 | _ <- Source(data).via(writerFlow).runWith(Sink.ignore)
66 | _ <- ParquetStreams.fromParquet.generic
67 | .read(basePath)
68 | .runForeach(r =>
69 | println(
70 | s"i=${r.get[Int]("i", vcc)}, text=${r.get[String]("text", vcc)}, partition=${r.get[String]("partition", vcc)}"
71 | )
72 | )
73 | } yield ()
74 |
75 | stream.andThen { case _ =>
76 | system.terminate()
77 | }
78 | }
79 |
--------------------------------------------------------------------------------
/examples/src/main/scala/com/github/mjakubowski84/parquet4s/akkaPekko/CustomProtobufWriteAndReadAkkaPekkoApp.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s.akkaPekko
2 |
3 | import com.github.mjakubowski84.parquet4s.ScalaCompat.actor.ActorSystem
4 | import com.github.mjakubowski84.parquet4s.ScalaCompat.stream.scaladsl.Source
5 | import com.github.mjakubowski84.parquet4s.{ParquetStreams, Path}
6 | import com.github.mjakubowski84.parquet4s.protobuf.DataOuterClass.Data
7 | import org.apache.parquet.proto.ProtoParquetWriter
8 |
9 | import java.nio.file.Files
10 | import scala.util.Random
11 | import org.apache.parquet.proto.ProtoParquetReader
12 | import com.google.protobuf.TextFormat
13 |
14 | /** Please note! This is an example of Java Protobuf + Parquet4s using custom readers and writers. You can also use
15 | * Scala Protobuf with regular Parquet4s functions thanks to ScalaPB module of Parquet4s.
16 | */
17 | object CustomProtobufWriteAndReadAkkaPekkoApp extends App {
18 | val count = 100
19 | val data = (1 to count).map(i => Data.newBuilder.setId(i).setText(Random.nextString(4)).build)
20 | val path = Path(Files.createTempDirectory("example"))
21 |
22 | implicit val system: ActorSystem = ActorSystem()
23 |
24 | import system.dispatcher
25 |
26 | lazy val writerBuilder =
27 | ProtoParquetWriter.builder[Data](path.append("data.parquet").hadoopPath).withMessage(classOf[Data])
28 |
29 | lazy val writerSink = ParquetStreams.toParquetSingleFile
30 | .custom[Data, ProtoParquetWriter.Builder[Data]](writerBuilder)
31 | .write
32 |
33 | lazy val readerBuilder = ProtoParquetReader.builder[Data.Builder](path.hadoopPath)
34 |
35 | lazy val readerSource = ParquetStreams.fromParquet
36 | .custom[Data.Builder](readerBuilder)
37 | .read[Data](_.build())
38 |
39 | val stream = for {
40 | _ <- Source(data).runWith(writerSink)
41 | _ <- readerSource.runForeach(data => println(TextFormat.printer().escapingNonAscii(false).printToString(data)))
42 | } yield ()
43 |
44 | stream.andThen {
45 | // finish
46 | case _ => system.terminate()
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/examples/src/main/scala/com/github/mjakubowski84/parquet4s/akkaPekko/WriteAndReadAkkaPekkoApp.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s.akkaPekko
2 |
3 | import com.github.mjakubowski84.parquet4s.ScalaCompat.actor.ActorSystem
4 | import com.github.mjakubowski84.parquet4s.ScalaCompat.stream.scaladsl.Source
5 | import com.github.mjakubowski84.parquet4s.{ParquetStreams, Path}
6 |
7 | import java.nio.file.Files
8 | import scala.util.Random
9 |
10 | object WriteAndReadAkkaPekkoApp extends App {
11 |
12 | case class Data(id: Int, text: String)
13 |
14 | val count = 100
15 | val data = (1 to count).map(i => Data(id = i, text = Random.nextString(4)))
16 | val path = Path(Files.createTempDirectory("example"))
17 |
18 | implicit val system: ActorSystem = ActorSystem()
19 | import system.dispatcher
20 |
21 | val stream = for {
22 | // write
23 | _ <- Source(data).runWith(ParquetStreams.toParquetSingleFile.of[Data].write(path.append("data.parquet")))
24 | // read
25 | _ <- ParquetStreams.fromParquet.as[Data].read(path).runForeach(println)
26 | } yield ()
27 |
28 | stream.andThen {
29 | // finish
30 | case _ => system.terminate()
31 | }
32 |
33 | }
34 |
--------------------------------------------------------------------------------
/examples/src/main/scala/com/github/mjakubowski84/parquet4s/akkaPekko/WriteAndReadCustomTypeAkkaPekkoApp.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s.akkaPekko
2 |
3 | import com.github.mjakubowski84.parquet4s.ScalaCompat.actor.ActorSystem
4 | import com.github.mjakubowski84.parquet4s.ScalaCompat.stream.scaladsl.{Sink, Source}
5 | import com.github.mjakubowski84.parquet4s.CustomType.*
6 | import com.github.mjakubowski84.parquet4s.{ParquetStreams, Path}
7 |
8 | import java.nio.file.Files
9 |
10 | object WriteAndReadCustomTypeAkkaPekkoApp extends App {
11 |
12 | object Data {
13 | def generate(count: Int): Iterator[Data] = Iterator.range(1, count).map(i => Data(id = i, dict = Dict.random))
14 | }
15 | case class Data(id: Int, dict: Dict.Type)
16 |
17 | val data = () => Data.generate(count = 100)
18 | val path = Path(Files.createTempDirectory("example"))
19 |
20 | implicit val system: ActorSystem = ActorSystem()
21 | import system.dispatcher
22 |
23 | val stream = for {
24 | // write
25 | _ <- Source
26 | .fromIterator(data)
27 | .runWith(ParquetStreams.toParquetSingleFile.of[Data].write(path.append("data.parquet")))
28 | // read
29 | // hint: you can filter by dict using string value, for example: filter = Col("dict") === "A"
30 | _ <- ParquetStreams.fromParquet.as[Data].read(path).runWith(Sink.foreach(println))
31 | } yield ()
32 |
33 | stream.andThen {
34 | // finish
35 | case _ => system.terminate()
36 | }
37 |
38 | }
39 |
--------------------------------------------------------------------------------
/examples/src/main/scala/com/github/mjakubowski84/parquet4s/akkaPekko/WriteAndReadFilteredAkkaPekkoApp.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s.akkaPekko
2 |
3 | import com.github.mjakubowski84.parquet4s.ScalaCompat.actor.ActorSystem
4 | import com.github.mjakubowski84.parquet4s.ScalaCompat.stream.scaladsl.{Sink, Source}
5 | import com.github.mjakubowski84.parquet4s.{Col, ParquetStreams, Path}
6 |
7 | import java.nio.file.Files
8 | import scala.concurrent.Future
9 | import scala.util.Random
10 |
11 | object WriteAndReadFilteredAkkaPekkoApp extends App {
12 |
13 | object Dict {
14 | val A = "A"
15 | val B = "B"
16 | val C = "C"
17 | val D = "D"
18 |
19 | val values: List[String] = List(A, B, C, D)
20 | def random: String = values(Random.nextInt(values.length))
21 | }
22 |
23 | case class Data(id: Int, dict: String)
24 |
25 | val count = 100
26 | val data = (1 to count).map(i => Data(id = i, dict = Dict.random))
27 | val path = Path(Files.createTempDirectory("example"))
28 |
29 | implicit val system: ActorSystem = ActorSystem()
30 | import system.dispatcher
31 |
32 | val printingSink = Sink.foreach(println)
33 |
34 | val stream = for {
35 | // write
36 | _ <- Source(data).runWith(ParquetStreams.toParquetSingleFile.of[Data].write(path.append("data.parquet")))
37 | // read filtered
38 | _ <- Future(println("""dict == "A""""))
39 | _ <- ParquetStreams.fromParquet.as[Data].filter(Col("dict") === Dict.A).read(path).runWith(printingSink)
40 | _ <- Future(println("""id >= 20 && id < 40"""))
41 | _ <- ParquetStreams.fromParquet.as[Data].filter(Col("id") >= 20 && Col("id") < 40).read(path).runWith(printingSink)
42 | } yield ()
43 |
44 | stream.andThen {
45 | // finish
46 | case _ => system.terminate()
47 | }
48 |
49 | }
50 |
--------------------------------------------------------------------------------
/examples/src/main/scala/com/github/mjakubowski84/parquet4s/akkaPekko/WriteAndReadGenericAkkaPekkoApp.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s.akkaPekko
2 |
3 | import com.github.mjakubowski84.parquet4s.ScalaCompat.actor.ActorSystem
4 | import com.github.mjakubowski84.parquet4s.ScalaCompat.stream.scaladsl.{Sink, Source}
5 | import com.github.mjakubowski84.parquet4s.{ParquetStreams, Path, RowParquetRecord, ValueCodecConfiguration}
6 | import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.{BINARY, INT32, INT64}
7 | import org.apache.parquet.schema.Type.Repetition.{OPTIONAL, REQUIRED}
8 | import org.apache.parquet.schema.{LogicalTypeAnnotation, MessageType, Types}
9 |
10 | import java.nio.file.Files
11 | import java.time.LocalDate
12 |
13 | object WriteAndReadGenericAkkaPekkoApp extends App {
14 |
15 | val ID = "id"
16 | val Name = "name"
17 | val Birthday = "birthday"
18 | val SchemaName = "user_schema"
19 |
20 | val Schema: MessageType = Types
21 | .buildMessage()
22 | .addField(Types.primitive(INT64, REQUIRED).as(LogicalTypeAnnotation.intType(64, true)).named(ID))
23 | .addField(Types.primitive(BINARY, OPTIONAL).as(LogicalTypeAnnotation.stringType()).named(Name))
24 | .addField(Types.primitive(INT32, OPTIONAL).as(LogicalTypeAnnotation.dateType()).named(Birthday))
25 | .named(SchemaName)
26 |
27 | val Vcc = ValueCodecConfiguration.Default
28 |
29 | val users = List(
30 | (1L, "Alice", LocalDate.of(2000, 1, 1)),
31 | (2L, "Bob", LocalDate.of(1980, 2, 28)),
32 | (3L, "Cecilia", LocalDate.of(1977, 3, 15))
33 | ).map { case (id, name, birthday) =>
34 | RowParquetRecord
35 | .emptyWithSchema(ID, Name, Birthday)
36 | .updated(ID, id, Vcc)
37 | .updated(Name, name, Vcc)
38 | .updated(Birthday, birthday, Vcc)
39 | }
40 |
41 | val path = Path(Files.createTempDirectory("example"))
42 |
43 | implicit val system: ActorSystem = ActorSystem()
44 | import system.dispatcher
45 |
46 | val stream = for {
47 | // write
48 | _ <- Source(users).runWith(ParquetStreams.toParquetSingleFile.generic(Schema).write(path.append("data.parquet")))
49 | // read
50 | _ <- ParquetStreams.fromParquet.generic.read(path).runWith(Sink.foreach(println))
51 | } yield ()
52 |
53 | stream.andThen {
54 | // finish
55 | case _ => system.terminate()
56 | }
57 | }
58 |
--------------------------------------------------------------------------------
/examples/src/main/scala/com/github/mjakubowski84/parquet4s/akkaPekko/indefinite/AkkaPekko.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s.akkaPekko.indefinite
2 |
3 | import com.github.mjakubowski84.parquet4s.ScalaCompat.actor.{ActorSystem, CoordinatedShutdown}
4 | import com.github.mjakubowski84.parquet4s.ScalaCompat.pattern.AskSupport
5 |
6 | import scala.concurrent.ExecutionContext
7 |
8 | trait AkkaPekko extends AskSupport {
9 |
10 | this: Logger =>
11 |
12 | implicit lazy val system: ActorSystem = ActorSystem()
13 | implicit def executionContext: ExecutionContext = system.dispatcher
14 | val coordinatedShutdown: CoordinatedShutdown = CoordinatedShutdown(system)
15 |
16 | }
17 |
--------------------------------------------------------------------------------
/examples/src/main/scala/com/github/mjakubowski84/parquet4s/akkaPekko/indefinite/ExampleApp.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s.akkaPekko.indefinite
2 |
3 | import com.github.mjakubowski84.parquet4s.ScalaCompat.Done
4 | import com.github.mjakubowski84.parquet4s.ScalaCompat.actor.CoordinatedShutdown
5 | import com.github.mjakubowski84.parquet4s.ScalaKafkaCompat.kafka.scaladsl.Consumer.DrainingControl
6 | import com.github.mjakubowski84.parquet4s.ScalaCompat.stream.scaladsl.Keep
7 |
8 | object ExampleApp
9 | extends App
10 | with Logger
11 | with AkkaPekko
12 | with Kafka
13 | with RandomDataProducer
14 | with MessageSource
15 | with MessageSink {
16 |
17 | startKafka()
18 | startDataProducer()
19 |
20 | logger.info(s"Starting stream that reads messages from Kafka and writes them to $baseWritePath...")
21 | val streamControl: DrainingControl[Done] = messageSource
22 | .toMat(messageSink)(Keep.both)
23 | .mapMaterializedValue(DrainingControl.apply[Done])
24 | .run()
25 |
26 | coordinatedShutdown.addTask(CoordinatedShutdown.PhaseServiceStop, "Stopping stream") { () =>
27 | logger.info("Stopping stream...")
28 | streamControl.drainAndShutdown()
29 | }
30 |
31 | }
32 |
--------------------------------------------------------------------------------
/examples/src/main/scala/com/github/mjakubowski84/parquet4s/akkaPekko/indefinite/Kafka.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s.akkaPekko.indefinite
2 |
3 | import com.github.mjakubowski84.parquet4s.ScalaCompat.Done
4 | import com.github.mjakubowski84.parquet4s.ScalaCompat.actor.CoordinatedShutdown
5 | import io.github.embeddedkafka.EmbeddedKafka
6 |
7 | import scala.concurrent.Future
8 |
9 | trait Kafka {
10 |
11 | this: Logger & AkkaPekko =>
12 |
13 | private lazy val broker = {
14 | logger.info("Starting Kafka...")
15 | EmbeddedKafka.start()
16 | }
17 |
18 | lazy val kafkaAddress = s"localhost:${broker.config.kafkaPort}"
19 | val topic = "exampleTopic"
20 | val groupId = "exampleGroupId"
21 |
22 | def sendKafkaMessage(message: String): Unit = EmbeddedKafka.publishStringMessageToKafka(topic, message)
23 |
24 | def startKafka(): Unit = {
25 | broker
26 | coordinatedShutdown.addTask(CoordinatedShutdown.PhaseBeforeActorSystemTerminate, "Stop kafka") { () =>
27 | Future {
28 | logger.info("Stopping Kafka...")
29 | EmbeddedKafka.stop()
30 | Done
31 | }
32 | }
33 | }
34 |
35 | }
36 |
--------------------------------------------------------------------------------
/examples/src/main/scala/com/github/mjakubowski84/parquet4s/akkaPekko/indefinite/Logger.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s.akkaPekko.indefinite
2 |
3 | trait Logger {
4 | lazy val logger: org.slf4j.Logger = org.slf4j.LoggerFactory.getLogger(this.getClass)
5 | }
6 |
--------------------------------------------------------------------------------
/examples/src/main/scala/com/github/mjakubowski84/parquet4s/akkaPekko/indefinite/MessageSink.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s.akkaPekko.indefinite
2 |
3 | import com.github.mjakubowski84.parquet4s.ScalaCompat.Done
4 | import com.github.mjakubowski84.parquet4s.ScalaKafkaCompat.kafka.CommitterSettings
5 | import com.github.mjakubowski84.parquet4s.ScalaKafkaCompat.kafka.ConsumerMessage.CommittableOffsetBatch
6 | import com.github.mjakubowski84.parquet4s.ScalaKafkaCompat.kafka.scaladsl.Committer
7 | import com.github.mjakubowski84.parquet4s.ScalaCompat.stream.FlowShape
8 | import com.github.mjakubowski84.parquet4s.ScalaCompat.stream.scaladsl.{Flow, Keep, Sink}
9 | import com.github.mjakubowski84.parquet4s.ScalaCompat.stream.stage.GraphStage
10 | import com.github.mjakubowski84.parquet4s.{Col, ParquetStreams, ParquetWriter, Path}
11 | import org.apache.parquet.hadoop.metadata.CompressionCodecName
12 |
13 | import java.nio.file.Files
14 | import java.sql.Timestamp
15 | import scala.concurrent.Future
16 | import scala.concurrent.duration.*
17 |
18 | object MessageSink {
19 |
20 | case class Data(
21 | year: String,
22 | month: String,
23 | day: String,
24 | timestamp: Timestamp,
25 | word: String
26 | )
27 |
28 | val MaxChunkSize: Int = 128
29 | val ChunkWriteTimeWindow: FiniteDuration = 10.seconds
30 | val WriteDirectoryName: String = "messages"
31 |
32 | }
33 |
34 | trait MessageSink {
35 |
36 | this: AkkaPekko & Logger =>
37 |
38 | import MessageSink.*
39 | import MessageSource.*
40 |
41 | protected val baseWritePath: Path = Path(Files.createTempDirectory("example")).append(WriteDirectoryName)
42 |
43 | private val writerOptions = ParquetWriter.Options(compressionCodecName = CompressionCodecName.SNAPPY)
44 |
45 | lazy val messageSink: Sink[Message, Future[Done]] =
46 | Flow[Message]
47 | .via(saveDataToParquetFlow)
48 | .map(_.committableOffset)
49 | .grouped(MaxChunkSize)
50 | .map(CommittableOffsetBatch.apply)
51 | .toMat(Committer.sink(CommitterSettings(system)))(Keep.right)
52 |
53 | private lazy val saveDataToParquetFlow: GraphStage[FlowShape[Message, Message]] =
54 | ParquetStreams.viaParquet
55 | .of[Message]
56 | .preWriteTransformation { message =>
57 | val timestamp = new Timestamp(message.record.timestamp())
58 | val localDateTime = timestamp.toLocalDateTime
59 | Some(
60 | Data(
61 | year = localDateTime.getYear.toString,
62 | month = localDateTime.getMonthValue.toString,
63 | day = localDateTime.getDayOfMonth.toString,
64 | timestamp = timestamp,
65 | word = message.record.value()
66 | )
67 | )
68 | }
69 | .partitionBy(Col("year"), Col("month"), Col("day"))
70 | .maxCount(MaxChunkSize.toLong)
71 | .maxDuration(ChunkWriteTimeWindow)
72 | .options(writerOptions)
73 | .postWriteHandler { state =>
74 | logger.info(s"Just wrote to ${state.modifiedPartitions}")
75 | }
76 | .write(baseWritePath)
77 |
78 | }
79 |
--------------------------------------------------------------------------------
/examples/src/main/scala/com/github/mjakubowski84/parquet4s/akkaPekko/indefinite/MessageSource.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s.akkaPekko.indefinite
2 |
3 | import com.github.mjakubowski84.parquet4s.ScalaKafkaCompat.kafka.scaladsl.Consumer
4 | import com.github.mjakubowski84.parquet4s.ScalaKafkaCompat.kafka.{ConsumerMessage, ConsumerSettings, Subscriptions}
5 | import com.github.mjakubowski84.parquet4s.ScalaCompat.stream.scaladsl.Source
6 | import org.apache.kafka.common.serialization.StringDeserializer
7 |
8 | import scala.concurrent.duration.Duration
9 |
10 | object MessageSource {
11 |
12 | type Message = ConsumerMessage.CommittableMessage[String, String]
13 |
14 | }
15 |
16 | trait MessageSource {
17 |
18 | this: AkkaPekko & Kafka =>
19 |
20 | import MessageSource.*
21 |
22 | private val consumerSettings = ConsumerSettings(system, new StringDeserializer(), new StringDeserializer())
23 | .withBootstrapServers(kafkaAddress)
24 | .withGroupId(groupId)
25 | .withStopTimeout(Duration.Zero)
26 | private val subscription = Subscriptions.topics(topic)
27 |
28 | lazy val messageSource: Source[Message, Consumer.Control] = Consumer.committableSource(consumerSettings, subscription)
29 |
30 | }
31 |
--------------------------------------------------------------------------------
/examples/src/main/scala/com/github/mjakubowski84/parquet4s/core/ColumnProjectionAndDataConcatenationApp.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s.core
2 |
3 | import com.github.mjakubowski84.parquet4s.*
4 |
5 | import java.nio.file.Files
6 | import java.time.LocalDate
7 | import scala.util.Using
8 |
9 | object ColumnProjectionAndDataConcatenationApp extends App {
10 |
11 | val ID = "id"
12 | val Name = "name"
13 | val FirstName = "firstName"
14 | val Birthday = "birthday"
15 |
16 | case class User1(id: Long, name: String, birthday: LocalDate)
17 | case class User2(id: Int, firstName: String, lastName: String)
18 | case class UserName(id: Long, name: String)
19 |
20 | val path = Path(Files.createTempDirectory("example"))
21 | val path1 = path.append("users1.parquet")
22 | val path2 = path.append("users2.parquet")
23 |
24 | val vcc = ValueCodecConfiguration.Default
25 |
26 | val users1 = List(
27 | User1(1L, "Alice", LocalDate.of(2000, 1, 1)),
28 | User1(2L, "Bob", LocalDate.of(1980, 2, 28)),
29 | User1(3L, "Cecilia", LocalDate.of(1977, 3, 15))
30 | )
31 | val users2 = List(
32 | User2(4, "Derek", "Smith"),
33 | User2(5, "Emilia", "Doe"),
34 | User2(6, "Fred", "Johnson")
35 | )
36 |
37 | // write
38 | ParquetWriter.of[User1].writeAndClose(path1, users1)
39 | ParquetWriter.of[User2].writeAndClose(path2, users2)
40 |
41 | // define 1st dataset
42 | val readUsers1 = ParquetReader
43 | .projectedGeneric(
44 | Col(ID).as[Long],
45 | Col(Name).as[String]
46 | )
47 | .read(path1)
48 | .as[UserName]
49 |
50 | // define 2nd dataset
51 | val readUsers2 = ParquetReader
52 | .projectedGeneric(
53 | Col(ID).as[Int],
54 | Col(FirstName).as[String].alias(Name)
55 | )
56 | .read(path2)
57 | .as[UserName]
58 |
59 | // define concatenation of datasets
60 | val readAllUserNames = readUsers1.concat(readUsers2)
61 |
62 | // execute
63 | Using.resource(readAllUserNames)(_.foreach(println))
64 |
65 | }
66 |
--------------------------------------------------------------------------------
/examples/src/main/scala/com/github/mjakubowski84/parquet4s/core/ETLApp.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s.core
2 |
3 | import com.github.mjakubowski84.parquet4s.*
4 |
5 | import java.nio.file.Files
6 | import scala.util.Using
7 |
8 | object ETLApp extends App {
9 |
10 | case class Owner(id: Long, name: String)
11 | case class Pet(id: Long, name: String, ownerId: Long)
12 | case class PetOwner(id: Long, name: String, petId: Long, petName: String)
13 |
14 | val path = Path(Files.createTempDirectory("example"))
15 | val ownerPath = path.append("owners.parquet")
16 | val petsPath = path.append("pets.parquet")
17 | val outputPath = path.append("output.parquet")
18 |
19 | val owners = List(
20 | Owner(1L, "Alice"),
21 | Owner(2L, "Bob"),
22 | Owner(3L, "Cecilia")
23 | )
24 | val pets = List(
25 | Pet(1L, "Rex", 2L),
26 | Pet(2L, "Felix", 3L),
27 | Pet(3L, "Molly", 3L),
28 | Pet(4L, "Sunshine", 4L)
29 | )
30 |
31 | // prepare input data
32 | ParquetWriter.of[Owner].writeAndClose(ownerPath, owners)
33 | ParquetWriter.of[Pet].writeAndClose(petsPath, pets)
34 |
35 | // define 1st dataset
36 | val readOwners = ParquetReader
37 | .projectedGeneric(
38 | Col("id").as[Long],
39 | Col("name").as[String]
40 | )
41 | .read(ownerPath)
42 |
43 | // define 2nd dataset
44 | val readPets = ParquetReader
45 | .projectedGeneric(
46 | Col("id").as[Long].alias("petId"),
47 | Col("name").as[String].alias("petName"),
48 | Col("ownerId").as[Long]
49 | )
50 | .read(petsPath)
51 |
52 | // perform ETL
53 | Using.resources(readOwners, readPets) { case (owners, pets) =>
54 | owners
55 | .innerJoin(right = pets, onLeft = Col("id"), onRight = Col("ownerId")) // define join operation
56 | .as[PetOwner] // set typed schema and codecs
57 | .writeAndClose(outputPath) // execute all operations defined above and write results to disk
58 | }
59 |
60 | // take note that all operations defined above writeAndClose are lazy and are not executed
61 | // before writeAndClose is called
62 |
63 | // read ETL results
64 | Using.resource(ParquetReader.as[PetOwner].read(outputPath))(_.foreach(println))
65 | }
66 |
--------------------------------------------------------------------------------
/examples/src/main/scala/com/github/mjakubowski84/parquet4s/core/WriteAndReadApp.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s.core
2 |
3 | import com.github.mjakubowski84.parquet4s.{ParquetReader, ParquetWriter, Path}
4 |
5 | import java.nio.file.Files
6 | import scala.util.Random
7 | import scala.util.Using
8 |
9 | object WriteAndReadApp extends App {
10 |
11 | case class Data(id: Int, text: String)
12 |
13 | val count = 100
14 | val data = (1 to count).map(i => Data(id = i, text = Random.nextString(4)))
15 | val path = Path(Files.createTempDirectory("example"))
16 |
17 | // write
18 | ParquetWriter.of[Data].writeAndClose(path.append("data.parquet"), data)
19 |
20 | // read
21 | Using.resource(ParquetReader.as[Data].read(path))(_.foreach(println))
22 | }
23 |
--------------------------------------------------------------------------------
/examples/src/main/scala/com/github/mjakubowski84/parquet4s/core/WriteAndReadCustomTypeApp.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s.core
2 |
3 | import com.github.mjakubowski84.parquet4s.CustomType.*
4 | import com.github.mjakubowski84.parquet4s.ParquetSchemaResolver.*
5 | import com.github.mjakubowski84.parquet4s.{ParquetReader, ParquetWriter, Path}
6 |
7 | import java.nio.file.Files
8 | import scala.util.Using
9 |
10 | object WriteAndReadCustomTypeApp extends App {
11 |
12 | object Data {
13 | def generate(count: Int): Iterable[Data] = (1 to count).map(i => Data(id = i, dict = Dict.random))
14 | }
15 | case class Data(id: Int, dict: Dict.Type)
16 |
17 | val data = Data.generate(count = 100)
18 | val path = Path(Files.createTempDirectory("example"))
19 |
20 | // write
21 | ParquetWriter.of[Data].writeAndClose(path.append("data.parquet"), data)
22 |
23 | // read
24 | // hint: you can filter by dict using string value, for example: filter = Col("dict") === "A"
25 | Using.resource(ParquetReader.as[Data].read(path))(_.foreach(println))
26 |
27 | }
28 |
--------------------------------------------------------------------------------
/examples/src/main/scala/com/github/mjakubowski84/parquet4s/core/WriteAndReadFilteredApp.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s.core
2 |
3 | import com.github.mjakubowski84.parquet4s.{Col, ParquetReader, ParquetWriter, Path}
4 |
5 | import java.nio.file.Files
6 | import scala.util.Random
7 | import scala.util.Using
8 |
9 | object WriteAndReadFilteredApp extends App {
10 |
11 | object Dict {
12 | val A = "A"
13 | val B = "B"
14 | val C = "C"
15 | val D = "D"
16 |
17 | val values: List[String] = List(A, B, C, D)
18 | def random: String = values(Random.nextInt(values.length))
19 | }
20 |
21 | case class Data(id: Int, dict: String)
22 |
23 | val count = 100
24 | val data = (1 to count).map(i => Data(id = i, dict = Dict.random))
25 | val path = Path(Files.createTempDirectory("example"))
26 |
27 | // write
28 | ParquetWriter.of[Data].writeAndClose(path.append("data.parquet"), data)
29 |
30 | // read filtered
31 | println("""dict == "A"""")
32 | val dictIsOnlyA = ParquetReader.as[Data].filter(Col("dict") === Dict.A).read(path)
33 | Using.resource(dictIsOnlyA)(_.foreach(println))
34 |
35 | println("""id >= 20 && id < 40""")
36 | val idIsBetween10And90 = ParquetReader.as[Data].filter(Col("id") >= 20 && Col("id") < 40).read(path)
37 | Using.resource(idIsBetween10And90)(_.foreach(println))
38 |
39 | }
40 |
--------------------------------------------------------------------------------
/examples/src/main/scala/com/github/mjakubowski84/parquet4s/core/WriteAndReadGenericApp.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s.core
2 |
3 | import com.github.mjakubowski84.parquet4s.*
4 | import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.{BINARY, INT32, INT64}
5 | import org.apache.parquet.schema.Type.Repetition.{OPTIONAL, REQUIRED}
6 | import org.apache.parquet.schema.{LogicalTypeAnnotation, MessageType, Types}
7 |
8 | import java.nio.file.Files
9 | import java.time.LocalDate
10 | import scala.util.Using
11 |
12 | object WriteAndReadGenericApp extends App {
13 |
14 | val ID = "id"
15 | val Name = "name"
16 | val Birthday = "birthday"
17 | val SchemaName = "user_schema"
18 |
19 | val path = Path(Files.createTempDirectory("example"))
20 | val vcc = ValueCodecConfiguration.Default
21 |
22 | val users = List(
23 | (1L, "Alice", LocalDate.of(2000, 1, 1)),
24 | (2L, "Bob", LocalDate.of(1980, 2, 28)),
25 | (3L, "Cecilia", LocalDate.of(1977, 3, 15))
26 | ).map { case (id, name, birthday) =>
27 | RowParquetRecord
28 | .emptyWithSchema(ID, Name, Birthday)
29 | .updated(ID, id, vcc)
30 | .updated(Name, name, vcc)
31 | .updated(Birthday, birthday, vcc)
32 | }
33 |
34 | // write
35 | val schema: MessageType = Types
36 | .buildMessage()
37 | .addField(Types.primitive(INT64, REQUIRED).as(LogicalTypeAnnotation.intType(64, true)).named(ID))
38 | .addField(Types.primitive(BINARY, OPTIONAL).as(LogicalTypeAnnotation.stringType()).named(Name))
39 | .addField(Types.primitive(INT32, OPTIONAL).as(LogicalTypeAnnotation.dateType()).named(Birthday))
40 | .named(SchemaName)
41 |
42 | ParquetWriter.generic(schema).writeAndClose(path.append("users.parquet"), users)
43 |
44 | // read
45 | Using.resource(ParquetReader.generic.read(path)) { readData =>
46 | readData.foreach { record =>
47 | val id = record.get[Long](ID, vcc)
48 | val name = record.get[String](Name, vcc)
49 | val birthday = record.get[LocalDate](Birthday, vcc)
50 | println(s"User[$ID=$id,$Name=$name,$Birthday=$birthday]")
51 | }
52 | }
53 |
54 | }
55 |
--------------------------------------------------------------------------------
/examples/src/main/scala/com/github/mjakubowski84/parquet4s/core/WriteAndReadUsingRecordFilterApp.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s.core
2 |
3 | import com.github.mjakubowski84.parquet4s.{ParquetReader, ParquetWriter, Path}
4 |
5 | import java.nio.file.Files
6 | import scala.util.Random
7 | import scala.util.Using
8 | import com.github.mjakubowski84.parquet4s.RecordFilter
9 |
10 | object WriteAndReadUsingRecordFilterApp extends App {
11 |
12 | case class Data(id: Int, text: String)
13 |
14 | val count = 100
15 | val data = (1 to count).map(i => Data(id = i, text = Random.nextString(4)))
16 | val path = Path(Files.createTempDirectory("example"))
17 |
18 | // write
19 | ParquetWriter.of[Data].writeAndClose(path.append("data.parquet"), data)
20 |
21 | // skips all but last 3 records (out of 100)
22 | Using.resource(ParquetReader.as[Data].filter(RecordFilter(_ >= 97)).read(path))(_.foreach(println))
23 | }
24 |
--------------------------------------------------------------------------------
/examples/src/main/scala/com/github/mjakubowski84/parquet4s/core/WriteIncrementallyAndReadApp.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s.core
2 |
3 | import com.github.mjakubowski84.parquet4s.{ParquetReader, ParquetWriter, Path}
4 |
5 | import java.nio.file.Files
6 | import scala.util.Random
7 | import scala.util.Using
8 |
9 | object WriteIncrementallyAndReadApp extends App {
10 |
11 | case class Data(id: Int, text: String)
12 |
13 | val count = 100
14 | val data = (1 to count).map(i => Data(id = i, text = Random.nextString(4)))
15 | val path = Path(Files.createTempDirectory("example"))
16 |
17 | // write
18 | val writer = ParquetWriter.of[Data].build(path.append("data.parquet"))
19 | try data.foreach(entity => writer.write(entity))
20 | finally writer.close()
21 |
22 | // read
23 | Using.resource(ParquetReader.as[Data].read(path))(_.foreach(println))
24 |
25 | }
26 |
--------------------------------------------------------------------------------
/examples/src/main/scala/com/github/mjakubowski84/parquet4s/fs2/CustomAvroPartitioningWriteFS2App.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s.fs2
2 |
3 | import cats.effect.{IO, IOApp}
4 | import com.github.mjakubowski84.parquet4s.Path
5 | import com.github.mjakubowski84.parquet4s.parquet.*
6 | import fs2.io.file.Files
7 | import fs2.Stream
8 |
9 | import scala.util.Random
10 | import org.apache.avro.SchemaBuilder
11 | import org.apache.avro.generic.GenericRecordBuilder
12 | import org.apache.parquet.avro.AvroParquetWriter
13 | import org.apache.avro.generic.GenericRecord
14 | import com.github.mjakubowski84.parquet4s.ParquetWriter
15 | import com.github.mjakubowski84.parquet4s.ValueCodecConfiguration
16 |
17 | object CustomAvroPartitioningWriteFS2App extends IOApp.Simple {
18 | private val Count = 100
19 | private val InputAvroSchema = SchemaBuilder
20 | .record("data")
21 | .namespace("example")
22 | .fields()
23 | .requiredInt("i")
24 | .requiredString("text")
25 | .requiredString("partition")
26 | .endRecord()
27 | private val PartitionedDataAvroSchema = SchemaBuilder
28 | .record("data")
29 | .namespace("example")
30 | .fields()
31 | .requiredInt("i")
32 | .requiredString("text")
33 | .endRecord()
34 |
35 | val data = (1 to Count).map { i =>
36 | new GenericRecordBuilder(InputAvroSchema)
37 | .set("i", i)
38 | .set("text", Random.nextString(4))
39 | .set("partition", (i % 4).toString())
40 | .build()
41 | }
42 |
43 | val vcc = ValueCodecConfiguration.Default
44 |
45 | def write(basePath: Path) =
46 | viaParquet[IO]
47 | .custom[GenericRecord, AvroParquetWriter.Builder[GenericRecord]](path =>
48 | AvroParquetWriter
49 | .builder[GenericRecord](path.toOutputFile(ParquetWriter.Options()))
50 | .withSchema(PartitionedDataAvroSchema)
51 | )
52 | .partitionUsing { case (path, record) =>
53 | val partitionValue = record.get("partition")
54 | val partitionedRecord = new GenericRecordBuilder(PartitionedDataAvroSchema)
55 | .set("i", record.get("i"))
56 | .set("text", record.get("text"))
57 | .build()
58 | (path.append(s"partition=$partitionValue"), partitionedRecord)
59 | }
60 | .write(basePath)
61 |
62 | def read(basePath: Path) =
63 | fromParquet[IO].generic
64 | .read(basePath)
65 |
66 | override def run: IO[Unit] = {
67 | val stream = for {
68 | path <- Stream
69 | .resource(Files[IO].tempDirectory(None, "", None))
70 | .map(fs2Path => Path(fs2Path.toNioPath).append("data.parquet"))
71 | _ <- Stream
72 | .iterable(data)
73 | .through(write(path))
74 | .append(
75 | read(path).evalTap(r =>
76 | IO.println(
77 | s"i=${r.get[Int]("i", vcc)}, text=${r.get[String]("text", vcc)}, partition=${r.get[String]("partition", vcc)}"
78 | )
79 | )
80 | )
81 | } yield ()
82 |
83 | stream.compile.drain
84 | }
85 | }
86 |
--------------------------------------------------------------------------------
/examples/src/main/scala/com/github/mjakubowski84/parquet4s/fs2/CustomAvroWriteAndReadFS2App.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s.fs2
2 |
3 | import cats.effect.{IO, IOApp}
4 | import com.github.mjakubowski84.parquet4s.Path
5 | import com.github.mjakubowski84.parquet4s.parquet.*
6 | import fs2.io.file.Files
7 | import fs2.{Pipe, Stream}
8 |
9 | import scala.util.Random
10 | import org.apache.avro.SchemaBuilder
11 | import org.apache.avro.generic.GenericRecordBuilder
12 | import org.apache.parquet.avro.AvroParquetWriter
13 | import org.apache.avro.generic.GenericRecord
14 | import com.github.mjakubowski84.parquet4s.ParquetWriter
15 | import com.github.mjakubowski84.parquet4s.ParquetReader
16 | import org.apache.parquet.avro.AvroParquetReader
17 |
18 | object CustomAvroWriteAndReadFS2App extends IOApp.Simple {
19 | val Count = 100
20 | val AvroSchema = SchemaBuilder
21 | .record("data")
22 | .namespace("example")
23 | .fields()
24 | .requiredInt("i")
25 | .requiredString("text")
26 | .endRecord()
27 |
28 | val data = (1 to Count).map { i =>
29 | new GenericRecordBuilder(AvroSchema)
30 | .set("i", i)
31 | .set("text", Random.nextString(4))
32 | .build()
33 | }
34 |
35 | def write(path: Path): Pipe[IO, GenericRecord, Nothing] = {
36 | val builder =
37 | AvroParquetWriter.builder[GenericRecord](path.toOutputFile(ParquetWriter.Options())).withSchema(AvroSchema)
38 |
39 | writeSingleFile[IO].custom[GenericRecord, AvroParquetWriter.Builder[GenericRecord]](builder).write
40 | }
41 |
42 | def read(path: Path) =
43 | fromParquet[IO]
44 | .custom(AvroParquetReader.builder[GenericRecord](path.toInputFile(ParquetReader.Options())))
45 | .read()
46 |
47 | override def run: IO[Unit] = {
48 | val stream = for {
49 | path <- Stream
50 | .resource(Files[IO].tempDirectory(None, "", None))
51 | .map(fs2Path => Path(fs2Path.toNioPath).append("data.parquet"))
52 | _ <- Stream
53 | .iterable(data)
54 | .through(write(path))
55 | .append(read(path).printlns)
56 | } yield ()
57 |
58 | stream.compile.drain
59 | }
60 | }
61 |
--------------------------------------------------------------------------------
/examples/src/main/scala/com/github/mjakubowski84/parquet4s/fs2/CustomProtobufWriteAndReadFS2App.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s.fs2
2 |
3 | import cats.effect.{IO, IOApp}
4 | import com.github.mjakubowski84.parquet4s.Path
5 | import com.github.mjakubowski84.parquet4s.parquet.*
6 | import com.github.mjakubowski84.parquet4s.protobuf.DataOuterClass.Data
7 | import fs2.io.file.Files
8 | import fs2.{Pipe, Stream}
9 | import org.apache.parquet.proto._
10 |
11 | import scala.util.Random
12 | import com.google.protobuf.TextFormat
13 |
14 | /** Please note! This is an example of Java Protobuf + Parquet4s using custom readers and writers. You can also use
15 | * Scala Protobuf with regular Parquet4s functions thanks to ScalaPB module of Parquet4s.
16 | */
17 | object CustomProtobufWriteAndReadFS2App extends IOApp.Simple {
18 | private val Count = 100
19 |
20 | def write(path: Path): Pipe[IO, Data, Nothing] = {
21 | val builder = ProtoParquetWriter.builder[Data](path.hadoopPath).withMessage(classOf[Data])
22 | writeSingleFile[IO]
23 | .custom[Data, ProtoParquetWriter.Builder[Data]](builder)
24 | .write
25 | }
26 |
27 | def read(path: Path) =
28 | fromParquet[IO]
29 | .custom(ProtoParquetReader.builder[Data.Builder](path.hadoopPath))
30 | .read(_.build)
31 |
32 | override def run: IO[Unit] = {
33 |
34 | val stream = for {
35 | path <- Stream
36 | .resource(Files[IO].tempDirectory(None, "", None))
37 | .map(fs2Path => Path(fs2Path.toNioPath).append("data.parquet"))
38 | _ <- Stream
39 | .range[IO, Int](start = 0, stopExclusive = Count)
40 | .map(i => Data.newBuilder.setId(i).setText(Random.nextString(4)).build)
41 | .through(write(path))
42 | .append(
43 | read(path).evalMapChunk(data => IO.println(TextFormat.printer().escapingNonAscii(false).printToString(data)))
44 | )
45 | } yield ()
46 |
47 | stream.compile.drain
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/examples/src/main/scala/com/github/mjakubowski84/parquet4s/fs2/WriteAndReadFS2App.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s.fs2
2 |
3 | import cats.Show
4 | import cats.effect.{IO, IOApp}
5 | import com.github.mjakubowski84.parquet4s.Path
6 | import com.github.mjakubowski84.parquet4s.parquet.*
7 | import fs2.Stream
8 | import fs2.io.file.Files
9 |
10 | import scala.util.Random
11 |
12 | object WriteAndReadFS2App extends IOApp.Simple {
13 |
14 | case class Data(id: Int, text: String)
15 |
16 | implicit private val showData: Show[Data] = Show.fromToString
17 | private val Count = 100
18 |
19 | override def run: IO[Unit] = {
20 | val stream = for {
21 | path <- Stream.resource(Files[IO].tempDirectory(None, "", None)).map(fs2Path => Path(fs2Path.toNioPath))
22 | _ <- Stream
23 | .range[IO, Int](start = 0, stopExclusive = Count)
24 | .map(i => Data(id = i, text = Random.nextString(4)))
25 | .through(writeSingleFile[IO].of[Data].write(path.append("data.parquet")))
26 | .append(fromParquet[IO].as[Data].read(path).printlns.drain)
27 | } yield ()
28 |
29 | stream.compile.drain
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/examples/src/main/scala/com/github/mjakubowski84/parquet4s/fs2/WriteAndReadFilteredFS2App.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s.fs2
2 |
3 | import cats.Show
4 | import cats.effect.{IO, IOApp}
5 | import com.github.mjakubowski84.parquet4s.{Col, Path}
6 | import com.github.mjakubowski84.parquet4s.parquet.*
7 | import fs2.Stream
8 | import fs2.io.file.Files
9 |
10 | import scala.util.Random
11 |
12 | object WriteAndReadFilteredFS2App extends IOApp.Simple {
13 |
14 | object Dict {
15 | val A = "A"
16 | val B = "B"
17 | val C = "C"
18 | val D = "D"
19 |
20 | val values: List[String] = List(A, B, C, D)
21 | def random: String = values(Random.nextInt(values.length))
22 | }
23 |
24 | case class Data(id: Int, dict: String)
25 |
26 | implicit private val showData: Show[Data] = Show.fromToString
27 | private val Count = 100
28 |
29 | override def run: IO[Unit] = {
30 | val stream = for {
31 | path <- Stream.resource(Files[IO].tempDirectory(None, "", None)).map(fs2Path => Path(fs2Path.toNioPath))
32 | _ <- Stream
33 | .range[IO, Int](start = 0, stopExclusive = Count)
34 | .map(i => Data(id = i, dict = Dict.random))
35 | .through(writeSingleFile[IO].of[Data].write(path.append("data.parquet")))
36 | .append(Stream.exec(IO.println("""dict == "A"""")))
37 | .append(fromParquet[IO].as[Data].filter(Col("dict") === Dict.A).read(path).printlns.drain)
38 | .append(Stream.exec(IO.println("""id >= 20 && id < 40""")))
39 | .append(
40 | fromParquet[IO]
41 | .as[Data]
42 | .filter(Col("id") >= 20 && Col("id") < 40)
43 | .read(path)
44 | .printlns
45 | .drain
46 | )
47 | } yield ()
48 |
49 | stream.compile.drain
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/examples/src/main/scala/com/github/mjakubowski84/parquet4s/fs2/WriteAndReadGenericFS2App.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s.fs2
2 |
3 | import cats.Show
4 | import cats.effect.{IO, IOApp}
5 | import com.github.mjakubowski84.parquet4s.parquet.*
6 | import com.github.mjakubowski84.parquet4s.{Path, RowParquetRecord, ValueCodecConfiguration}
7 | import fs2.Stream
8 | import fs2.io.file.Files
9 | import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.{BINARY, INT32, INT64}
10 | import org.apache.parquet.schema.Type.Repetition.{OPTIONAL, REQUIRED}
11 | import org.apache.parquet.schema.{LogicalTypeAnnotation, MessageType, Types}
12 |
13 | import java.time.LocalDate
14 |
15 | object WriteAndReadGenericFS2App extends IOApp.Simple {
16 |
17 | private val ID = "id"
18 | private val Name = "name"
19 | private val Birthday = "birthday"
20 | private val SchemaName = "user_schema"
21 |
22 | val schema: MessageType = Types
23 | .buildMessage()
24 | .addField(Types.primitive(INT64, REQUIRED).as(LogicalTypeAnnotation.intType(64, true)).named(ID))
25 | .addField(Types.primitive(BINARY, OPTIONAL).as(LogicalTypeAnnotation.stringType()).named(Name))
26 | .addField(Types.primitive(INT32, OPTIONAL).as(LogicalTypeAnnotation.dateType()).named(Birthday))
27 | .named(SchemaName)
28 |
29 | implicit private val showRecords: Show[RowParquetRecord] = Show.fromToString
30 |
31 | private val vcc = ValueCodecConfiguration.Default
32 |
33 | private val users = List(
34 | (1L, "Alice", LocalDate.of(2000, 1, 1)),
35 | (2L, "Bob", LocalDate.of(1980, 2, 28)),
36 | (3L, "Cecilia", LocalDate.of(1977, 3, 15))
37 | ).map { case (id, name, birthday) =>
38 | RowParquetRecord
39 | .emptyWithSchema(ID, Name, Birthday)
40 | .updated(ID, id, vcc)
41 | .updated(Name, name, vcc)
42 | .updated(Birthday, birthday, vcc)
43 | }
44 |
45 | override def run: IO[Unit] = {
46 | val stream = for {
47 | path <- Stream.resource(Files[IO].tempDirectory(None, "", None)).map(fs2Path => Path(fs2Path.toNioPath))
48 | _ <- Stream
49 | .iterable[IO, RowParquetRecord](users)
50 | .through(writeSingleFile[IO].generic(schema).write(path.append("data.parquet")))
51 | .append(fromParquet[IO].generic.read(path).printlns.drain)
52 | } yield ()
53 |
54 | stream.compile.drain
55 | }
56 | }
57 |
--------------------------------------------------------------------------------
/examples/src/main/scala/com/github/mjakubowski84/parquet4s/scalapb/WriteAndReadApp.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s.scalapb
2 |
3 | import com.github.mjakubowski84.parquet4s.ScalaPBImplicits.*
4 | import com.github.mjakubowski84.parquet4s.protobuf.Data
5 | import com.github.mjakubowski84.parquet4s.{ParquetReader, ParquetWriter, Path}
6 |
7 | import java.nio.file.Files
8 | import scala.util.Using
9 |
10 | object WriteAndReadApp extends App {
11 | val data = (1 to 100).map(id => Data(id = id, text = id.toString))
12 | val path = Path(Files.createTempDirectory("example"))
13 |
14 | // write
15 | ParquetWriter.of[Data].writeAndClose(path.append("data.parquet"), data)
16 |
17 | // read
18 | Using.resource(ParquetReader.as[Data].read(path))(_.foreach(println))
19 | }
20 |
--------------------------------------------------------------------------------
/examples/src/main/scala/com/github/mjakubowski84/parquet4s/scalapb/WriteIncrementallyAndReadApp.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s.scalapb
2 |
3 | import com.github.mjakubowski84.parquet4s.ScalaPBImplicits.*
4 | import com.github.mjakubowski84.parquet4s.protobuf.Data
5 | import com.github.mjakubowski84.parquet4s.{ParquetReader, ParquetWriter, Path}
6 |
7 | import java.nio.file.Files
8 |
9 | object WriteIncrementallyAndReadApp extends App {
10 | val count = 100
11 | val data = (1 to count).map(id => Data(id = id, text = id.toString))
12 | val path = Path(Files.createTempDirectory("example"))
13 |
14 | // write
15 | val writer = ParquetWriter.of[Data].build(path.append("data.parquet"))
16 | try data.foreach(entity => writer.write(entity))
17 | finally writer.close()
18 |
19 | // read
20 | val readData = ParquetReader.as[Data].read(path)
21 | try readData.foreach(println)
22 | finally readData.close()
23 | }
24 |
--------------------------------------------------------------------------------
/fs2/src/it/resources/logback-test.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/fs2/src/main/scala/com/github/mjakubowski84/parquet4s/parquet/logger.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s.parquet
2 |
3 | import cats.effect.Sync
4 | import cats.implicits.*
5 | import org.slf4j.LoggerFactory
6 |
7 | private[parquet] object logger {
8 |
9 | class Logger[F[_]](wrapped: org.slf4j.Logger)(implicit F: Sync[F]) {
10 |
11 | // FIXME replace with debug with format and params
12 | def debug(msg: => String): F[Unit] =
13 | F.catchNonFatal(wrapped.isDebugEnabled).flatMap {
14 | case true =>
15 | F.delay(wrapped.debug(msg))
16 | case false =>
17 | F.unit
18 | }
19 |
20 | }
21 |
22 | def apply[F[_]](name: String)(implicit F: Sync[F]): F[Logger[F]] =
23 | F.delay(LoggerFactory.getLogger(name)).map(new Logger(_))
24 |
25 | def apply[F[_]: Sync](clazz: Class[?]): F[Logger[F]] =
26 | apply(clazz.getCanonicalName)
27 |
28 | }
29 |
--------------------------------------------------------------------------------
/fs2/src/main/scala/com/github/mjakubowski84/parquet4s/parquet/package.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import cats.effect.{Async, Sync}
4 |
5 | package object parquet {
6 |
7 | /** Creates a [[fs2.Stream]] that reads Parquet data from the specified path. If there are multiple files at path then
8 | * the order in which files are loaded is determined by underlying filesystem.
Path can refer to local file,
9 | * HDFS, AWS S3, Google Storage, Azure, etc. Please refer to Hadoop client documentation or your data provider in
10 | * order to know how to configure the connection.
Can read also partitioned directories. Filter applies
11 | * also to partition values. Partition values are set as fields in read entities at path defined by partition name.
12 | * Path can be a simple column name or a dot-separated path to nested field. Missing intermediate fields are
13 | * automatically created for each read record.
Allows to turn on a projection over original file schema
14 | * in order to boost read performance if not all columns are required to be read.
Builder allows to create a
15 | * stream of data of given type or of generic records.
16 | * @tparam F
17 | * effect type
18 | * @return
19 | * Builder of the [[fs2.Stream]]
20 | */
21 | def fromParquet[F[_]: Sync]: reader.FromParquet[F] = new reader.FromParquetImpl[F]
22 |
23 | /** Builds a [[fs2.Pipe]] that writes Parquet data to single file at the specified path (including file name). The
24 | * resulting stream returns nothing, that is, it doesn't emit any element.
Path can refer to local file,
25 | * HDFS, AWS S3, Google Storage, Azure, etc. Please refer to Hadoop client documentation or your data provider in
26 | * order to know how to configure the connection.
Builder allows to create a pipe for given data type or for
27 | * generic records.
28 | * @tparam F
29 | * effect type
30 | * @return
31 | * [[fs2.Pipe]] builder
32 | */
33 | def writeSingleFile[F[_]: Sync] = new writer.ToParquetImpl[F]
34 |
35 | /** Builds a [[fs2.Pipe]] that: - Is designed to write Parquet files indefinitely
- Is able to
36 | * (optionally) partition data by a list of provided fields
- Flushes and rotates files after given number of
37 | * rows is written to the partition or a given time period elapses
- Outputs incoming message after it is
38 | * written but can write an effect of provided message transformation.
Builder allows to create a
39 | * pipe for given data type or for generic records.
40 | * @tparam F
41 | * effect type
42 | * @return
43 | * [[fs2.Pipe]] builder
44 | */
45 | def viaParquet[F[_]: Async]: rotatingWriter.ViaParquet[F] = new rotatingWriter.ViaParquetImpl[F]
46 |
47 | }
48 |
--------------------------------------------------------------------------------
/fs2/src/test/scala/com/github/mjakubowski84/parquet4s/parquet/IoSpec.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s.parquet
2 |
3 | import com.github.mjakubowski84.parquet4s.PartitionTestUtils
4 | import org.scalatest.Inside
5 | import org.scalatest.flatspec.AnyFlatSpec
6 | import org.scalatest.matchers.should.Matchers
7 |
8 | class IoSpec extends AnyFlatSpec with Matchers with Inside with PartitionTestUtils {
9 |
10 | "PartitionRegexp" should "match valid partition names and values" in
11 | forAll(ValidPartitionsTable) { case (name, value) =>
12 | inside(s"$name=$value") { case io.PartitionRegexp(`name`, `value`) =>
13 | succeed
14 | }
15 | }
16 |
17 | it should "not match invalid partition names and values" in
18 | forAll(InvalidPartitionsTable) { case (name, value) =>
19 | s"$name=$value" match {
20 | case io.PartitionRegexp(capturedName, capturedValue) =>
21 | fail(
22 | s"Expected no match for name [$name] and value [$value] " +
23 | s"but one was found: [$capturedName, $capturedValue]"
24 | )
25 | case _ =>
26 | succeed
27 | }
28 | }
29 |
30 | }
31 |
--------------------------------------------------------------------------------
/project/ActorLibCross.scala:
--------------------------------------------------------------------------------
1 | case class ActorLibCross(
2 | override val idSuffix: String,
3 | override val directorySuffix: String
4 | ) extends sbt.VirtualAxis.WeakAxis
5 |
--------------------------------------------------------------------------------
/project/Compilation.scala:
--------------------------------------------------------------------------------
1 | import sbt.Keys._
2 | import sbt.CrossVersion
3 |
4 | object Compilation {
5 |
6 | lazy val compilationSettings = Seq(
7 | scalacOptions ++=
8 | Seq(
9 | "-encoding",
10 | "UTF-8",
11 | "-release",
12 | "8",
13 | "-feature",
14 | "-language:implicitConversions",
15 | "-language:higherKinds",
16 | "-Xfatal-warnings",
17 | "-Wconf:src=src_managed/.*:silent"
18 | ) ++ {
19 | CrossVersion.partialVersion(scalaVersion.value) match {
20 | case Some((3, _)) =>
21 | Seq(
22 | "-unchecked",
23 | "-explain-types",
24 | "-Wunused:implicits", // Warn if an implicit parameter is unused.
25 | "-Wunused:explicits", // Warn if an explicit parameter is unused.
26 | "-Wunused:imports", // Warn if an import selector is not referenced.
27 | "-Wunused:locals", // Warn if a local definition is unused.
28 | "-Wunused:params", // Warn if a value parameter is unused.
29 | "-Wunused:privates" // Warn if a private member is unused.
30 | )
31 | case Some((2, 13)) =>
32 | Seq(
33 | "-deprecation",
34 | "-Xsource:3",
35 | "-explaintypes",
36 | "-Wextra-implicit", // Warn when more than one implicit parameter section is defined.
37 | "-Wnumeric-widen", // Warn when numerics are widened.
38 | "-Wunused:implicits", // Warn if an implicit parameter is unused.
39 | "-Wunused:explicits", // Warn if an explicit parameter is unused.
40 | "-Wunused:imports", // Warn if an import selector is not referenced.
41 | "-Wunused:locals", // Warn if a local definition is unused.
42 | "-Wunused:params", // Warn if a value parameter is unused.
43 | "-Wunused:patvars", // Warn if a variable bound in a pattern is unused.
44 | "-Wunused:privates", // Warn if a private member is unused.
45 | "-Wunnamed-boolean-literal" // Warn if boolean literal is unnamed.
46 | )
47 | case _ =>
48 | Seq(
49 | "-deprecation",
50 | "-Xsource:3",
51 | "-explaintypes",
52 | "-Ywarn-extra-implicit", // Warn when more than one implicit parameter section is defined.
53 | "-Ywarn-inaccessible", // Warn about inaccessible types in method signatures.
54 | "-Ywarn-numeric-widen", // Warn when numerics are widened.
55 | "-Ywarn-unused:implicits", // Warn if an implicit parameter is unused.
56 | "-Ywarn-unused:imports", // Warn if an import selector is not referenced.
57 | "-Ywarn-unused:locals", // Warn if a local definition is unused.
58 | "-Ywarn-unused:params", // Warn if a value parameter is unused.
59 | "-Ywarn-unused:patvars", // Warn if a variable bound in a pattern is unused.
60 | "-Ywarn-unused:privates" // Warn if a private member is unused.
61 | )
62 | }
63 | }
64 | )
65 |
66 | }
67 |
--------------------------------------------------------------------------------
/project/DependecyVersions.scala:
--------------------------------------------------------------------------------
1 | object DependecyVersions {
2 | val parquetVersion = "1.15.2"
3 | val shapelessVersion = "2.3.13"
4 | val sparkVersion = "3.5.5"
5 | val hadoopVersion = "3.4.1"
6 | val slf4jVersion = "2.0.17"
7 | val logbackVersion = "1.3.15" // stick to 1.3.x for JDK-8 compatibility
8 | val akkaVersion = "2.6.21" // non-licensed version
9 | val fs2Version = "3.12.0"
10 | val catsEffectVersion = "3.6.1"
11 | val scalaCollectionCompatVersion = "2.13.0"
12 | val scalatestVersion = "3.2.19"
13 | val mockitoVersion = "4.11.0" // stick to 4.x for JDK-8 compatibility
14 | val pekkoVersion = "1.1.3"
15 | val jacksonVersion = "2.19.0"
16 | val testcontainersVersion = "0.43.0"
17 | }
18 |
--------------------------------------------------------------------------------
/project/Documentation.scala:
--------------------------------------------------------------------------------
1 | import com.typesafe.sbt.site.SitePlugin.autoImport.makeSite
2 | import mdoc.MdocPlugin.autoImport._
3 | import microsites.MicrositeFavicon
4 | import microsites.MicrositeKeys._
5 | import sbt.Keys._
6 | import sbt.Compile
7 | import sbt.{Def, url}
8 | import sbt.io.FileFilter._
9 | import sbt.io.syntax._
10 |
11 | object Documentation {
12 |
13 | lazy val documentationSettings: Seq[Def.Setting[_]] =
14 | Seq(
15 | name := "Parquet4s",
16 | description := "Read and write Parquet files using Scala",
17 | organizationName := "Marcin Jakubowski",
18 | organizationHomepage := Some(url("https://github.com/mjakubowski84")),
19 | micrositeDocumentationUrl := "docs",
20 | micrositeFooterText := None,
21 | micrositeBaseUrl := "parquet4s",
22 | micrositeGitterChannel := false,
23 | micrositeGithubOwner := "mjakubowski84",
24 | micrositeGithubRepo := "parquet4s",
25 | micrositeGithubToken := sys.env.get("PARQUET4S_DOCS_GITHUB_TOKEN"),
26 | micrositePushSiteWith := GitHub4s,
27 | makeSite / includeFilter := "*.html" || "*.css" || "*.png" || "*.jpg" || "*.gif" || "*.js" || "*.md" || "*.svg",
28 | micrositeDataDirectory := (Compile / resourceDirectory).value / "docs" / "data",
29 | micrositeImgDirectory := (Compile / resourceDirectory).value / "docs" / "images",
30 | micrositePalette := Map(
31 | "brand-primary" -> "#F1606A",
32 | "brand-secondary" -> "#F1606A",
33 | "white-color" -> "#FFFFFF"
34 | ),
35 | micrositeFavicons := Seq(
36 | MicrositeFavicon("favicon-16x16.png", "16x16"),
37 | MicrositeFavicon("favicon-32x32.png", "32x32")
38 | ),
39 | mdocVariables := Map(
40 | "VERSION" -> version.value
41 | ),
42 | mdocIn := (Compile / resourceDirectory).value / "docs"
43 | )
44 |
45 | }
46 |
--------------------------------------------------------------------------------
/project/Releasing.scala:
--------------------------------------------------------------------------------
1 | import sbt.Keys._
2 | import sbt.{Credentials, Def, Developer, IntegrationTest, Opts, ScmInfo, Test, url}
3 | import xerial.sbt.Sonatype._
4 | import xerial.sbt.Sonatype.autoImport.{sonatypeProfileName, sonatypeProjectHosting}
5 |
6 | object Releasing {
7 |
8 | lazy val publishSettings: Seq[Def.Setting[_]] =
9 | Seq(
10 | credentials ++= Seq(
11 | Credentials(
12 | realm = "Sonatype Nexus Repository Manager",
13 | host = "oss.sonatype.org",
14 | userName = sys.env.getOrElse(
15 | "SONATYPE_USERNAME", {
16 | streams.value.log.warn("Undefined environment variable: SONATYPE_USERNAME")
17 | "UNDEFINED"
18 | }
19 | ),
20 | passwd = sys.env.getOrElse(
21 | "SONATYPE_PASSWORD", {
22 | streams.value.log.warn("Undefined environment variable: SONATYPE_PASSWORD")
23 | "UNDEFINED"
24 | }
25 | )
26 | )
27 | ),
28 | licenses := Seq("MIT" -> url("https://opensource.org/licenses/MIT")),
29 | homepage := Some(url("https://github.com/mjakubowski84/parquet4s")),
30 | scmInfo := Some(
31 | ScmInfo(
32 | browseUrl = url("https://github.com/mjakubowski84/parquet4s"),
33 | connection = "scm:git@github.com:mjakubowski84/parquet4s.git"
34 | )
35 | ),
36 | sonatypeProjectHosting := Some(
37 | GitHubHosting(user = "mjakubowski84", repository = "parquet4s", email = "mjakubowski84@gmail.com")
38 | ),
39 | sonatypeProfileName := "com.github.mjakubowski84",
40 | developers := List(
41 | Developer(
42 | id = "mjakubowski84",
43 | name = "Marcin Jakubowski",
44 | email = "mjakubowski84@gmail.com",
45 | url = url("https://github.com/mjakubowski84")
46 | )
47 | ),
48 | publishMavenStyle := true,
49 | publishTo := Some(
50 | if (isSnapshot.value)
51 | Opts.resolver.mavenLocalFile
52 | else
53 | Opts.resolver.sonatypeStaging
54 | ),
55 | Test / publishArtifact := false,
56 | IntegrationTest / publishArtifact := false
57 | ) ++ (if (sys.env contains "SONATYPE_USERNAME") Signing.signingSettings else Seq.empty)
58 |
59 | }
60 |
--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=1.10.7
2 |
--------------------------------------------------------------------------------
/project/metals.sbt:
--------------------------------------------------------------------------------
1 | // format: off
2 | // DO NOT EDIT! This file is auto-generated.
3 |
4 | // This file enables sbt-bloop to create bloop config files.
5 |
6 | addSbtPlugin("ch.epfl.scala" % "sbt-bloop" % "2.0.10")
7 |
8 | // format: on
9 |
--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-projectmatrix" % "0.11.0")
2 | addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "3.9.21")
3 | addSbtPlugin("com.thoughtworks.sbt-api-mappings" % "sbt-api-mappings" % "3.0.2")
4 | addSbtPlugin("pl.project13.scala" % "sbt-jmh" % "0.4.7")
5 | addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.5.4")
6 | addSbtPlugin("com.47deg" % "sbt-microsites" % "1.4.3") // 1.4.4 causes problems with JDK8
7 | addSbtPlugin("com.thesamet" % "sbt-protoc" % "1.0.6")
8 |
9 | libraryDependencies += "com.thesamet.scalapb" %% "compilerplugin" % "0.11.17"
10 |
--------------------------------------------------------------------------------
/s3Test/src/it/resources/logback-test.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
--------------------------------------------------------------------------------
/s3Test/src/it/scala/com/github/mjakubowski84/parquet4s/s3/S3ItSpec.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s.s3
2 |
3 | import org.scalatest.flatspec.AnyFlatSpec
4 | import org.scalatest.matchers.should.Matchers
5 | import com.dimafeng.testcontainers.scalatest.TestContainerForAll
6 | import com.dimafeng.testcontainers.LocalStackV2Container
7 | import org.testcontainers.containers.localstack.LocalStackContainer.Service
8 | import org.apache.hadoop.conf.Configuration
9 | import scala.util.Using
10 | import com.github.mjakubowski84.parquet4s.Path
11 | import com.github.mjakubowski84.parquet4s.ParquetWriter
12 | import com.github.mjakubowski84.parquet4s.ParquetReader
13 |
14 | class S3ItSpec extends AnyFlatSpec with Matchers with TestContainerForAll {
15 |
16 | case class Data(i: Int, text: String)
17 |
18 | val bucket = "data"
19 | val data = Seq(Data(1, "a"), Data(2, "b"))
20 | val path = Path(s"s3a://$bucket/file.parquet")
21 |
22 | override val containerDef: LocalStackV2Container.Def =
23 | LocalStackV2Container.Def(
24 | tag = "latest",
25 | services = Seq(Service.S3)
26 | )
27 |
28 | override def afterContainersStart(containers: LocalStackV2Container): Unit =
29 | containers.execInContainer("awslocal", "s3api", "create-bucket", "--bucket", bucket)
30 |
31 | "Parquet4s" should "write and read data to/from S3" in
32 | withContainers { s3Container =>
33 | val configuration = new Configuration()
34 |
35 | configuration.set("fs.s3a.access.key", s3Container.container.getAccessKey())
36 | configuration.set("fs.s3a.secret.key", s3Container.container.getSecretKey())
37 | configuration.set("fs.s3a.endpoint", s3Container.container.getEndpoint().toString())
38 | configuration.set("fs.s3a.endpoint.region", s3Container.container.getRegion())
39 |
40 | ParquetWriter.of[Data].options(ParquetWriter.Options(hadoopConf = configuration)).writeAndClose(path, data)
41 |
42 | Using.resource(ParquetReader.as[Data].options(ParquetReader.Options(hadoopConf = configuration)).read(path)) {
43 | _.toSeq should be(data)
44 | }
45 | }
46 |
47 | }
48 |
--------------------------------------------------------------------------------
/scalapb/src/main/scala/com/github/mjakubowski84/parquet4s/ScalaPBParquetSchemaResolver.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import com.github.mjakubowski84.parquet4s.ScalaPBImplicits.*
4 | import org.apache.parquet.schema.{Type, Types}
5 | import scalapb.{GeneratedMessage, GeneratedMessageCompanion}
6 |
7 | import scala.jdk.CollectionConverters.*
8 |
9 | class ScalaPBParquetSchemaResolver[T <: GeneratedMessage: GeneratedMessageCompanion] extends ParquetSchemaResolver[T] {
10 | private val cmp = implicitly[GeneratedMessageCompanion[T]]
11 |
12 | override def schemaName: Option[String] = Option(cmp.scalaDescriptor.name)
13 |
14 | override def resolveSchema(cursor: Cursor): List[Type] = {
15 | val md = cmp.scalaDescriptor
16 | Types
17 | .buildMessage()
18 | .addFields(md.fields)
19 | .named(md.fullName)
20 | .getFields
21 | .iterator()
22 | .asScala
23 | .toList
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/scalapb/src/test/protobuf/data.proto:
--------------------------------------------------------------------------------
1 | syntax = "proto3";
2 |
3 | option java_package = "com.github.mjakubowski84.parquet4s";
4 |
5 | message Data {
6 | enum ABC {
7 | A = 0;
8 | B = 1;
9 | C = 2;
10 | }
11 |
12 | message Inner {
13 | string text = 1;
14 | }
15 |
16 | // primitive types
17 | bool bool = 1;
18 | int32 int = 2;
19 | int64 long = 3;
20 | float float = 4;
21 | double double = 5;
22 | string text = 6;
23 | ABC abc = 7;
24 |
25 | // message type
26 | Inner inner = 8;
27 |
28 | // map types
29 | map map = 9;
30 | map enum_map = 10;
31 | map msg_map = 11;
32 |
33 | // list types
34 | repeated bool bool_list = 101;
35 | repeated int32 int_list = 102;
36 | repeated int64 long_list = 103;
37 | repeated float float_list = 104;
38 | repeated double double_list = 105;
39 | repeated string text_list = 106;
40 | repeated ABC enum_list = 107;
41 | repeated Inner msg_list = 108;
42 | }
43 |
--------------------------------------------------------------------------------
/scalapb/src/test/scala/com/github/mjakubowski84/parquet4s/Parquet4sScalaPBCoreSpec.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import com.github.mjakubowski84.parquet4s.ScalaPBImplicits.*
4 | import org.apache.hadoop.conf.Configuration
5 | import org.apache.parquet.proto.{ProtoParquetReader, ProtoParquetWriter, ProtoReadSupport, ProtoWriteSupport}
6 | import org.scalatest.flatspec.AnyFlatSpec
7 | import org.scalatest.matchers.should.Matchers
8 | import com.github.mjakubowski84.parquet4s.DataOuterClass.Data as JData
9 |
10 | import TestData.*
11 |
12 | class Parquet4sScalaPBCoreSpec extends AnyFlatSpec with Matchers {
13 |
14 | "core module" should "be able to read data written with parquet-protobuf" in {
15 | val outFile = InMemoryOutputFile(initBufferSize = 4800)
16 | val hadoopConf = new Configuration()
17 | hadoopConf.setBoolean(ProtoWriteSupport.PB_SPECS_COMPLIANT_WRITE, true)
18 |
19 | ParquetWriter
20 | .custom[JData, ProtoParquetWriter.Builder[JData]](
21 | ProtoParquetWriter.builder[JData](outFile).withMessage(classOf[JData])
22 | )
23 | .options(ParquetWriter.Options(hadoopConf = hadoopConf))
24 | .writeAndClose(javaData)
25 |
26 | ParquetReader.as[Data].read(outFile.toInputFile).toSeq should be(scalaData)
27 | }
28 |
29 | it should "write data compliant with parquet-protobuf" in {
30 | val outFile = InMemoryOutputFile(initBufferSize = 4800)
31 | val hadoopConf = new Configuration()
32 | hadoopConf.setClass(ProtoReadSupport.PB_CLASS, classOf[JData], classOf[com.google.protobuf.GeneratedMessageV3])
33 |
34 | ParquetWriter.of[Data].writeAndClose(outFile, scalaData)
35 |
36 | ParquetReader
37 | .custom[JData.Builder](ProtoParquetReader.builder[JData.Builder](outFile.toInputFile))
38 | .options(ParquetReader.Options(hadoopConf = hadoopConf))
39 | .read
40 | .map(_.build()) should be(javaData)
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/scalapb/src/test/scala/com/github/mjakubowski84/parquet4s/Parquet4sScalaPBFS2Spec.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import cats.effect.IO
4 | import cats.effect.testing.scalatest.AsyncIOSpec
5 | import com.github.mjakubowski84.parquet4s.DataOuterClass.Data as JData
6 | import com.github.mjakubowski84.parquet4s.ScalaPBImplicits.*
7 | import fs2.Stream
8 | import org.apache.hadoop.conf.Configuration
9 | import org.apache.parquet.proto.{ProtoParquetReader, ProtoParquetWriter, ProtoReadSupport, ProtoWriteSupport}
10 | import org.scalatest.flatspec.AsyncFlatSpec
11 | import org.scalatest.matchers.should.Matchers
12 |
13 | import TestData.*
14 |
15 | class Parquet4sScalaPBFS2Spec extends AsyncFlatSpec with AsyncIOSpec with Matchers {
16 |
17 | "fs2 module" should "be compatible with parquet-protobuf" in {
18 | val outFile = InMemoryOutputFile(initBufferSize = 4800)
19 | val hadoopConf = new Configuration()
20 | hadoopConf.setBoolean(ProtoWriteSupport.PB_SPECS_COMPLIANT_WRITE, true)
21 |
22 | def write: Stream[IO, Nothing] =
23 | Stream
24 | .iterable(javaData)
25 | .through(
26 | parquet
27 | .writeSingleFile[IO]
28 | .custom[JData, ProtoParquetWriter.Builder[JData]](
29 | ProtoParquetWriter.builder[JData](outFile).withMessage(classOf[JData])
30 | )
31 | .options(ParquetWriter.Options(hadoopConf = hadoopConf))
32 | .write
33 | )
34 |
35 | def read: Stream[IO, Vector[Data]] =
36 | parquet.fromParquet[IO].as[Data].read(outFile.toInputFile).fold(Vector.empty[Data])(_ :+ _)
37 |
38 | (write ++ read).map(_ should be(scalaData)).compile.lastOrError
39 | }
40 |
41 | it should "write data compliant with parquet-protobuf" in {
42 | val outFile = InMemoryOutputFile(initBufferSize = 4800)
43 | val hadoopConf = new Configuration()
44 | hadoopConf.setClass(ProtoReadSupport.PB_CLASS, classOf[JData], classOf[com.google.protobuf.GeneratedMessageV3])
45 |
46 | def write: Stream[IO, Nothing] =
47 | Stream
48 | .iterable(scalaData)
49 | .through(
50 | parquet
51 | .writeSingleFile[IO]
52 | .of[Data]
53 | .write(outFile)
54 | )
55 |
56 | def read: Stream[IO, Vector[JData]] =
57 | parquet
58 | .fromParquet[IO]
59 | .custom[JData.Builder](ProtoParquetReader.builder[JData.Builder](outFile.toInputFile))
60 | .options(ParquetReader.Options(hadoopConf = hadoopConf))
61 | .read(_.build)
62 | .fold(Vector.empty[JData])(_ :+ _)
63 |
64 | (write ++ read)
65 | .map(_ should be(javaData))
66 | .compile
67 | .lastOrError
68 | }
69 | }
70 |
--------------------------------------------------------------------------------
/scalapb/src/test/scala/com/github/mjakubowski84/parquet4s/Parquet4sScalaPBSpec.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import org.scalatest.flatspec.AnyFlatSpec
4 | import org.scalatest.matchers.should.Matchers
5 | import com.github.mjakubowski84.parquet4s.ScalaPBImplicits.*
6 |
7 | class Parquet4sScalaPBSpec extends AnyFlatSpec with Matchers {
8 |
9 | def testWithData(newData: Int => Data): Unit = {
10 | val data = (1 to 100).map(newData)
11 |
12 | val outFile = InMemoryOutputFile(initBufferSize = 4800)
13 | ParquetWriter.of[Data].writeAndClose(outFile, data)
14 |
15 | val inFile = InMemoryInputFile.fromBytes(outFile.take())
16 | ParquetReader.as[Data].read(inFile).toSeq shouldBe data
17 | }
18 |
19 | "parquet4s-scalapb" should "work with primitive types" in {
20 | testWithData(i => Data(bool = i % 2 == 0))
21 | testWithData(i => Data(int = i))
22 | testWithData(i => Data(long = i.toLong))
23 | testWithData(i => Data(float = i.toFloat))
24 | testWithData(i => Data(double = i.toDouble))
25 | testWithData(i => Data(text = i.toString))
26 | testWithData(i => Data(abc = Data.ABC.fromValue(i % 3)))
27 | }
28 |
29 | it should "work with message types" in
30 | testWithData(i => Data(inner = Some(Data.Inner(i.toString))))
31 |
32 | it should "work with unrecognized enum values" in
33 | testWithData(i => Data(abc = Data.ABC.fromValue(i % 5)))
34 |
35 | it should "work with map types" in {
36 | testWithData(i => Data(map = Map("original" -> i, "doubled" -> 2 * i)))
37 | testWithData(i => Data(enumMap = Map(i -> Data.ABC.fromValue(i % 5))))
38 | testWithData(i => Data(msgMap = Map(i.toLong -> Data.Inner(text = "level1"))))
39 | }
40 |
41 | it should "work with list types" in {
42 | testWithData(i => Data(boolList = (i to i + 100).map(_ % 2 == 0)))
43 | testWithData(i => Data(intList = i to i + 100))
44 | testWithData(i => Data(longList = (i to i + 100).map(_.toLong)))
45 | testWithData(i => Data(floatList = (i to i + 100).map(_.toFloat)))
46 | testWithData(i => Data(doubleList = (i to i + 100).map(_.toDouble)))
47 | testWithData(i => Data(textList = (i to i + 100).map(_.toString)))
48 | testWithData(i => Data(enumList = (i to i + 100).map(Data.ABC.fromValue)))
49 | testWithData(i => Data(msgList = (i to i + 100).map(i => Data.Inner(i.toString))))
50 | }
51 |
52 | }
53 |
--------------------------------------------------------------------------------
/scalapb/src/test/scala/com/github/mjakubowski84/parquet4s/TestData.scala:
--------------------------------------------------------------------------------
1 | package com.github.mjakubowski84.parquet4s
2 |
3 | import com.github.mjakubowski84.parquet4s.DataOuterClass.Data as JData
4 | import scala.jdk.CollectionConverters.*
5 |
6 | object TestData {
7 |
8 | val javaData: Seq[JData] = (1 to 100)
9 | .map(i =>
10 | JData
11 | .newBuilder()
12 | .setBool(i % 2 == 0)
13 | .setInt(i)
14 | .setLong(i.toLong)
15 | .setFloat(i.toFloat)
16 | .setDouble(i.toDouble)
17 | .setText(i.toString)
18 | .setAbcValue(i % JData.ABC.values().length)
19 | .setInner(JData.Inner.newBuilder().setText(i.toString).build())
20 | .addAllBoolList((i to i + 100).map(_ % 2 == 0).map(java.lang.Boolean.valueOf).asJava)
21 | .addAllIntList((i to i + 100).map(Integer.valueOf).asJava)
22 | .addAllLongList((i to i + 100).map(_.toLong).map(java.lang.Long.valueOf).asJava)
23 | .addAllFloatList((i to i + 100).map(_.toFloat).map(java.lang.Float.valueOf).asJava)
24 | .addAllDoubleList((i to i + 100).map(_.toDouble).map(java.lang.Double.valueOf).asJava)
25 | .addAllTextList((i to i + 100).map(_.toString).asJava)
26 | .addAllEnumListValue((i to i + 100).map(_ % JData.ABC.values().length).map(Integer.valueOf).asJava)
27 | .addAllMsgList((i to i + 100).map(i => JData.Inner.newBuilder().setText(i.toString).build()).asJava)
28 | .build()
29 | )
30 | val scalaData: Seq[Data] = javaData.map(d => Data.parseFrom(d.toByteArray))
31 |
32 | }
33 |
--------------------------------------------------------------------------------
/site/src/main/resources/docs/data/menu.yml:
--------------------------------------------------------------------------------
1 | options:
2 | - title: Introduction
3 | url: docs
4 |
5 | - title: Quick Start
6 | url: docs/quick_start
7 |
8 | - title: Integration with Akka Streams
9 | url: docs/akka
10 |
11 | - title: Integration with Pekko Streams
12 | url: docs/pekko
13 |
14 | - title: Integration with FS2
15 | url: docs/fs2
16 |
17 | - title: Supported storage types
18 | url: docs/storage_types
19 |
20 | - title: Records, types and schema
21 | url: docs/records_and_schema
22 |
23 | - title: Projection
24 | url: docs/projection
25 |
26 | - title: Filtering
27 | url: docs/filtering
28 |
29 | - title: Partitioning
30 | url: docs/partitioning
31 |
32 | - title: Statistics
33 | url: docs/statistics
34 |
35 | - title: Examples
36 | url: docs/examples
37 |
38 | - title: Migration from 1.x
39 | url: docs/migration
40 |
41 | - title: (Experimental) ETL
42 | url: docs/etl
43 |
44 | - title: (Experimental) Protobuf with ScalaPB
45 | url: docs/protobuf
46 |
47 | - title: Distinguished Sponsors
48 | url: docs/sponsors
49 |
--------------------------------------------------------------------------------
/site/src/main/resources/docs/docs/etl.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: docs
3 | title: (Experimental) ETL
4 | permalink: docs/etl/
5 | ---
6 |
7 | # (Experimental) ETL
8 |
9 | Version 2.1.0 of Parquet4s introduces advanced operations on generic datasets, that is on `ParquetIterable[RowParquetRecord]`, to the core module. Now users can join and concat two or more datasets which can simplify some ETL jobs a lot.
10 |
11 | Available operations:
12 |
13 | - Left join
14 | - Right join
15 | - Inner join
16 | - Full join
17 | - Concat (appending one dataset to another)
18 | - Write called directly on a dataset.
19 |
20 | Mind that joins require loading the right-side dataset into memory, so those operations are not applicable for very large datasets. Consider switching the position of datasets in your join operation (the left dataset is iterated over). Or use e.g. Apache Spark which distributes data across multiple machines for performing join operations.
21 |
22 | Please note that this is an experimental feature. API may change in the future, and some functionalities may be added or removed.
23 |
24 | ```scala mdoc:compile-only
25 | import com.github.mjakubowski84.parquet4s.{Col, ParquetReader, Path}
26 | import scala.util.Using
27 |
28 | case class PetOwner(id: Long, name: String, petId: Long, petName: String)
29 |
30 | // define 1st dataset
31 | val readOwners = ParquetReader
32 | .projectedGeneric(
33 | Col("id").as[Long],
34 | Col("name").as[String]
35 | )
36 | .read(Path("/owners"))
37 |
38 | // define 2nd dataset
39 | val readPets = ParquetReader
40 | .projectedGeneric(
41 | Col("id").as[Long].alias("petId"),
42 | Col("name").as[String].alias("petName"),
43 | Col("ownerId").as[Long]
44 | )
45 | .read(Path("/pets"))
46 |
47 | // join and write output dataset
48 | Using.resources(readOwners, readPets) { case (owners, pets) =>
49 | owners
50 | .innerJoin(right = pets, onLeft = Col("id"), onRight = Col("ownerId")) // define join operation
51 | .as[PetOwner] // set typed schema and codecs
52 | .writeAndClose(Path("/pet_owners/file.parquet")) // execute all including write to the disk
53 | }
54 |
55 | // take note that all operations defined above writeAndClose are lazy and are not executed before
56 | // writeAndClose is called
57 | ```
58 |
--------------------------------------------------------------------------------
/site/src/main/resources/docs/docs/examples.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: docs
3 | title: Examples
4 | permalink: docs/examples/
5 | ---
6 |
7 | # Examples
8 |
9 | Please check [examples](https://github.com/mjakubowski84/parquet4s/blob/master/examples) where you can find simple code covering basics for `core`, `akkaPekko` and `fs2` modules.
10 |
11 | Moreover, examples contain two simple applications comprising Akka Streams / Pekko Streams or FS2 and Kafka. They show how you can write partitioned Parquet files with data coming from an indefinite stream.
12 |
--------------------------------------------------------------------------------
/site/src/main/resources/docs/docs/introduction.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: docs
3 | title: Introduction
4 | permalink: docs/
5 | ---
6 |
7 | # Introduction
8 |
9 | Parquet4s is a simple I/O for [Parquet](https://parquet.apache.org/). Allows you to easily read and write Parquet files in [Scala](https://www.scala-lang.org/).
10 |
11 | Use just a Scala case class to define the schema of your data. No need to use Avro, Protobuf, Thrift or other data serialisation systems. You can use generic records if you don't want to use the case class, too.
12 |
13 | Compatible with files generated with [Apache Spark](https://spark.apache.org/). However, unlike in Spark, you do not have to start a cluster to perform I/O operations.
14 |
15 | Based on the official [Parquet library](https://github.com/apache/parquet-mr), [Hadoop Client](https://github.com/apache/hadoop) and [Shapeless](https://github.com/milessabin/shapeless) (Shapeless is not in use in a version for Scala 3).
16 |
17 | As it is based on Hadoop Client then you can connect to any Hadoop-compatible storage like AWS S3 or Google Cloud Storage.
18 |
19 | Integrations for [Akka Streams](https://doc.akka.io/docs/akka/current/stream/index.html), [Pekko Streams](https://pekko.apache.org/docs/pekko/current/stream/index.html) and [FS2](https://fs2.io/).
20 |
21 | Released for Scala 2.12.x, 2.13.x and 3.3.x.
22 |
--------------------------------------------------------------------------------
/site/src/main/resources/docs/docs/projection.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: docs
3 | title: Projection
4 | permalink: docs/projection/
5 | ---
6 |
7 | # Projection
8 |
9 | Schema projection is a way of optimization of reads. When calling `ParquetReader.as[MyData]` Parquet4s reads the whole content of each Parquet record even when you provide a case class that maps only a part of stored columns. The same happens when you use generic records by calling `ParquetReader.generic`. However, you can explicitly tell Parquet4s to use a different schema. In effect, all columns not matching your schema will be skipped and not read. You can define the projection schema in numerous ways:
10 |
11 | 1. by defining case class for typed read using `projectedAs`,
12 | 2. by defining generic column projection (allows reference to nested fields and aliases) using `projectedGeneric`,
13 | 3. by providing your own instance of Parquet's `MessageType` for generic read using `projectedGeneric`.
14 |
15 | ```scala mdoc:compile-only
16 | import com.github.mjakubowski84.parquet4s.{Col, ParquetIterable, ParquetReader, Path, RowParquetRecord}
17 | import org.apache.parquet.schema.MessageType
18 |
19 | // typed read
20 | case class MyData(column1: Int, columnX: String)
21 | val myData: ParquetIterable[MyData] =
22 | ParquetReader
23 | .projectedAs[MyData]
24 | .read(Path("file.parquet"))
25 |
26 | // generic read with column projection
27 | val records1: ParquetIterable[RowParquetRecord] =
28 | ParquetReader
29 | .projectedGeneric(
30 | Col("column1").as[Int],
31 | Col("columnX").as[String].alias("my_column"),
32 | )
33 | .read(Path("file.parquet"))
34 |
35 | // generic read with own instance of Parquet schema
36 | val schemaOverride: MessageType = ???
37 | val records2: ParquetIterable[RowParquetRecord] =
38 | ParquetReader
39 | .projectedGeneric(schemaOverride)
40 | .read(Path("file.parquet"))
41 | ```
42 |
43 |
--------------------------------------------------------------------------------
/site/src/main/resources/docs/docs/protobuf.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: docs
3 | title: Read and write Parquet from and to Protobuf
4 | permalink: docs/protobuf/
5 | ---
6 |
7 | # Read and write Parquet from and to Protobuf
8 |
9 | Using the original Java Parquet library, you can read and write parquet to and from Protbuf. Parquet4s has `custom` functions in its API, which could be leveraged for that. However, Parquet Protobuf can only be used with Java models, not to mention other issues that make it hard to use, especially in Scala. You would prefer to use [ScalaPB](https://scalapb.github.io/) in Scala projects, right? Thanks to Parquet4S, you can! Import ScalaPB extension to any Parquet4S project, either it is Akka / Pekko, FS2 or plain Scala:
10 |
11 | ```scala
12 | "com.github.mjakubowski84" %% "parquet4s-scalapb" % "@VERSION@"
13 | ```
14 |
15 | Follow the ScalaPB [documentation](https://scalapb.github.io/docs/installation) to generate your Scala model from `.proto` files.
16 |
17 | Then, import Parquet4S type classes tailored for Protobuf. The rest of the code stays the same as in regular Parquet4S - no matter if that is Akka / Pekko, FS2 or core!
18 |
19 | ```scala mdoc:compile-only
20 | import com.github.mjakubowski84.parquet4s.ScalaPBImplicits._
21 | import com.github.mjakubowski84.parquet4s.{ParquetReader, ParquetWriter, Path}
22 |
23 | import scala.util.Using
24 |
25 | case class GeneratedProtobufData(someField: Int)
26 |
27 | val data: Iterable[GeneratedProtobufData] = ??? // your data
28 | val path: Path = ??? // path to write to / to read from
29 |
30 | // write
31 | ParquetWriter.of[GeneratedProtobufData].writeAndClose(path.append("data.parquet"), data)
32 |
33 | // read
34 | Using.resource(ParquetReader.as[GeneratedProtobufData].read(path))(_.foreach(println))
35 | ```
36 |
37 | Please follow the [examples](https://github.com/mjakubowski84/parquet4s/tree/master/examples/src/main/scala/com/github/mjakubowski84/parquet4s/scalapb) to learn more.
38 |
--------------------------------------------------------------------------------
/site/src/main/resources/docs/docs/quick_start.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: docs
3 | title: Quick start
4 | permalink: docs/quick_start/
5 | ---
6 |
7 | # Quick start
8 |
9 | ## SBT
10 |
11 | ```scala
12 | libraryDependencies ++= Seq(
13 | "com.github.mjakubowski84" %% "parquet4s-core" % "@VERSION@",
14 | "org.apache.hadoop" % "hadoop-client" % yourHadoopVersion
15 | )
16 | ```
17 |
18 | ## Mill
19 |
20 | ```scala
21 | def ivyDeps = Agg(
22 | ivy"com.github.mjakubowski84::parquet4s-core:@VERSION@",
23 | ivy"org.apache.hadoop:hadoop-client:$yourHadoopVersion"
24 | )
25 | ```
26 |
27 | ```scala mdoc:compile-only
28 | import com.github.mjakubowski84.parquet4s.{ ParquetReader, ParquetWriter, Path }
29 |
30 | case class User(userId: String, name: String, created: java.sql.Timestamp)
31 |
32 | val users: Iterable[User] = Seq(
33 | User("1", "parquet", new java.sql.Timestamp(1L))
34 | )
35 | val path = Path("path/to/local/file.parquet")
36 |
37 | // writing
38 | ParquetWriter.of[User].writeAndClose(path, users)
39 |
40 | // reading
41 | val parquetIterable = ParquetReader.as[User].read(path)
42 | try {
43 | parquetIterable.foreach(println)
44 | } finally parquetIterable.close()
45 | ```
46 |
47 | ## AWS S3
48 |
49 | Parquet4s works with AWS S3 and [many other distributed storage types]({% link docs/storage_types.md %}).
50 |
51 | In order to connect to AWS S3 you need to define one more dependency:
52 |
53 | ```scala
54 | "org.apache.hadoop" % "hadoop-aws" % yourHadoopVersion
55 | ```
56 |
57 | Next, the most common way is to define following environmental variables:
58 |
59 | ```bash
60 | export AWS_ACCESS_KEY_ID=my.aws.key
61 | export AWS_SECRET_ACCESS_KEY=my.secret.key
62 | ```
63 |
64 | Please refer to [documentation of Hadoop AWS](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/#Authenticating_with_S3) for more information on how to authenticate with S3.
65 |
66 | You may need to set some configuration properties to access your storage, e.g. `fs.s3a.path.style.access`.
67 | Please follow [documentation of Hadoop AWS](https://hadoop.apache.org/docs/current/hadoop-aws/tools/hadoop-aws/index.html) for more details and troubleshooting.
68 |
69 | Moreover, you refer to Parquet4s' [integration test](https://github.com/mjakubowski84/parquet4s/tree/master/s3Test/src/it) that proves that integration with S3 works.
70 |
71 | ## Passing Hadoop Configs Programmatically
72 |
73 | File system configs for S3, GCS, Hadoop, etc. can also be set programmatically to the `ParquetReader` and `ParquetWriter` by passing the `Configuration` to the `ParqetReader.Options` and `ParquetWriter.Options` case classes.
74 |
75 | ```scala mdoc:compile-only
76 | import com.github.mjakubowski84.parquet4s.{ ParquetReader, ParquetWriter, Path }
77 | import org.apache.parquet.hadoop.metadata.CompressionCodecName
78 | import org.apache.hadoop.conf.Configuration
79 |
80 | case class User(userId: String, name: String, created: java.sql.Timestamp)
81 |
82 | val users: Iterable[User] = Seq(
83 | User("1", "parquet", new java.sql.Timestamp(1L))
84 | )
85 |
86 | val writerOptions = ParquetWriter.Options(
87 | compressionCodecName = CompressionCodecName.SNAPPY,
88 | hadoopConf = new Configuration()
89 | )
90 |
91 | ParquetWriter
92 | .of[User]
93 | .options(writerOptions)
94 | .writeAndClose(Path("path/to/local/file.parquet"), users)
95 | ```
96 |
--------------------------------------------------------------------------------
/site/src/main/resources/docs/docs/sponsors.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: docs
3 | title: Distinguished Sponsors
4 | permalink: docs/sponsors/
5 | ---
6 |
7 | # Distinguished Sponsors
8 |
9 | - [calvinlfer](https://github.com/calvinlfer)
10 |
--------------------------------------------------------------------------------
/site/src/main/resources/docs/docs/statistics.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: docs
3 | title: Statistics
4 | permalink: docs/statistics/
5 | ---
6 |
7 | # Statistics
8 |
9 | Parquet files contain metadata that are used to optimize [filtering]({% link docs/filtering.md %}). Additionally, Parquet4s leverages metadata to provide insight about datasets in an efficient way:
10 |
11 | - Number of records
12 | - Min value of a column
13 | - Max value of a column
14 |
15 | Parquet4s will try to resolve those statistics without iterating over each record if possible. Statistics can also be queried using a filter — but please mind that speed of the query might decrease as, due to filtering, the algorithm might need to iterate over the content of a row group to resolve min/max values. The performance of the query is the best in the case of sorted datasets.
16 |
17 | Parquet4s provides separate API for Statistics. It is also leveraged in `ParqueIterable`e.g. to efficiently calculate `size`.
18 |
19 | ```scala mdoc:compile-only
20 | import com.github.mjakubowski84.parquet4s.{Col, Path, Stats}
21 |
22 | import java.time.LocalDate
23 | case class User(id: Long, age: Int, registered: LocalDate)
24 |
25 | // stats of users that registered in year 2020
26 | val userStats = Stats
27 | .builder
28 | .filter(Col("registered") >= LocalDate.of(2020, 1, 1) && Col("registered") < LocalDate.of(2021, 1, 1))
29 | .projection[User]
30 | .stats(Path("users"))
31 |
32 | val numberOfUsers = userStats.recordCount
33 | val minAge = userStats.min[Int](Col("age"))
34 | val maxAge = userStats.max[Int](Col("age"))
35 | ```
36 |
37 | ```scala mdoc:compile-only
38 | import com.github.mjakubowski84.parquet4s.{Col, ParquetReader, Path, Stats}
39 |
40 | import java.time.LocalDate
41 | case class User(id: Long, age: Int, registered: LocalDate)
42 |
43 | // users that registered in year 2020
44 | val users = ParquetReader
45 | .projectedAs[User]
46 | .filter(Col("registered") >= LocalDate.of(2020, 1, 1) && Col("registered") < LocalDate.of(2021, 1, 1))
47 | .read(Path("users"))
48 |
49 | try {
50 | val numberOfUsers = users.size
51 | val minAge = users.min[Int](Col("age"))
52 | val maxAge = users.max[Int](Col("age"))
53 | } finally {
54 | users.close()
55 | }
56 | ```
57 |
--------------------------------------------------------------------------------
/site/src/main/resources/docs/docs/storage_types.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: docs
3 | title: Supported storage types
4 | permalink: docs/storage_types/
5 | ---
6 |
7 | # Supported storage types
8 |
9 | As it is based on Hadoop Client, Parquet4s can read and write from a variety of file systems:
10 |
11 | - Local files
12 | - HDFS
13 | - [Amazon S3](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html)
14 | - [Google Storage](https://cloud.google.com/dataproc/docs/concepts/connectors/cloud-storage)
15 | - [Azure Blob Storage](https://hadoop.apache.org/docs/stable/hadoop-azure/index.html)
16 | - [Azure Data Lake Storage](https://hadoop.apache.org/docs/stable/hadoop-azure-datalake/index.html)
17 | - and any other storage compatible with Hadoop...
18 |
19 | Please refer to Hadoop Client documentation or your storage provider to check how to connect to your storage.
20 |
--------------------------------------------------------------------------------
/site/src/main/resources/docs/images/favicon-16x16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjakubowski84/parquet4s/036d8a03c1febb087813309f797414ed860e7992/site/src/main/resources/docs/images/favicon-16x16.png
--------------------------------------------------------------------------------
/site/src/main/resources/docs/images/favicon-32x32.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjakubowski84/parquet4s/036d8a03c1febb087813309f797414ed860e7992/site/src/main/resources/docs/images/favicon-32x32.png
--------------------------------------------------------------------------------
/site/src/main/resources/docs/images/features-header.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
25 |
--------------------------------------------------------------------------------
/site/src/main/resources/docs/images/light-navbar-brand.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
25 |
--------------------------------------------------------------------------------
/site/src/main/resources/docs/images/light-sidebar-brand.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
25 |
--------------------------------------------------------------------------------
/site/src/main/resources/docs/index.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: homeFeatures
3 | features:
4 | - first: ["Quick start", "How to use Parquet4s in just a few steps", "quick_start"]
5 | - third: ["Documentation", "All you need to know about Parquet4s"]
6 | ---
7 |
--------------------------------------------------------------------------------