├── .circleci └── config.yml ├── .github └── FUNDING.yml ├── .gitignore ├── .java-version ├── .jvmopts ├── .scalafmt.conf ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── akkaPekko └── src │ ├── it │ └── scala │ │ └── com │ │ └── github │ │ └── mjakubowski84 │ │ └── parquet4s │ │ └── ParquetStreamsITSpec.scala │ └── main │ ├── scala-akka-jvm │ └── com │ │ └── github │ │ └── mjakubowski84 │ │ └── parquet4s │ │ └── ScalaCompat.scala │ ├── scala-pekko-jvm │ └── com │ │ └── github │ │ └── mjakubowski84 │ │ └── parquet4s │ │ └── ScalaCompat.scala │ └── scala │ └── com │ └── github │ └── mjakubowski84 │ └── parquet4s │ ├── ParquetPartitioningFlow.scala │ ├── ParquetSource.scala │ ├── ParquetStreams.scala │ └── SingleFileParquetSink.scala ├── akkaPekkoBenchmarks └── src │ └── main │ └── scala │ └── com │ └── github │ └── mjakubowski84 │ └── parquet4s │ └── AkkaPekkoBenchmark.scala ├── build.sbt ├── core └── src │ ├── it │ ├── resources │ │ └── logback-test.xml │ ├── scala-2.12 │ │ └── com │ │ │ └── github │ │ │ └── mjakubowski84 │ │ │ └── parquet4s │ │ │ ├── CustomTypeITSpec.scala │ │ │ ├── ParquetWriterAndSparkCompatibilityItSpec.scala │ │ │ ├── SparkAndParquetReaderCompatibilityItSpec.scala │ │ │ ├── SparkHelper.scala │ │ │ ├── TestCaseSupport.scala │ │ │ ├── TimeEncodingCompatibilityItSpec.scala │ │ │ ├── TimeEncodingInt64MicrosCompatibilityItSpec.scala │ │ │ ├── TimeEncodingInt64MillisCompatibilityItSpec.scala │ │ │ ├── TimeEncodingInt64NanosCompatibilityItSpec.scala │ │ │ └── TimeEncodingInt96CompatibilityItSpec.scala │ ├── scala-2.13 │ │ └── com │ │ │ └── github │ │ │ └── mjakubowski84 │ │ │ └── parquet4s │ │ │ ├── CustomTypeITSpec.scala │ │ │ ├── ParquetWriterAndSparkCompatibilityItSpec.scala │ │ │ ├── SparkAndParquetReaderCompatibilityItSpec.scala │ │ │ ├── SparkHelper.scala │ │ │ ├── TestCaseSupport.scala │ │ │ ├── TimeEncodingCompatibilityItSpec.scala │ │ │ ├── TimeEncodingInt64MicrosCompatibilityItSpec.scala │ │ │ ├── TimeEncodingInt64MillisCompatibilityItSpec.scala │ │ │ ├── TimeEncodingInt64NanosCompatibilityItSpec.scala │ │ │ └── TimeEncodingInt96CompatibilityItSpec.scala │ ├── scala-3 │ │ └── com │ │ │ └── github │ │ │ └── mjakubowski84 │ │ │ └── parquet4s │ │ │ ├── CustomTypeITSpec.scala │ │ │ └── TestCaseSupport.scala │ └── scala │ │ └── com │ │ └── github │ │ └── mjakubowski84 │ │ └── parquet4s │ │ ├── CompatibilityTestCases.scala │ │ ├── FilteringByListItSpec.scala │ │ ├── FilteringITSpec.scala │ │ ├── IOOpsITSpec.scala │ │ ├── ParquetReaderItSpec.scala │ │ ├── ParquetWriterAndParquetReaderCompatibilityItSpec.scala │ │ ├── ParquetWriterItSpec.scala │ │ ├── ProjectionItSpec.scala │ │ ├── RecordFilterItSpec.scala │ │ ├── TestUtils.scala │ │ └── stats │ │ ├── CompoundAndPartitionedStatsITSpec.scala │ │ ├── FileStatsItSpec.scala │ │ └── FilteredFileStatsItSpec.scala │ ├── main │ ├── scala-2.12 │ │ └── com │ │ │ └── github │ │ │ └── mjakubowski84 │ │ │ └── parquet4s │ │ │ ├── ParquetRecordDecoder.scala │ │ │ ├── ParquetRecordEncoder.scala │ │ │ ├── ParquetSchemaResolver.scala │ │ │ ├── ProductDecoders.scala │ │ │ ├── ProductEncoders.scala │ │ │ ├── ProductSchemaDefs.scala │ │ │ └── compat │ │ │ ├── CursorCompat.scala │ │ │ ├── IteratorCompat.scala │ │ │ └── MapCompat.scala │ ├── scala-2.13 │ │ └── com │ │ │ └── github │ │ │ └── mjakubowski84 │ │ │ └── parquet4s │ │ │ ├── ParquetRecordDecoder.scala │ │ │ ├── ParquetRecordEncoder.scala │ │ │ ├── ParquetSchemaResolver.scala │ │ │ ├── ProductDecoders.scala │ │ │ ├── ProductEncoders.scala │ │ │ ├── ProductSchemaDefs.scala │ │ │ └── compat │ │ │ ├── CursorCompat.scala │ │ │ ├── IteratorCompat.scala │ │ │ └── MapCompat.scala │ ├── scala-3 │ │ └── com │ │ │ └── github │ │ │ └── mjakubowski84 │ │ │ └── parquet4s │ │ │ ├── ParquetRecordDecoder.scala │ │ │ ├── ParquetRecordEncoder.scala │ │ │ ├── ParquetSchemaResolver.scala │ │ │ ├── ProductDecoders.scala │ │ │ ├── ProductEncoders.scala │ │ │ ├── ProductSchemaDefs.scala │ │ │ └── compat │ │ │ ├── CursorCompat.scala │ │ │ ├── IteratorCompat.scala │ │ │ └── MapCompat.scala │ └── scala │ │ └── com │ │ └── github │ │ └── mjakubowski84 │ │ └── parquet4s │ │ ├── ColumnPath.scala │ │ ├── ColumnProjection.scala │ │ ├── Cursor.scala │ │ ├── DecimalFormat.scala │ │ ├── Filter.scala │ │ ├── HadoopParquetReader.scala │ │ ├── IOOps.scala │ │ ├── InMemoryInputFile.scala │ │ ├── InMemoryOutputFile.scala │ │ ├── MetadataWriter.scala │ │ ├── ParquetIterable.scala │ │ ├── ParquetIterator.scala │ │ ├── ParquetReadSupport.scala │ │ ├── ParquetReader.scala │ │ ├── ParquetRecord.scala │ │ ├── ParquetWriter.scala │ │ ├── PartitionFilter.scala │ │ ├── Path.scala │ │ ├── Schema.scala │ │ ├── Stats.scala │ │ ├── TimestampFormat.scala │ │ ├── UDP.scala │ │ ├── Value.scala │ │ ├── ValueCodec.scala │ │ ├── ValueCodecConfiguration.scala │ │ ├── ValueCodecs.scala │ │ ├── ValueDecoder.scala │ │ ├── ValueEncoder.scala │ │ ├── ValueImplicits.scala │ │ ├── etl │ │ ├── CompoundParquetIterable.scala │ │ ├── InMemoryParquetIterable.scala │ │ └── Join.scala │ │ ├── experimental.scala │ │ └── stats │ │ ├── CompoundStats.scala │ │ ├── FileStats.scala │ │ ├── FilteredFileStats.scala │ │ ├── InMemoryStats.scala │ │ ├── LazyDelegateStats.scala │ │ └── PartitionedFileStats.scala │ └── test │ ├── resources │ └── logback-test.xml │ └── scala │ └── com │ └── github │ └── mjakubowski84 │ └── parquet4s │ ├── ColumnPathSpec.scala │ ├── CursorSpec.scala │ ├── DecimalFormatSpec.scala │ ├── DecimalValueSpec.scala │ ├── FilterSpec.scala │ ├── IOOpsSpec.scala │ ├── InMemoryFileSpec.scala │ ├── ParquetIterableSpec.scala │ ├── ParquetIteratorSpec.scala │ ├── ParquetRecordDecoderSpec.scala │ ├── ParquetRecordEncoderSpec.scala │ ├── ParquetRecordSpec.scala │ ├── ParquetSchemaResolverSpec.scala │ ├── PartitionFilterSpec.scala │ ├── PartitionTestUtils.scala │ ├── SkippingParquetSchemaResolverSpec.scala │ ├── TestCases.scala │ ├── TimestampFormatSpec.scala │ ├── ValueCodecsSpec.scala │ ├── ValueEncodingAndDecodingSpec.scala │ └── etl │ └── JoinSpec.scala ├── coreBenchmarks └── src │ └── main │ └── scala │ └── com │ └── github │ └── mjakubowski84 │ └── parquet4s │ └── CoreBenchmark.scala ├── examples └── src │ └── main │ ├── protobuf │ └── data.proto │ ├── resources │ └── logback.xml │ ├── scala-akka-jvm │ └── com │ │ └── github │ │ └── mjakubowski84 │ │ └── parquet4s │ │ └── ScalaKafkaCompat.scala │ ├── scala-pekko-jvm │ └── com │ │ └── github │ │ └── mjakubowski84 │ │ └── parquet4s │ │ └── ScalaKafkaCompat.scala │ └── scala │ └── com │ └── github │ └── mjakubowski84 │ └── parquet4s │ ├── CustomType.scala │ ├── akkaPekko │ ├── CustomAvroWriteAndReadAkkaPekkoApp.scala │ ├── CustomPartitioningAvroWriteAkkaPekkoApp.scala │ ├── CustomProtobufWriteAndReadAkkaPekkoApp.scala │ ├── WriteAndReadAkkaPekkoApp.scala │ ├── WriteAndReadCustomTypeAkkaPekkoApp.scala │ ├── WriteAndReadFilteredAkkaPekkoApp.scala │ ├── WriteAndReadGenericAkkaPekkoApp.scala │ └── indefinite │ │ ├── AkkaPekko.scala │ │ ├── ExampleApp.scala │ │ ├── Kafka.scala │ │ ├── Logger.scala │ │ ├── MessageSink.scala │ │ ├── MessageSource.scala │ │ └── RandomDataProducer.scala │ ├── core │ ├── ColumnProjectionAndDataConcatenationApp.scala │ ├── ETLApp.scala │ ├── WriteAndReadApp.scala │ ├── WriteAndReadCustomTypeApp.scala │ ├── WriteAndReadFilteredApp.scala │ ├── WriteAndReadGenericApp.scala │ ├── WriteAndReadUsingRecordFilterApp.scala │ └── WriteIncrementallyAndReadApp.scala │ ├── fs2 │ ├── CustomAvroPartitioningWriteFS2App.scala │ ├── CustomAvroWriteAndReadFS2App.scala │ ├── CustomProtobufWriteAndReadFS2App.scala │ ├── IndefiniteFS2App.scala │ ├── WriteAndReadFS2App.scala │ ├── WriteAndReadFilteredFS2App.scala │ └── WriteAndReadGenericFS2App.scala │ └── scalapb │ ├── WriteAndReadApp.scala │ └── WriteIncrementallyAndReadApp.scala ├── fs2 └── src │ ├── it │ ├── resources │ │ └── logback-test.xml │ └── scala │ │ └── com │ │ └── github │ │ └── mjakubowski84 │ │ └── parquet4s │ │ ├── Fs2ParquetItSpec.scala │ │ └── parquet │ │ └── IoITSpec.scala │ ├── main │ └── scala │ │ └── com │ │ └── github │ │ └── mjakubowski84 │ │ └── parquet4s │ │ └── parquet │ │ ├── io.scala │ │ ├── logger.scala │ │ ├── package.scala │ │ ├── reader.scala │ │ ├── rotatingWriter.scala │ │ └── writer.scala │ └── test │ └── scala │ └── com │ └── github │ └── mjakubowski84 │ └── parquet4s │ └── parquet │ └── IoSpec.scala ├── fs2Benchmarks └── src │ └── main │ └── scala │ └── com │ └── github │ └── mjakubowski84 │ └── parquet4s │ └── Fs2Benchmark.scala ├── project ├── ActorLibCross.scala ├── Compilation.scala ├── DependecyVersions.scala ├── Documentation.scala ├── Releasing.scala ├── Signing.scala ├── build.properties ├── metals.sbt └── plugins.sbt ├── s3Test └── src │ └── it │ ├── resources │ └── logback-test.xml │ └── scala │ └── com │ └── github │ └── mjakubowski84 │ └── parquet4s │ └── s3 │ └── S3ItSpec.scala ├── scalapb └── src │ ├── main │ └── scala │ │ └── com │ │ └── github │ │ └── mjakubowski84 │ │ └── parquet4s │ │ ├── ScalaPBImplicits.scala │ │ ├── ScalaPBParquetRecordDecoder.scala │ │ ├── ScalaPBParquetRecordEncoder.scala │ │ └── ScalaPBParquetSchemaResolver.scala │ └── test │ ├── protobuf │ └── data.proto │ └── scala │ └── com │ └── github │ └── mjakubowski84 │ └── parquet4s │ ├── Parquet4sScalaPBAkkaPekkoSpec.scala │ ├── Parquet4sScalaPBCoreSpec.scala │ ├── Parquet4sScalaPBFS2Spec.scala │ ├── Parquet4sScalaPBSpec.scala │ └── TestData.scala └── site └── src └── main └── resources └── docs ├── data └── menu.yml ├── docs ├── akka.md ├── etl.md ├── examples.md ├── filtering.md ├── fs2.md ├── introduction.md ├── migration.md ├── partitioning.md ├── pekko.md ├── projection.md ├── protobuf.md ├── quick_start.md ├── records_and_schema.md ├── sponsors.md ├── statistics.md └── storage_types.md ├── images ├── favicon-16x16.png ├── favicon-32x32.png ├── features-header.svg ├── light-navbar-brand.svg └── light-sidebar-brand.svg └── index.md /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: mjakubowski84 2 | 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.log 3 | *.iml 4 | 5 | .vscode 6 | .metals 7 | .bloop 8 | .idea 9 | .bsp 10 | .sbt 11 | 12 | target 13 | spark-warehouse 14 | project/.plugins.sbt.swp 15 | project/project 16 | -------------------------------------------------------------------------------- /.java-version: -------------------------------------------------------------------------------- 1 | 17 2 | -------------------------------------------------------------------------------- /.jvmopts: -------------------------------------------------------------------------------- 1 | -Xmx4G 2 | -Xss2M 3 | -Xms512m 4 | -XX:+UseG1GC 5 | -XX:MaxInlineLevel=20 6 | # --add-opens=java.base/java.lang=ALL-UNNAMED 7 | # --add-opens=java.base/java.lang.invoke=ALL-UNNAMED 8 | # --add-opens=java.base/java.lang.reflect=ALL-UNNAMED 9 | # --add-opens=java.base/java.io=ALL-UNNAMED 10 | # --add-opens=java.base/java.net=ALL-UNNAMED 11 | # --add-opens=java.base/java.nio=ALL-UNNAMED 12 | # --add-opens=java.base/java.util=ALL-UNNAMED 13 | # --add-opens=java.base/java.util.concurrent=ALL-UNNAMED 14 | # --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED 15 | # --add-opens=java.base/sun.nio.ch=ALL-UNNAMED 16 | # --add-opens=java.base/sun.nio.cs=ALL-UNNAMED 17 | # --add-opens=java.base/sun.security.action=ALL-UNNAMED 18 | # --add-opens=java.base/sun.util.calendar=ALL-UNNAMED 19 | -------------------------------------------------------------------------------- /.scalafmt.conf: -------------------------------------------------------------------------------- 1 | version = 3.9.1 2 | preset = default 3 | maxColumn = 120 4 | 5 | rewrite.rules = [ 6 | RedundantBraces, 7 | RedundantParens, 8 | SortModifiers, 9 | PreferCurlyFors, 10 | Imports 11 | ] 12 | 13 | align.preset = more 14 | align.tokens = [ 15 | {code = "="}, 16 | {code = "=>"}, 17 | {code = "<-"} 18 | ] 19 | 20 | runner.dialect = scala213source3 21 | fileOverride { 22 | "glob:**/scala-2.12/**" { 23 | runner.dialect = scala212source3 24 | } 25 | "glob:**/scala-2.13/**" { 26 | runner.dialect = scala213source3 27 | } 28 | "glob:**/scala-3/**" { 29 | runner.dialect = scala3 30 | } 31 | } 32 | project.excludePaths = [ 33 | "glob:**/metals.sbt" 34 | ] 35 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to contribute 2 | 3 | I am happy that you are reading this! Your contribution is warmly welcome! Here you can read the guideliness that will make it easier. 4 | 5 | 1. Before contributing please create GitHub issue (if it is not created already) and discuss your case and the solution with the authors. 6 | 7 | 2. Propose your changes via pull request. 8 | 9 | 3. Please write descriptive messages in your commits and pull requests so that we can understand all easily. 10 | 11 | 4. Should your change be covered by a missing test? Write it! 12 | 13 | 5. Remember to update README.md if necessary. 14 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Marcin Jakubowski 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Parquet4S 2 | 3 | 4 | 5 | Parquet4s is a simple I/O for [Parquet](https://parquet.apache.org/). Allows you to easily read and write Parquet files in [Scala](https://www.scala-lang.org/). 6 | 7 | Use just a Scala case class to define the schema of your data. No need to use Avro, Protobuf, Thrift, or other data serialisation systems. You can use generic records if you don't want to use the case class, too. 8 | 9 | Compatible with files generated with [Apache Spark](https://spark.apache.org/). However, unlike in Spark, you do not have to start a cluster to perform I/O operations. 10 | 11 | Based on official [Parquet library](https://github.com/apache/parquet-mr), [Hadoop Client](https://github.com/apache/hadoop) and [Shapeless](https://github.com/milessabin/shapeless) (Shapeless is not in use in a version for Scala 3). 12 | 13 | As it is based on Hadoop Client, you can connect to any Hadoop-compatible storage like AWS S3 or Google Cloud Storage. 14 | 15 | Integrations for [Akka Streams](https://doc.akka.io/docs/akka/current/stream/index.html), [Pekko Streams](https://pekko.apache.org/docs/pekko/current/stream/index.html), and [FS2](https://fs2.io/). 16 | 17 | Released for Scala 2.12.x, 2.13.x and 3.3.x. 18 | 19 | ## Documentation 20 | 21 | Documentation is available at [here](https://mjakubowski84.github.io/parquet4s/). 22 | 23 | ## Contributing 24 | 25 | Do you want to contribute? Please read the [contribution guidelines](CONTRIBUTING.md). 26 | 27 | ## Sponsors 28 | 29 | - [calvinlfer](https://github.com/calvinlfer) 30 | -------------------------------------------------------------------------------- /akkaPekko/src/main/scala-akka-jvm/com/github/mjakubowski84/parquet4s/ScalaCompat.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | object ScalaCompat { 4 | 5 | type NotUsed = akka.NotUsed 6 | def NotUsed = akka.NotUsed 7 | 8 | type Done = akka.Done 9 | def Done = akka.Done 10 | 11 | object stream { 12 | type Attributes = akka.stream.Attributes 13 | 14 | def ActorAttributes = akka.stream.ActorAttributes 15 | 16 | type Inlet[T] = akka.stream.Inlet[T] 17 | def Inlet = akka.stream.Inlet 18 | 19 | type Outlet[T] = akka.stream.Outlet[T] 20 | def Outlet = akka.stream.Outlet 21 | 22 | type Shape = akka.stream.Shape 23 | 24 | type FlowShape[I, O] = akka.stream.FlowShape[I, O] 25 | def FlowShape = akka.stream.FlowShape 26 | 27 | object stage { 28 | type GraphStage[S <: Shape] = akka.stream.stage.GraphStage[S] 29 | 30 | type GraphStageLogic = akka.stream.stage.GraphStageLogic 31 | 32 | type TimerGraphStageLogic = akka.stream.stage.TimerGraphStageLogic 33 | 34 | type InHandler = akka.stream.stage.InHandler 35 | 36 | type OutHandler = akka.stream.stage.OutHandler 37 | } 38 | 39 | object scaladsl { 40 | type Source[Out, Mat] = akka.stream.scaladsl.Source[Out, Mat] 41 | def Source = akka.stream.scaladsl.Source 42 | 43 | type Flow[In, Out, Mat] = akka.stream.scaladsl.Flow[In, Out, Mat] 44 | def Flow[T] = akka.stream.scaladsl.Flow[T] 45 | 46 | def Keep = akka.stream.scaladsl.Keep 47 | 48 | type Sink[In, Mat] = akka.stream.scaladsl.Sink[In, Mat] 49 | def Sink = akka.stream.scaladsl.Sink 50 | } 51 | } 52 | 53 | object pattern { 54 | type AskSupport = akka.pattern.AskSupport 55 | } 56 | 57 | object actor { 58 | type Actor = akka.actor.Actor 59 | def Actor = akka.actor.Actor 60 | 61 | type ActorRef = akka.actor.ActorRef 62 | def ActorRef = akka.actor.ActorRef 63 | 64 | type CoordinatedShutdown = akka.actor.CoordinatedShutdown 65 | def CoordinatedShutdown = akka.actor.CoordinatedShutdown 66 | 67 | type Cancellable = akka.actor.Cancellable 68 | 69 | type Props = akka.actor.Props 70 | def Props = akka.actor.Props 71 | 72 | type Scheduler = akka.actor.Scheduler 73 | 74 | type ActorSystem = akka.actor.ActorSystem 75 | def ActorSystem = akka.actor.ActorSystem 76 | } 77 | 78 | object util { 79 | type Timeout = akka.util.Timeout 80 | def Timeout = akka.util.Timeout 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /akkaPekko/src/main/scala-pekko-jvm/com/github/mjakubowski84/parquet4s/ScalaCompat.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | object ScalaCompat { 4 | 5 | type NotUsed = org.apache.pekko.NotUsed 6 | def NotUsed = org.apache.pekko.NotUsed 7 | 8 | type Done = org.apache.pekko.Done 9 | def Done = org.apache.pekko.Done 10 | 11 | object stream { 12 | type Attributes = org.apache.pekko.stream.Attributes 13 | 14 | def ActorAttributes = org.apache.pekko.stream.ActorAttributes 15 | 16 | type Inlet[T] = org.apache.pekko.stream.Inlet[T] 17 | def Inlet = org.apache.pekko.stream.Inlet 18 | 19 | type Outlet[T] = org.apache.pekko.stream.Outlet[T] 20 | def Outlet = org.apache.pekko.stream.Outlet 21 | 22 | type Shape = org.apache.pekko.stream.Shape 23 | 24 | type FlowShape[I, O] = org.apache.pekko.stream.FlowShape[I, O] 25 | def FlowShape = org.apache.pekko.stream.FlowShape 26 | 27 | object stage { 28 | type GraphStage[S <: Shape] = org.apache.pekko.stream.stage.GraphStage[S] 29 | 30 | type GraphStageLogic = org.apache.pekko.stream.stage.GraphStageLogic 31 | 32 | type TimerGraphStageLogic = org.apache.pekko.stream.stage.TimerGraphStageLogic 33 | 34 | type InHandler = org.apache.pekko.stream.stage.InHandler 35 | 36 | type OutHandler = org.apache.pekko.stream.stage.OutHandler 37 | } 38 | 39 | object scaladsl { 40 | type Source[Out, Mat] = org.apache.pekko.stream.scaladsl.Source[Out, Mat] 41 | def Source = org.apache.pekko.stream.scaladsl.Source 42 | 43 | type Flow[In, Out, Mat] = org.apache.pekko.stream.scaladsl.Flow[In, Out, Mat] 44 | def Flow[T] = org.apache.pekko.stream.scaladsl.Flow[T] 45 | 46 | def Keep = org.apache.pekko.stream.scaladsl.Keep 47 | 48 | type Sink[In, Mat] = org.apache.pekko.stream.scaladsl.Sink[In, Mat] 49 | def Sink = org.apache.pekko.stream.scaladsl.Sink 50 | } 51 | } 52 | 53 | object pattern { 54 | type AskSupport = org.apache.pekko.pattern.AskSupport 55 | } 56 | 57 | object actor { 58 | type Actor = org.apache.pekko.actor.Actor 59 | def Actor = org.apache.pekko.actor.Actor 60 | 61 | type ActorRef = org.apache.pekko.actor.ActorRef 62 | def ActorRef = org.apache.pekko.actor.ActorRef 63 | 64 | type CoordinatedShutdown = org.apache.pekko.actor.CoordinatedShutdown 65 | def CoordinatedShutdown = org.apache.pekko.actor.CoordinatedShutdown 66 | 67 | type Cancellable = org.apache.pekko.actor.Cancellable 68 | 69 | type Props = org.apache.pekko.actor.Props 70 | def Props = org.apache.pekko.actor.Props 71 | 72 | type Scheduler = org.apache.pekko.actor.Scheduler 73 | 74 | type ActorSystem = org.apache.pekko.actor.ActorSystem 75 | def ActorSystem = org.apache.pekko.actor.ActorSystem 76 | } 77 | 78 | object util { 79 | type Timeout = org.apache.pekko.util.Timeout 80 | def Timeout = org.apache.pekko.util.Timeout 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /akkaPekko/src/main/scala/com/github/mjakubowski84/parquet4s/ParquetStreams.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | /** Holds factory of Akka Streams / Pekko Streams sources and sinks that allow reading from and writing to Parquet 4 | * files. 5 | */ 6 | object ParquetStreams { 7 | 8 | /** Creates a [[com.github.mjakubowski84.parquet4s.ScalaCompat.stream.scaladsl.Source]] that reads Parquet data from 9 | * the specified path. If there are multiple files at path then the order in which files are loaded is determined by 10 | * underlying filesystem.
Path can refer to local file, HDFS, AWS S3, Google Storage, Azure, etc. Please refer 11 | * to Hadoop client documentation or your data provider in order to know how to configure the connection.
Can 12 | * read also partitioned directories. Filter applies also to partition values. Partition values are set as 13 | * fields in read entities at path defined by partition name. Path can be a simple column name or a dot-separated 14 | * path to nested field. Missing intermediate fields are automatically created for each read record.
Allows to 15 | * turn on a projection over original file schema in order to boost read performance if not all columns are 16 | * required to be read.
Provides explicit API for both custom data types and generic records. 17 | * @return 18 | * Builder of the source. 19 | */ 20 | def fromParquet: ParquetSource.FromParquet = ParquetSource.FromParquetImpl 21 | 22 | /** Creates a [[com.github.mjakubowski84.parquet4s.ScalaCompat.stream.scaladsl.Sink]] that writes Parquet data to 23 | * single file at the specified path (including file name).
Path can refer to local file, HDFS, AWS S3, Google 24 | * Storage, Azure, etc. Please refer to Hadoop client documentation or your data provider in order to know how to 25 | * configure the connection.
Provides explicit API for both custom data types and generic records. 26 | * @return 27 | * Builder of a sink that writes Parquet file 28 | */ 29 | def toParquetSingleFile: SingleFileParquetSink.ToParquet = SingleFileParquetSink.ToParquetImpl 30 | 31 | /** Builds a flow that:
  1. Is designed to write Parquet files indefinitely
  2. Is able to (optionally) 32 | * partition data by a list of provided fields
  3. Flushes and rotates files after given number of rows is 33 | * written to the partition or given time period elapses
  4. Outputs incoming message after it is written but 34 | * can write an effect of provided message transformation.

Provides explicit API for both custom 35 | * data types and generic records. 36 | * @return 37 | * Builder of the flow. 38 | */ 39 | def viaParquet: ParquetPartitioningFlow.ViaParquet = ParquetPartitioningFlow.ViaParquetImpl 40 | } 41 | -------------------------------------------------------------------------------- /core/src/it/resources/logback-test.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /core/src/it/scala-2.12/com/github/mjakubowski84/parquet4s/ParquetWriterAndSparkCompatibilityItSpec.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import com.github.mjakubowski84.parquet4s.Case.CaseDef 4 | import com.github.mjakubowski84.parquet4s.CompatibilityParty.* 5 | import org.scalatest.BeforeAndAfter 6 | import org.scalatest.freespec.AnyFreeSpec 7 | import org.scalatest.matchers.should.Matchers 8 | 9 | class ParquetWriterAndSparkCompatibilityItSpec extends AnyFreeSpec with Matchers with BeforeAndAfter with SparkHelper { 10 | 11 | before { 12 | clearTemp() 13 | } 14 | 15 | private def runTestCase(testCase: CaseDef): Unit = 16 | testCase.description in { 17 | ParquetWriter.of[testCase.DataType](testCase.encoder, testCase.resolver).writeAndClose(tempPath, testCase.data) 18 | readFromTemp(testCase.typeTag) should contain theSameElementsAs testCase.data 19 | } 20 | 21 | "Spark should be able to read file saved by ParquetWriter if the file contains" - 22 | CompatibilityTestCases.cases(Writer, Spark).foreach(runTestCase) 23 | 24 | } 25 | -------------------------------------------------------------------------------- /core/src/it/scala-2.12/com/github/mjakubowski84/parquet4s/SparkAndParquetReaderCompatibilityItSpec.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import com.github.mjakubowski84.parquet4s.CompatibilityParty.* 4 | import SparkAndParquetReaderCompatibilityItSpec.Partitioned 5 | import org.scalatest.BeforeAndAfter 6 | import org.scalatest.freespec.AnyFreeSpec 7 | import org.scalatest.matchers.should.Matchers 8 | import scala.annotation.nowarn 9 | import scala.util.Using 10 | 11 | object SparkAndParquetReaderCompatibilityItSpec { 12 | case class Partitioned(partition: String, s: String) 13 | } 14 | 15 | @nowarn 16 | class SparkAndParquetReaderCompatibilityItSpec extends AnyFreeSpec with Matchers with BeforeAndAfter with SparkHelper { 17 | 18 | before { 19 | clearTemp() 20 | } 21 | 22 | private def runTestCase(testCase: Case.CaseDef): Unit = 23 | testCase.description in { 24 | writeToTemp(testCase.data)(testCase.typeTag) 25 | Using.resource(ParquetReader.as[testCase.DataType].read(tempPath)(testCase.decoder)) { parquetIterable => 26 | parquetIterable should contain theSameElementsAs testCase.data 27 | } 28 | } 29 | 30 | "ParquetReader should be able to read file saved by Spark if the file contains" - 31 | CompatibilityTestCases.cases(Spark, Reader).foreach(runTestCase) 32 | 33 | "ParquetReader should read data partitioned by Spark" in { 34 | import sparkSession.implicits.* 35 | val data = Seq( 36 | Partitioned(partition = "a", s = "a"), 37 | Partitioned(partition = "a=1", s = "a") 38 | ) 39 | data.toDS().write.partitionBy("partition").parquet(tempPath.toString) 40 | Using.resource(ParquetReader.as[Partitioned].read(tempPath)) { parquetIterable => 41 | parquetIterable should contain theSameElementsAs data 42 | } 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /core/src/it/scala-2.12/com/github/mjakubowski84/parquet4s/SparkHelper.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.scalatest.{BeforeAndAfterAll, Suite} 5 | 6 | import scala.reflect.runtime.universe.TypeTag 7 | 8 | trait SparkHelper extends BeforeAndAfterAll with TestUtils { 9 | 10 | this: Suite => 11 | 12 | sealed trait OutputTimestampType 13 | case object Int96 extends OutputTimestampType 14 | case object TIMESTAMP_MICROS extends OutputTimestampType 15 | case object TIMESTAMP_MILLIS extends OutputTimestampType 16 | 17 | private var sparkStarted = false 18 | 19 | protected def outputTimestampType: OutputTimestampType = Int96 20 | 21 | lazy val sparkSession: SparkSession = { 22 | sparkStarted = true 23 | SparkSession 24 | .builder() 25 | .master("local[2]") 26 | .appName(getClass.getSimpleName) 27 | .config("spark.sql.parquet.outputTimestampType", outputTimestampType.toString) 28 | .config("spark.sql.session.timeZone", "UTC") 29 | .getOrCreate() 30 | } 31 | 32 | override def afterAll(): Unit = { 33 | super.afterAll() 34 | if (sparkStarted) sparkSession.stop() 35 | } 36 | 37 | def writeToTemp[T <: Product: TypeTag](data: Seq[T]): Unit = { 38 | import sparkSession.implicits.* 39 | data.toDS().write.parquet(tempPath.toString) 40 | } 41 | 42 | def readFromTemp[T <: Product: TypeTag]: Seq[T] = { 43 | import sparkSession.implicits.* 44 | sparkSession.read.parquet(tempPath.toString).as[T].collect().toSeq 45 | } 46 | 47 | } 48 | -------------------------------------------------------------------------------- /core/src/it/scala-2.12/com/github/mjakubowski84/parquet4s/TestCaseSupport.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import com.github.mjakubowski84.parquet4s.CompatibilityParty.CompatibilityParty 4 | 5 | import scala.reflect.runtime.universe.TypeTag 6 | 7 | object CompatibilityParty { 8 | sealed trait CompatibilityParty 9 | case object Spark extends CompatibilityParty 10 | case object Reader extends CompatibilityParty 11 | case object Writer extends CompatibilityParty 12 | 13 | val All: Set[CompatibilityParty] = Set(Spark, Reader, Writer) 14 | } 15 | 16 | object Case { 17 | 18 | type CaseDef = Case[? <: Product] 19 | 20 | def apply[T <: Product: TypeTag: ParquetRecordDecoder: ParquetRecordEncoder: ParquetSchemaResolver]( 21 | description: String, 22 | data: Seq[T], 23 | compatibilityParties: Set[CompatibilityParty] = CompatibilityParty.All 24 | ): Case[T] = 25 | new Case( 26 | description = description, 27 | compatibilityParties = compatibilityParties, 28 | _data = data, 29 | _decoder = implicitly[ParquetRecordDecoder[T]], 30 | _encoder = implicitly[ParquetRecordEncoder[T]], 31 | _resolver = implicitly[ParquetSchemaResolver[T]], 32 | _typeTag = implicitly[TypeTag[T]] 33 | ) 34 | } 35 | 36 | class Case[T <: Product]( 37 | val description: String, 38 | val compatibilityParties: Set[CompatibilityParty], 39 | _data: Seq[T], 40 | _decoder: ParquetRecordDecoder[T], 41 | _encoder: ParquetRecordEncoder[T], 42 | _resolver: ParquetSchemaResolver[T], 43 | _typeTag: TypeTag[T] 44 | ) { 45 | type DataType = T 46 | def data: Seq[DataType] = _data 47 | def decoder: ParquetRecordDecoder[DataType] = _decoder 48 | def encoder: ParquetRecordEncoder[DataType] = _encoder 49 | def resolver: ParquetSchemaResolver[DataType] = _resolver 50 | def typeTag: TypeTag[DataType] = _typeTag 51 | } 52 | 53 | trait TestCaseSupport { 54 | 55 | def caseDefinitions: Seq[Case.CaseDef] 56 | 57 | def cases(compatibilityParties: Set[CompatibilityParty] = CompatibilityParty.All): Seq[Case.CaseDef] = 58 | caseDefinitions.filter { caseDefinition => 59 | compatibilityParties.forall(caseDefinition.compatibilityParties.contains) 60 | } 61 | 62 | def cases(compatibilityParty: CompatibilityParty*): Seq[Case.CaseDef] = cases(compatibilityParty.toSet) 63 | 64 | def only[T: TypeTag]: Case.CaseDef = { 65 | val targetTpe = implicitly[TypeTag[T]].tpe 66 | caseDefinitions 67 | .find(_.typeTag.tpe =:= targetTpe) 68 | .getOrElse(throw new NoSuchElementException(s"Case $targetTpe is not defined")) 69 | } 70 | 71 | } 72 | -------------------------------------------------------------------------------- /core/src/it/scala-2.12/com/github/mjakubowski84/parquet4s/TimeEncodingCompatibilityItSpec.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import com.github.mjakubowski84.parquet4s.CompatibilityTestCases.TimePrimitives 4 | import com.github.mjakubowski84.parquet4s.TimeValueCodecs.* 5 | import org.scalatest.BeforeAndAfter 6 | import org.scalatest.freespec.AnyFreeSpecLike 7 | import org.scalatest.matchers.should.Matchers 8 | 9 | import java.sql.Date 10 | import java.time.{LocalDate, LocalDateTime} 11 | import java.util.TimeZone 12 | 13 | abstract class TimeEncodingCompatibilityItSpec 14 | extends AnyFreeSpecLike 15 | with Matchers 16 | with BeforeAndAfter 17 | with SparkHelper { 18 | 19 | private val newYearMidnight = LocalDateTime.of(2019, 1, 1, 0, 0, 0) 20 | private val newYear = Date.valueOf(LocalDate.of(2019, 1, 1)) 21 | private val timeZones = List( 22 | TimeZone.getTimeZone("GMT-1"), 23 | TimeZone.getTimeZone("UTC"), 24 | TimeZone.getTimeZone("GMT+1") 25 | ) 26 | 27 | protected val parquetWriter: ParquetWriter.Builder[TimePrimitives] = ParquetWriter.of[TimePrimitives] 28 | 29 | before { 30 | clearTemp() 31 | } 32 | 33 | private def writeWithSpark(data: TimePrimitives): Unit = writeToTemp(Seq(data)) 34 | 35 | private def readWithSpark: TimePrimitives = readFromTemp[TimePrimitives].head 36 | 37 | private def writeWithParquet4S(data: TimePrimitives, timeZone: TimeZone): Unit = 38 | parquetWriter 39 | .options(ParquetWriter.Options(timeZone = timeZone)) 40 | .writeAndClose(tempPath, Seq(data)) 41 | 42 | private def readWithParquet4S(timeZone: TimeZone): TimePrimitives = { 43 | val parquetIterable = 44 | ParquetReader.as[TimePrimitives].options(ParquetReader.Options(timeZone = timeZone)).read(tempPath) 45 | try parquetIterable.head 46 | finally parquetIterable.close() 47 | } 48 | 49 | "For time zone of" - 50 | timeZones.foreach { timeZone => 51 | val data = TimePrimitives(timestamp = localDateTimeToTimestamp(newYearMidnight, timeZone), date = newYear) 52 | timeZone.getDisplayName - { 53 | "Spark should read data written by Parquet4s" in { 54 | writeWithParquet4S(data, timeZone) 55 | readWithSpark should be(data) 56 | } 57 | "Parquet4s should read data written by Spark" in { 58 | writeWithSpark(data) 59 | readWithParquet4S(timeZone) should be(data) 60 | } 61 | } 62 | } 63 | 64 | } 65 | -------------------------------------------------------------------------------- /core/src/it/scala-2.12/com/github/mjakubowski84/parquet4s/TimeEncodingInt64MicrosCompatibilityItSpec.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import com.github.mjakubowski84.parquet4s.CompatibilityTestCases.TimePrimitives 4 | 5 | class TimeEncodingInt64MicrosCompatibilityItSpec extends TimeEncodingCompatibilityItSpec { 6 | 7 | import TimestampFormat.Implicits.Micros.* 8 | 9 | override val outputTimestampType: OutputTimestampType = Int96 10 | override protected val parquetWriter: ParquetWriter.Builder[TimePrimitives] = ParquetWriter.of[TimePrimitives] 11 | 12 | } 13 | -------------------------------------------------------------------------------- /core/src/it/scala-2.12/com/github/mjakubowski84/parquet4s/TimeEncodingInt64MillisCompatibilityItSpec.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import com.github.mjakubowski84.parquet4s.CompatibilityTestCases.TimePrimitives 4 | 5 | class TimeEncodingInt64MillisCompatibilityItSpec extends TimeEncodingCompatibilityItSpec { 6 | 7 | import TimestampFormat.Implicits.Millis.* 8 | 9 | override val outputTimestampType: OutputTimestampType = TIMESTAMP_MILLIS 10 | override protected val parquetWriter: ParquetWriter.Builder[TimePrimitives] = ParquetWriter.of[TimePrimitives] 11 | 12 | } 13 | -------------------------------------------------------------------------------- /core/src/it/scala-2.12/com/github/mjakubowski84/parquet4s/TimeEncodingInt64NanosCompatibilityItSpec.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import com.github.mjakubowski84.parquet4s.CompatibilityTestCases.TimePrimitives 4 | import com.github.mjakubowski84.parquet4s.TimeValueCodecs.localDateTimeToTimestamp 5 | import org.scalatest.BeforeAndAfter 6 | import org.scalatest.freespec.AnyFreeSpec 7 | import org.scalatest.matchers.should.Matchers 8 | 9 | import java.sql.Date 10 | import java.time.{LocalDate, LocalDateTime} 11 | import java.util.TimeZone 12 | import TimestampFormat.Implicits.Nanos.* 13 | 14 | class TimeEncodingInt64NanosCompatibilityItSpec extends AnyFreeSpec with Matchers with BeforeAndAfter with TestUtils { 15 | 16 | private val newYearMidnight = LocalDateTime.of(2019, 1, 1, 0, 0, 0) 17 | private val newYear = Date.valueOf(LocalDate.of(2019, 1, 1)) 18 | private val timeZones = List( 19 | TimeZone.getTimeZone("GMT-1"), 20 | TimeZone.getTimeZone("UTC"), 21 | TimeZone.getTimeZone("GMT+1") 22 | ) 23 | 24 | before { 25 | clearTemp() 26 | } 27 | 28 | private def writeWithParquet4S(data: TimePrimitives, timeZone: TimeZone): Unit = 29 | ParquetWriter 30 | .of[TimePrimitives] 31 | .options(ParquetWriter.Options(timeZone = timeZone)) 32 | .writeAndClose(tempPath, Seq(data)) 33 | 34 | private def readWithParquet4S(timeZone: TimeZone): TimePrimitives = { 35 | val parquetIterable = 36 | ParquetReader.as[TimePrimitives].options(ParquetReader.Options(timeZone = timeZone)).read(tempPath) 37 | try parquetIterable.head 38 | finally parquetIterable.close() 39 | } 40 | 41 | "Parquet4s should read data written with time zone of" - 42 | timeZones.foreach { timeZone => 43 | val data = TimePrimitives(timestamp = localDateTimeToTimestamp(newYearMidnight, timeZone), date = newYear) 44 | timeZone.getDisplayName in { 45 | writeWithParquet4S(data, timeZone) 46 | readWithParquet4S(timeZone) should be(data) 47 | } 48 | } 49 | 50 | } 51 | -------------------------------------------------------------------------------- /core/src/it/scala-2.12/com/github/mjakubowski84/parquet4s/TimeEncodingInt96CompatibilityItSpec.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | class TimeEncodingInt96MillisCompatibilityItSpec extends TimeEncodingCompatibilityItSpec { 4 | 5 | override val outputTimestampType: OutputTimestampType = Int96 6 | 7 | } 8 | -------------------------------------------------------------------------------- /core/src/it/scala-2.13/com/github/mjakubowski84/parquet4s/ParquetWriterAndSparkCompatibilityItSpec.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import com.github.mjakubowski84.parquet4s.Case.CaseDef 4 | import com.github.mjakubowski84.parquet4s.CompatibilityParty.* 5 | import org.scalatest.BeforeAndAfter 6 | import org.scalatest.freespec.AnyFreeSpec 7 | import org.scalatest.matchers.should.Matchers 8 | 9 | class ParquetWriterAndSparkCompatibilityItSpec extends AnyFreeSpec with Matchers with BeforeAndAfter with SparkHelper { 10 | 11 | before { 12 | clearTemp() 13 | } 14 | 15 | private def runTestCase(testCase: CaseDef): Unit = 16 | testCase.description in { 17 | ParquetWriter.of[testCase.DataType](testCase.encoder, testCase.resolver).writeAndClose(tempPath, testCase.data) 18 | readFromTemp(testCase.typeTag) should contain theSameElementsAs testCase.data 19 | } 20 | 21 | "Spark should be able to read file saved by ParquetWriter if the file contains" - 22 | CompatibilityTestCases.cases(Writer, Spark).foreach(runTestCase) 23 | 24 | } 25 | -------------------------------------------------------------------------------- /core/src/it/scala-2.13/com/github/mjakubowski84/parquet4s/SparkAndParquetReaderCompatibilityItSpec.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import com.github.mjakubowski84.parquet4s.CompatibilityParty.* 4 | import org.scalatest.BeforeAndAfter 5 | import org.scalatest.freespec.AnyFreeSpec 6 | import org.scalatest.matchers.should.Matchers 7 | 8 | class SparkAndParquetReaderCompatibilityItSpec extends AnyFreeSpec with Matchers with BeforeAndAfter with SparkHelper { 9 | 10 | before { 11 | clearTemp() 12 | } 13 | 14 | private def runTestCase(testCase: Case.CaseDef): Unit = 15 | testCase.description in { 16 | writeToTemp(testCase.data)(testCase.typeTag) 17 | val parquetIterable = ParquetReader.as[testCase.DataType].read(tempPath)(testCase.decoder) 18 | try parquetIterable should contain theSameElementsAs testCase.data 19 | finally parquetIterable.close() 20 | } 21 | 22 | "ParquetReader should be able to read file saved by Spark if the file contains" - 23 | CompatibilityTestCases.cases(Spark, Reader).foreach(runTestCase) 24 | 25 | } 26 | -------------------------------------------------------------------------------- /core/src/it/scala-2.13/com/github/mjakubowski84/parquet4s/SparkHelper.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.scalatest.{BeforeAndAfterAll, Suite} 5 | 6 | import scala.reflect.runtime.universe.TypeTag 7 | 8 | trait SparkHelper extends BeforeAndAfterAll with TestUtils { 9 | 10 | this: Suite => 11 | 12 | sealed trait OutputTimestampType 13 | case object Int96 extends OutputTimestampType 14 | case object TIMESTAMP_MICROS extends OutputTimestampType 15 | case object TIMESTAMP_MILLIS extends OutputTimestampType 16 | 17 | private var sparkStarted = false 18 | 19 | protected def outputTimestampType: OutputTimestampType = Int96 20 | 21 | lazy val sparkSession: SparkSession = { 22 | sparkStarted = true 23 | SparkSession 24 | .builder() 25 | .master("local[2]") 26 | .appName(getClass.getSimpleName) 27 | .config("spark.sql.parquet.outputTimestampType", outputTimestampType.toString) 28 | .config("spark.sql.session.timeZone", "UTC") 29 | .getOrCreate() 30 | } 31 | 32 | override def afterAll(): Unit = { 33 | super.afterAll() 34 | if (sparkStarted) sparkSession.stop() 35 | } 36 | 37 | def writeToTemp[T <: Product: TypeTag](data: Seq[T]): Unit = { 38 | import sparkSession.implicits.* 39 | data.toDS().write.parquet(tempPath.toString) 40 | } 41 | 42 | def readFromTemp[T <: Product: TypeTag]: Seq[T] = { 43 | import sparkSession.implicits.* 44 | sparkSession.read.parquet(tempPath.toString).as[T].collect().toSeq 45 | } 46 | 47 | } 48 | -------------------------------------------------------------------------------- /core/src/it/scala-2.13/com/github/mjakubowski84/parquet4s/TestCaseSupport.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import com.github.mjakubowski84.parquet4s.CompatibilityParty.CompatibilityParty 4 | 5 | import scala.reflect.runtime.universe.TypeTag 6 | 7 | object CompatibilityParty { 8 | sealed trait CompatibilityParty 9 | case object Spark extends CompatibilityParty 10 | case object Reader extends CompatibilityParty 11 | case object Writer extends CompatibilityParty 12 | 13 | val All: Set[CompatibilityParty] = Set(Spark, Reader, Writer) 14 | } 15 | 16 | object Case { 17 | 18 | type CaseDef = Case[? <: Product] 19 | 20 | def apply[T <: Product: TypeTag: ParquetRecordDecoder: ParquetRecordEncoder: ParquetSchemaResolver]( 21 | description: String, 22 | data: Seq[T], 23 | compatibilityParties: Set[CompatibilityParty] = CompatibilityParty.All 24 | ): Case[T] = 25 | new Case( 26 | description = description, 27 | compatibilityParties = compatibilityParties, 28 | _data = data, 29 | _decoder = implicitly[ParquetRecordDecoder[T]], 30 | _encoder = implicitly[ParquetRecordEncoder[T]], 31 | _resolver = implicitly[ParquetSchemaResolver[T]], 32 | _typeTag = implicitly[TypeTag[T]] 33 | ) 34 | } 35 | 36 | class Case[T <: Product]( 37 | val description: String, 38 | val compatibilityParties: Set[CompatibilityParty], 39 | _data: Seq[T], 40 | _decoder: ParquetRecordDecoder[T], 41 | _encoder: ParquetRecordEncoder[T], 42 | _resolver: ParquetSchemaResolver[T], 43 | _typeTag: TypeTag[T] 44 | ) { 45 | type DataType = T 46 | def data: Seq[DataType] = _data 47 | def decoder: ParquetRecordDecoder[DataType] = _decoder 48 | def encoder: ParquetRecordEncoder[DataType] = _encoder 49 | def resolver: ParquetSchemaResolver[DataType] = _resolver 50 | def typeTag: TypeTag[DataType] = _typeTag 51 | } 52 | 53 | trait TestCaseSupport { 54 | 55 | def caseDefinitions: Seq[Case.CaseDef] 56 | 57 | def cases(compatibilityParties: Set[CompatibilityParty] = CompatibilityParty.All): Seq[Case.CaseDef] = 58 | caseDefinitions.filter { caseDefinition => 59 | compatibilityParties.forall(caseDefinition.compatibilityParties.contains) 60 | } 61 | 62 | def cases(compatibilityParty: CompatibilityParty*): Seq[Case.CaseDef] = cases(compatibilityParty.toSet) 63 | 64 | def only[T: TypeTag]: Case.CaseDef = { 65 | val targetTpe = implicitly[TypeTag[T]].tpe 66 | caseDefinitions 67 | .find(_.typeTag.tpe =:= targetTpe) 68 | .getOrElse(throw new NoSuchElementException(s"Case $targetTpe is not defined")) 69 | } 70 | 71 | } 72 | -------------------------------------------------------------------------------- /core/src/it/scala-2.13/com/github/mjakubowski84/parquet4s/TimeEncodingCompatibilityItSpec.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import com.github.mjakubowski84.parquet4s.CompatibilityTestCases.TimePrimitives 4 | import com.github.mjakubowski84.parquet4s.TimeValueCodecs.* 5 | import org.scalatest.BeforeAndAfter 6 | import org.scalatest.freespec.AnyFreeSpecLike 7 | import org.scalatest.matchers.should.Matchers 8 | 9 | import java.sql.Date 10 | import java.time.{LocalDate, LocalDateTime} 11 | import java.util.TimeZone 12 | 13 | abstract class TimeEncodingCompatibilityItSpec 14 | extends AnyFreeSpecLike 15 | with Matchers 16 | with BeforeAndAfter 17 | with SparkHelper { 18 | 19 | private val newYearMidnight = LocalDateTime.of(2019, 1, 1, 0, 0, 0) 20 | private val newYear = Date.valueOf(LocalDate.of(2019, 1, 1)) 21 | private val timeZones = List( 22 | TimeZone.getTimeZone("GMT-1"), 23 | TimeZone.getTimeZone("UTC"), 24 | TimeZone.getTimeZone("GMT+1") 25 | ) 26 | 27 | protected val parquetWriter: ParquetWriter.Builder[TimePrimitives] = ParquetWriter.of[TimePrimitives] 28 | 29 | before { 30 | clearTemp() 31 | } 32 | 33 | private def writeWithSpark(data: TimePrimitives): Unit = writeToTemp(Seq(data)) 34 | 35 | private def readWithSpark: TimePrimitives = readFromTemp[TimePrimitives].head 36 | 37 | private def writeWithParquet4S(data: TimePrimitives, timeZone: TimeZone): Unit = 38 | parquetWriter 39 | .options(ParquetWriter.Options(timeZone = timeZone)) 40 | .writeAndClose(tempPath, Seq(data)) 41 | 42 | private def readWithParquet4S(timeZone: TimeZone): TimePrimitives = { 43 | val parquetIterable = 44 | ParquetReader.as[TimePrimitives].options(ParquetReader.Options(timeZone = timeZone)).read(tempPath) 45 | try parquetIterable.head 46 | finally parquetIterable.close() 47 | } 48 | 49 | "For time zone of" - 50 | timeZones.foreach { timeZone => 51 | val data = TimePrimitives(timestamp = localDateTimeToTimestamp(newYearMidnight, timeZone), date = newYear) 52 | timeZone.getDisplayName - { 53 | "Spark should read data written by Parquet4s" in { 54 | writeWithParquet4S(data, timeZone) 55 | readWithSpark should be(data) 56 | } 57 | "Parquet4s should read data written by Spark" in { 58 | writeWithSpark(data) 59 | readWithParquet4S(timeZone) should be(data) 60 | } 61 | } 62 | } 63 | 64 | } 65 | -------------------------------------------------------------------------------- /core/src/it/scala-2.13/com/github/mjakubowski84/parquet4s/TimeEncodingInt64MicrosCompatibilityItSpec.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import com.github.mjakubowski84.parquet4s.CompatibilityTestCases.TimePrimitives 4 | 5 | class TimeEncodingInt64MicrosCompatibilityItSpec extends TimeEncodingCompatibilityItSpec { 6 | 7 | import TimestampFormat.Implicits.Micros.* 8 | 9 | override val outputTimestampType: OutputTimestampType = Int96 10 | override protected val parquetWriter: ParquetWriter.Builder[TimePrimitives] = ParquetWriter.of[TimePrimitives] 11 | 12 | } 13 | -------------------------------------------------------------------------------- /core/src/it/scala-2.13/com/github/mjakubowski84/parquet4s/TimeEncodingInt64MillisCompatibilityItSpec.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import com.github.mjakubowski84.parquet4s.CompatibilityTestCases.TimePrimitives 4 | 5 | class TimeEncodingInt64MillisCompatibilityItSpec extends TimeEncodingCompatibilityItSpec { 6 | 7 | import TimestampFormat.Implicits.Millis.* 8 | 9 | override val outputTimestampType: OutputTimestampType = TIMESTAMP_MILLIS 10 | override protected val parquetWriter: ParquetWriter.Builder[TimePrimitives] = ParquetWriter.of[TimePrimitives] 11 | 12 | } 13 | -------------------------------------------------------------------------------- /core/src/it/scala-2.13/com/github/mjakubowski84/parquet4s/TimeEncodingInt64NanosCompatibilityItSpec.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import com.github.mjakubowski84.parquet4s.CompatibilityTestCases.TimePrimitives 4 | import com.github.mjakubowski84.parquet4s.TimeValueCodecs.localDateTimeToTimestamp 5 | import org.scalatest.BeforeAndAfter 6 | import org.scalatest.freespec.AnyFreeSpec 7 | import org.scalatest.matchers.should.Matchers 8 | 9 | import java.sql.Date 10 | import java.time.{LocalDate, LocalDateTime} 11 | import java.util.TimeZone 12 | import TimestampFormat.Implicits.Nanos.* 13 | 14 | class TimeEncodingInt64NanosCompatibilityItSpec extends AnyFreeSpec with Matchers with BeforeAndAfter with TestUtils { 15 | 16 | private val newYearMidnight = LocalDateTime.of(2019, 1, 1, 0, 0, 0) 17 | private val newYear = Date.valueOf(LocalDate.of(2019, 1, 1)) 18 | private val timeZones = List( 19 | TimeZone.getTimeZone("GMT-1"), 20 | TimeZone.getTimeZone("UTC"), 21 | TimeZone.getTimeZone("GMT+1") 22 | ) 23 | 24 | before { 25 | clearTemp() 26 | } 27 | 28 | private def writeWithParquet4S(data: TimePrimitives, timeZone: TimeZone): Unit = 29 | ParquetWriter 30 | .of[TimePrimitives] 31 | .options(ParquetWriter.Options(timeZone = timeZone)) 32 | .writeAndClose(tempPath, Seq(data)) 33 | 34 | private def readWithParquet4S(timeZone: TimeZone): TimePrimitives = { 35 | val parquetIterable = 36 | ParquetReader.as[TimePrimitives].options(ParquetReader.Options(timeZone = timeZone)).read(tempPath) 37 | try parquetIterable.head 38 | finally parquetIterable.close() 39 | } 40 | 41 | "Parquet4s should read data written with time zone of" - 42 | timeZones.foreach { timeZone => 43 | val data = TimePrimitives(timestamp = localDateTimeToTimestamp(newYearMidnight, timeZone), date = newYear) 44 | timeZone.getDisplayName in { 45 | writeWithParquet4S(data, timeZone) 46 | readWithParquet4S(timeZone) should be(data) 47 | } 48 | } 49 | 50 | } 51 | -------------------------------------------------------------------------------- /core/src/it/scala-2.13/com/github/mjakubowski84/parquet4s/TimeEncodingInt96CompatibilityItSpec.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | class TimeEncodingInt96MillisCompatibilityItSpec extends TimeEncodingCompatibilityItSpec { 4 | 5 | override val outputTimestampType: OutputTimestampType = Int96 6 | 7 | } 8 | -------------------------------------------------------------------------------- /core/src/it/scala-3/com/github/mjakubowski84/parquet4s/TestCaseSupport.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import com.github.mjakubowski84.parquet4s.CompatibilityParty.CompatibilityParty 4 | 5 | object CompatibilityParty { 6 | sealed trait CompatibilityParty 7 | case object Spark extends CompatibilityParty 8 | case object Reader extends CompatibilityParty 9 | case object Writer extends CompatibilityParty 10 | 11 | val All: Set[CompatibilityParty] = Set(Spark, Reader, Writer) 12 | } 13 | 14 | object Case { 15 | 16 | type CaseDef = Case[? <: Product] 17 | 18 | def apply[T <: Product: ParquetRecordDecoder: ParquetRecordEncoder: ParquetSchemaResolver]( 19 | description: String, 20 | data: Seq[T], 21 | compatibilityParties: Set[CompatibilityParty] = CompatibilityParty.All 22 | ): Case[T] = 23 | new Case( 24 | description = description, 25 | compatibilityParties = compatibilityParties, 26 | _data = data, 27 | _decoder = implicitly[ParquetRecordDecoder[T]], 28 | _encoder = implicitly[ParquetRecordEncoder[T]], 29 | _resolver = implicitly[ParquetSchemaResolver[T]] 30 | ) 31 | } 32 | 33 | class Case[T <: Product]( 34 | val description: String, 35 | val compatibilityParties: Set[CompatibilityParty], 36 | _data: Seq[T], 37 | _decoder: ParquetRecordDecoder[T], 38 | _encoder: ParquetRecordEncoder[T], 39 | _resolver: ParquetSchemaResolver[T] 40 | ) { 41 | opaque type DataType = T 42 | def data: Seq[DataType] = _data 43 | def decoder: ParquetRecordDecoder[DataType] = _decoder 44 | def encoder: ParquetRecordEncoder[DataType] = _encoder 45 | def resolver: ParquetSchemaResolver[DataType] = _resolver 46 | } 47 | 48 | trait TestCaseSupport { 49 | 50 | def caseDefinitions: Seq[Case.CaseDef] 51 | 52 | def cases(compatibilityParties: Set[CompatibilityParty] = CompatibilityParty.All): Seq[Case.CaseDef] = 53 | caseDefinitions.filter { caseDefinition => 54 | compatibilityParties.forall(caseDefinition.compatibilityParties.contains) 55 | } 56 | 57 | def cases(compatibilityParty: CompatibilityParty*): Seq[Case.CaseDef] = cases(compatibilityParty.toSet) 58 | 59 | } 60 | -------------------------------------------------------------------------------- /core/src/it/scala/com/github/mjakubowski84/parquet4s/ParquetWriterAndParquetReaderCompatibilityItSpec.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import com.github.mjakubowski84.parquet4s.Case.CaseDef 4 | import com.github.mjakubowski84.parquet4s.CompatibilityParty.* 5 | import org.scalatest.matchers.should.Matchers 6 | import org.scalatest.BeforeAndAfter 7 | import org.scalatest.freespec.AnyFreeSpec 8 | 9 | class ParquetWriterAndParquetReaderCompatibilityItSpec 10 | extends AnyFreeSpec 11 | with Matchers 12 | with BeforeAndAfter 13 | with TestUtils { 14 | 15 | before { 16 | clearTemp() 17 | } 18 | 19 | private def runTestCase(testCase: CaseDef): Unit = 20 | testCase.description in { 21 | ParquetWriter 22 | .of(testCase.encoder, testCase.resolver) 23 | .writeAndClose(tempPath, testCase.data) 24 | val parquetIterable = ParquetReader.as[testCase.DataType].read(tempPath)(testCase.decoder) 25 | try parquetIterable should contain theSameElementsAs testCase.data 26 | finally parquetIterable.close() 27 | } 28 | 29 | "Spark should be able to read file saved by ParquetWriter if the file contains" - 30 | CompatibilityTestCases.cases(Writer, Reader).foreach(runTestCase) 31 | 32 | } 33 | -------------------------------------------------------------------------------- /core/src/it/scala/com/github/mjakubowski84/parquet4s/RecordFilterItSpec.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import org.scalatest.flatspec.AnyFlatSpec 4 | import org.scalatest.matchers.should.Matchers 5 | import scala.util.Using 6 | 7 | class RecordFilterSpec extends AnyFlatSpec with Matchers { 8 | 9 | private case class Data(i: Int) 10 | 11 | private val data = (0 to 10).map(Data(_)) 12 | 13 | "RecordFilter" should "filter data by record index" in { 14 | val outFile = InMemoryOutputFile(initBufferSize = 1024) 15 | ParquetWriter.of[Data].writeAndClose(outFile, data) 16 | val inFile = outFile.toInputFile 17 | Using.resource(ParquetReader.as[Data].filter(RecordFilter(i => i >= 1 && i < 10)).read(inFile)) { iterable => 18 | val result = iterable.toVector 19 | result should have size 9 20 | result.head should be(Data(1)) 21 | result.last should be(Data(9)) 22 | } 23 | } 24 | 25 | } 26 | -------------------------------------------------------------------------------- /core/src/it/scala/com/github/mjakubowski84/parquet4s/TestUtils.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import org.apache.hadoop.conf.Configuration 4 | import org.apache.hadoop.fs.{FileSystem, FileUtil} 5 | 6 | import java.io.File 7 | import java.nio.file.Files 8 | 9 | trait TestUtils { 10 | 11 | protected val tempPath: Path = Path(Path(Files.createTempDirectory("example")), "testOutputPath") 12 | protected lazy val configuration = new Configuration() 13 | protected lazy val fileSystem: FileSystem = tempPath.toHadoop.getFileSystem(configuration) 14 | private val tempDir = new File(tempPath.toUri) 15 | 16 | def clearTemp(): Unit = { 17 | FileUtil.fullyDelete(tempDir) 18 | () 19 | } 20 | 21 | } 22 | -------------------------------------------------------------------------------- /core/src/main/scala-2.12/com/github/mjakubowski84/parquet4s/ParquetRecordDecoder.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import shapeless.labelled.{FieldType, field} 4 | import shapeless.{::, HList, HNil, LabelledGeneric, Lazy, Witness} 5 | 6 | import scala.annotation.{implicitNotFound, unused} 7 | import scala.util.control.NonFatal 8 | 9 | /** Type class that allows to decode instances of [[RowParquetRecord]] 10 | * @tparam T 11 | * represents schema of [[RowParquetRecord]] 12 | */ 13 | @implicitNotFound( 14 | "ParquetRecordDecoder. Cannot read data of type ${T}. " + 15 | "Please check if there is implicit ValueDecoder available for each field and subfield of ${T}." 16 | ) 17 | trait ParquetRecordDecoder[T] extends MetadataReader { 18 | 19 | /** @param record 20 | * to be decoded to instance of given type 21 | * @param configuration 22 | * [ValueCodecConfiguration] used by some codecs 23 | * @return 24 | * instance of product type decoded from record 25 | */ 26 | def decode(record: RowParquetRecord, configuration: ValueCodecConfiguration): T 27 | 28 | def setMetadata(@unused metadata: collection.Map[String, String]): Unit = {} 29 | } 30 | 31 | object ParquetRecordDecoder { 32 | 33 | object DecodingException { 34 | def apply(msg: String, cause: Throwable): DecodingException = { 35 | val decodingException = DecodingException(msg) 36 | decodingException.initCause(cause) 37 | decodingException 38 | } 39 | } 40 | 41 | case class DecodingException(msg: String) extends RuntimeException(msg) 42 | 43 | def apply[T](implicit ev: ParquetRecordDecoder[T]): ParquetRecordDecoder[T] = ev 44 | 45 | def decode[T](record: RowParquetRecord, configuration: ValueCodecConfiguration = ValueCodecConfiguration.Default)( 46 | implicit ev: ParquetRecordDecoder[T] 47 | ): T = ev.decode(record, configuration) 48 | 49 | implicit val nilDecoder: ParquetRecordDecoder[HNil] = (_, _) => HNil 50 | 51 | implicit def headValueDecoder[FieldName <: Symbol, Head, Tail <: HList](implicit 52 | witness: Witness.Aux[FieldName], 53 | headDecoder: ValueDecoder[Head], 54 | tailDecoder: ParquetRecordDecoder[Tail] 55 | ): ParquetRecordDecoder[FieldType[FieldName, Head] :: Tail] = 56 | (record: RowParquetRecord, configuration: ValueCodecConfiguration) => { 57 | val fieldName = witness.value.name 58 | val decodedFieldOpt = 59 | try record.get[Head](fieldName, configuration) 60 | catch { 61 | case NonFatal(cause) => 62 | throw DecodingException(s"Failed to decode field $fieldName of record: $record", cause) 63 | } 64 | decodedFieldOpt match { 65 | case Some(decodedFieldValue) => 66 | field[FieldName](decodedFieldValue) :: tailDecoder.decode(record, configuration) 67 | case None => 68 | throw DecodingException(s"Missing required field $fieldName in a record: $record") 69 | } 70 | } 71 | 72 | implicit def genericDecoder[A, R](implicit 73 | gen: LabelledGeneric.Aux[A, R], 74 | decoder: Lazy[ParquetRecordDecoder[R]] 75 | ): ParquetRecordDecoder[A] = 76 | (record: RowParquetRecord, configuration: ValueCodecConfiguration) => 77 | gen.from(decoder.value.decode(record, configuration)) 78 | 79 | } 80 | -------------------------------------------------------------------------------- /core/src/main/scala-2.12/com/github/mjakubowski84/parquet4s/ProductDecoders.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import shapeless.LowPriority 4 | 5 | import scala.annotation.nowarn 6 | 7 | trait ProductDecoders { 8 | 9 | implicit def productDecoder[T](implicit 10 | @nowarn ev: LowPriority, 11 | decoder: ParquetRecordDecoder[T] 12 | ): OptionalValueDecoder[T] = 13 | (value, configuration) => 14 | value match { 15 | case record: RowParquetRecord => decoder.decode(record, configuration) 16 | } 17 | 18 | } 19 | -------------------------------------------------------------------------------- /core/src/main/scala-2.12/com/github/mjakubowski84/parquet4s/ProductEncoders.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import shapeless.LowPriority 4 | 5 | import scala.annotation.nowarn 6 | 7 | trait ProductEncoders { 8 | implicit def productEncoder[T](implicit 9 | @nowarn ev: LowPriority, 10 | encoder: ParquetRecordEncoder[T] 11 | ): OptionalValueEncoder[T] = 12 | (data, configuration) => encoder.encode(data, null, configuration) 13 | } 14 | -------------------------------------------------------------------------------- /core/src/main/scala-2.12/com/github/mjakubowski84/parquet4s/ProductSchemaDefs.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import shapeless.LowPriority 4 | 5 | import scala.annotation.nowarn 6 | 7 | trait ProductSchemaDefs { 8 | implicit def productSchema[T](implicit 9 | @nowarn ev: LowPriority, 10 | parquetSchemaResolver: ParquetSchemaResolver[T] 11 | ): TypedSchemaDef[T] = 12 | SchemaDef.group(parquetSchemaResolver.resolveSchema(Cursor.simple)*).withMetadata(SchemaDef.Meta.Generated).typed[T] 13 | 14 | } 15 | -------------------------------------------------------------------------------- /core/src/main/scala-2.12/com/github/mjakubowski84/parquet4s/compat/CursorCompat.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s.compat 2 | 3 | import com.github.mjakubowski84.parquet4s.Cursor 4 | import shapeless.Witness 5 | 6 | trait CursorCompat { 7 | 8 | this: Cursor => 9 | 10 | /** @tparam FieldName 11 | * symbol of the field that cursor shall advance 12 | * @return 13 | * a new cursor or None if advance to given field is disallowed 14 | */ 15 | def advance[FieldName <: Symbol: Witness.Aux]: Option[Cursor] = 16 | advanceByFieldName(implicitly[Witness.Aux[FieldName]].value.name) 17 | 18 | } 19 | -------------------------------------------------------------------------------- /core/src/main/scala-2.12/com/github/mjakubowski84/parquet4s/compat/IteratorCompat.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s.compat 2 | 3 | import scala.collection.AbstractIterator 4 | 5 | object IteratorCompat { 6 | 7 | private class UnfoldIterator[A, S](init: S, f: S => Option[(A, S)]) extends AbstractIterator[A] { 8 | private var state: S = init 9 | private var nextValue: Option[(A, S)] = null 10 | 11 | override def hasNext: Boolean = { 12 | if (nextValue == null) { 13 | nextValue = f(state) 14 | } 15 | nextValue.isDefined 16 | } 17 | 18 | override def next(): A = 19 | if (hasNext) { 20 | val (out, nextState) = nextValue.get 21 | state = nextState 22 | nextValue = null 23 | out 24 | } else { 25 | Iterator.empty.next() 26 | } 27 | 28 | } 29 | 30 | @inline 31 | def unfold[A, S](init: S)(f: S => Option[(A, S)]): Iterator[A] = new UnfoldIterator[A, S](init, f) 32 | } 33 | -------------------------------------------------------------------------------- /core/src/main/scala-2.12/com/github/mjakubowski84/parquet4s/compat/MapCompat.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s.compat 2 | 3 | import com.github.mjakubowski84.parquet4s.{MapParquetRecord, Value} 4 | 5 | object MapCompat { 6 | @inline 7 | def remove[K, V](map: Map[K, V], key: K): Map[K, V] = map - key 8 | } 9 | 10 | trait MapCompat { 11 | 12 | this: MapParquetRecord => 13 | 14 | /** Removes a single entry from this map. 15 | * 16 | * @param key 17 | * the key of the entry to remove. 18 | * @return 19 | * the [[MapParquetRecord]] itself 20 | */ 21 | override def -(key: Value): MapParquetRecord = 22 | new MapParquetRecord(MapCompat.remove(entries, key)) 23 | 24 | /** Adds a single entry to this map. 25 | * 26 | * @param entry 27 | * the element to add 28 | * @return 29 | * map of inner values entries with the entry added 30 | */ 31 | override def +[V1 >: Value](entry: (Value, V1)): Map[Value, V1] = 32 | entries + entry 33 | 34 | } 35 | -------------------------------------------------------------------------------- /core/src/main/scala-2.13/com/github/mjakubowski84/parquet4s/ParquetRecordDecoder.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import shapeless.labelled.{FieldType, field} 4 | import shapeless.{::, HList, HNil, LabelledGeneric, Lazy, Witness} 5 | 6 | import scala.annotation.{implicitNotFound, unused} 7 | import scala.util.control.NonFatal 8 | 9 | /** Type class that allows to decode instances of [[RowParquetRecord]] 10 | * @tparam T 11 | * represents schema of [[RowParquetRecord]] 12 | */ 13 | @implicitNotFound( 14 | "ParquetRecordDecoder. Cannot read data of type ${T}. " + 15 | "Please check if there is implicit ValueDecoder available for each field and subfield of ${T}." 16 | ) 17 | trait ParquetRecordDecoder[T] extends MetadataReader { 18 | 19 | /** @param record 20 | * to be decoded to instance of given type 21 | * @param configuration 22 | * [ValueCodecConfiguration] used by some codecs 23 | * @return 24 | * instance of product type decoded from record 25 | */ 26 | def decode(record: RowParquetRecord, configuration: ValueCodecConfiguration): T 27 | 28 | override def setMetadata(@unused metadata: collection.Map[String, String]): Unit = {} 29 | } 30 | 31 | object ParquetRecordDecoder { 32 | 33 | object DecodingException { 34 | def apply(msg: String, cause: Throwable): DecodingException = { 35 | val decodingException = DecodingException(msg) 36 | decodingException.initCause(cause) 37 | decodingException 38 | } 39 | } 40 | 41 | case class DecodingException(msg: String) extends RuntimeException(msg) 42 | 43 | def apply[T](implicit ev: ParquetRecordDecoder[T]): ParquetRecordDecoder[T] = ev 44 | 45 | def decode[T](record: RowParquetRecord, configuration: ValueCodecConfiguration = ValueCodecConfiguration.Default)( 46 | implicit ev: ParquetRecordDecoder[T] 47 | ): T = ev.decode(record, configuration) 48 | 49 | implicit val nilDecoder: ParquetRecordDecoder[HNil] = (_, _) => HNil 50 | 51 | implicit def headValueDecoder[FieldName <: Symbol, Head, Tail <: HList](implicit 52 | witness: Witness.Aux[FieldName], 53 | headDecoder: ValueDecoder[Head], 54 | tailDecoder: ParquetRecordDecoder[Tail] 55 | ): ParquetRecordDecoder[FieldType[FieldName, Head] :: Tail] = 56 | (record: RowParquetRecord, configuration: ValueCodecConfiguration) => { 57 | val fieldName = witness.value.name 58 | val decodedFieldOpt = 59 | try record.get[Head](fieldName, configuration) 60 | catch { 61 | case NonFatal(cause) => 62 | throw DecodingException(s"Failed to decode field $fieldName of record: $record", cause) 63 | } 64 | decodedFieldOpt match { 65 | case Some(decodedFieldValue) => 66 | field[FieldName](decodedFieldValue) :: tailDecoder.decode(record, configuration) 67 | case None => 68 | throw DecodingException(s"Missing required field $fieldName in a record: $record") 69 | } 70 | } 71 | 72 | implicit def genericDecoder[A, R](implicit 73 | gen: LabelledGeneric.Aux[A, R], 74 | decoder: Lazy[ParquetRecordDecoder[R]] 75 | ): ParquetRecordDecoder[A] = 76 | (record: RowParquetRecord, configuration: ValueCodecConfiguration) => 77 | gen.from(decoder.value.decode(record, configuration)) 78 | 79 | } 80 | -------------------------------------------------------------------------------- /core/src/main/scala-2.13/com/github/mjakubowski84/parquet4s/ProductDecoders.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import shapeless.LowPriority 4 | 5 | import scala.annotation.nowarn 6 | 7 | trait ProductDecoders { 8 | 9 | implicit def productDecoder[T](implicit 10 | @nowarn 11 | ev: LowPriority, 12 | decoder: ParquetRecordDecoder[T] 13 | ): OptionalValueDecoder[T] = 14 | (value, configuration) => 15 | value match { 16 | case record: RowParquetRecord => decoder.decode(record, configuration) 17 | } 18 | 19 | } 20 | -------------------------------------------------------------------------------- /core/src/main/scala-2.13/com/github/mjakubowski84/parquet4s/ProductEncoders.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import shapeless.LowPriority 4 | 5 | import scala.annotation.nowarn 6 | 7 | trait ProductEncoders { 8 | implicit def productEncoder[T](implicit 9 | @nowarn ev: LowPriority, 10 | encoder: ParquetRecordEncoder[T] 11 | ): OptionalValueEncoder[T] = 12 | (data, configuration) => encoder.encode(data, null, configuration) 13 | } 14 | -------------------------------------------------------------------------------- /core/src/main/scala-2.13/com/github/mjakubowski84/parquet4s/ProductSchemaDefs.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import shapeless.LowPriority 4 | 5 | import scala.annotation.nowarn 6 | 7 | trait ProductSchemaDefs { 8 | implicit def productSchema[T](implicit 9 | @nowarn ev: LowPriority, 10 | parquetSchemaResolver: ParquetSchemaResolver[T] 11 | ): TypedSchemaDef[T] = 12 | SchemaDef 13 | .group(parquetSchemaResolver.resolveSchema(Cursor.simple)*) 14 | .withMetadata(SchemaDef.Meta.Generated) 15 | .typed[T] 16 | 17 | } 18 | -------------------------------------------------------------------------------- /core/src/main/scala-2.13/com/github/mjakubowski84/parquet4s/compat/CursorCompat.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s.compat 2 | 3 | import com.github.mjakubowski84.parquet4s.Cursor 4 | import shapeless.Witness 5 | 6 | trait CursorCompat { 7 | 8 | this: Cursor => 9 | 10 | /** @tparam FieldName 11 | * symbol of the field that cursor shall advance 12 | * @return 13 | * a new cursor or None if advance to given field is disallowed 14 | */ 15 | def advance[FieldName <: Symbol: Witness.Aux]: Option[Cursor] = 16 | advanceByFieldName(implicitly[Witness.Aux[FieldName]].value.name) 17 | 18 | } 19 | -------------------------------------------------------------------------------- /core/src/main/scala-2.13/com/github/mjakubowski84/parquet4s/compat/IteratorCompat.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s.compat 2 | 3 | object IteratorCompat { 4 | 5 | @inline 6 | def unfold[A, S](init: S)(f: S => Option[(A, S)]): Iterator[A] = Iterator.unfold[A, S](init)(f) 7 | 8 | } 9 | -------------------------------------------------------------------------------- /core/src/main/scala-2.13/com/github/mjakubowski84/parquet4s/compat/MapCompat.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s.compat 2 | 3 | import com.github.mjakubowski84.parquet4s.{MapParquetRecord, Value} 4 | 5 | object MapCompat { 6 | @inline 7 | def remove[K, V](map: Map[K, V], key: K): Map[K, V] = map.removed(key) 8 | } 9 | 10 | trait MapCompat { 11 | 12 | this: MapParquetRecord => 13 | 14 | /** Removes a single entry from this map. 15 | * 16 | * @param key 17 | * key of the element to remove 18 | * @return 19 | * map of inner entries with the element removed 20 | */ 21 | override def removed(key: Value): MapParquetRecord = 22 | new MapParquetRecord(MapCompat.remove(entries, key)) 23 | 24 | /** Adds a single entry to this map. 25 | * 26 | * @param key 27 | * key of the entry to add 28 | * @param value 29 | * value of the entry to add 30 | * @return 31 | * map of inner entries updated with the new entry added 32 | */ 33 | override def updated[V1 >: Value](key: Value, value: V1): Map[Value, V1] = 34 | entries.updated(key, value) 35 | 36 | } 37 | -------------------------------------------------------------------------------- /core/src/main/scala-3/com/github/mjakubowski84/parquet4s/ProductDecoders.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import scala.util.NotGiven 4 | 5 | trait ProductDecoders: 6 | 7 | given productDecoder[T](using 8 | ev: NotGiven[ValueDecoder[T]], 9 | decoder: ParquetRecordDecoder[T] 10 | ): OptionalValueDecoder[T] = 11 | (value, configuration) => 12 | value match 13 | case record: RowParquetRecord => decoder.decode(record, configuration) 14 | 15 | end ProductDecoders 16 | -------------------------------------------------------------------------------- /core/src/main/scala-3/com/github/mjakubowski84/parquet4s/ProductEncoders.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import scala.util.NotGiven 4 | 5 | trait ProductEncoders: 6 | 7 | given productEncoder[T](using 8 | ev: NotGiven[ValueEncoder[T]], 9 | encoder: ParquetRecordEncoder[T] 10 | ): OptionalValueEncoder[T] = 11 | (data, configuration) => encoder.encode(data, null, configuration) 12 | 13 | end ProductEncoders 14 | -------------------------------------------------------------------------------- /core/src/main/scala-3/com/github/mjakubowski84/parquet4s/ProductSchemaDefs.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import scala.util.NotGiven 4 | 5 | trait ProductSchemaDefs: 6 | 7 | given productSchema[T <: Product: ParquetSchemaResolver](using 8 | NotGiven[TypedSchemaDef[T]] 9 | ): TypedSchemaDef[T] = 10 | SchemaDef 11 | .group(summon[ParquetSchemaResolver[T]].resolveSchema(Cursor.simple)*) 12 | .withMetadata(SchemaDef.Meta.Generated) 13 | .typed[T] 14 | 15 | end ProductSchemaDefs 16 | -------------------------------------------------------------------------------- /core/src/main/scala-3/com/github/mjakubowski84/parquet4s/compat/CursorCompat.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s.compat 2 | 3 | import com.github.mjakubowski84.parquet4s.Cursor 4 | 5 | trait CursorCompat: 6 | 7 | this: Cursor => 8 | 9 | /** @tparam FieldName 10 | * String & Singleton of the field that cursor shall advance 11 | * @return 12 | * a new cursor or None if advance to given field is disallowed 13 | */ 14 | def advance[FieldName <: String & Singleton: ValueOf]: Option[Cursor] = 15 | advanceByFieldName(summon[ValueOf[FieldName]].value) 16 | 17 | end CursorCompat 18 | -------------------------------------------------------------------------------- /core/src/main/scala-3/com/github/mjakubowski84/parquet4s/compat/IteratorCompat.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s.compat 2 | 3 | object IteratorCompat: 4 | inline def unfold[A, S](init: S)(f: S => Option[(A, S)]): Iterator[A] = Iterator.unfold[A, S](init)(f) 5 | -------------------------------------------------------------------------------- /core/src/main/scala-3/com/github/mjakubowski84/parquet4s/compat/MapCompat.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s.compat 2 | 3 | import com.github.mjakubowski84.parquet4s.{MapParquetRecord, Value} 4 | 5 | object MapCompat: 6 | inline def remove[K, V](map: Map[K, V], key: K): Map[K, V] = map.removed(key) 7 | 8 | trait MapCompat: 9 | 10 | this: MapParquetRecord => 11 | 12 | /** Removes a single entry from this map. 13 | * 14 | * @param key 15 | * key of the element to remove 16 | * @return 17 | * map of inner entries with the element removed 18 | */ 19 | override def removed(key: Value): MapParquetRecord = 20 | new MapParquetRecord(MapCompat.remove(entries, key)) 21 | 22 | /** Adds a single entry to this map. 23 | * 24 | * @param key 25 | * key of the entry to add 26 | * @param value 27 | * value of the entry to add 28 | * @return 29 | * map of inner entries updated with the new entry added 30 | */ 31 | override def updated[V1 >: Value](key: Value, value: V1): Map[Value, V1] = 32 | entries.updated(key, value) 33 | 34 | end MapCompat 35 | -------------------------------------------------------------------------------- /core/src/main/scala/com/github/mjakubowski84/parquet4s/ColumnProjection.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | object ColumnProjection { 4 | def apply(typedColumnPath: TypedColumnPath[?], ordinal: Int): ColumnProjection = 5 | ColumnProjection(typedColumnPath, ordinal, typedColumnPath.alias) 6 | } 7 | 8 | /** Column projection extracts a value from a field at given path and sets it at given position, optionally with a new 9 | * name. 10 | * @param columnPath 11 | * path to the field 12 | * @param ordinal 13 | * position of a column in a schema defined by the projection 14 | * @param alias 15 | * optional new name of the field 16 | */ 17 | case class ColumnProjection(columnPath: ColumnPath, ordinal: Int, alias: Option[String]) { 18 | val length: Int = columnPath.elements.length 19 | } 20 | -------------------------------------------------------------------------------- /core/src/main/scala/com/github/mjakubowski84/parquet4s/HadoopParquetReader.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import org.apache.parquet.filter2.compat.FilterCompat 4 | import org.apache.parquet.hadoop.api.ReadSupport 5 | import org.apache.parquet.io.InputFile 6 | import org.apache.parquet.schema.MessageType 7 | 8 | object HadoopParquetReader { 9 | 10 | private class Builder( 11 | inputFile: InputFile, 12 | projectedSchemaOpt: Option[MessageType], 13 | columnProjections: Seq[ColumnProjection], 14 | metadataReader: MetadataReader 15 | ) extends org.apache.parquet.hadoop.ParquetReader.Builder[RowParquetRecord](inputFile) { 16 | override lazy val getReadSupport: ReadSupport[RowParquetRecord] = new ParquetReadSupport( 17 | projectedSchemaOpt = projectedSchemaOpt, 18 | columnProjections = columnProjections, 19 | metadataReader = metadataReader 20 | ) 21 | } 22 | 23 | def apply( 24 | inputFile: InputFile, 25 | projectedSchemaOpt: Option[MessageType], 26 | columnProjections: Seq[ColumnProjection], 27 | filter: FilterCompat.Filter, 28 | metadataReader: MetadataReader 29 | ): org.apache.parquet.hadoop.ParquetReader.Builder[RowParquetRecord] = 30 | new Builder(inputFile, projectedSchemaOpt, columnProjections, metadataReader).withFilter(filter) 31 | 32 | } 33 | -------------------------------------------------------------------------------- /core/src/main/scala/com/github/mjakubowski84/parquet4s/InMemoryInputFile.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import org.apache.parquet.io.{InputFile, SeekableInputStream} 4 | 5 | import java.io.EOFException 6 | import java.nio.ByteBuffer 7 | import scala.util.control.NoStackTrace 8 | 9 | object InMemoryInputFile { 10 | def fromBytesUnsafe(bytes: Array[Byte]): InMemoryInputFile = new InMemoryInputFile(bytes) 11 | 12 | def fromBytes(bytes: Array[Byte]): InMemoryInputFile = new InMemoryInputFile(bytes.clone()) 13 | } 14 | 15 | @experimental 16 | class InMemoryInputFile private (content: Array[Byte]) extends InputFile { 17 | 18 | override def getLength: Long = content.length.toLong 19 | 20 | override def newStream(): SeekableInputStream = new SeekableInputStream { 21 | private var pos: Int = 0 22 | 23 | override def getPos: Long = pos.toLong 24 | 25 | override def seek(newPos: Long): Unit = pos = newPos.toInt 26 | 27 | override def readFully(bytes: Array[Byte]): Unit = readFully(bytes, 0, bytes.length) 28 | 29 | override def readFully(bytes: Array[Byte], start: Int, len: Int): Unit = { 30 | if (content.length - pos < len) throw new EOFException with NoStackTrace 31 | System.arraycopy(content, pos, bytes, start, len) 32 | pos += len 33 | } 34 | 35 | override def read(buf: ByteBuffer): Int = { 36 | val avail = remaining 37 | if (avail == 0) -1 38 | else { 39 | val len = avail.min(buf.remaining()) 40 | if (len > 0) { 41 | buf.put(content, pos, len) 42 | pos += len 43 | } 44 | len 45 | } 46 | } 47 | 48 | override def readFully(buf: ByteBuffer): Unit = { 49 | val availSpace = buf.remaining 50 | if (remaining < availSpace) throw new EOFException with NoStackTrace 51 | if (availSpace > 0) buf.put(content, pos, availSpace) 52 | pos += availSpace 53 | } 54 | 55 | override def read(): Int = 56 | if (remaining == 0) -1 57 | else { 58 | val next = content(pos) & 0xff 59 | pos += 1 60 | next 61 | } 62 | 63 | private def remaining: Int = content.length - pos 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /core/src/main/scala/com/github/mjakubowski84/parquet4s/InMemoryOutputFile.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import org.apache.hadoop.fs.FileAlreadyExistsException 4 | import org.apache.parquet.io.{InputFile, OutputFile, PositionOutputStream} 5 | 6 | import java.io.ByteArrayOutputStream 7 | 8 | object InMemoryOutputFile { 9 | val DefaultBlockSize: Int = 64 << 10 10 | 11 | def apply( 12 | initBufferSize: Int, 13 | maxBufferSize: Option[Int] = None, 14 | blockSize: Int = DefaultBlockSize 15 | ): InMemoryOutputFile = new InMemoryOutputFile(initBufferSize, maxBufferSize.getOrElse(3 * initBufferSize), blockSize) 16 | } 17 | 18 | /** Reusable in-memory `OutputFile` based on `ByteArrayOutputStream` 19 | * 20 | * @param initBufferSize 21 | * size of the `ByteArrayOutputStream`'s internal buffer when it is created 22 | * @param maxBufferSize 23 | * a threshold beyond which the internal buffer will be recreated with the initBufferSize 24 | * @param blockSize 25 | * size of a row group being buffered in memory. This limits the memory usage when writing 26 | */ 27 | class InMemoryOutputFile private (initBufferSize: Int, maxBufferSize: Int, blockSize: Int) extends OutputFile { 28 | private val os = new ReusableByteArrayOutputStream(initBufferSize, maxBufferSize) 29 | 30 | override def create(blockSizeHint: Long): PositionOutputStream = { 31 | if (os.size() > 0) throw new FileAlreadyExistsException(s"In-memory file already exists") 32 | new PositionOutputStream { 33 | override def getPos: Long = os.size().toLong 34 | override def write(b: Int): Unit = os.write(b) 35 | override def write(b: Array[Byte], off: Int, len: Int): Unit = os.write(b, off, len) 36 | } 37 | } 38 | 39 | override def createOrOverwrite(blockSizeHint: Long): PositionOutputStream = { 40 | os.reset() 41 | create(blockSizeHint) 42 | } 43 | 44 | override def supportsBlockSize(): Boolean = true 45 | 46 | override def defaultBlockSize(): Long = blockSize.toLong 47 | 48 | /** Return an Array[Byte] copied from the current content of the internal buffer, and reset the internal state. The 49 | * [[InMemoryOutputFile]] could then be reused without allocating the internal buffer. 50 | * 51 | * @return 52 | * bytes copied from the current content of internal buffer 53 | */ 54 | def take(): Array[Byte] = os.take 55 | 56 | def contentLength: Int = os.size() 57 | 58 | /** Creates an [[org.apache.parquet.io.InputFile]] from the content of this [[org.apache.parquet.io.OutputFile]]. 59 | */ 60 | def toInputFile: InputFile = InMemoryInputFile.fromBytes(take()) 61 | } 62 | 63 | class ReusableByteArrayOutputStream(initBufferSize: Int, maxBufferSize: Int) 64 | extends ByteArrayOutputStream(initBufferSize) { 65 | def take: Array[Byte] = { 66 | val content = toByteArray 67 | if (buf.length > maxBufferSize) { 68 | buf = new Array[Byte](initBufferSize) 69 | } 70 | count = 0 71 | content 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /core/src/main/scala/com/github/mjakubowski84/parquet4s/MetadataWriter.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | private[parquet4s] object MetadataWriter { 4 | val NoOp: MetadataWriter = () => Map.empty 5 | } 6 | 7 | private[parquet4s] trait MetadataWriter { 8 | def getMetadata(): Map[String, String] 9 | } 10 | 11 | private[parquet4s] object MetadataReader { 12 | val NoOp: MetadataReader = _ => {} 13 | } 14 | private[parquet4s] trait MetadataReader { 15 | def setMetadata(metadata: collection.Map[String, String]): Unit 16 | } 17 | -------------------------------------------------------------------------------- /core/src/main/scala/com/github/mjakubowski84/parquet4s/Path.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import org.apache.hadoop.conf.Configuration 4 | import org.apache.hadoop.fs.Path as HadoopPath 5 | import org.apache.parquet.hadoop.util.{HadoopInputFile, HadoopOutputFile} 6 | import org.apache.parquet.io.{InputFile, OutputFile} 7 | 8 | import java.net.URI 9 | import java.nio.file.{Paths, Path as NioPath} 10 | 11 | object Path { 12 | 13 | def apply(hadoopPath: HadoopPath): Path = new Path(hadoopPath) 14 | 15 | def apply(pathString: String): Path = apply(new HadoopPath(pathString)) 16 | 17 | def apply(nioPath: NioPath): Path = apply(new URI("file", null, nioPath.toAbsolutePath.toString, null, null)) 18 | 19 | def apply(uri: URI): Path = apply(new HadoopPath(uri)) 20 | 21 | def apply(parent: Path, child: String): Path = apply(new HadoopPath(parent.toHadoop, child)) 22 | 23 | val Separator: String = HadoopPath.SEPARATOR 24 | } 25 | 26 | /** Represents path/URI to Parquet file or directory containing Parquet files. 27 | */ 28 | class Path private (val hadoopPath: HadoopPath) extends AnyVal { 29 | 30 | def append(element: String): Path = new Path(new HadoopPath(hadoopPath, element)) 31 | 32 | def parent: Option[Path] = Option(hadoopPath.getParent).map(Path.apply) 33 | 34 | def name: String = hadoopPath.getName 35 | 36 | def toUri: URI = hadoopPath.toUri 37 | 38 | def toNio: NioPath = Paths.get(toUri) 39 | 40 | def toHadoop: HadoopPath = hadoopPath 41 | 42 | def canEqual(other: Any): Boolean = other.isInstanceOf[Path] 43 | 44 | def toOutputFile(conf: Configuration): OutputFile = HadoopOutputFile.fromPath(hadoopPath, conf) 45 | 46 | def toOutputFile(options: ParquetWriter.Options): OutputFile = toOutputFile(options.hadoopConf) 47 | 48 | def toInputFile(conf: Configuration): InputFile = HadoopInputFile.fromPath(hadoopPath, conf) 49 | 50 | def toInputFile(options: ParquetReader.Options): InputFile = HadoopInputFile.fromPath(hadoopPath, options.hadoopConf) 51 | 52 | override def toString: String = hadoopPath.toString 53 | 54 | } 55 | -------------------------------------------------------------------------------- /core/src/main/scala/com/github/mjakubowski84/parquet4s/UDP.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import org.apache.parquet.filter2.predicate.{Statistics, UserDefinedPredicate} 4 | 5 | /** Extend this trait in order to build a non-standard filter.
Please note! When defining V, use Java 6 | * types supported by Parquet such as: 7 | * 1. [[java.lang.Boolean]] 8 | * 1. [[java.lang.Integer]] 9 | * 1. [[java.lang.Long]] 10 | * 1. [[java.lang.Double]] 11 | * 1. [[java.lang.Float]] 12 | * 1. [[org.apache.parquet.io.api.Binary]] 13 | * 14 | * @tparam V 15 | * Type of column custom filter refers to 16 | */ 17 | trait UDP[V] { 18 | 19 | /** Used to filter record by record. 20 | * 21 | * @param value 22 | * column value of record that is filtered 23 | * @return 24 | * `true` if record containing given value should be kept 25 | */ 26 | def keep(value: V): Boolean 27 | 28 | /** Used to drop a whole row group if collected statistics do not match your requirements. 29 | * 30 | * @param statistics 31 | * of the row group 32 | * @return 33 | * `true` if the whole row group can be omitted 34 | */ 35 | def canDrop(statistics: FilterStatistics[V]): Boolean 36 | 37 | /** It is an opposite of `canDrop`. There is a separate function for inverse comparison as the some types may require 38 | * quite a different logic for that. This function will be called when processing `not` predicates. 39 | * 40 | * @param statistics 41 | * of the row group 42 | * @return 43 | * `true` if the whole row group can be omitted for inverse filter 44 | */ 45 | def inverseCanDrop(statistics: FilterStatistics[V]): Boolean 46 | 47 | /** @return 48 | * name of the filter 49 | */ 50 | def name: String 51 | } 52 | 53 | /** Row group statistics then can be used in [[UDP]] to drop unwanted data. 54 | * @param min 55 | * minimum value of `V` in a row group 56 | * @param max 57 | * maximum value of `V` in a row group 58 | * @param ordering 59 | * [[scala.Ordering]] of `V` 60 | * @tparam V 61 | * user type of column 62 | */ 63 | class FilterStatistics[V](val min: V, val max: V)(implicit val ordering: Ordering[V]) 64 | 65 | private[parquet4s] class UDPAdapter[V <: Comparable[V]](udp: UDP[V])(implicit ordering: Ordering[V]) 66 | extends UserDefinedPredicate[V] 67 | with Serializable { 68 | 69 | override def keep(value: V): Boolean = udp.keep(value) 70 | 71 | override def canDrop(statistics: Statistics[V]): Boolean = 72 | udp.canDrop(convert(statistics)) 73 | 74 | override def inverseCanDrop(statistics: Statistics[V]): Boolean = 75 | udp.inverseCanDrop(convert(statistics)) 76 | 77 | override def toString: String = udp.name 78 | 79 | private def convert(statistics: Statistics[V]) = new FilterStatistics[V](statistics.getMin, statistics.getMax) 80 | 81 | } 82 | -------------------------------------------------------------------------------- /core/src/main/scala/com/github/mjakubowski84/parquet4s/Value.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import org.apache.parquet.io.api.{Binary, RecordConsumer} 4 | import org.apache.parquet.schema.Type 5 | import java.math.BigInteger 6 | 7 | /** Basic structure element which Parquet data is built from. Represents any data element that can be read from or can 8 | * be written to Parquet files. 9 | */ 10 | trait Value extends Any { 11 | 12 | /** Writes the value content to Parquet 13 | * @param schema 14 | * schema of that value 15 | * @param recordConsumer 16 | * has to be used to write the data to the file 17 | */ 18 | def write(schema: Type, recordConsumer: RecordConsumer): Unit 19 | 20 | } 21 | 22 | /** Primitive value like integer or long. 23 | * @tparam T 24 | * type of the value 25 | */ 26 | trait PrimitiveValue[T] extends Any with Value { 27 | 28 | /** Content of the value 29 | */ 30 | def value: T 31 | 32 | } 33 | 34 | case class LongValue(value: Long) extends AnyVal with PrimitiveValue[Long] { 35 | override def write(schema: Type, recordConsumer: RecordConsumer): Unit = recordConsumer.addLong(value) 36 | } 37 | 38 | case class IntValue(value: Int) extends AnyVal with PrimitiveValue[Int] { 39 | override def write(schema: Type, recordConsumer: RecordConsumer): Unit = recordConsumer.addInteger(value) 40 | } 41 | 42 | case class FloatValue(value: Float) extends AnyVal with PrimitiveValue[Float] { 43 | override def write(schema: Type, recordConsumer: RecordConsumer): Unit = recordConsumer.addFloat(value) 44 | } 45 | 46 | case class DoubleValue(value: Double) extends AnyVal with PrimitiveValue[Double] { 47 | override def write(schema: Type, recordConsumer: RecordConsumer): Unit = recordConsumer.addDouble(value) 48 | } 49 | 50 | object BinaryValue { 51 | def apply(bytes: Array[Byte]): BinaryValue = BinaryValue(Binary.fromReusedByteArray(bytes)) 52 | def apply(string: String): BinaryValue = BinaryValue(Binary.fromString(string)) 53 | } 54 | 55 | case class BinaryValue(value: Binary) extends PrimitiveValue[Binary] { 56 | 57 | override def write(schema: Type, recordConsumer: RecordConsumer): Unit = recordConsumer.addBinary(value) 58 | 59 | } 60 | 61 | case class BooleanValue(value: Boolean) extends AnyVal with PrimitiveValue[Boolean] { 62 | override def write(schema: Type, recordConsumer: RecordConsumer): Unit = recordConsumer.addBoolean(value) 63 | } 64 | 65 | case class DateTimeValue(value: Long, format: TimestampFormat.Format) extends Value { 66 | override def write(schema: Type, recordConsumer: RecordConsumer): Unit = recordConsumer.addLong(value) 67 | } 68 | 69 | case class DecimalValue(value: BigInteger, format: DecimalFormat.Format) extends Value { 70 | 71 | override def write(schema: Type, recordConsumer: RecordConsumer): Unit = format.write(recordConsumer, value) 72 | 73 | def toBigDecimal: BigDecimal = BigDecimal(value, format.scale, format.mathContext) 74 | } 75 | 76 | /** Special instance of [[Value]] that represents lack of the value. [[NullValue]] does not hold any data so it cannot 77 | * be written. 78 | */ 79 | case object NullValue extends Value { 80 | override def write(schema: Type, recordConsumer: RecordConsumer): Unit = 81 | throw new UnsupportedOperationException("Null values cannot be written.") 82 | } 83 | -------------------------------------------------------------------------------- /core/src/main/scala/com/github/mjakubowski84/parquet4s/ValueCodec.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | /** Represents both [[ValueEncoder]] and [[ValueDecoder]] 4 | */ 5 | trait ValueCodec[T] extends ValueEncoder[T] with ValueDecoder[T] 6 | 7 | /** Represents both [[RequiredValueEncoder]] and [[RequiredValueDecoder]] 8 | */ 9 | trait RequiredValueCodec[T] extends ValueCodec[T] with RequiredValueEncoder[T] with RequiredValueDecoder[T] 10 | 11 | /** Represents both [[OptionalValueEncoder]] and [[OptionalValueDecoder]] 12 | */ 13 | trait OptionalValueCodec[T] extends ValueCodec[T] with OptionalValueEncoder[T] with OptionalValueDecoder[T] 14 | -------------------------------------------------------------------------------- /core/src/main/scala/com/github/mjakubowski84/parquet4s/ValueCodecConfiguration.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import java.util.TimeZone 4 | 5 | /** Configuration necessary for some of codecs 6 | * 7 | * @param timeZone 8 | * used when encoding and decoding time-based values 9 | */ 10 | case class ValueCodecConfiguration(timeZone: TimeZone) 11 | 12 | object ValueCodecConfiguration { 13 | val Default: ValueCodecConfiguration = ValueCodecConfiguration(timeZone = TimeZone.getDefault) 14 | 15 | def apply(readerOptions: ParquetReader.Options): ValueCodecConfiguration = 16 | ValueCodecConfiguration(readerOptions.timeZone) 17 | 18 | def apply(writerOptions: ParquetWriter.Options): ValueCodecConfiguration = 19 | ValueCodecConfiguration(writerOptions.timeZone) 20 | 21 | } 22 | -------------------------------------------------------------------------------- /core/src/main/scala/com/github/mjakubowski84/parquet4s/ValueDecoder.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import scala.annotation.implicitNotFound 4 | 5 | trait AllValueDecoders extends PrimitiveValueDecoders with TimeValueDecoders with ComplexValueDecoders 6 | 7 | /** Contains implicit instances of all [[ValueDecoder]] 8 | */ 9 | object ValueDecoder extends AllValueDecoders 10 | 11 | @implicitNotFound( 12 | "Missing ValueDecoder for value type ${T}. Implement your own decoder in order to deserialise your data." 13 | ) 14 | trait ValueDecoder[T] { 15 | 16 | /** @param value 17 | * source Parquet [[Value]] 18 | * @param configuration 19 | * [ValueCodecConfiguration] used by some codecs 20 | * @return 21 | * data decoded from [[Value]] 22 | */ 23 | def decode(value: Value, configuration: ValueCodecConfiguration): T 24 | } 25 | 26 | /** Decoder for non-null type of [[Value]] 27 | * @tparam T 28 | * data type to decode from 29 | */ 30 | trait RequiredValueDecoder[T] extends ValueDecoder[T] { 31 | 32 | final override def decode(value: Value, configuration: ValueCodecConfiguration): T = 33 | value match { 34 | case NullValue => 35 | throw new IllegalArgumentException("NullValue cannot be decoded to required type") 36 | case other => 37 | decodeNonNull(other, configuration) 38 | } 39 | 40 | protected def decodeNonNull(value: Value, configuration: ValueCodecConfiguration): T 41 | 42 | } 43 | 44 | /** Decoder for [[Value]] that can be null. 45 | * @tparam T 46 | * data type to decode from 47 | */ 48 | trait OptionalValueDecoder[T] extends ValueDecoder[T] { 49 | 50 | final override def decode(value: Value, configuration: ValueCodecConfiguration): T = 51 | value match { 52 | case NullValue => null.asInstanceOf[T] 53 | case other => decodeNonNull(other, configuration) 54 | } 55 | 56 | protected def decodeNonNull(value: Value, configuration: ValueCodecConfiguration): T 57 | 58 | } 59 | -------------------------------------------------------------------------------- /core/src/main/scala/com/github/mjakubowski84/parquet4s/ValueEncoder.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import scala.annotation.implicitNotFound 4 | 5 | trait AllValueEncoders extends PrimitiveValueEncoders with TimeValueEncoders with ComplexValueEncoders 6 | 7 | /** Contains implicit instances of all [[ValueEncoder]] 8 | */ 9 | object ValueEncoder extends AllValueEncoders 10 | 11 | @implicitNotFound( 12 | "Missing ValueEncoder for value type ${T}. Implement your own encoder in order to serialise your data." 13 | ) 14 | trait ValueEncoder[T] { 15 | 16 | /** @param data 17 | * source data 18 | * @param configuration 19 | * [ValueCodecConfiguration] used by some codecs 20 | * @return 21 | * encoded Parquet [[Value]] 22 | */ 23 | def encode(data: T, configuration: ValueCodecConfiguration): Value 24 | } 25 | 26 | /** Encoder for non-null type of [[Value]] 27 | * @tparam T 28 | * data type to encode to 29 | */ 30 | trait RequiredValueEncoder[T] extends ValueEncoder[T] { 31 | override def encode(data: T, configuration: ValueCodecConfiguration): Value = 32 | Option(data) match { 33 | case None => 34 | throw new IllegalArgumentException("Cannot encode null instance of required type") 35 | case Some(other) => 36 | encodeNonNull(other, configuration) 37 | } 38 | 39 | protected def encodeNonNull(data: T, configuration: ValueCodecConfiguration): Value 40 | } 41 | 42 | /** Encoder for [[Value]] that can be null. 43 | * @tparam T 44 | * data type to encode to 45 | */ 46 | trait OptionalValueEncoder[T] extends ValueEncoder[T] { 47 | override def encode(data: T, configuration: ValueCodecConfiguration): Value = 48 | Option(data).fold[Value](NullValue)(nonNullData => encodeNonNull(nonNullData, configuration)) 49 | 50 | protected def encodeNonNull(data: T, configuration: ValueCodecConfiguration): Value 51 | } 52 | -------------------------------------------------------------------------------- /core/src/main/scala/com/github/mjakubowski84/parquet4s/ValueImplicits.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import java.time.{LocalDate, LocalDateTime} 4 | 5 | /** Provides simple conversion methods for primitives. 6 | */ 7 | object ValueImplicits { 8 | 9 | import ValueCodecConfiguration.* 10 | 11 | implicit class IntWrapper(v: Int)(implicit encoder: ValueEncoder[Int]) { 12 | def value: Value = encoder.encode(v, Default) 13 | } 14 | implicit class LongWrapper(v: Long)(implicit encoder: ValueEncoder[Long]) { 15 | def value: Value = encoder.encode(v, Default) 16 | } 17 | implicit class FloatWrapper(v: Float)(implicit encoder: ValueEncoder[Float]) { 18 | def value: Value = encoder.encode(v, Default) 19 | } 20 | implicit class DoubleWrapper(v: Double)(implicit encoder: ValueEncoder[Double]) { 21 | def value: Value = encoder.encode(v, Default) 22 | } 23 | implicit class ByteWrapper(v: Byte)(implicit encoder: ValueEncoder[Byte]) { 24 | def value: Value = encoder.encode(v, Default) 25 | } 26 | implicit class ShortWrapper(v: Short)(implicit encoder: ValueEncoder[Short]) { 27 | def value: Value = encoder.encode(v, Default) 28 | } 29 | implicit class BooleanWrapper(v: Boolean)(implicit encoder: ValueEncoder[Boolean]) { 30 | def value: Value = encoder.encode(v, Default) 31 | } 32 | implicit class StringWrapper(v: String)(implicit encoder: ValueEncoder[String]) { 33 | def value: Value = encoder.encode(v, Default) 34 | } 35 | implicit class CharWrapper(v: Char)(implicit encoder: ValueEncoder[Char]) { 36 | def value: Value = encoder.encode(v, Default) 37 | } 38 | implicit class BigDecimalWrapper(v: BigDecimal)(implicit encoder: ValueEncoder[BigDecimal]) { 39 | def value: Value = encoder.encode(v, Default) 40 | } 41 | implicit class LocalDateTimeWrapper(v: LocalDateTime)(implicit encoder: ValueEncoder[LocalDateTime]) { 42 | def value: Value = encoder.encode(v, Default) 43 | } 44 | implicit class LocalDateWrapper(v: LocalDate)(implicit encoder: ValueEncoder[LocalDate]) { 45 | def value: Value = encoder.encode(v, Default) 46 | } 47 | implicit class TimestampWrapper(v: java.sql.Timestamp)(implicit encoder: ValueEncoder[java.sql.Timestamp]) { 48 | def value: Value = encoder.encode(v, Default) 49 | } 50 | implicit class DateWrapper(v: java.sql.Date)(implicit encoder: ValueEncoder[java.sql.Date]) { 51 | def value: Value = encoder.encode(v, Default) 52 | } 53 | 54 | } 55 | -------------------------------------------------------------------------------- /core/src/main/scala/com/github/mjakubowski84/parquet4s/etl/CompoundParquetIterable.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s.etl 2 | 3 | import com.github.mjakubowski84.parquet4s.* 4 | import com.github.mjakubowski84.parquet4s.stats.CompoundStats 5 | 6 | private[parquet4s] class CompoundParquetIterable[T](components: Iterable[ParquetIterable[T]]) 7 | extends ParquetIterable[T] { 8 | 9 | override val stats: Stats = new CompoundStats(components.map(_.stats)) 10 | 11 | override lazy val valueCodecConfiguration: ValueCodecConfiguration = 12 | components.headOption.map(_.valueCodecConfiguration).getOrElse(ValueCodecConfiguration.Default) 13 | 14 | override def iterator: Iterator[T] = 15 | components.foldLeft[Iterator[T]](Iterator.empty)(_ ++ _.iterator) 16 | 17 | override def close(): Unit = components.foreach(_.close()) 18 | 19 | override private[parquet4s] def appendTransformation( 20 | transformation: RowParquetRecord => Iterable[RowParquetRecord] 21 | ): ParquetIterable[T] = 22 | new CompoundParquetIterable[T](components.map(_.appendTransformation(transformation))) 23 | 24 | override private[parquet4s] def changeDecoder[U: ParquetRecordDecoder]: ParquetIterable[U] = 25 | new CompoundParquetIterable[U](components.map(_.changeDecoder[U])) 26 | 27 | override def concat(other: ParquetIterable[T]): ParquetIterable[T] = 28 | new CompoundParquetIterable(components ++ Iterable(other)) 29 | } 30 | -------------------------------------------------------------------------------- /core/src/main/scala/com/github/mjakubowski84/parquet4s/etl/InMemoryParquetIterable.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s.etl 2 | 3 | import com.github.mjakubowski84.parquet4s.* 4 | import com.github.mjakubowski84.parquet4s.stats.InMemoryStats 5 | 6 | private[parquet4s] class InMemoryParquetIterable[T]( 7 | data: => Iterable[RowParquetRecord], 8 | override val valueCodecConfiguration: ValueCodecConfiguration = ValueCodecConfiguration.Default, 9 | transformations: Seq[RowParquetRecord => Iterable[RowParquetRecord]] = Seq.empty, 10 | decode: RowParquetRecord => T = identity[RowParquetRecord] _ 11 | ) extends ParquetIterable[T] { 12 | 13 | override private[parquet4s] def appendTransformation( 14 | transformation: RowParquetRecord => Iterable[RowParquetRecord] 15 | ): ParquetIterable[T] = 16 | new InMemoryParquetIterable[T]( 17 | data = data, 18 | valueCodecConfiguration = valueCodecConfiguration, 19 | transformations = transformations :+ transformation, 20 | decode = decode 21 | ) 22 | 23 | override private[parquet4s] def changeDecoder[U: ParquetRecordDecoder]: ParquetIterable[U] = 24 | new InMemoryParquetIterable[U]( 25 | data = data, 26 | valueCodecConfiguration = valueCodecConfiguration, 27 | transformations = transformations, 28 | decode = record => ParquetRecordDecoder.decode[U](record, valueCodecConfiguration) 29 | ) 30 | 31 | override private[parquet4s] lazy val stats: Stats = new InMemoryStats(data, valueCodecConfiguration) 32 | 33 | override def close(): Unit = () 34 | 35 | override def iterator: Iterator[T] = 36 | if (transformations.isEmpty) data.iterator.map(decode) 37 | else 38 | data.iterator.flatMap(record => 39 | transformations 40 | .foldLeft(Iterator(record)) { case (iterator, transformation) => 41 | iterator.flatMap(transformation) 42 | } 43 | .map(decode) 44 | ) 45 | } 46 | -------------------------------------------------------------------------------- /core/src/main/scala/com/github/mjakubowski84/parquet4s/experimental.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | /** Experimental feature. API and functionality may change or be removed completely. 4 | */ 5 | class experimental extends scala.annotation.Annotation 6 | -------------------------------------------------------------------------------- /core/src/main/scala/com/github/mjakubowski84/parquet4s/stats/CompoundStats.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s.stats 2 | 3 | import com.github.mjakubowski84.parquet4s.{ColumnPath, Stats, ValueDecoder} 4 | 5 | /** Calculates [[Stats]] from multiple files. 6 | */ 7 | private[parquet4s] class CompoundStats(statsSeq: Iterable[Stats]) extends Stats { 8 | 9 | override lazy val recordCount: Long = statsSeq.map(_.recordCount).sum 10 | 11 | override def min[V](columnPath: ColumnPath, currentMin: Option[V])(implicit 12 | decoder: ValueDecoder[V], 13 | ordering: Ordering[V] 14 | ): Option[V] = 15 | statsSeq.foldLeft(currentMin) { case (acc, stats) => 16 | stats.min(columnPath, acc) 17 | } 18 | 19 | override def max[V](columnPath: ColumnPath, currentMax: Option[V])(implicit 20 | decoder: ValueDecoder[V], 21 | ordering: Ordering[V] 22 | ): Option[V] = 23 | statsSeq.foldLeft(currentMax) { case (acc, stats) => 24 | stats.max(columnPath, acc) 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /core/src/main/scala/com/github/mjakubowski84/parquet4s/stats/FileStats.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s.stats 2 | 3 | import com.github.mjakubowski84.parquet4s.* 4 | import org.apache.parquet.ParquetReadOptions 5 | import org.apache.parquet.column.statistics.Statistics 6 | import org.apache.parquet.hadoop.ParquetFileReader 7 | import org.apache.parquet.io.InputFile 8 | import org.apache.parquet.schema.MessageType 9 | 10 | import scala.jdk.CollectionConverters.* 11 | import scala.util.Using 12 | 13 | /** Calculates statistics from unfiltered Parquet files. 14 | */ 15 | private[parquet4s] class FileStats( 16 | inputFile: InputFile, 17 | vcc: ValueCodecConfiguration, 18 | projectionSchemaOpt: Option[MessageType] 19 | ) extends Stats { 20 | 21 | private val readerOptions = ParquetReadOptions.builder().build() 22 | 23 | abstract private class StatsReader extends AutoCloseable { 24 | protected val reader: ParquetFileReader = ParquetFileReader.open(inputFile, readerOptions) 25 | projectionSchemaOpt.foreach(reader.setRequestedSchema) 26 | override def close(): Unit = reader.close() 27 | } 28 | 29 | private class RecordCountReader extends StatsReader { 30 | def recordCount: Long = reader.getRecordCount 31 | } 32 | 33 | private class MinMaxReader[V](columnPath: ColumnPath, currentExtreme: Option[V])(implicit 34 | decoder: ValueDecoder[V], 35 | ordering: Ordering[V] 36 | ) extends StatsReader { 37 | private val dotString = columnPath.toString 38 | 39 | private def extreme(statsValue: Statistics[?] => Option[Value], choose: (V, V) => V) = 40 | reader.getRowGroups.asScala.iterator 41 | .map(block => block.getColumns.asScala.find(_.getPath.toDotString == dotString)) 42 | .flatMap { 43 | case Some(column) => statsValue(column.getStatistics).map(value => decoder.decode(value, vcc)) 44 | case None => None 45 | } 46 | .foldLeft(currentExtreme) { 47 | case (None, v) => Option(v) 48 | case (Some(a), b) => Option(choose(a, b)) 49 | } 50 | 51 | def min: Option[V] = extreme(statsMinValue, ordering.min) 52 | def max: Option[V] = extreme(statsMaxValue, ordering.max) 53 | 54 | } 55 | 56 | override def recordCount: Long = 57 | Using.resource(new RecordCountReader)(_.recordCount) 58 | 59 | override def min[V](columnPath: ColumnPath, currentMin: Option[V])(implicit 60 | decoder: ValueDecoder[V], 61 | ordering: Ordering[V] 62 | ): Option[V] = 63 | Using.resource(new MinMaxReader[V](columnPath, currentMin))(_.min) 64 | 65 | override def max[V](columnPath: ColumnPath, currentMax: Option[V])(implicit 66 | decoder: ValueDecoder[V], 67 | ordering: Ordering[V] 68 | ): Option[V] = 69 | Using.resource(new MinMaxReader[V](columnPath, currentMax))(_.max) 70 | 71 | } 72 | -------------------------------------------------------------------------------- /core/src/main/scala/com/github/mjakubowski84/parquet4s/stats/InMemoryStats.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s.stats 2 | 3 | import com.github.mjakubowski84.parquet4s.* 4 | 5 | private[parquet4s] class InMemoryStats(iterable: Iterable[RowParquetRecord], vcc: ValueCodecConfiguration) 6 | extends Stats { 7 | 8 | override lazy val recordCount: Long = iterable.size.toLong 9 | 10 | override protected[parquet4s] def min[V](columnPath: ColumnPath, currentMin: Option[V])(implicit 11 | decoder: ValueDecoder[V], 12 | ordering: Ordering[V] 13 | ): Option[V] = iterable.foldLeft(currentMin) { case (currOpt, record) => 14 | (record.get(columnPath).map(decoder.decode(_, vcc)), currOpt) match { 15 | case (Some(v), Some(curr)) => Some(ordering.min(curr, v)) 16 | case (Some(v), None) => Some(v) 17 | case (None, _) => currOpt 18 | } 19 | } 20 | 21 | override protected[parquet4s] def max[V](columnPath: ColumnPath, currentMax: Option[V])(implicit 22 | decoder: ValueDecoder[V], 23 | ordering: Ordering[V] 24 | ): Option[V] = iterable.foldLeft(currentMax) { case (currOpt, record) => 25 | (record.get(columnPath).map(decoder.decode(_, vcc)), currOpt) match { 26 | case (Some(v), Some(curr)) => Some(ordering.max(curr, v)) 27 | case (Some(v), None) => Some(v) 28 | case (None, _) => currOpt 29 | } 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /core/src/main/scala/com/github/mjakubowski84/parquet4s/stats/LazyDelegateStats.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s.stats 2 | 3 | import com.github.mjakubowski84.parquet4s.* 4 | import org.apache.parquet.filter2.compat.FilterCompat 5 | import org.apache.parquet.filter2.compat.FilterCompat.NoOpFilter 6 | import org.apache.parquet.io.InputFile 7 | import org.apache.parquet.schema.MessageType 8 | 9 | private[parquet4s] class LazyDelegateStats( 10 | inputFile: InputFile, 11 | vcc: ValueCodecConfiguration, 12 | projectionSchemaOpt: Option[MessageType], 13 | filter: FilterCompat.Filter, 14 | partitionViewOpt: Option[PartitionView] 15 | ) extends Stats { 16 | private lazy val delegate: Stats = { 17 | val fileStats = 18 | if (filter.isInstanceOf[NoOpFilter]) 19 | new FileStats(inputFile, vcc, projectionSchemaOpt) 20 | else 21 | new FilteredFileStats(inputFile, vcc, projectionSchemaOpt, filter) 22 | partitionViewOpt match { 23 | case Some(partitionView) if partitionView.nonEmpty => 24 | new PartitionedFileStats(fileStats, partitionView) 25 | case _ => 26 | fileStats 27 | } 28 | } 29 | 30 | override def recordCount: Long = delegate.recordCount 31 | 32 | override protected[parquet4s] def min[V](columnPath: ColumnPath, currentMin: Option[V])(implicit 33 | decoder: ValueDecoder[V], 34 | ordering: Ordering[V] 35 | ): Option[V] = 36 | delegate.min(columnPath, currentMin) 37 | 38 | override protected[parquet4s] def max[V](columnPath: ColumnPath, currentMax: Option[V])(implicit 39 | decoder: ValueDecoder[V], 40 | ordering: Ordering[V] 41 | ): Option[V] = 42 | delegate.max(columnPath, currentMax) 43 | } 44 | -------------------------------------------------------------------------------- /core/src/main/scala/com/github/mjakubowski84/parquet4s/stats/PartitionedFileStats.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s.stats 2 | 3 | import com.github.mjakubowski84.parquet4s._ 4 | 5 | private[parquet4s] class PartitionedFileStats(wrapped: Stats, partitionView: PartitionView) extends Stats { 6 | override def recordCount = wrapped.recordCount 7 | 8 | override protected[parquet4s] def min[V](columnPath: ColumnPath, currentMin: Option[V])(implicit 9 | decoder: ValueDecoder[V], 10 | ordering: Ordering[V] 11 | ): Option[V] = 12 | (partitionView.value(columnPath).map(_.toStringUsingUTF8.asInstanceOf[V]), currentMin) match { 13 | case (Some(partitionValue), Some(cm)) => Some(ordering.min(partitionValue, cm)) 14 | case (Some(partitionValue), None) => Some(partitionValue) 15 | case _ => wrapped.min[V](columnPath, currentMin) 16 | } 17 | 18 | override protected[parquet4s] def max[V](columnPath: ColumnPath, currentMax: Option[V])(implicit 19 | decoder: ValueDecoder[V], 20 | ordering: Ordering[V] 21 | ): Option[V] = 22 | (partitionView.value(columnPath).map(_.toStringUsingUTF8.asInstanceOf[V]), currentMax) match { 23 | case (Some(partitionValue), Some(cm)) => Some(ordering.max(partitionValue, cm)) 24 | case (Some(partitionValue), None) => Some(partitionValue) 25 | case _ => wrapped.max[V](columnPath, currentMax) 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /core/src/test/resources/logback-test.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /core/src/test/scala/com/github/mjakubowski84/parquet4s/ColumnPathSpec.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import com.github.mjakubowski84.parquet4s.LogicalTypes.StringType 4 | import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY 5 | import org.apache.parquet.schema.Type.Repetition.OPTIONAL 6 | import org.apache.parquet.schema.Types 7 | import org.scalatest.flatspec.AnyFlatSpec 8 | import org.scalatest.matchers.should.Matchers 9 | 10 | class ColumnPathSpec extends AnyFlatSpec with Matchers { 11 | 12 | "ColumnPath" should "be created with proper elements" in { 13 | Col("path").elements should be(Seq("path")) 14 | Col("path.subPath").elements should be(Seq("path", "subPath")) 15 | } 16 | 17 | it should "be appendable" in { 18 | Col("path").appendElement("subPath").elements should be(Seq("path", "subPath")) 19 | } 20 | 21 | it should "turn to dot path" in { 22 | Col("path").toString should be("path") 23 | Col("path.subPath").toString should be("path.subPath") 24 | } 25 | 26 | it should "be able to turn to typed" in { 27 | Col("path").as[String].toType should be(Types.primitive(BINARY, OPTIONAL).as(StringType).named("path")) 28 | } 29 | 30 | } 31 | -------------------------------------------------------------------------------- /core/src/test/scala/com/github/mjakubowski84/parquet4s/IOOpsSpec.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import org.scalatest.Inside 4 | import org.scalatest.flatspec.AnyFlatSpec 5 | import org.scalatest.matchers.should.Matchers 6 | 7 | class IOOpsSpec extends AnyFlatSpec with Matchers with Inside with PartitionTestUtils { 8 | 9 | "PartitionRegexp" should "match valid partition names and values" in 10 | forAll(ValidPartitionsTable) { case (name, value) => 11 | inside(s"$name=$value") { case IOOps.PartitionRegexp(`name`, `value`) => 12 | succeed 13 | } 14 | } 15 | 16 | it should "not match invalid partition names and values" in 17 | forAll(InvalidPartitionsTable) { case (name, value) => 18 | s"$name=$value" match { 19 | case IOOps.PartitionRegexp(capturedName, capturedValue) => 20 | fail( 21 | s"Expected no match for name [$name] and value [$value] " + 22 | s"but one was found: [$capturedName, $capturedValue]" 23 | ) 24 | case _ => 25 | succeed 26 | } 27 | } 28 | 29 | } 30 | -------------------------------------------------------------------------------- /core/src/test/scala/com/github/mjakubowski84/parquet4s/InMemoryFileSpec.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import org.apache.commons.lang3.RandomStringUtils 4 | import org.scalatest.flatspec.AnyFlatSpec 5 | import org.scalatest.matchers.should.Matchers 6 | 7 | import java.nio.file.Files 8 | 9 | class InMemoryFileSpec extends AnyFlatSpec with Matchers { 10 | it should "write to in-memory output file" in { 11 | case class Data(id: Int, text: String) 12 | 13 | val count = 100 14 | val data = (1 to count).map(i => Data(id = i, text = RandomStringUtils.randomPrint(4))) 15 | val file = InMemoryOutputFile(initBufferSize = 1024) 16 | 17 | // write 18 | ParquetWriter.of[Data].writeAndClose(file, data) 19 | 20 | val inputFile = Files.createTempFile("in-memory-output-file-test", ".parquet") 21 | Files.write(inputFile, file.take()) 22 | 23 | // read 24 | val readData = ParquetReader.as[Data].read(Path(inputFile)) 25 | try readData.toSeq shouldBe data 26 | finally readData.close() 27 | } 28 | 29 | it should "read from in-memory input file" in { 30 | case class Data(id: Int, text: String) 31 | 32 | val count = 100 33 | val data = (1 to count).map(i => Data(id = i, text = RandomStringUtils.randomPrint(4))) 34 | val outputFile = InMemoryOutputFile(initBufferSize = 1024) 35 | 36 | // write 37 | ParquetWriter.of[Data].writeAndClose(outputFile, data) 38 | 39 | val inputFile = InMemoryInputFile.fromBytesUnsafe(outputFile.take()) 40 | 41 | // read 42 | val readData = ParquetReader.as[Data].read(inputFile) 43 | try readData.toSeq shouldBe data 44 | finally readData.close() 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /core/src/test/scala/com/github/mjakubowski84/parquet4s/PartitionTestUtils.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import org.scalatest.prop.{TableDrivenPropertyChecks, TableFor2} 4 | 5 | trait PartitionTestUtils extends TableDrivenPropertyChecks { 6 | private val allChars: Seq[Char] = (Byte.MinValue to Byte.MaxValue).map(_.toChar) 7 | private val alphaNumericChars: Seq[Char] = ('a' to 'z') ++ ('A' to 'Z') ++ ('0' to '9') 8 | 9 | private val allowedPartitionNameChars: Seq[Char] = alphaNumericChars ++ Seq('.', '_') 10 | private val allowedPartitionValueChars: Seq[Char] = alphaNumericChars ++ Seq( 11 | '!', '?', '-', '+', '_', '.', ',', '*', '\'', '(', ')', '&', '@', ':', ';', '/', ' ' 12 | ) 13 | 14 | private val disallowedPartitionNameChars: Seq[Char] = allChars.filterNot(allowedPartitionNameChars.contains) 15 | private val disallowedPartitionValueChars: Seq[Char] = allChars.filterNot(allowedPartitionValueChars.contains) 16 | 17 | private val validNames = generatePartitionStrings(prefix = "testValue", withChars = allowedPartitionNameChars) 18 | private val validValues = generatePartitionStrings(prefix = "testName", withChars = allowedPartitionValueChars) 19 | 20 | private val invalidNames = generatePartitionStrings(prefix = "testValue", withChars = disallowedPartitionNameChars) 21 | private val invalidValues = generatePartitionStrings(prefix = "testName", withChars = disallowedPartitionValueChars) 22 | 23 | private def generatePartitionStrings(prefix: String, withChars: Seq[Char]) = withChars.map(char => s"$prefix$char") 24 | 25 | val ValidPartitionsTable: TableFor2[String, String] = Table( 26 | ("name", "value"), 27 | validNames.flatMap(name => validValues.map(value => name -> value))* 28 | ) 29 | val InvalidPartitionsTable: TableFor2[String, String] = Table( 30 | ("name", "value"), 31 | invalidNames.flatMap(name => invalidValues.map(value => name -> value))* 32 | ) 33 | 34 | } 35 | -------------------------------------------------------------------------------- /core/src/test/scala/com/github/mjakubowski84/parquet4s/TestCases.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | object TestCases { 4 | 5 | case class Empty() 6 | 7 | // Primitives 8 | case class Primitives( 9 | boolean: Boolean, 10 | int: Int, 11 | long: Long, 12 | float: Float, 13 | double: Double, 14 | string: String, 15 | short: Short, 16 | byte: Byte, 17 | char: Char, 18 | bigDecimal: BigDecimal 19 | ) 20 | case class TimePrimitives( 21 | localDateTime: java.time.LocalDateTime, 22 | sqlTimestamp: java.sql.Timestamp, 23 | localDate: java.time.LocalDate, 24 | sqlDate: java.sql.Date 25 | ) 26 | case class ContainsOption(optional: Option[Int]) 27 | 28 | // Collections of primitives 29 | case class Collections( 30 | list: List[Int], 31 | seq: Seq[Int], 32 | vector: Vector[Int], 33 | set: Set[Int], 34 | array: Array[Int] 35 | ) { 36 | override def equals(obj: Any): Boolean = 37 | obj match { 38 | case other @ Collections(otherList, otherSeq, otherVector, otherSet, otherArray) => 39 | (other canEqual this) && 40 | list == otherList && 41 | seq == otherSeq && 42 | vector == otherVector && 43 | set == otherSet && 44 | array.sameElements(otherArray) 45 | case _ => false 46 | } 47 | } 48 | case class ArrayOfBytes(bytes: Array[Byte]) { 49 | override def equals(obj: Any): Boolean = 50 | obj match { 51 | case other @ ArrayOfBytes(bytes) => 52 | (other canEqual this) && this.bytes.sameElements(bytes) 53 | case _ => false 54 | } 55 | } 56 | case class ContainsCollectionOfOptionalPrimitives(list: List[Option[Int]]) 57 | case class ContainsCollectionOfCollections(listOfSets: List[Set[Int]]) 58 | case class ContainsMapOfPrimitives(map: Map[String, Int]) 59 | case class ContainsMapOfOptionalPrimitives(map: Map[String, Option[Int]]) 60 | case class ContainsMapOfCollectionsOfPrimitives(map: Map[String, List[Int]]) 61 | 62 | // Nested class 63 | case class Nested(int: Int) 64 | case class ContainsNestedClass(nested: Nested) 65 | case class ContainsOptionalNestedClass(nestedOptional: Option[Nested]) 66 | 67 | // Collections of nested class 68 | case class CollectionsOfNestedClass( 69 | list: List[Nested], 70 | seq: Seq[Nested], 71 | vector: Vector[Nested], 72 | set: Set[Nested], 73 | array: Array[Nested] 74 | ) { 75 | override def equals(obj: Any): Boolean = 76 | obj match { 77 | case other @ CollectionsOfNestedClass(otherList, otherSeq, otherVector, otherSet, otherArray) => 78 | (other canEqual this) && 79 | list == otherList && 80 | seq == otherSeq && 81 | vector == otherVector && 82 | set == otherSet && 83 | array.sameElements(otherArray) 84 | case _ => false 85 | } 86 | } 87 | case class ContainsMapOfNestedClassAsValue(nested: Map[String, Nested]) 88 | case class ContainsMapOfNestedClassAsKey(nested: Map[Nested, String]) 89 | case class ContainsMapOfOptionalNestedClassAsValue(nested: Map[String, Option[Nested]]) 90 | case class ContainsMapOfCollectionsOfNestedClassAsValue(nested: Map[String, List[Nested]]) 91 | 92 | } 93 | -------------------------------------------------------------------------------- /core/src/test/scala/com/github/mjakubowski84/parquet4s/ValueCodecsSpec.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import org.scalatest.flatspec.AnyFlatSpec 4 | import org.scalatest.matchers.should.Matchers 5 | 6 | import java.time.Instant 7 | import java.util.TimeZone 8 | 9 | class ValueCodecsSpec extends AnyFlatSpec with Matchers { 10 | private val defaultConfiguration = ValueCodecConfiguration(TimeZone.getTimeZone("Africa/Nairobi")) 11 | 12 | behavior of "Default timestamp format (INT96)" 13 | 14 | it should "be able to encode Instant and decode it back" in { 15 | val instant = Instant.ofEpochMilli(1234567L) 16 | val decodedInstant = codec(instant) 17 | decodedInstant should be(instant) 18 | } 19 | 20 | private def codec[A](a: A)(implicit encoder: ValueEncoder[A], decoder: ValueDecoder[A]): A = { 21 | val value = encoder.encode(a, defaultConfiguration) 22 | decoder.decode(value, defaultConfiguration) 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /core/src/test/scala/com/github/mjakubowski84/parquet4s/ValueEncodingAndDecodingSpec.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import org.scalatest.flatspec.AnyFlatSpec 4 | import org.scalatest.matchers.should.Matchers 5 | 6 | class ValueEncodingAndDecodingSpec extends AnyFlatSpec with Matchers { 7 | 8 | case class TestType(i: Int) 9 | 10 | val requiredValueEncoder: RequiredValueEncoder[TestType] = (data, _) => IntValue(data.i) 11 | val requiredValueDecoder: RequiredValueDecoder[TestType] = (value, _) => 12 | value match { 13 | case IntValue(i) => TestType(i) 14 | } 15 | val optionalValueEncoder: OptionalValueEncoder[TestType] = (data, _) => IntValue(data.i) 16 | val optionalValueDecoder: OptionalValueDecoder[TestType] = (value, _) => 17 | value match { 18 | case IntValue(i) => TestType(i) 19 | } 20 | 21 | val testType: TestType = TestType(42) 22 | val testValue: IntValue = IntValue(testType.i) 23 | val configuration: ValueCodecConfiguration = ValueCodecConfiguration.Default 24 | 25 | "Required value encoder" should "encode non-null value" in { 26 | requiredValueEncoder.encode(testType, configuration) should be(testValue) 27 | } 28 | 29 | it should "throw an exception when encoding null" in { 30 | an[IllegalArgumentException] should be thrownBy requiredValueEncoder.encode( 31 | null.asInstanceOf[TestType], 32 | configuration 33 | ) 34 | } 35 | 36 | "Required value decoder" should "decode non-null value" in { 37 | requiredValueDecoder.decode(testValue, configuration) should be(testType) 38 | } 39 | 40 | it should "throw an exception when decoding null-value" in { 41 | an[IllegalArgumentException] should be thrownBy requiredValueDecoder.decode(NullValue, configuration) 42 | } 43 | 44 | "Optional value encoder" should "encode non-null value" in { 45 | optionalValueEncoder.encode(testType, configuration) should be(testValue) 46 | } 47 | 48 | it should "throw an exception when encoding null" in { 49 | optionalValueEncoder.encode(null.asInstanceOf[TestType], configuration) should be(NullValue) 50 | } 51 | 52 | "Optional value decoder" should "throw an exception when decoding null-value" in { 53 | optionalValueDecoder.decode(NullValue, configuration) should be(null) 54 | } 55 | 56 | it should "decode non-null value" in { 57 | optionalValueDecoder.decode(testValue, configuration) should be(testType) 58 | } 59 | 60 | } 61 | -------------------------------------------------------------------------------- /examples/src/main/protobuf/data.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | option java_package = "com.github.mjakubowski84.parquet4s.protobuf"; 4 | 5 | message Data { 6 | int32 id = 1; 7 | string text = 2; 8 | } 9 | -------------------------------------------------------------------------------- /examples/src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /examples/src/main/scala-akka-jvm/com/github/mjakubowski84/parquet4s/ScalaKafkaCompat.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | object ScalaKafkaCompat { 4 | object kafka { 5 | 6 | type CommitterSettings = akka.kafka.CommitterSettings 7 | def CommitterSettings = akka.kafka.CommitterSettings 8 | 9 | val ConsumerMessage = akka.kafka.ConsumerMessage 10 | 11 | type ConsumerSettings[K, V] = akka.kafka.ConsumerSettings[K, V] 12 | def ConsumerSettings = akka.kafka.ConsumerSettings 13 | 14 | def Subscriptions = akka.kafka.Subscriptions 15 | 16 | type Subscription = akka.kafka.Subscription 17 | 18 | object scaladsl { 19 | 20 | val Consumer = akka.kafka.scaladsl.Consumer 21 | 22 | def Committer = akka.kafka.scaladsl.Committer 23 | 24 | } 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /examples/src/main/scala-pekko-jvm/com/github/mjakubowski84/parquet4s/ScalaKafkaCompat.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | object ScalaKafkaCompat { 4 | object kafka { 5 | 6 | type CommitterSettings = org.apache.pekko.kafka.CommitterSettings 7 | def CommitterSettings = org.apache.pekko.kafka.CommitterSettings 8 | 9 | val ConsumerMessage = org.apache.pekko.kafka.ConsumerMessage 10 | 11 | type ConsumerSettings[K, V] = org.apache.pekko.kafka.ConsumerSettings[K, V] 12 | def ConsumerSettings = org.apache.pekko.kafka.ConsumerSettings 13 | 14 | def Subscriptions = org.apache.pekko.kafka.Subscriptions 15 | 16 | type Subscription = org.apache.pekko.kafka.Subscription 17 | 18 | object scaladsl { 19 | 20 | val Consumer = org.apache.pekko.kafka.scaladsl.Consumer 21 | 22 | def Committer = org.apache.pekko.kafka.scaladsl.Committer 23 | 24 | } 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /examples/src/main/scala/com/github/mjakubowski84/parquet4s/CustomType.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import org.apache.parquet.schema.{LogicalTypeAnnotation, PrimitiveType} 4 | 5 | import scala.util.Random 6 | 7 | object CustomType { 8 | 9 | object Dict { 10 | 11 | sealed trait Type 12 | case object A extends Type 13 | case object B extends Type 14 | case object C extends Type 15 | case object D extends Type 16 | 17 | val values: List[Type] = List(A, B, C, D) 18 | def valueOf(name: String): Type = values 19 | .find(_.toString == name) 20 | .getOrElse(throw new IllegalArgumentException(s"Invalid dict name: $name")) 21 | 22 | def random: Type = values(Random.nextInt(values.length)) 23 | 24 | // required for reading 25 | implicit val decoder: OptionalValueDecoder[Type] = 26 | (value: Value, _: ValueCodecConfiguration) => 27 | value match { 28 | case BinaryValue(binary) => valueOf(binary.toStringUsingUTF8) 29 | } 30 | // required for writing 31 | implicit val encoder: OptionalValueEncoder[Type] = 32 | (data: Type, _: ValueCodecConfiguration) => BinaryValue(data.toString) 33 | // required for writing 34 | implicit val schema: TypedSchemaDef[Type] = 35 | SchemaDef 36 | .primitive( 37 | primitiveType = PrimitiveType.PrimitiveTypeName.BINARY, 38 | required = false, 39 | logicalTypeAnnotation = Option(LogicalTypeAnnotation.stringType()) 40 | ) 41 | .typed[Type] 42 | } 43 | 44 | } 45 | -------------------------------------------------------------------------------- /examples/src/main/scala/com/github/mjakubowski84/parquet4s/akkaPekko/CustomAvroWriteAndReadAkkaPekkoApp.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s.akkaPekko 2 | 3 | import com.github.mjakubowski84.parquet4s.ScalaCompat.actor.ActorSystem 4 | import com.github.mjakubowski84.parquet4s.ScalaCompat.stream.scaladsl.Source 5 | import com.github.mjakubowski84.parquet4s.{ParquetStreams, Path} 6 | import org.apache.parquet.avro.AvroParquetWriter 7 | 8 | import java.nio.file.Files 9 | import scala.util.Random 10 | import org.apache.avro.generic.GenericRecordBuilder 11 | import com.github.mjakubowski84.parquet4s.ParquetWriter 12 | import org.apache.parquet.avro.AvroParquetReader 13 | import com.github.mjakubowski84.parquet4s.ParquetReader 14 | import org.apache.avro.SchemaBuilder 15 | import org.apache.avro.generic.GenericRecord 16 | 17 | object CustomAvroWriteAndReadAkkaPekkoApp extends App { 18 | val avroSchema = SchemaBuilder 19 | .record("data") 20 | .namespace("example") 21 | .fields() 22 | .requiredInt("i") 23 | .requiredString("text") 24 | .endRecord() 25 | val count = 100 26 | val data = (1 to count).map { i => 27 | new GenericRecordBuilder(avroSchema) 28 | .set("i", i) 29 | .set("text", Random.nextString(4)) 30 | .build() 31 | } 32 | val path = Path(Files.createTempDirectory("example")).append("data.parquet") 33 | 34 | implicit val system: ActorSystem = ActorSystem() 35 | import system.dispatcher 36 | 37 | lazy val writerBuilder = AvroParquetWriter 38 | .builder[GenericRecord](path.toOutputFile(ParquetWriter.Options())) 39 | .withSchema(avroSchema) 40 | 41 | lazy val writerSink = ParquetStreams.toParquetSingleFile 42 | .custom[GenericRecord, AvroParquetWriter.Builder[GenericRecord]](writerBuilder) 43 | .write 44 | 45 | lazy val readerBuilder = AvroParquetReader 46 | .builder[GenericRecord](path.toInputFile(ParquetReader.Options())) 47 | 48 | lazy val readerSource = ParquetStreams.fromParquet 49 | .custom[GenericRecord](readerBuilder) 50 | .read() 51 | 52 | val stream = for { 53 | _ <- Source(data).runWith(writerSink) 54 | _ <- readerSource.runForeach(println) 55 | } yield () 56 | 57 | stream.andThen { case _ => 58 | system.terminate() 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /examples/src/main/scala/com/github/mjakubowski84/parquet4s/akkaPekko/CustomPartitioningAvroWriteAkkaPekkoApp.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s.akkaPekko 2 | 3 | import com.github.mjakubowski84.parquet4s.ScalaCompat.actor.ActorSystem 4 | import com.github.mjakubowski84.parquet4s.ScalaCompat.stream.scaladsl.Sink 5 | import com.github.mjakubowski84.parquet4s.ScalaCompat.stream.scaladsl.Source 6 | import com.github.mjakubowski84.parquet4s.{ParquetStreams, Path} 7 | import org.apache.parquet.avro.AvroParquetWriter 8 | 9 | import java.nio.file.Files 10 | import scala.util.Random 11 | import org.apache.avro.generic.GenericRecordBuilder 12 | import com.github.mjakubowski84.parquet4s.ParquetWriter 13 | import org.apache.avro.SchemaBuilder 14 | import org.apache.avro.generic.GenericRecord 15 | import com.github.mjakubowski84.parquet4s.ValueCodecConfiguration 16 | 17 | object CustomPartitioningAvroWriteAkkaPekkoApp extends App { 18 | val inputDataAvroSchema = SchemaBuilder 19 | .record("data") 20 | .namespace("example") 21 | .fields() 22 | .requiredInt("i") 23 | .requiredString("text") 24 | .requiredString("partition") 25 | .endRecord() 26 | val partitionedDataAvroSchema = SchemaBuilder 27 | .record("data") 28 | .namespace("example") 29 | .fields() 30 | .requiredInt("i") 31 | .requiredString("text") 32 | .endRecord() 33 | 34 | val count = 100 35 | val data = (1 to count).map { i => 36 | new GenericRecordBuilder(inputDataAvroSchema) 37 | .set("i", i) 38 | .set("text", Random.nextString(4)) 39 | .set("partition", (i % 4).toString()) 40 | .build() 41 | } 42 | val basePath = Path(Files.createTempDirectory("example")) 43 | val vcc = ValueCodecConfiguration.Default 44 | 45 | implicit val system: ActorSystem = ActorSystem() 46 | import system.dispatcher 47 | 48 | lazy val writerFlow = ParquetStreams.viaParquet 49 | .custom[GenericRecord, AvroParquetWriter.Builder[GenericRecord]](path => 50 | AvroParquetWriter 51 | .builder[GenericRecord](path.toOutputFile(ParquetWriter.Options())) 52 | .withSchema(partitionedDataAvroSchema) 53 | ) 54 | .partitionUsing { case (path, record) => 55 | val partitionValue = record.get("partition") 56 | val partitionedRecord = new GenericRecordBuilder(partitionedDataAvroSchema) 57 | .set("i", record.get("i")) 58 | .set("text", record.get("text")) 59 | .build() 60 | (path.append(s"partition=$partitionValue"), partitionedRecord) 61 | } 62 | .write(basePath) 63 | 64 | val stream = for { 65 | _ <- Source(data).via(writerFlow).runWith(Sink.ignore) 66 | _ <- ParquetStreams.fromParquet.generic 67 | .read(basePath) 68 | .runForeach(r => 69 | println( 70 | s"i=${r.get[Int]("i", vcc)}, text=${r.get[String]("text", vcc)}, partition=${r.get[String]("partition", vcc)}" 71 | ) 72 | ) 73 | } yield () 74 | 75 | stream.andThen { case _ => 76 | system.terminate() 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /examples/src/main/scala/com/github/mjakubowski84/parquet4s/akkaPekko/CustomProtobufWriteAndReadAkkaPekkoApp.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s.akkaPekko 2 | 3 | import com.github.mjakubowski84.parquet4s.ScalaCompat.actor.ActorSystem 4 | import com.github.mjakubowski84.parquet4s.ScalaCompat.stream.scaladsl.Source 5 | import com.github.mjakubowski84.parquet4s.{ParquetStreams, Path} 6 | import com.github.mjakubowski84.parquet4s.protobuf.DataOuterClass.Data 7 | import org.apache.parquet.proto.ProtoParquetWriter 8 | 9 | import java.nio.file.Files 10 | import scala.util.Random 11 | import org.apache.parquet.proto.ProtoParquetReader 12 | import com.google.protobuf.TextFormat 13 | 14 | /** Please note! This is an example of Java Protobuf + Parquet4s using custom readers and writers. You can also use 15 | * Scala Protobuf with regular Parquet4s functions thanks to ScalaPB module of Parquet4s. 16 | */ 17 | object CustomProtobufWriteAndReadAkkaPekkoApp extends App { 18 | val count = 100 19 | val data = (1 to count).map(i => Data.newBuilder.setId(i).setText(Random.nextString(4)).build) 20 | val path = Path(Files.createTempDirectory("example")) 21 | 22 | implicit val system: ActorSystem = ActorSystem() 23 | 24 | import system.dispatcher 25 | 26 | lazy val writerBuilder = 27 | ProtoParquetWriter.builder[Data](path.append("data.parquet").hadoopPath).withMessage(classOf[Data]) 28 | 29 | lazy val writerSink = ParquetStreams.toParquetSingleFile 30 | .custom[Data, ProtoParquetWriter.Builder[Data]](writerBuilder) 31 | .write 32 | 33 | lazy val readerBuilder = ProtoParquetReader.builder[Data.Builder](path.hadoopPath) 34 | 35 | lazy val readerSource = ParquetStreams.fromParquet 36 | .custom[Data.Builder](readerBuilder) 37 | .read[Data](_.build()) 38 | 39 | val stream = for { 40 | _ <- Source(data).runWith(writerSink) 41 | _ <- readerSource.runForeach(data => println(TextFormat.printer().escapingNonAscii(false).printToString(data))) 42 | } yield () 43 | 44 | stream.andThen { 45 | // finish 46 | case _ => system.terminate() 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /examples/src/main/scala/com/github/mjakubowski84/parquet4s/akkaPekko/WriteAndReadAkkaPekkoApp.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s.akkaPekko 2 | 3 | import com.github.mjakubowski84.parquet4s.ScalaCompat.actor.ActorSystem 4 | import com.github.mjakubowski84.parquet4s.ScalaCompat.stream.scaladsl.Source 5 | import com.github.mjakubowski84.parquet4s.{ParquetStreams, Path} 6 | 7 | import java.nio.file.Files 8 | import scala.util.Random 9 | 10 | object WriteAndReadAkkaPekkoApp extends App { 11 | 12 | case class Data(id: Int, text: String) 13 | 14 | val count = 100 15 | val data = (1 to count).map(i => Data(id = i, text = Random.nextString(4))) 16 | val path = Path(Files.createTempDirectory("example")) 17 | 18 | implicit val system: ActorSystem = ActorSystem() 19 | import system.dispatcher 20 | 21 | val stream = for { 22 | // write 23 | _ <- Source(data).runWith(ParquetStreams.toParquetSingleFile.of[Data].write(path.append("data.parquet"))) 24 | // read 25 | _ <- ParquetStreams.fromParquet.as[Data].read(path).runForeach(println) 26 | } yield () 27 | 28 | stream.andThen { 29 | // finish 30 | case _ => system.terminate() 31 | } 32 | 33 | } 34 | -------------------------------------------------------------------------------- /examples/src/main/scala/com/github/mjakubowski84/parquet4s/akkaPekko/WriteAndReadCustomTypeAkkaPekkoApp.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s.akkaPekko 2 | 3 | import com.github.mjakubowski84.parquet4s.ScalaCompat.actor.ActorSystem 4 | import com.github.mjakubowski84.parquet4s.ScalaCompat.stream.scaladsl.{Sink, Source} 5 | import com.github.mjakubowski84.parquet4s.CustomType.* 6 | import com.github.mjakubowski84.parquet4s.{ParquetStreams, Path} 7 | 8 | import java.nio.file.Files 9 | 10 | object WriteAndReadCustomTypeAkkaPekkoApp extends App { 11 | 12 | object Data { 13 | def generate(count: Int): Iterator[Data] = Iterator.range(1, count).map(i => Data(id = i, dict = Dict.random)) 14 | } 15 | case class Data(id: Int, dict: Dict.Type) 16 | 17 | val data = () => Data.generate(count = 100) 18 | val path = Path(Files.createTempDirectory("example")) 19 | 20 | implicit val system: ActorSystem = ActorSystem() 21 | import system.dispatcher 22 | 23 | val stream = for { 24 | // write 25 | _ <- Source 26 | .fromIterator(data) 27 | .runWith(ParquetStreams.toParquetSingleFile.of[Data].write(path.append("data.parquet"))) 28 | // read 29 | // hint: you can filter by dict using string value, for example: filter = Col("dict") === "A" 30 | _ <- ParquetStreams.fromParquet.as[Data].read(path).runWith(Sink.foreach(println)) 31 | } yield () 32 | 33 | stream.andThen { 34 | // finish 35 | case _ => system.terminate() 36 | } 37 | 38 | } 39 | -------------------------------------------------------------------------------- /examples/src/main/scala/com/github/mjakubowski84/parquet4s/akkaPekko/WriteAndReadFilteredAkkaPekkoApp.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s.akkaPekko 2 | 3 | import com.github.mjakubowski84.parquet4s.ScalaCompat.actor.ActorSystem 4 | import com.github.mjakubowski84.parquet4s.ScalaCompat.stream.scaladsl.{Sink, Source} 5 | import com.github.mjakubowski84.parquet4s.{Col, ParquetStreams, Path} 6 | 7 | import java.nio.file.Files 8 | import scala.concurrent.Future 9 | import scala.util.Random 10 | 11 | object WriteAndReadFilteredAkkaPekkoApp extends App { 12 | 13 | object Dict { 14 | val A = "A" 15 | val B = "B" 16 | val C = "C" 17 | val D = "D" 18 | 19 | val values: List[String] = List(A, B, C, D) 20 | def random: String = values(Random.nextInt(values.length)) 21 | } 22 | 23 | case class Data(id: Int, dict: String) 24 | 25 | val count = 100 26 | val data = (1 to count).map(i => Data(id = i, dict = Dict.random)) 27 | val path = Path(Files.createTempDirectory("example")) 28 | 29 | implicit val system: ActorSystem = ActorSystem() 30 | import system.dispatcher 31 | 32 | val printingSink = Sink.foreach(println) 33 | 34 | val stream = for { 35 | // write 36 | _ <- Source(data).runWith(ParquetStreams.toParquetSingleFile.of[Data].write(path.append("data.parquet"))) 37 | // read filtered 38 | _ <- Future(println("""dict == "A"""")) 39 | _ <- ParquetStreams.fromParquet.as[Data].filter(Col("dict") === Dict.A).read(path).runWith(printingSink) 40 | _ <- Future(println("""id >= 20 && id < 40""")) 41 | _ <- ParquetStreams.fromParquet.as[Data].filter(Col("id") >= 20 && Col("id") < 40).read(path).runWith(printingSink) 42 | } yield () 43 | 44 | stream.andThen { 45 | // finish 46 | case _ => system.terminate() 47 | } 48 | 49 | } 50 | -------------------------------------------------------------------------------- /examples/src/main/scala/com/github/mjakubowski84/parquet4s/akkaPekko/WriteAndReadGenericAkkaPekkoApp.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s.akkaPekko 2 | 3 | import com.github.mjakubowski84.parquet4s.ScalaCompat.actor.ActorSystem 4 | import com.github.mjakubowski84.parquet4s.ScalaCompat.stream.scaladsl.{Sink, Source} 5 | import com.github.mjakubowski84.parquet4s.{ParquetStreams, Path, RowParquetRecord, ValueCodecConfiguration} 6 | import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.{BINARY, INT32, INT64} 7 | import org.apache.parquet.schema.Type.Repetition.{OPTIONAL, REQUIRED} 8 | import org.apache.parquet.schema.{LogicalTypeAnnotation, MessageType, Types} 9 | 10 | import java.nio.file.Files 11 | import java.time.LocalDate 12 | 13 | object WriteAndReadGenericAkkaPekkoApp extends App { 14 | 15 | val ID = "id" 16 | val Name = "name" 17 | val Birthday = "birthday" 18 | val SchemaName = "user_schema" 19 | 20 | val Schema: MessageType = Types 21 | .buildMessage() 22 | .addField(Types.primitive(INT64, REQUIRED).as(LogicalTypeAnnotation.intType(64, true)).named(ID)) 23 | .addField(Types.primitive(BINARY, OPTIONAL).as(LogicalTypeAnnotation.stringType()).named(Name)) 24 | .addField(Types.primitive(INT32, OPTIONAL).as(LogicalTypeAnnotation.dateType()).named(Birthday)) 25 | .named(SchemaName) 26 | 27 | val Vcc = ValueCodecConfiguration.Default 28 | 29 | val users = List( 30 | (1L, "Alice", LocalDate.of(2000, 1, 1)), 31 | (2L, "Bob", LocalDate.of(1980, 2, 28)), 32 | (3L, "Cecilia", LocalDate.of(1977, 3, 15)) 33 | ).map { case (id, name, birthday) => 34 | RowParquetRecord 35 | .emptyWithSchema(ID, Name, Birthday) 36 | .updated(ID, id, Vcc) 37 | .updated(Name, name, Vcc) 38 | .updated(Birthday, birthday, Vcc) 39 | } 40 | 41 | val path = Path(Files.createTempDirectory("example")) 42 | 43 | implicit val system: ActorSystem = ActorSystem() 44 | import system.dispatcher 45 | 46 | val stream = for { 47 | // write 48 | _ <- Source(users).runWith(ParquetStreams.toParquetSingleFile.generic(Schema).write(path.append("data.parquet"))) 49 | // read 50 | _ <- ParquetStreams.fromParquet.generic.read(path).runWith(Sink.foreach(println)) 51 | } yield () 52 | 53 | stream.andThen { 54 | // finish 55 | case _ => system.terminate() 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /examples/src/main/scala/com/github/mjakubowski84/parquet4s/akkaPekko/indefinite/AkkaPekko.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s.akkaPekko.indefinite 2 | 3 | import com.github.mjakubowski84.parquet4s.ScalaCompat.actor.{ActorSystem, CoordinatedShutdown} 4 | import com.github.mjakubowski84.parquet4s.ScalaCompat.pattern.AskSupport 5 | 6 | import scala.concurrent.ExecutionContext 7 | 8 | trait AkkaPekko extends AskSupport { 9 | 10 | this: Logger => 11 | 12 | implicit lazy val system: ActorSystem = ActorSystem() 13 | implicit def executionContext: ExecutionContext = system.dispatcher 14 | val coordinatedShutdown: CoordinatedShutdown = CoordinatedShutdown(system) 15 | 16 | } 17 | -------------------------------------------------------------------------------- /examples/src/main/scala/com/github/mjakubowski84/parquet4s/akkaPekko/indefinite/ExampleApp.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s.akkaPekko.indefinite 2 | 3 | import com.github.mjakubowski84.parquet4s.ScalaCompat.Done 4 | import com.github.mjakubowski84.parquet4s.ScalaCompat.actor.CoordinatedShutdown 5 | import com.github.mjakubowski84.parquet4s.ScalaKafkaCompat.kafka.scaladsl.Consumer.DrainingControl 6 | import com.github.mjakubowski84.parquet4s.ScalaCompat.stream.scaladsl.Keep 7 | 8 | object ExampleApp 9 | extends App 10 | with Logger 11 | with AkkaPekko 12 | with Kafka 13 | with RandomDataProducer 14 | with MessageSource 15 | with MessageSink { 16 | 17 | startKafka() 18 | startDataProducer() 19 | 20 | logger.info(s"Starting stream that reads messages from Kafka and writes them to $baseWritePath...") 21 | val streamControl: DrainingControl[Done] = messageSource 22 | .toMat(messageSink)(Keep.both) 23 | .mapMaterializedValue(DrainingControl.apply[Done]) 24 | .run() 25 | 26 | coordinatedShutdown.addTask(CoordinatedShutdown.PhaseServiceStop, "Stopping stream") { () => 27 | logger.info("Stopping stream...") 28 | streamControl.drainAndShutdown() 29 | } 30 | 31 | } 32 | -------------------------------------------------------------------------------- /examples/src/main/scala/com/github/mjakubowski84/parquet4s/akkaPekko/indefinite/Kafka.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s.akkaPekko.indefinite 2 | 3 | import com.github.mjakubowski84.parquet4s.ScalaCompat.Done 4 | import com.github.mjakubowski84.parquet4s.ScalaCompat.actor.CoordinatedShutdown 5 | import io.github.embeddedkafka.EmbeddedKafka 6 | 7 | import scala.concurrent.Future 8 | 9 | trait Kafka { 10 | 11 | this: Logger & AkkaPekko => 12 | 13 | private lazy val broker = { 14 | logger.info("Starting Kafka...") 15 | EmbeddedKafka.start() 16 | } 17 | 18 | lazy val kafkaAddress = s"localhost:${broker.config.kafkaPort}" 19 | val topic = "exampleTopic" 20 | val groupId = "exampleGroupId" 21 | 22 | def sendKafkaMessage(message: String): Unit = EmbeddedKafka.publishStringMessageToKafka(topic, message) 23 | 24 | def startKafka(): Unit = { 25 | broker 26 | coordinatedShutdown.addTask(CoordinatedShutdown.PhaseBeforeActorSystemTerminate, "Stop kafka") { () => 27 | Future { 28 | logger.info("Stopping Kafka...") 29 | EmbeddedKafka.stop() 30 | Done 31 | } 32 | } 33 | } 34 | 35 | } 36 | -------------------------------------------------------------------------------- /examples/src/main/scala/com/github/mjakubowski84/parquet4s/akkaPekko/indefinite/Logger.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s.akkaPekko.indefinite 2 | 3 | trait Logger { 4 | lazy val logger: org.slf4j.Logger = org.slf4j.LoggerFactory.getLogger(this.getClass) 5 | } 6 | -------------------------------------------------------------------------------- /examples/src/main/scala/com/github/mjakubowski84/parquet4s/akkaPekko/indefinite/MessageSink.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s.akkaPekko.indefinite 2 | 3 | import com.github.mjakubowski84.parquet4s.ScalaCompat.Done 4 | import com.github.mjakubowski84.parquet4s.ScalaKafkaCompat.kafka.CommitterSettings 5 | import com.github.mjakubowski84.parquet4s.ScalaKafkaCompat.kafka.ConsumerMessage.CommittableOffsetBatch 6 | import com.github.mjakubowski84.parquet4s.ScalaKafkaCompat.kafka.scaladsl.Committer 7 | import com.github.mjakubowski84.parquet4s.ScalaCompat.stream.FlowShape 8 | import com.github.mjakubowski84.parquet4s.ScalaCompat.stream.scaladsl.{Flow, Keep, Sink} 9 | import com.github.mjakubowski84.parquet4s.ScalaCompat.stream.stage.GraphStage 10 | import com.github.mjakubowski84.parquet4s.{Col, ParquetStreams, ParquetWriter, Path} 11 | import org.apache.parquet.hadoop.metadata.CompressionCodecName 12 | 13 | import java.nio.file.Files 14 | import java.sql.Timestamp 15 | import scala.concurrent.Future 16 | import scala.concurrent.duration.* 17 | 18 | object MessageSink { 19 | 20 | case class Data( 21 | year: String, 22 | month: String, 23 | day: String, 24 | timestamp: Timestamp, 25 | word: String 26 | ) 27 | 28 | val MaxChunkSize: Int = 128 29 | val ChunkWriteTimeWindow: FiniteDuration = 10.seconds 30 | val WriteDirectoryName: String = "messages" 31 | 32 | } 33 | 34 | trait MessageSink { 35 | 36 | this: AkkaPekko & Logger => 37 | 38 | import MessageSink.* 39 | import MessageSource.* 40 | 41 | protected val baseWritePath: Path = Path(Files.createTempDirectory("example")).append(WriteDirectoryName) 42 | 43 | private val writerOptions = ParquetWriter.Options(compressionCodecName = CompressionCodecName.SNAPPY) 44 | 45 | lazy val messageSink: Sink[Message, Future[Done]] = 46 | Flow[Message] 47 | .via(saveDataToParquetFlow) 48 | .map(_.committableOffset) 49 | .grouped(MaxChunkSize) 50 | .map(CommittableOffsetBatch.apply) 51 | .toMat(Committer.sink(CommitterSettings(system)))(Keep.right) 52 | 53 | private lazy val saveDataToParquetFlow: GraphStage[FlowShape[Message, Message]] = 54 | ParquetStreams.viaParquet 55 | .of[Message] 56 | .preWriteTransformation { message => 57 | val timestamp = new Timestamp(message.record.timestamp()) 58 | val localDateTime = timestamp.toLocalDateTime 59 | Some( 60 | Data( 61 | year = localDateTime.getYear.toString, 62 | month = localDateTime.getMonthValue.toString, 63 | day = localDateTime.getDayOfMonth.toString, 64 | timestamp = timestamp, 65 | word = message.record.value() 66 | ) 67 | ) 68 | } 69 | .partitionBy(Col("year"), Col("month"), Col("day")) 70 | .maxCount(MaxChunkSize.toLong) 71 | .maxDuration(ChunkWriteTimeWindow) 72 | .options(writerOptions) 73 | .postWriteHandler { state => 74 | logger.info(s"Just wrote to ${state.modifiedPartitions}") 75 | } 76 | .write(baseWritePath) 77 | 78 | } 79 | -------------------------------------------------------------------------------- /examples/src/main/scala/com/github/mjakubowski84/parquet4s/akkaPekko/indefinite/MessageSource.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s.akkaPekko.indefinite 2 | 3 | import com.github.mjakubowski84.parquet4s.ScalaKafkaCompat.kafka.scaladsl.Consumer 4 | import com.github.mjakubowski84.parquet4s.ScalaKafkaCompat.kafka.{ConsumerMessage, ConsumerSettings, Subscriptions} 5 | import com.github.mjakubowski84.parquet4s.ScalaCompat.stream.scaladsl.Source 6 | import org.apache.kafka.common.serialization.StringDeserializer 7 | 8 | import scala.concurrent.duration.Duration 9 | 10 | object MessageSource { 11 | 12 | type Message = ConsumerMessage.CommittableMessage[String, String] 13 | 14 | } 15 | 16 | trait MessageSource { 17 | 18 | this: AkkaPekko & Kafka => 19 | 20 | import MessageSource.* 21 | 22 | private val consumerSettings = ConsumerSettings(system, new StringDeserializer(), new StringDeserializer()) 23 | .withBootstrapServers(kafkaAddress) 24 | .withGroupId(groupId) 25 | .withStopTimeout(Duration.Zero) 26 | private val subscription = Subscriptions.topics(topic) 27 | 28 | lazy val messageSource: Source[Message, Consumer.Control] = Consumer.committableSource(consumerSettings, subscription) 29 | 30 | } 31 | -------------------------------------------------------------------------------- /examples/src/main/scala/com/github/mjakubowski84/parquet4s/core/ColumnProjectionAndDataConcatenationApp.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s.core 2 | 3 | import com.github.mjakubowski84.parquet4s.* 4 | 5 | import java.nio.file.Files 6 | import java.time.LocalDate 7 | import scala.util.Using 8 | 9 | object ColumnProjectionAndDataConcatenationApp extends App { 10 | 11 | val ID = "id" 12 | val Name = "name" 13 | val FirstName = "firstName" 14 | val Birthday = "birthday" 15 | 16 | case class User1(id: Long, name: String, birthday: LocalDate) 17 | case class User2(id: Int, firstName: String, lastName: String) 18 | case class UserName(id: Long, name: String) 19 | 20 | val path = Path(Files.createTempDirectory("example")) 21 | val path1 = path.append("users1.parquet") 22 | val path2 = path.append("users2.parquet") 23 | 24 | val vcc = ValueCodecConfiguration.Default 25 | 26 | val users1 = List( 27 | User1(1L, "Alice", LocalDate.of(2000, 1, 1)), 28 | User1(2L, "Bob", LocalDate.of(1980, 2, 28)), 29 | User1(3L, "Cecilia", LocalDate.of(1977, 3, 15)) 30 | ) 31 | val users2 = List( 32 | User2(4, "Derek", "Smith"), 33 | User2(5, "Emilia", "Doe"), 34 | User2(6, "Fred", "Johnson") 35 | ) 36 | 37 | // write 38 | ParquetWriter.of[User1].writeAndClose(path1, users1) 39 | ParquetWriter.of[User2].writeAndClose(path2, users2) 40 | 41 | // define 1st dataset 42 | val readUsers1 = ParquetReader 43 | .projectedGeneric( 44 | Col(ID).as[Long], 45 | Col(Name).as[String] 46 | ) 47 | .read(path1) 48 | .as[UserName] 49 | 50 | // define 2nd dataset 51 | val readUsers2 = ParquetReader 52 | .projectedGeneric( 53 | Col(ID).as[Int], 54 | Col(FirstName).as[String].alias(Name) 55 | ) 56 | .read(path2) 57 | .as[UserName] 58 | 59 | // define concatenation of datasets 60 | val readAllUserNames = readUsers1.concat(readUsers2) 61 | 62 | // execute 63 | Using.resource(readAllUserNames)(_.foreach(println)) 64 | 65 | } 66 | -------------------------------------------------------------------------------- /examples/src/main/scala/com/github/mjakubowski84/parquet4s/core/ETLApp.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s.core 2 | 3 | import com.github.mjakubowski84.parquet4s.* 4 | 5 | import java.nio.file.Files 6 | import scala.util.Using 7 | 8 | object ETLApp extends App { 9 | 10 | case class Owner(id: Long, name: String) 11 | case class Pet(id: Long, name: String, ownerId: Long) 12 | case class PetOwner(id: Long, name: String, petId: Long, petName: String) 13 | 14 | val path = Path(Files.createTempDirectory("example")) 15 | val ownerPath = path.append("owners.parquet") 16 | val petsPath = path.append("pets.parquet") 17 | val outputPath = path.append("output.parquet") 18 | 19 | val owners = List( 20 | Owner(1L, "Alice"), 21 | Owner(2L, "Bob"), 22 | Owner(3L, "Cecilia") 23 | ) 24 | val pets = List( 25 | Pet(1L, "Rex", 2L), 26 | Pet(2L, "Felix", 3L), 27 | Pet(3L, "Molly", 3L), 28 | Pet(4L, "Sunshine", 4L) 29 | ) 30 | 31 | // prepare input data 32 | ParquetWriter.of[Owner].writeAndClose(ownerPath, owners) 33 | ParquetWriter.of[Pet].writeAndClose(petsPath, pets) 34 | 35 | // define 1st dataset 36 | val readOwners = ParquetReader 37 | .projectedGeneric( 38 | Col("id").as[Long], 39 | Col("name").as[String] 40 | ) 41 | .read(ownerPath) 42 | 43 | // define 2nd dataset 44 | val readPets = ParquetReader 45 | .projectedGeneric( 46 | Col("id").as[Long].alias("petId"), 47 | Col("name").as[String].alias("petName"), 48 | Col("ownerId").as[Long] 49 | ) 50 | .read(petsPath) 51 | 52 | // perform ETL 53 | Using.resources(readOwners, readPets) { case (owners, pets) => 54 | owners 55 | .innerJoin(right = pets, onLeft = Col("id"), onRight = Col("ownerId")) // define join operation 56 | .as[PetOwner] // set typed schema and codecs 57 | .writeAndClose(outputPath) // execute all operations defined above and write results to disk 58 | } 59 | 60 | // take note that all operations defined above writeAndClose are lazy and are not executed 61 | // before writeAndClose is called 62 | 63 | // read ETL results 64 | Using.resource(ParquetReader.as[PetOwner].read(outputPath))(_.foreach(println)) 65 | } 66 | -------------------------------------------------------------------------------- /examples/src/main/scala/com/github/mjakubowski84/parquet4s/core/WriteAndReadApp.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s.core 2 | 3 | import com.github.mjakubowski84.parquet4s.{ParquetReader, ParquetWriter, Path} 4 | 5 | import java.nio.file.Files 6 | import scala.util.Random 7 | import scala.util.Using 8 | 9 | object WriteAndReadApp extends App { 10 | 11 | case class Data(id: Int, text: String) 12 | 13 | val count = 100 14 | val data = (1 to count).map(i => Data(id = i, text = Random.nextString(4))) 15 | val path = Path(Files.createTempDirectory("example")) 16 | 17 | // write 18 | ParquetWriter.of[Data].writeAndClose(path.append("data.parquet"), data) 19 | 20 | // read 21 | Using.resource(ParquetReader.as[Data].read(path))(_.foreach(println)) 22 | } 23 | -------------------------------------------------------------------------------- /examples/src/main/scala/com/github/mjakubowski84/parquet4s/core/WriteAndReadCustomTypeApp.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s.core 2 | 3 | import com.github.mjakubowski84.parquet4s.CustomType.* 4 | import com.github.mjakubowski84.parquet4s.ParquetSchemaResolver.* 5 | import com.github.mjakubowski84.parquet4s.{ParquetReader, ParquetWriter, Path} 6 | 7 | import java.nio.file.Files 8 | import scala.util.Using 9 | 10 | object WriteAndReadCustomTypeApp extends App { 11 | 12 | object Data { 13 | def generate(count: Int): Iterable[Data] = (1 to count).map(i => Data(id = i, dict = Dict.random)) 14 | } 15 | case class Data(id: Int, dict: Dict.Type) 16 | 17 | val data = Data.generate(count = 100) 18 | val path = Path(Files.createTempDirectory("example")) 19 | 20 | // write 21 | ParquetWriter.of[Data].writeAndClose(path.append("data.parquet"), data) 22 | 23 | // read 24 | // hint: you can filter by dict using string value, for example: filter = Col("dict") === "A" 25 | Using.resource(ParquetReader.as[Data].read(path))(_.foreach(println)) 26 | 27 | } 28 | -------------------------------------------------------------------------------- /examples/src/main/scala/com/github/mjakubowski84/parquet4s/core/WriteAndReadFilteredApp.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s.core 2 | 3 | import com.github.mjakubowski84.parquet4s.{Col, ParquetReader, ParquetWriter, Path} 4 | 5 | import java.nio.file.Files 6 | import scala.util.Random 7 | import scala.util.Using 8 | 9 | object WriteAndReadFilteredApp extends App { 10 | 11 | object Dict { 12 | val A = "A" 13 | val B = "B" 14 | val C = "C" 15 | val D = "D" 16 | 17 | val values: List[String] = List(A, B, C, D) 18 | def random: String = values(Random.nextInt(values.length)) 19 | } 20 | 21 | case class Data(id: Int, dict: String) 22 | 23 | val count = 100 24 | val data = (1 to count).map(i => Data(id = i, dict = Dict.random)) 25 | val path = Path(Files.createTempDirectory("example")) 26 | 27 | // write 28 | ParquetWriter.of[Data].writeAndClose(path.append("data.parquet"), data) 29 | 30 | // read filtered 31 | println("""dict == "A"""") 32 | val dictIsOnlyA = ParquetReader.as[Data].filter(Col("dict") === Dict.A).read(path) 33 | Using.resource(dictIsOnlyA)(_.foreach(println)) 34 | 35 | println("""id >= 20 && id < 40""") 36 | val idIsBetween10And90 = ParquetReader.as[Data].filter(Col("id") >= 20 && Col("id") < 40).read(path) 37 | Using.resource(idIsBetween10And90)(_.foreach(println)) 38 | 39 | } 40 | -------------------------------------------------------------------------------- /examples/src/main/scala/com/github/mjakubowski84/parquet4s/core/WriteAndReadGenericApp.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s.core 2 | 3 | import com.github.mjakubowski84.parquet4s.* 4 | import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.{BINARY, INT32, INT64} 5 | import org.apache.parquet.schema.Type.Repetition.{OPTIONAL, REQUIRED} 6 | import org.apache.parquet.schema.{LogicalTypeAnnotation, MessageType, Types} 7 | 8 | import java.nio.file.Files 9 | import java.time.LocalDate 10 | import scala.util.Using 11 | 12 | object WriteAndReadGenericApp extends App { 13 | 14 | val ID = "id" 15 | val Name = "name" 16 | val Birthday = "birthday" 17 | val SchemaName = "user_schema" 18 | 19 | val path = Path(Files.createTempDirectory("example")) 20 | val vcc = ValueCodecConfiguration.Default 21 | 22 | val users = List( 23 | (1L, "Alice", LocalDate.of(2000, 1, 1)), 24 | (2L, "Bob", LocalDate.of(1980, 2, 28)), 25 | (3L, "Cecilia", LocalDate.of(1977, 3, 15)) 26 | ).map { case (id, name, birthday) => 27 | RowParquetRecord 28 | .emptyWithSchema(ID, Name, Birthday) 29 | .updated(ID, id, vcc) 30 | .updated(Name, name, vcc) 31 | .updated(Birthday, birthday, vcc) 32 | } 33 | 34 | // write 35 | val schema: MessageType = Types 36 | .buildMessage() 37 | .addField(Types.primitive(INT64, REQUIRED).as(LogicalTypeAnnotation.intType(64, true)).named(ID)) 38 | .addField(Types.primitive(BINARY, OPTIONAL).as(LogicalTypeAnnotation.stringType()).named(Name)) 39 | .addField(Types.primitive(INT32, OPTIONAL).as(LogicalTypeAnnotation.dateType()).named(Birthday)) 40 | .named(SchemaName) 41 | 42 | ParquetWriter.generic(schema).writeAndClose(path.append("users.parquet"), users) 43 | 44 | // read 45 | Using.resource(ParquetReader.generic.read(path)) { readData => 46 | readData.foreach { record => 47 | val id = record.get[Long](ID, vcc) 48 | val name = record.get[String](Name, vcc) 49 | val birthday = record.get[LocalDate](Birthday, vcc) 50 | println(s"User[$ID=$id,$Name=$name,$Birthday=$birthday]") 51 | } 52 | } 53 | 54 | } 55 | -------------------------------------------------------------------------------- /examples/src/main/scala/com/github/mjakubowski84/parquet4s/core/WriteAndReadUsingRecordFilterApp.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s.core 2 | 3 | import com.github.mjakubowski84.parquet4s.{ParquetReader, ParquetWriter, Path} 4 | 5 | import java.nio.file.Files 6 | import scala.util.Random 7 | import scala.util.Using 8 | import com.github.mjakubowski84.parquet4s.RecordFilter 9 | 10 | object WriteAndReadUsingRecordFilterApp extends App { 11 | 12 | case class Data(id: Int, text: String) 13 | 14 | val count = 100 15 | val data = (1 to count).map(i => Data(id = i, text = Random.nextString(4))) 16 | val path = Path(Files.createTempDirectory("example")) 17 | 18 | // write 19 | ParquetWriter.of[Data].writeAndClose(path.append("data.parquet"), data) 20 | 21 | // skips all but last 3 records (out of 100) 22 | Using.resource(ParquetReader.as[Data].filter(RecordFilter(_ >= 97)).read(path))(_.foreach(println)) 23 | } 24 | -------------------------------------------------------------------------------- /examples/src/main/scala/com/github/mjakubowski84/parquet4s/core/WriteIncrementallyAndReadApp.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s.core 2 | 3 | import com.github.mjakubowski84.parquet4s.{ParquetReader, ParquetWriter, Path} 4 | 5 | import java.nio.file.Files 6 | import scala.util.Random 7 | import scala.util.Using 8 | 9 | object WriteIncrementallyAndReadApp extends App { 10 | 11 | case class Data(id: Int, text: String) 12 | 13 | val count = 100 14 | val data = (1 to count).map(i => Data(id = i, text = Random.nextString(4))) 15 | val path = Path(Files.createTempDirectory("example")) 16 | 17 | // write 18 | val writer = ParquetWriter.of[Data].build(path.append("data.parquet")) 19 | try data.foreach(entity => writer.write(entity)) 20 | finally writer.close() 21 | 22 | // read 23 | Using.resource(ParquetReader.as[Data].read(path))(_.foreach(println)) 24 | 25 | } 26 | -------------------------------------------------------------------------------- /examples/src/main/scala/com/github/mjakubowski84/parquet4s/fs2/CustomAvroPartitioningWriteFS2App.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s.fs2 2 | 3 | import cats.effect.{IO, IOApp} 4 | import com.github.mjakubowski84.parquet4s.Path 5 | import com.github.mjakubowski84.parquet4s.parquet.* 6 | import fs2.io.file.Files 7 | import fs2.Stream 8 | 9 | import scala.util.Random 10 | import org.apache.avro.SchemaBuilder 11 | import org.apache.avro.generic.GenericRecordBuilder 12 | import org.apache.parquet.avro.AvroParquetWriter 13 | import org.apache.avro.generic.GenericRecord 14 | import com.github.mjakubowski84.parquet4s.ParquetWriter 15 | import com.github.mjakubowski84.parquet4s.ValueCodecConfiguration 16 | 17 | object CustomAvroPartitioningWriteFS2App extends IOApp.Simple { 18 | private val Count = 100 19 | private val InputAvroSchema = SchemaBuilder 20 | .record("data") 21 | .namespace("example") 22 | .fields() 23 | .requiredInt("i") 24 | .requiredString("text") 25 | .requiredString("partition") 26 | .endRecord() 27 | private val PartitionedDataAvroSchema = SchemaBuilder 28 | .record("data") 29 | .namespace("example") 30 | .fields() 31 | .requiredInt("i") 32 | .requiredString("text") 33 | .endRecord() 34 | 35 | val data = (1 to Count).map { i => 36 | new GenericRecordBuilder(InputAvroSchema) 37 | .set("i", i) 38 | .set("text", Random.nextString(4)) 39 | .set("partition", (i % 4).toString()) 40 | .build() 41 | } 42 | 43 | val vcc = ValueCodecConfiguration.Default 44 | 45 | def write(basePath: Path) = 46 | viaParquet[IO] 47 | .custom[GenericRecord, AvroParquetWriter.Builder[GenericRecord]](path => 48 | AvroParquetWriter 49 | .builder[GenericRecord](path.toOutputFile(ParquetWriter.Options())) 50 | .withSchema(PartitionedDataAvroSchema) 51 | ) 52 | .partitionUsing { case (path, record) => 53 | val partitionValue = record.get("partition") 54 | val partitionedRecord = new GenericRecordBuilder(PartitionedDataAvroSchema) 55 | .set("i", record.get("i")) 56 | .set("text", record.get("text")) 57 | .build() 58 | (path.append(s"partition=$partitionValue"), partitionedRecord) 59 | } 60 | .write(basePath) 61 | 62 | def read(basePath: Path) = 63 | fromParquet[IO].generic 64 | .read(basePath) 65 | 66 | override def run: IO[Unit] = { 67 | val stream = for { 68 | path <- Stream 69 | .resource(Files[IO].tempDirectory(None, "", None)) 70 | .map(fs2Path => Path(fs2Path.toNioPath).append("data.parquet")) 71 | _ <- Stream 72 | .iterable(data) 73 | .through(write(path)) 74 | .append( 75 | read(path).evalTap(r => 76 | IO.println( 77 | s"i=${r.get[Int]("i", vcc)}, text=${r.get[String]("text", vcc)}, partition=${r.get[String]("partition", vcc)}" 78 | ) 79 | ) 80 | ) 81 | } yield () 82 | 83 | stream.compile.drain 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /examples/src/main/scala/com/github/mjakubowski84/parquet4s/fs2/CustomAvroWriteAndReadFS2App.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s.fs2 2 | 3 | import cats.effect.{IO, IOApp} 4 | import com.github.mjakubowski84.parquet4s.Path 5 | import com.github.mjakubowski84.parquet4s.parquet.* 6 | import fs2.io.file.Files 7 | import fs2.{Pipe, Stream} 8 | 9 | import scala.util.Random 10 | import org.apache.avro.SchemaBuilder 11 | import org.apache.avro.generic.GenericRecordBuilder 12 | import org.apache.parquet.avro.AvroParquetWriter 13 | import org.apache.avro.generic.GenericRecord 14 | import com.github.mjakubowski84.parquet4s.ParquetWriter 15 | import com.github.mjakubowski84.parquet4s.ParquetReader 16 | import org.apache.parquet.avro.AvroParquetReader 17 | 18 | object CustomAvroWriteAndReadFS2App extends IOApp.Simple { 19 | val Count = 100 20 | val AvroSchema = SchemaBuilder 21 | .record("data") 22 | .namespace("example") 23 | .fields() 24 | .requiredInt("i") 25 | .requiredString("text") 26 | .endRecord() 27 | 28 | val data = (1 to Count).map { i => 29 | new GenericRecordBuilder(AvroSchema) 30 | .set("i", i) 31 | .set("text", Random.nextString(4)) 32 | .build() 33 | } 34 | 35 | def write(path: Path): Pipe[IO, GenericRecord, Nothing] = { 36 | val builder = 37 | AvroParquetWriter.builder[GenericRecord](path.toOutputFile(ParquetWriter.Options())).withSchema(AvroSchema) 38 | 39 | writeSingleFile[IO].custom[GenericRecord, AvroParquetWriter.Builder[GenericRecord]](builder).write 40 | } 41 | 42 | def read(path: Path) = 43 | fromParquet[IO] 44 | .custom(AvroParquetReader.builder[GenericRecord](path.toInputFile(ParquetReader.Options()))) 45 | .read() 46 | 47 | override def run: IO[Unit] = { 48 | val stream = for { 49 | path <- Stream 50 | .resource(Files[IO].tempDirectory(None, "", None)) 51 | .map(fs2Path => Path(fs2Path.toNioPath).append("data.parquet")) 52 | _ <- Stream 53 | .iterable(data) 54 | .through(write(path)) 55 | .append(read(path).printlns) 56 | } yield () 57 | 58 | stream.compile.drain 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /examples/src/main/scala/com/github/mjakubowski84/parquet4s/fs2/CustomProtobufWriteAndReadFS2App.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s.fs2 2 | 3 | import cats.effect.{IO, IOApp} 4 | import com.github.mjakubowski84.parquet4s.Path 5 | import com.github.mjakubowski84.parquet4s.parquet.* 6 | import com.github.mjakubowski84.parquet4s.protobuf.DataOuterClass.Data 7 | import fs2.io.file.Files 8 | import fs2.{Pipe, Stream} 9 | import org.apache.parquet.proto._ 10 | 11 | import scala.util.Random 12 | import com.google.protobuf.TextFormat 13 | 14 | /** Please note! This is an example of Java Protobuf + Parquet4s using custom readers and writers. You can also use 15 | * Scala Protobuf with regular Parquet4s functions thanks to ScalaPB module of Parquet4s. 16 | */ 17 | object CustomProtobufWriteAndReadFS2App extends IOApp.Simple { 18 | private val Count = 100 19 | 20 | def write(path: Path): Pipe[IO, Data, Nothing] = { 21 | val builder = ProtoParquetWriter.builder[Data](path.hadoopPath).withMessage(classOf[Data]) 22 | writeSingleFile[IO] 23 | .custom[Data, ProtoParquetWriter.Builder[Data]](builder) 24 | .write 25 | } 26 | 27 | def read(path: Path) = 28 | fromParquet[IO] 29 | .custom(ProtoParquetReader.builder[Data.Builder](path.hadoopPath)) 30 | .read(_.build) 31 | 32 | override def run: IO[Unit] = { 33 | 34 | val stream = for { 35 | path <- Stream 36 | .resource(Files[IO].tempDirectory(None, "", None)) 37 | .map(fs2Path => Path(fs2Path.toNioPath).append("data.parquet")) 38 | _ <- Stream 39 | .range[IO, Int](start = 0, stopExclusive = Count) 40 | .map(i => Data.newBuilder.setId(i).setText(Random.nextString(4)).build) 41 | .through(write(path)) 42 | .append( 43 | read(path).evalMapChunk(data => IO.println(TextFormat.printer().escapingNonAscii(false).printToString(data))) 44 | ) 45 | } yield () 46 | 47 | stream.compile.drain 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /examples/src/main/scala/com/github/mjakubowski84/parquet4s/fs2/WriteAndReadFS2App.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s.fs2 2 | 3 | import cats.Show 4 | import cats.effect.{IO, IOApp} 5 | import com.github.mjakubowski84.parquet4s.Path 6 | import com.github.mjakubowski84.parquet4s.parquet.* 7 | import fs2.Stream 8 | import fs2.io.file.Files 9 | 10 | import scala.util.Random 11 | 12 | object WriteAndReadFS2App extends IOApp.Simple { 13 | 14 | case class Data(id: Int, text: String) 15 | 16 | implicit private val showData: Show[Data] = Show.fromToString 17 | private val Count = 100 18 | 19 | override def run: IO[Unit] = { 20 | val stream = for { 21 | path <- Stream.resource(Files[IO].tempDirectory(None, "", None)).map(fs2Path => Path(fs2Path.toNioPath)) 22 | _ <- Stream 23 | .range[IO, Int](start = 0, stopExclusive = Count) 24 | .map(i => Data(id = i, text = Random.nextString(4))) 25 | .through(writeSingleFile[IO].of[Data].write(path.append("data.parquet"))) 26 | .append(fromParquet[IO].as[Data].read(path).printlns.drain) 27 | } yield () 28 | 29 | stream.compile.drain 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /examples/src/main/scala/com/github/mjakubowski84/parquet4s/fs2/WriteAndReadFilteredFS2App.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s.fs2 2 | 3 | import cats.Show 4 | import cats.effect.{IO, IOApp} 5 | import com.github.mjakubowski84.parquet4s.{Col, Path} 6 | import com.github.mjakubowski84.parquet4s.parquet.* 7 | import fs2.Stream 8 | import fs2.io.file.Files 9 | 10 | import scala.util.Random 11 | 12 | object WriteAndReadFilteredFS2App extends IOApp.Simple { 13 | 14 | object Dict { 15 | val A = "A" 16 | val B = "B" 17 | val C = "C" 18 | val D = "D" 19 | 20 | val values: List[String] = List(A, B, C, D) 21 | def random: String = values(Random.nextInt(values.length)) 22 | } 23 | 24 | case class Data(id: Int, dict: String) 25 | 26 | implicit private val showData: Show[Data] = Show.fromToString 27 | private val Count = 100 28 | 29 | override def run: IO[Unit] = { 30 | val stream = for { 31 | path <- Stream.resource(Files[IO].tempDirectory(None, "", None)).map(fs2Path => Path(fs2Path.toNioPath)) 32 | _ <- Stream 33 | .range[IO, Int](start = 0, stopExclusive = Count) 34 | .map(i => Data(id = i, dict = Dict.random)) 35 | .through(writeSingleFile[IO].of[Data].write(path.append("data.parquet"))) 36 | .append(Stream.exec(IO.println("""dict == "A""""))) 37 | .append(fromParquet[IO].as[Data].filter(Col("dict") === Dict.A).read(path).printlns.drain) 38 | .append(Stream.exec(IO.println("""id >= 20 && id < 40"""))) 39 | .append( 40 | fromParquet[IO] 41 | .as[Data] 42 | .filter(Col("id") >= 20 && Col("id") < 40) 43 | .read(path) 44 | .printlns 45 | .drain 46 | ) 47 | } yield () 48 | 49 | stream.compile.drain 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /examples/src/main/scala/com/github/mjakubowski84/parquet4s/fs2/WriteAndReadGenericFS2App.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s.fs2 2 | 3 | import cats.Show 4 | import cats.effect.{IO, IOApp} 5 | import com.github.mjakubowski84.parquet4s.parquet.* 6 | import com.github.mjakubowski84.parquet4s.{Path, RowParquetRecord, ValueCodecConfiguration} 7 | import fs2.Stream 8 | import fs2.io.file.Files 9 | import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.{BINARY, INT32, INT64} 10 | import org.apache.parquet.schema.Type.Repetition.{OPTIONAL, REQUIRED} 11 | import org.apache.parquet.schema.{LogicalTypeAnnotation, MessageType, Types} 12 | 13 | import java.time.LocalDate 14 | 15 | object WriteAndReadGenericFS2App extends IOApp.Simple { 16 | 17 | private val ID = "id" 18 | private val Name = "name" 19 | private val Birthday = "birthday" 20 | private val SchemaName = "user_schema" 21 | 22 | val schema: MessageType = Types 23 | .buildMessage() 24 | .addField(Types.primitive(INT64, REQUIRED).as(LogicalTypeAnnotation.intType(64, true)).named(ID)) 25 | .addField(Types.primitive(BINARY, OPTIONAL).as(LogicalTypeAnnotation.stringType()).named(Name)) 26 | .addField(Types.primitive(INT32, OPTIONAL).as(LogicalTypeAnnotation.dateType()).named(Birthday)) 27 | .named(SchemaName) 28 | 29 | implicit private val showRecords: Show[RowParquetRecord] = Show.fromToString 30 | 31 | private val vcc = ValueCodecConfiguration.Default 32 | 33 | private val users = List( 34 | (1L, "Alice", LocalDate.of(2000, 1, 1)), 35 | (2L, "Bob", LocalDate.of(1980, 2, 28)), 36 | (3L, "Cecilia", LocalDate.of(1977, 3, 15)) 37 | ).map { case (id, name, birthday) => 38 | RowParquetRecord 39 | .emptyWithSchema(ID, Name, Birthday) 40 | .updated(ID, id, vcc) 41 | .updated(Name, name, vcc) 42 | .updated(Birthday, birthday, vcc) 43 | } 44 | 45 | override def run: IO[Unit] = { 46 | val stream = for { 47 | path <- Stream.resource(Files[IO].tempDirectory(None, "", None)).map(fs2Path => Path(fs2Path.toNioPath)) 48 | _ <- Stream 49 | .iterable[IO, RowParquetRecord](users) 50 | .through(writeSingleFile[IO].generic(schema).write(path.append("data.parquet"))) 51 | .append(fromParquet[IO].generic.read(path).printlns.drain) 52 | } yield () 53 | 54 | stream.compile.drain 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /examples/src/main/scala/com/github/mjakubowski84/parquet4s/scalapb/WriteAndReadApp.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s.scalapb 2 | 3 | import com.github.mjakubowski84.parquet4s.ScalaPBImplicits.* 4 | import com.github.mjakubowski84.parquet4s.protobuf.Data 5 | import com.github.mjakubowski84.parquet4s.{ParquetReader, ParquetWriter, Path} 6 | 7 | import java.nio.file.Files 8 | import scala.util.Using 9 | 10 | object WriteAndReadApp extends App { 11 | val data = (1 to 100).map(id => Data(id = id, text = id.toString)) 12 | val path = Path(Files.createTempDirectory("example")) 13 | 14 | // write 15 | ParquetWriter.of[Data].writeAndClose(path.append("data.parquet"), data) 16 | 17 | // read 18 | Using.resource(ParquetReader.as[Data].read(path))(_.foreach(println)) 19 | } 20 | -------------------------------------------------------------------------------- /examples/src/main/scala/com/github/mjakubowski84/parquet4s/scalapb/WriteIncrementallyAndReadApp.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s.scalapb 2 | 3 | import com.github.mjakubowski84.parquet4s.ScalaPBImplicits.* 4 | import com.github.mjakubowski84.parquet4s.protobuf.Data 5 | import com.github.mjakubowski84.parquet4s.{ParquetReader, ParquetWriter, Path} 6 | 7 | import java.nio.file.Files 8 | 9 | object WriteIncrementallyAndReadApp extends App { 10 | val count = 100 11 | val data = (1 to count).map(id => Data(id = id, text = id.toString)) 12 | val path = Path(Files.createTempDirectory("example")) 13 | 14 | // write 15 | val writer = ParquetWriter.of[Data].build(path.append("data.parquet")) 16 | try data.foreach(entity => writer.write(entity)) 17 | finally writer.close() 18 | 19 | // read 20 | val readData = ParquetReader.as[Data].read(path) 21 | try readData.foreach(println) 22 | finally readData.close() 23 | } 24 | -------------------------------------------------------------------------------- /fs2/src/it/resources/logback-test.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /fs2/src/main/scala/com/github/mjakubowski84/parquet4s/parquet/logger.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s.parquet 2 | 3 | import cats.effect.Sync 4 | import cats.implicits.* 5 | import org.slf4j.LoggerFactory 6 | 7 | private[parquet] object logger { 8 | 9 | class Logger[F[_]](wrapped: org.slf4j.Logger)(implicit F: Sync[F]) { 10 | 11 | // FIXME replace with debug with format and params 12 | def debug(msg: => String): F[Unit] = 13 | F.catchNonFatal(wrapped.isDebugEnabled).flatMap { 14 | case true => 15 | F.delay(wrapped.debug(msg)) 16 | case false => 17 | F.unit 18 | } 19 | 20 | } 21 | 22 | def apply[F[_]](name: String)(implicit F: Sync[F]): F[Logger[F]] = 23 | F.delay(LoggerFactory.getLogger(name)).map(new Logger(_)) 24 | 25 | def apply[F[_]: Sync](clazz: Class[?]): F[Logger[F]] = 26 | apply(clazz.getCanonicalName) 27 | 28 | } 29 | -------------------------------------------------------------------------------- /fs2/src/main/scala/com/github/mjakubowski84/parquet4s/parquet/package.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import cats.effect.{Async, Sync} 4 | 5 | package object parquet { 6 | 7 | /** Creates a [[fs2.Stream]] that reads Parquet data from the specified path. If there are multiple files at path then 8 | * the order in which files are loaded is determined by underlying filesystem.
Path can refer to local file, 9 | * HDFS, AWS S3, Google Storage, Azure, etc. Please refer to Hadoop client documentation or your data provider in 10 | * order to know how to configure the connection.
Can read also partitioned directories. Filter applies 11 | * also to partition values. Partition values are set as fields in read entities at path defined by partition name. 12 | * Path can be a simple column name or a dot-separated path to nested field. Missing intermediate fields are 13 | * automatically created for each read record.
Allows to turn on a projection over original file schema 14 | * in order to boost read performance if not all columns are required to be read.
Builder allows to create a 15 | * stream of data of given type or of generic records. 16 | * @tparam F 17 | * effect type 18 | * @return 19 | * Builder of the [[fs2.Stream]] 20 | */ 21 | def fromParquet[F[_]: Sync]: reader.FromParquet[F] = new reader.FromParquetImpl[F] 22 | 23 | /** Builds a [[fs2.Pipe]] that writes Parquet data to single file at the specified path (including file name). The 24 | * resulting stream returns nothing, that is, it doesn't emit any element.
Path can refer to local file, 25 | * HDFS, AWS S3, Google Storage, Azure, etc. Please refer to Hadoop client documentation or your data provider in 26 | * order to know how to configure the connection.
Builder allows to create a pipe for given data type or for 27 | * generic records. 28 | * @tparam F 29 | * effect type 30 | * @return 31 | * [[fs2.Pipe]] builder 32 | */ 33 | def writeSingleFile[F[_]: Sync] = new writer.ToParquetImpl[F] 34 | 35 | /** Builds a [[fs2.Pipe]] that:
  1. Is designed to write Parquet files indefinitely
  2. Is able to 36 | * (optionally) partition data by a list of provided fields
  3. Flushes and rotates files after given number of 37 | * rows is written to the partition or a given time period elapses
  4. Outputs incoming message after it is 38 | * written but can write an effect of provided message transformation.

Builder allows to create a 39 | * pipe for given data type or for generic records. 40 | * @tparam F 41 | * effect type 42 | * @return 43 | * [[fs2.Pipe]] builder 44 | */ 45 | def viaParquet[F[_]: Async]: rotatingWriter.ViaParquet[F] = new rotatingWriter.ViaParquetImpl[F] 46 | 47 | } 48 | -------------------------------------------------------------------------------- /fs2/src/test/scala/com/github/mjakubowski84/parquet4s/parquet/IoSpec.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s.parquet 2 | 3 | import com.github.mjakubowski84.parquet4s.PartitionTestUtils 4 | import org.scalatest.Inside 5 | import org.scalatest.flatspec.AnyFlatSpec 6 | import org.scalatest.matchers.should.Matchers 7 | 8 | class IoSpec extends AnyFlatSpec with Matchers with Inside with PartitionTestUtils { 9 | 10 | "PartitionRegexp" should "match valid partition names and values" in 11 | forAll(ValidPartitionsTable) { case (name, value) => 12 | inside(s"$name=$value") { case io.PartitionRegexp(`name`, `value`) => 13 | succeed 14 | } 15 | } 16 | 17 | it should "not match invalid partition names and values" in 18 | forAll(InvalidPartitionsTable) { case (name, value) => 19 | s"$name=$value" match { 20 | case io.PartitionRegexp(capturedName, capturedValue) => 21 | fail( 22 | s"Expected no match for name [$name] and value [$value] " + 23 | s"but one was found: [$capturedName, $capturedValue]" 24 | ) 25 | case _ => 26 | succeed 27 | } 28 | } 29 | 30 | } 31 | -------------------------------------------------------------------------------- /project/ActorLibCross.scala: -------------------------------------------------------------------------------- 1 | case class ActorLibCross( 2 | override val idSuffix: String, 3 | override val directorySuffix: String 4 | ) extends sbt.VirtualAxis.WeakAxis 5 | -------------------------------------------------------------------------------- /project/Compilation.scala: -------------------------------------------------------------------------------- 1 | import sbt.Keys._ 2 | import sbt.CrossVersion 3 | 4 | object Compilation { 5 | 6 | lazy val compilationSettings = Seq( 7 | scalacOptions ++= 8 | Seq( 9 | "-encoding", 10 | "UTF-8", 11 | "-release", 12 | "8", 13 | "-feature", 14 | "-language:implicitConversions", 15 | "-language:higherKinds", 16 | "-Xfatal-warnings", 17 | "-Wconf:src=src_managed/.*:silent" 18 | ) ++ { 19 | CrossVersion.partialVersion(scalaVersion.value) match { 20 | case Some((3, _)) => 21 | Seq( 22 | "-unchecked", 23 | "-explain-types", 24 | "-Wunused:implicits", // Warn if an implicit parameter is unused. 25 | "-Wunused:explicits", // Warn if an explicit parameter is unused. 26 | "-Wunused:imports", // Warn if an import selector is not referenced. 27 | "-Wunused:locals", // Warn if a local definition is unused. 28 | "-Wunused:params", // Warn if a value parameter is unused. 29 | "-Wunused:privates" // Warn if a private member is unused. 30 | ) 31 | case Some((2, 13)) => 32 | Seq( 33 | "-deprecation", 34 | "-Xsource:3", 35 | "-explaintypes", 36 | "-Wextra-implicit", // Warn when more than one implicit parameter section is defined. 37 | "-Wnumeric-widen", // Warn when numerics are widened. 38 | "-Wunused:implicits", // Warn if an implicit parameter is unused. 39 | "-Wunused:explicits", // Warn if an explicit parameter is unused. 40 | "-Wunused:imports", // Warn if an import selector is not referenced. 41 | "-Wunused:locals", // Warn if a local definition is unused. 42 | "-Wunused:params", // Warn if a value parameter is unused. 43 | "-Wunused:patvars", // Warn if a variable bound in a pattern is unused. 44 | "-Wunused:privates", // Warn if a private member is unused. 45 | "-Wunnamed-boolean-literal" // Warn if boolean literal is unnamed. 46 | ) 47 | case _ => 48 | Seq( 49 | "-deprecation", 50 | "-Xsource:3", 51 | "-explaintypes", 52 | "-Ywarn-extra-implicit", // Warn when more than one implicit parameter section is defined. 53 | "-Ywarn-inaccessible", // Warn about inaccessible types in method signatures. 54 | "-Ywarn-numeric-widen", // Warn when numerics are widened. 55 | "-Ywarn-unused:implicits", // Warn if an implicit parameter is unused. 56 | "-Ywarn-unused:imports", // Warn if an import selector is not referenced. 57 | "-Ywarn-unused:locals", // Warn if a local definition is unused. 58 | "-Ywarn-unused:params", // Warn if a value parameter is unused. 59 | "-Ywarn-unused:patvars", // Warn if a variable bound in a pattern is unused. 60 | "-Ywarn-unused:privates" // Warn if a private member is unused. 61 | ) 62 | } 63 | } 64 | ) 65 | 66 | } 67 | -------------------------------------------------------------------------------- /project/DependecyVersions.scala: -------------------------------------------------------------------------------- 1 | object DependecyVersions { 2 | val parquetVersion = "1.15.2" 3 | val shapelessVersion = "2.3.13" 4 | val sparkVersion = "3.5.5" 5 | val hadoopVersion = "3.4.1" 6 | val slf4jVersion = "2.0.17" 7 | val logbackVersion = "1.3.15" // stick to 1.3.x for JDK-8 compatibility 8 | val akkaVersion = "2.6.21" // non-licensed version 9 | val fs2Version = "3.12.0" 10 | val catsEffectVersion = "3.6.1" 11 | val scalaCollectionCompatVersion = "2.13.0" 12 | val scalatestVersion = "3.2.19" 13 | val mockitoVersion = "4.11.0" // stick to 4.x for JDK-8 compatibility 14 | val pekkoVersion = "1.1.3" 15 | val jacksonVersion = "2.19.0" 16 | val testcontainersVersion = "0.43.0" 17 | } 18 | -------------------------------------------------------------------------------- /project/Documentation.scala: -------------------------------------------------------------------------------- 1 | import com.typesafe.sbt.site.SitePlugin.autoImport.makeSite 2 | import mdoc.MdocPlugin.autoImport._ 3 | import microsites.MicrositeFavicon 4 | import microsites.MicrositeKeys._ 5 | import sbt.Keys._ 6 | import sbt.Compile 7 | import sbt.{Def, url} 8 | import sbt.io.FileFilter._ 9 | import sbt.io.syntax._ 10 | 11 | object Documentation { 12 | 13 | lazy val documentationSettings: Seq[Def.Setting[_]] = 14 | Seq( 15 | name := "Parquet4s", 16 | description := "Read and write Parquet files using Scala", 17 | organizationName := "Marcin Jakubowski", 18 | organizationHomepage := Some(url("https://github.com/mjakubowski84")), 19 | micrositeDocumentationUrl := "docs", 20 | micrositeFooterText := None, 21 | micrositeBaseUrl := "parquet4s", 22 | micrositeGitterChannel := false, 23 | micrositeGithubOwner := "mjakubowski84", 24 | micrositeGithubRepo := "parquet4s", 25 | micrositeGithubToken := sys.env.get("PARQUET4S_DOCS_GITHUB_TOKEN"), 26 | micrositePushSiteWith := GitHub4s, 27 | makeSite / includeFilter := "*.html" || "*.css" || "*.png" || "*.jpg" || "*.gif" || "*.js" || "*.md" || "*.svg", 28 | micrositeDataDirectory := (Compile / resourceDirectory).value / "docs" / "data", 29 | micrositeImgDirectory := (Compile / resourceDirectory).value / "docs" / "images", 30 | micrositePalette := Map( 31 | "brand-primary" -> "#F1606A", 32 | "brand-secondary" -> "#F1606A", 33 | "white-color" -> "#FFFFFF" 34 | ), 35 | micrositeFavicons := Seq( 36 | MicrositeFavicon("favicon-16x16.png", "16x16"), 37 | MicrositeFavicon("favicon-32x32.png", "32x32") 38 | ), 39 | mdocVariables := Map( 40 | "VERSION" -> version.value 41 | ), 42 | mdocIn := (Compile / resourceDirectory).value / "docs" 43 | ) 44 | 45 | } 46 | -------------------------------------------------------------------------------- /project/Releasing.scala: -------------------------------------------------------------------------------- 1 | import sbt.Keys._ 2 | import sbt.{Credentials, Def, Developer, IntegrationTest, Opts, ScmInfo, Test, url} 3 | import xerial.sbt.Sonatype._ 4 | import xerial.sbt.Sonatype.autoImport.{sonatypeProfileName, sonatypeProjectHosting} 5 | 6 | object Releasing { 7 | 8 | lazy val publishSettings: Seq[Def.Setting[_]] = 9 | Seq( 10 | credentials ++= Seq( 11 | Credentials( 12 | realm = "Sonatype Nexus Repository Manager", 13 | host = "oss.sonatype.org", 14 | userName = sys.env.getOrElse( 15 | "SONATYPE_USERNAME", { 16 | streams.value.log.warn("Undefined environment variable: SONATYPE_USERNAME") 17 | "UNDEFINED" 18 | } 19 | ), 20 | passwd = sys.env.getOrElse( 21 | "SONATYPE_PASSWORD", { 22 | streams.value.log.warn("Undefined environment variable: SONATYPE_PASSWORD") 23 | "UNDEFINED" 24 | } 25 | ) 26 | ) 27 | ), 28 | licenses := Seq("MIT" -> url("https://opensource.org/licenses/MIT")), 29 | homepage := Some(url("https://github.com/mjakubowski84/parquet4s")), 30 | scmInfo := Some( 31 | ScmInfo( 32 | browseUrl = url("https://github.com/mjakubowski84/parquet4s"), 33 | connection = "scm:git@github.com:mjakubowski84/parquet4s.git" 34 | ) 35 | ), 36 | sonatypeProjectHosting := Some( 37 | GitHubHosting(user = "mjakubowski84", repository = "parquet4s", email = "mjakubowski84@gmail.com") 38 | ), 39 | sonatypeProfileName := "com.github.mjakubowski84", 40 | developers := List( 41 | Developer( 42 | id = "mjakubowski84", 43 | name = "Marcin Jakubowski", 44 | email = "mjakubowski84@gmail.com", 45 | url = url("https://github.com/mjakubowski84") 46 | ) 47 | ), 48 | publishMavenStyle := true, 49 | publishTo := Some( 50 | if (isSnapshot.value) 51 | Opts.resolver.mavenLocalFile 52 | else 53 | Opts.resolver.sonatypeStaging 54 | ), 55 | Test / publishArtifact := false, 56 | IntegrationTest / publishArtifact := false 57 | ) ++ (if (sys.env contains "SONATYPE_USERNAME") Signing.signingSettings else Seq.empty) 58 | 59 | } 60 | -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=1.10.7 2 | -------------------------------------------------------------------------------- /project/metals.sbt: -------------------------------------------------------------------------------- 1 | // format: off 2 | // DO NOT EDIT! This file is auto-generated. 3 | 4 | // This file enables sbt-bloop to create bloop config files. 5 | 6 | addSbtPlugin("ch.epfl.scala" % "sbt-bloop" % "2.0.10") 7 | 8 | // format: on 9 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-projectmatrix" % "0.11.0") 2 | addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "3.9.21") 3 | addSbtPlugin("com.thoughtworks.sbt-api-mappings" % "sbt-api-mappings" % "3.0.2") 4 | addSbtPlugin("pl.project13.scala" % "sbt-jmh" % "0.4.7") 5 | addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.5.4") 6 | addSbtPlugin("com.47deg" % "sbt-microsites" % "1.4.3") // 1.4.4 causes problems with JDK8 7 | addSbtPlugin("com.thesamet" % "sbt-protoc" % "1.0.6") 8 | 9 | libraryDependencies += "com.thesamet.scalapb" %% "compilerplugin" % "0.11.17" 10 | -------------------------------------------------------------------------------- /s3Test/src/it/resources/logback-test.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /s3Test/src/it/scala/com/github/mjakubowski84/parquet4s/s3/S3ItSpec.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s.s3 2 | 3 | import org.scalatest.flatspec.AnyFlatSpec 4 | import org.scalatest.matchers.should.Matchers 5 | import com.dimafeng.testcontainers.scalatest.TestContainerForAll 6 | import com.dimafeng.testcontainers.LocalStackV2Container 7 | import org.testcontainers.containers.localstack.LocalStackContainer.Service 8 | import org.apache.hadoop.conf.Configuration 9 | import scala.util.Using 10 | import com.github.mjakubowski84.parquet4s.Path 11 | import com.github.mjakubowski84.parquet4s.ParquetWriter 12 | import com.github.mjakubowski84.parquet4s.ParquetReader 13 | 14 | class S3ItSpec extends AnyFlatSpec with Matchers with TestContainerForAll { 15 | 16 | case class Data(i: Int, text: String) 17 | 18 | val bucket = "data" 19 | val data = Seq(Data(1, "a"), Data(2, "b")) 20 | val path = Path(s"s3a://$bucket/file.parquet") 21 | 22 | override val containerDef: LocalStackV2Container.Def = 23 | LocalStackV2Container.Def( 24 | tag = "latest", 25 | services = Seq(Service.S3) 26 | ) 27 | 28 | override def afterContainersStart(containers: LocalStackV2Container): Unit = 29 | containers.execInContainer("awslocal", "s3api", "create-bucket", "--bucket", bucket) 30 | 31 | "Parquet4s" should "write and read data to/from S3" in 32 | withContainers { s3Container => 33 | val configuration = new Configuration() 34 | 35 | configuration.set("fs.s3a.access.key", s3Container.container.getAccessKey()) 36 | configuration.set("fs.s3a.secret.key", s3Container.container.getSecretKey()) 37 | configuration.set("fs.s3a.endpoint", s3Container.container.getEndpoint().toString()) 38 | configuration.set("fs.s3a.endpoint.region", s3Container.container.getRegion()) 39 | 40 | ParquetWriter.of[Data].options(ParquetWriter.Options(hadoopConf = configuration)).writeAndClose(path, data) 41 | 42 | Using.resource(ParquetReader.as[Data].options(ParquetReader.Options(hadoopConf = configuration)).read(path)) { 43 | _.toSeq should be(data) 44 | } 45 | } 46 | 47 | } 48 | -------------------------------------------------------------------------------- /scalapb/src/main/scala/com/github/mjakubowski84/parquet4s/ScalaPBParquetSchemaResolver.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import com.github.mjakubowski84.parquet4s.ScalaPBImplicits.* 4 | import org.apache.parquet.schema.{Type, Types} 5 | import scalapb.{GeneratedMessage, GeneratedMessageCompanion} 6 | 7 | import scala.jdk.CollectionConverters.* 8 | 9 | class ScalaPBParquetSchemaResolver[T <: GeneratedMessage: GeneratedMessageCompanion] extends ParquetSchemaResolver[T] { 10 | private val cmp = implicitly[GeneratedMessageCompanion[T]] 11 | 12 | override def schemaName: Option[String] = Option(cmp.scalaDescriptor.name) 13 | 14 | override def resolveSchema(cursor: Cursor): List[Type] = { 15 | val md = cmp.scalaDescriptor 16 | Types 17 | .buildMessage() 18 | .addFields(md.fields) 19 | .named(md.fullName) 20 | .getFields 21 | .iterator() 22 | .asScala 23 | .toList 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /scalapb/src/test/protobuf/data.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | option java_package = "com.github.mjakubowski84.parquet4s"; 4 | 5 | message Data { 6 | enum ABC { 7 | A = 0; 8 | B = 1; 9 | C = 2; 10 | } 11 | 12 | message Inner { 13 | string text = 1; 14 | } 15 | 16 | // primitive types 17 | bool bool = 1; 18 | int32 int = 2; 19 | int64 long = 3; 20 | float float = 4; 21 | double double = 5; 22 | string text = 6; 23 | ABC abc = 7; 24 | 25 | // message type 26 | Inner inner = 8; 27 | 28 | // map types 29 | map map = 9; 30 | map enum_map = 10; 31 | map msg_map = 11; 32 | 33 | // list types 34 | repeated bool bool_list = 101; 35 | repeated int32 int_list = 102; 36 | repeated int64 long_list = 103; 37 | repeated float float_list = 104; 38 | repeated double double_list = 105; 39 | repeated string text_list = 106; 40 | repeated ABC enum_list = 107; 41 | repeated Inner msg_list = 108; 42 | } 43 | -------------------------------------------------------------------------------- /scalapb/src/test/scala/com/github/mjakubowski84/parquet4s/Parquet4sScalaPBCoreSpec.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import com.github.mjakubowski84.parquet4s.ScalaPBImplicits.* 4 | import org.apache.hadoop.conf.Configuration 5 | import org.apache.parquet.proto.{ProtoParquetReader, ProtoParquetWriter, ProtoReadSupport, ProtoWriteSupport} 6 | import org.scalatest.flatspec.AnyFlatSpec 7 | import org.scalatest.matchers.should.Matchers 8 | import com.github.mjakubowski84.parquet4s.DataOuterClass.Data as JData 9 | 10 | import TestData.* 11 | 12 | class Parquet4sScalaPBCoreSpec extends AnyFlatSpec with Matchers { 13 | 14 | "core module" should "be able to read data written with parquet-protobuf" in { 15 | val outFile = InMemoryOutputFile(initBufferSize = 4800) 16 | val hadoopConf = new Configuration() 17 | hadoopConf.setBoolean(ProtoWriteSupport.PB_SPECS_COMPLIANT_WRITE, true) 18 | 19 | ParquetWriter 20 | .custom[JData, ProtoParquetWriter.Builder[JData]]( 21 | ProtoParquetWriter.builder[JData](outFile).withMessage(classOf[JData]) 22 | ) 23 | .options(ParquetWriter.Options(hadoopConf = hadoopConf)) 24 | .writeAndClose(javaData) 25 | 26 | ParquetReader.as[Data].read(outFile.toInputFile).toSeq should be(scalaData) 27 | } 28 | 29 | it should "write data compliant with parquet-protobuf" in { 30 | val outFile = InMemoryOutputFile(initBufferSize = 4800) 31 | val hadoopConf = new Configuration() 32 | hadoopConf.setClass(ProtoReadSupport.PB_CLASS, classOf[JData], classOf[com.google.protobuf.GeneratedMessageV3]) 33 | 34 | ParquetWriter.of[Data].writeAndClose(outFile, scalaData) 35 | 36 | ParquetReader 37 | .custom[JData.Builder](ProtoParquetReader.builder[JData.Builder](outFile.toInputFile)) 38 | .options(ParquetReader.Options(hadoopConf = hadoopConf)) 39 | .read 40 | .map(_.build()) should be(javaData) 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /scalapb/src/test/scala/com/github/mjakubowski84/parquet4s/Parquet4sScalaPBFS2Spec.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import cats.effect.IO 4 | import cats.effect.testing.scalatest.AsyncIOSpec 5 | import com.github.mjakubowski84.parquet4s.DataOuterClass.Data as JData 6 | import com.github.mjakubowski84.parquet4s.ScalaPBImplicits.* 7 | import fs2.Stream 8 | import org.apache.hadoop.conf.Configuration 9 | import org.apache.parquet.proto.{ProtoParquetReader, ProtoParquetWriter, ProtoReadSupport, ProtoWriteSupport} 10 | import org.scalatest.flatspec.AsyncFlatSpec 11 | import org.scalatest.matchers.should.Matchers 12 | 13 | import TestData.* 14 | 15 | class Parquet4sScalaPBFS2Spec extends AsyncFlatSpec with AsyncIOSpec with Matchers { 16 | 17 | "fs2 module" should "be compatible with parquet-protobuf" in { 18 | val outFile = InMemoryOutputFile(initBufferSize = 4800) 19 | val hadoopConf = new Configuration() 20 | hadoopConf.setBoolean(ProtoWriteSupport.PB_SPECS_COMPLIANT_WRITE, true) 21 | 22 | def write: Stream[IO, Nothing] = 23 | Stream 24 | .iterable(javaData) 25 | .through( 26 | parquet 27 | .writeSingleFile[IO] 28 | .custom[JData, ProtoParquetWriter.Builder[JData]]( 29 | ProtoParquetWriter.builder[JData](outFile).withMessage(classOf[JData]) 30 | ) 31 | .options(ParquetWriter.Options(hadoopConf = hadoopConf)) 32 | .write 33 | ) 34 | 35 | def read: Stream[IO, Vector[Data]] = 36 | parquet.fromParquet[IO].as[Data].read(outFile.toInputFile).fold(Vector.empty[Data])(_ :+ _) 37 | 38 | (write ++ read).map(_ should be(scalaData)).compile.lastOrError 39 | } 40 | 41 | it should "write data compliant with parquet-protobuf" in { 42 | val outFile = InMemoryOutputFile(initBufferSize = 4800) 43 | val hadoopConf = new Configuration() 44 | hadoopConf.setClass(ProtoReadSupport.PB_CLASS, classOf[JData], classOf[com.google.protobuf.GeneratedMessageV3]) 45 | 46 | def write: Stream[IO, Nothing] = 47 | Stream 48 | .iterable(scalaData) 49 | .through( 50 | parquet 51 | .writeSingleFile[IO] 52 | .of[Data] 53 | .write(outFile) 54 | ) 55 | 56 | def read: Stream[IO, Vector[JData]] = 57 | parquet 58 | .fromParquet[IO] 59 | .custom[JData.Builder](ProtoParquetReader.builder[JData.Builder](outFile.toInputFile)) 60 | .options(ParquetReader.Options(hadoopConf = hadoopConf)) 61 | .read(_.build) 62 | .fold(Vector.empty[JData])(_ :+ _) 63 | 64 | (write ++ read) 65 | .map(_ should be(javaData)) 66 | .compile 67 | .lastOrError 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /scalapb/src/test/scala/com/github/mjakubowski84/parquet4s/Parquet4sScalaPBSpec.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import org.scalatest.flatspec.AnyFlatSpec 4 | import org.scalatest.matchers.should.Matchers 5 | import com.github.mjakubowski84.parquet4s.ScalaPBImplicits.* 6 | 7 | class Parquet4sScalaPBSpec extends AnyFlatSpec with Matchers { 8 | 9 | def testWithData(newData: Int => Data): Unit = { 10 | val data = (1 to 100).map(newData) 11 | 12 | val outFile = InMemoryOutputFile(initBufferSize = 4800) 13 | ParquetWriter.of[Data].writeAndClose(outFile, data) 14 | 15 | val inFile = InMemoryInputFile.fromBytes(outFile.take()) 16 | ParquetReader.as[Data].read(inFile).toSeq shouldBe data 17 | } 18 | 19 | "parquet4s-scalapb" should "work with primitive types" in { 20 | testWithData(i => Data(bool = i % 2 == 0)) 21 | testWithData(i => Data(int = i)) 22 | testWithData(i => Data(long = i.toLong)) 23 | testWithData(i => Data(float = i.toFloat)) 24 | testWithData(i => Data(double = i.toDouble)) 25 | testWithData(i => Data(text = i.toString)) 26 | testWithData(i => Data(abc = Data.ABC.fromValue(i % 3))) 27 | } 28 | 29 | it should "work with message types" in 30 | testWithData(i => Data(inner = Some(Data.Inner(i.toString)))) 31 | 32 | it should "work with unrecognized enum values" in 33 | testWithData(i => Data(abc = Data.ABC.fromValue(i % 5))) 34 | 35 | it should "work with map types" in { 36 | testWithData(i => Data(map = Map("original" -> i, "doubled" -> 2 * i))) 37 | testWithData(i => Data(enumMap = Map(i -> Data.ABC.fromValue(i % 5)))) 38 | testWithData(i => Data(msgMap = Map(i.toLong -> Data.Inner(text = "level1")))) 39 | } 40 | 41 | it should "work with list types" in { 42 | testWithData(i => Data(boolList = (i to i + 100).map(_ % 2 == 0))) 43 | testWithData(i => Data(intList = i to i + 100)) 44 | testWithData(i => Data(longList = (i to i + 100).map(_.toLong))) 45 | testWithData(i => Data(floatList = (i to i + 100).map(_.toFloat))) 46 | testWithData(i => Data(doubleList = (i to i + 100).map(_.toDouble))) 47 | testWithData(i => Data(textList = (i to i + 100).map(_.toString))) 48 | testWithData(i => Data(enumList = (i to i + 100).map(Data.ABC.fromValue))) 49 | testWithData(i => Data(msgList = (i to i + 100).map(i => Data.Inner(i.toString)))) 50 | } 51 | 52 | } 53 | -------------------------------------------------------------------------------- /scalapb/src/test/scala/com/github/mjakubowski84/parquet4s/TestData.scala: -------------------------------------------------------------------------------- 1 | package com.github.mjakubowski84.parquet4s 2 | 3 | import com.github.mjakubowski84.parquet4s.DataOuterClass.Data as JData 4 | import scala.jdk.CollectionConverters.* 5 | 6 | object TestData { 7 | 8 | val javaData: Seq[JData] = (1 to 100) 9 | .map(i => 10 | JData 11 | .newBuilder() 12 | .setBool(i % 2 == 0) 13 | .setInt(i) 14 | .setLong(i.toLong) 15 | .setFloat(i.toFloat) 16 | .setDouble(i.toDouble) 17 | .setText(i.toString) 18 | .setAbcValue(i % JData.ABC.values().length) 19 | .setInner(JData.Inner.newBuilder().setText(i.toString).build()) 20 | .addAllBoolList((i to i + 100).map(_ % 2 == 0).map(java.lang.Boolean.valueOf).asJava) 21 | .addAllIntList((i to i + 100).map(Integer.valueOf).asJava) 22 | .addAllLongList((i to i + 100).map(_.toLong).map(java.lang.Long.valueOf).asJava) 23 | .addAllFloatList((i to i + 100).map(_.toFloat).map(java.lang.Float.valueOf).asJava) 24 | .addAllDoubleList((i to i + 100).map(_.toDouble).map(java.lang.Double.valueOf).asJava) 25 | .addAllTextList((i to i + 100).map(_.toString).asJava) 26 | .addAllEnumListValue((i to i + 100).map(_ % JData.ABC.values().length).map(Integer.valueOf).asJava) 27 | .addAllMsgList((i to i + 100).map(i => JData.Inner.newBuilder().setText(i.toString).build()).asJava) 28 | .build() 29 | ) 30 | val scalaData: Seq[Data] = javaData.map(d => Data.parseFrom(d.toByteArray)) 31 | 32 | } 33 | -------------------------------------------------------------------------------- /site/src/main/resources/docs/data/menu.yml: -------------------------------------------------------------------------------- 1 | options: 2 | - title: Introduction 3 | url: docs 4 | 5 | - title: Quick Start 6 | url: docs/quick_start 7 | 8 | - title: Integration with Akka Streams 9 | url: docs/akka 10 | 11 | - title: Integration with Pekko Streams 12 | url: docs/pekko 13 | 14 | - title: Integration with FS2 15 | url: docs/fs2 16 | 17 | - title: Supported storage types 18 | url: docs/storage_types 19 | 20 | - title: Records, types and schema 21 | url: docs/records_and_schema 22 | 23 | - title: Projection 24 | url: docs/projection 25 | 26 | - title: Filtering 27 | url: docs/filtering 28 | 29 | - title: Partitioning 30 | url: docs/partitioning 31 | 32 | - title: Statistics 33 | url: docs/statistics 34 | 35 | - title: Examples 36 | url: docs/examples 37 | 38 | - title: Migration from 1.x 39 | url: docs/migration 40 | 41 | - title: (Experimental) ETL 42 | url: docs/etl 43 | 44 | - title: (Experimental) Protobuf with ScalaPB 45 | url: docs/protobuf 46 | 47 | - title: Distinguished Sponsors 48 | url: docs/sponsors 49 | -------------------------------------------------------------------------------- /site/src/main/resources/docs/docs/etl.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: docs 3 | title: (Experimental) ETL 4 | permalink: docs/etl/ 5 | --- 6 | 7 | # (Experimental) ETL 8 | 9 | Version 2.1.0 of Parquet4s introduces advanced operations on generic datasets, that is on `ParquetIterable[RowParquetRecord]`, to the core module. Now users can join and concat two or more datasets which can simplify some ETL jobs a lot. 10 | 11 | Available operations: 12 | 13 | - Left join 14 | - Right join 15 | - Inner join 16 | - Full join 17 | - Concat (appending one dataset to another) 18 | - Write called directly on a dataset. 19 | 20 | Mind that joins require loading the right-side dataset into memory, so those operations are not applicable for very large datasets. Consider switching the position of datasets in your join operation (the left dataset is iterated over). Or use e.g. Apache Spark which distributes data across multiple machines for performing join operations. 21 | 22 | Please note that this is an experimental feature. API may change in the future, and some functionalities may be added or removed. 23 | 24 | ```scala mdoc:compile-only 25 | import com.github.mjakubowski84.parquet4s.{Col, ParquetReader, Path} 26 | import scala.util.Using 27 | 28 | case class PetOwner(id: Long, name: String, petId: Long, petName: String) 29 | 30 | // define 1st dataset 31 | val readOwners = ParquetReader 32 | .projectedGeneric( 33 | Col("id").as[Long], 34 | Col("name").as[String] 35 | ) 36 | .read(Path("/owners")) 37 | 38 | // define 2nd dataset 39 | val readPets = ParquetReader 40 | .projectedGeneric( 41 | Col("id").as[Long].alias("petId"), 42 | Col("name").as[String].alias("petName"), 43 | Col("ownerId").as[Long] 44 | ) 45 | .read(Path("/pets")) 46 | 47 | // join and write output dataset 48 | Using.resources(readOwners, readPets) { case (owners, pets) => 49 | owners 50 | .innerJoin(right = pets, onLeft = Col("id"), onRight = Col("ownerId")) // define join operation 51 | .as[PetOwner] // set typed schema and codecs 52 | .writeAndClose(Path("/pet_owners/file.parquet")) // execute all including write to the disk 53 | } 54 | 55 | // take note that all operations defined above writeAndClose are lazy and are not executed before 56 | // writeAndClose is called 57 | ``` 58 | -------------------------------------------------------------------------------- /site/src/main/resources/docs/docs/examples.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: docs 3 | title: Examples 4 | permalink: docs/examples/ 5 | --- 6 | 7 | # Examples 8 | 9 | Please check [examples](https://github.com/mjakubowski84/parquet4s/blob/master/examples) where you can find simple code covering basics for `core`, `akkaPekko` and `fs2` modules. 10 | 11 | Moreover, examples contain two simple applications comprising Akka Streams / Pekko Streams or FS2 and Kafka. They show how you can write partitioned Parquet files with data coming from an indefinite stream. 12 | -------------------------------------------------------------------------------- /site/src/main/resources/docs/docs/introduction.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: docs 3 | title: Introduction 4 | permalink: docs/ 5 | --- 6 | 7 | # Introduction 8 | 9 | Parquet4s is a simple I/O for [Parquet](https://parquet.apache.org/). Allows you to easily read and write Parquet files in [Scala](https://www.scala-lang.org/). 10 | 11 | Use just a Scala case class to define the schema of your data. No need to use Avro, Protobuf, Thrift or other data serialisation systems. You can use generic records if you don't want to use the case class, too. 12 | 13 | Compatible with files generated with [Apache Spark](https://spark.apache.org/). However, unlike in Spark, you do not have to start a cluster to perform I/O operations. 14 | 15 | Based on the official [Parquet library](https://github.com/apache/parquet-mr), [Hadoop Client](https://github.com/apache/hadoop) and [Shapeless](https://github.com/milessabin/shapeless) (Shapeless is not in use in a version for Scala 3). 16 | 17 | As it is based on Hadoop Client then you can connect to any Hadoop-compatible storage like AWS S3 or Google Cloud Storage. 18 | 19 | Integrations for [Akka Streams](https://doc.akka.io/docs/akka/current/stream/index.html), [Pekko Streams](https://pekko.apache.org/docs/pekko/current/stream/index.html) and [FS2](https://fs2.io/). 20 | 21 | Released for Scala 2.12.x, 2.13.x and 3.3.x. 22 | -------------------------------------------------------------------------------- /site/src/main/resources/docs/docs/projection.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: docs 3 | title: Projection 4 | permalink: docs/projection/ 5 | --- 6 | 7 | # Projection 8 | 9 | Schema projection is a way of optimization of reads. When calling `ParquetReader.as[MyData]` Parquet4s reads the whole content of each Parquet record even when you provide a case class that maps only a part of stored columns. The same happens when you use generic records by calling `ParquetReader.generic`. However, you can explicitly tell Parquet4s to use a different schema. In effect, all columns not matching your schema will be skipped and not read. You can define the projection schema in numerous ways: 10 | 11 | 1. by defining case class for typed read using `projectedAs`, 12 | 2. by defining generic column projection (allows reference to nested fields and aliases) using `projectedGeneric`, 13 | 3. by providing your own instance of Parquet's `MessageType` for generic read using `projectedGeneric`. 14 | 15 | ```scala mdoc:compile-only 16 | import com.github.mjakubowski84.parquet4s.{Col, ParquetIterable, ParquetReader, Path, RowParquetRecord} 17 | import org.apache.parquet.schema.MessageType 18 | 19 | // typed read 20 | case class MyData(column1: Int, columnX: String) 21 | val myData: ParquetIterable[MyData] = 22 | ParquetReader 23 | .projectedAs[MyData] 24 | .read(Path("file.parquet")) 25 | 26 | // generic read with column projection 27 | val records1: ParquetIterable[RowParquetRecord] = 28 | ParquetReader 29 | .projectedGeneric( 30 | Col("column1").as[Int], 31 | Col("columnX").as[String].alias("my_column"), 32 | ) 33 | .read(Path("file.parquet")) 34 | 35 | // generic read with own instance of Parquet schema 36 | val schemaOverride: MessageType = ??? 37 | val records2: ParquetIterable[RowParquetRecord] = 38 | ParquetReader 39 | .projectedGeneric(schemaOverride) 40 | .read(Path("file.parquet")) 41 | ``` 42 | 43 | -------------------------------------------------------------------------------- /site/src/main/resources/docs/docs/protobuf.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: docs 3 | title: Read and write Parquet from and to Protobuf 4 | permalink: docs/protobuf/ 5 | --- 6 | 7 | # Read and write Parquet from and to Protobuf 8 | 9 | Using the original Java Parquet library, you can read and write parquet to and from Protbuf. Parquet4s has `custom` functions in its API, which could be leveraged for that. However, Parquet Protobuf can only be used with Java models, not to mention other issues that make it hard to use, especially in Scala. You would prefer to use [ScalaPB](https://scalapb.github.io/) in Scala projects, right? Thanks to Parquet4S, you can! Import ScalaPB extension to any Parquet4S project, either it is Akka / Pekko, FS2 or plain Scala: 10 | 11 | ```scala 12 | "com.github.mjakubowski84" %% "parquet4s-scalapb" % "@VERSION@" 13 | ``` 14 | 15 | Follow the ScalaPB [documentation](https://scalapb.github.io/docs/installation) to generate your Scala model from `.proto` files. 16 | 17 | Then, import Parquet4S type classes tailored for Protobuf. The rest of the code stays the same as in regular Parquet4S - no matter if that is Akka / Pekko, FS2 or core! 18 | 19 | ```scala mdoc:compile-only 20 | import com.github.mjakubowski84.parquet4s.ScalaPBImplicits._ 21 | import com.github.mjakubowski84.parquet4s.{ParquetReader, ParquetWriter, Path} 22 | 23 | import scala.util.Using 24 | 25 | case class GeneratedProtobufData(someField: Int) 26 | 27 | val data: Iterable[GeneratedProtobufData] = ??? // your data 28 | val path: Path = ??? // path to write to / to read from 29 | 30 | // write 31 | ParquetWriter.of[GeneratedProtobufData].writeAndClose(path.append("data.parquet"), data) 32 | 33 | // read 34 | Using.resource(ParquetReader.as[GeneratedProtobufData].read(path))(_.foreach(println)) 35 | ``` 36 | 37 | Please follow the [examples](https://github.com/mjakubowski84/parquet4s/tree/master/examples/src/main/scala/com/github/mjakubowski84/parquet4s/scalapb) to learn more. 38 | -------------------------------------------------------------------------------- /site/src/main/resources/docs/docs/quick_start.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: docs 3 | title: Quick start 4 | permalink: docs/quick_start/ 5 | --- 6 | 7 | # Quick start 8 | 9 | ## SBT 10 | 11 | ```scala 12 | libraryDependencies ++= Seq( 13 | "com.github.mjakubowski84" %% "parquet4s-core" % "@VERSION@", 14 | "org.apache.hadoop" % "hadoop-client" % yourHadoopVersion 15 | ) 16 | ``` 17 | 18 | ## Mill 19 | 20 | ```scala 21 | def ivyDeps = Agg( 22 | ivy"com.github.mjakubowski84::parquet4s-core:@VERSION@", 23 | ivy"org.apache.hadoop:hadoop-client:$yourHadoopVersion" 24 | ) 25 | ``` 26 | 27 | ```scala mdoc:compile-only 28 | import com.github.mjakubowski84.parquet4s.{ ParquetReader, ParquetWriter, Path } 29 | 30 | case class User(userId: String, name: String, created: java.sql.Timestamp) 31 | 32 | val users: Iterable[User] = Seq( 33 | User("1", "parquet", new java.sql.Timestamp(1L)) 34 | ) 35 | val path = Path("path/to/local/file.parquet") 36 | 37 | // writing 38 | ParquetWriter.of[User].writeAndClose(path, users) 39 | 40 | // reading 41 | val parquetIterable = ParquetReader.as[User].read(path) 42 | try { 43 | parquetIterable.foreach(println) 44 | } finally parquetIterable.close() 45 | ``` 46 | 47 | ## AWS S3 48 | 49 | Parquet4s works with AWS S3 and [many other distributed storage types]({% link docs/storage_types.md %}). 50 | 51 | In order to connect to AWS S3 you need to define one more dependency: 52 | 53 | ```scala 54 | "org.apache.hadoop" % "hadoop-aws" % yourHadoopVersion 55 | ``` 56 | 57 | Next, the most common way is to define following environmental variables: 58 | 59 | ```bash 60 | export AWS_ACCESS_KEY_ID=my.aws.key 61 | export AWS_SECRET_ACCESS_KEY=my.secret.key 62 | ``` 63 | 64 | Please refer to [documentation of Hadoop AWS](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/#Authenticating_with_S3) for more information on how to authenticate with S3. 65 | 66 | You may need to set some configuration properties to access your storage, e.g. `fs.s3a.path.style.access`. 67 | Please follow [documentation of Hadoop AWS](https://hadoop.apache.org/docs/current/hadoop-aws/tools/hadoop-aws/index.html) for more details and troubleshooting. 68 | 69 | Moreover, you refer to Parquet4s' [integration test](https://github.com/mjakubowski84/parquet4s/tree/master/s3Test/src/it) that proves that integration with S3 works. 70 | 71 | ## Passing Hadoop Configs Programmatically 72 | 73 | File system configs for S3, GCS, Hadoop, etc. can also be set programmatically to the `ParquetReader` and `ParquetWriter` by passing the `Configuration` to the `ParqetReader.Options` and `ParquetWriter.Options` case classes. 74 | 75 | ```scala mdoc:compile-only 76 | import com.github.mjakubowski84.parquet4s.{ ParquetReader, ParquetWriter, Path } 77 | import org.apache.parquet.hadoop.metadata.CompressionCodecName 78 | import org.apache.hadoop.conf.Configuration 79 | 80 | case class User(userId: String, name: String, created: java.sql.Timestamp) 81 | 82 | val users: Iterable[User] = Seq( 83 | User("1", "parquet", new java.sql.Timestamp(1L)) 84 | ) 85 | 86 | val writerOptions = ParquetWriter.Options( 87 | compressionCodecName = CompressionCodecName.SNAPPY, 88 | hadoopConf = new Configuration() 89 | ) 90 | 91 | ParquetWriter 92 | .of[User] 93 | .options(writerOptions) 94 | .writeAndClose(Path("path/to/local/file.parquet"), users) 95 | ``` 96 | -------------------------------------------------------------------------------- /site/src/main/resources/docs/docs/sponsors.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: docs 3 | title: Distinguished Sponsors 4 | permalink: docs/sponsors/ 5 | --- 6 | 7 | # Distinguished Sponsors 8 | 9 | - [calvinlfer](https://github.com/calvinlfer) 10 | -------------------------------------------------------------------------------- /site/src/main/resources/docs/docs/statistics.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: docs 3 | title: Statistics 4 | permalink: docs/statistics/ 5 | --- 6 | 7 | # Statistics 8 | 9 | Parquet files contain metadata that are used to optimize [filtering]({% link docs/filtering.md %}). Additionally, Parquet4s leverages metadata to provide insight about datasets in an efficient way: 10 | 11 | - Number of records 12 | - Min value of a column 13 | - Max value of a column 14 | 15 | Parquet4s will try to resolve those statistics without iterating over each record if possible. Statistics can also be queried using a filter — but please mind that speed of the query might decrease as, due to filtering, the algorithm might need to iterate over the content of a row group to resolve min/max values. The performance of the query is the best in the case of sorted datasets. 16 | 17 | Parquet4s provides separate API for Statistics. It is also leveraged in `ParqueIterable`e.g. to efficiently calculate `size`. 18 | 19 | ```scala mdoc:compile-only 20 | import com.github.mjakubowski84.parquet4s.{Col, Path, Stats} 21 | 22 | import java.time.LocalDate 23 | case class User(id: Long, age: Int, registered: LocalDate) 24 | 25 | // stats of users that registered in year 2020 26 | val userStats = Stats 27 | .builder 28 | .filter(Col("registered") >= LocalDate.of(2020, 1, 1) && Col("registered") < LocalDate.of(2021, 1, 1)) 29 | .projection[User] 30 | .stats(Path("users")) 31 | 32 | val numberOfUsers = userStats.recordCount 33 | val minAge = userStats.min[Int](Col("age")) 34 | val maxAge = userStats.max[Int](Col("age")) 35 | ``` 36 | 37 | ```scala mdoc:compile-only 38 | import com.github.mjakubowski84.parquet4s.{Col, ParquetReader, Path, Stats} 39 | 40 | import java.time.LocalDate 41 | case class User(id: Long, age: Int, registered: LocalDate) 42 | 43 | // users that registered in year 2020 44 | val users = ParquetReader 45 | .projectedAs[User] 46 | .filter(Col("registered") >= LocalDate.of(2020, 1, 1) && Col("registered") < LocalDate.of(2021, 1, 1)) 47 | .read(Path("users")) 48 | 49 | try { 50 | val numberOfUsers = users.size 51 | val minAge = users.min[Int](Col("age")) 52 | val maxAge = users.max[Int](Col("age")) 53 | } finally { 54 | users.close() 55 | } 56 | ``` 57 | -------------------------------------------------------------------------------- /site/src/main/resources/docs/docs/storage_types.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: docs 3 | title: Supported storage types 4 | permalink: docs/storage_types/ 5 | --- 6 | 7 | # Supported storage types 8 | 9 | As it is based on Hadoop Client, Parquet4s can read and write from a variety of file systems: 10 | 11 | - Local files 12 | - HDFS 13 | - [Amazon S3](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html) 14 | - [Google Storage](https://cloud.google.com/dataproc/docs/concepts/connectors/cloud-storage) 15 | - [Azure Blob Storage](https://hadoop.apache.org/docs/stable/hadoop-azure/index.html) 16 | - [Azure Data Lake Storage](https://hadoop.apache.org/docs/stable/hadoop-azure-datalake/index.html) 17 | - and any other storage compatible with Hadoop... 18 | 19 | Please refer to Hadoop Client documentation or your storage provider to check how to connect to your storage. 20 | -------------------------------------------------------------------------------- /site/src/main/resources/docs/images/favicon-16x16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mjakubowski84/parquet4s/036d8a03c1febb087813309f797414ed860e7992/site/src/main/resources/docs/images/favicon-16x16.png -------------------------------------------------------------------------------- /site/src/main/resources/docs/images/favicon-32x32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mjakubowski84/parquet4s/036d8a03c1febb087813309f797414ed860e7992/site/src/main/resources/docs/images/favicon-32x32.png -------------------------------------------------------------------------------- /site/src/main/resources/docs/images/features-header.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /site/src/main/resources/docs/images/light-navbar-brand.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /site/src/main/resources/docs/images/light-sidebar-brand.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /site/src/main/resources/docs/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: homeFeatures 3 | features: 4 | - first: ["Quick start", "How to use Parquet4s in just a few steps", "quick_start"] 5 | - third: ["Documentation", "All you need to know about Parquet4s"] 6 | --- 7 | --------------------------------------------------------------------------------