├── .gitignore ├── LICENSE ├── README.md ├── build.sbt ├── examples ├── csv.py ├── csv.scala ├── json.py ├── json.scala ├── parquet.py ├── parquet.scala ├── people.csv ├── people.json └── people.parquet ├── project ├── assembly.sbt ├── build.properties └── plugins.sbt ├── scalastyle-config.xml ├── src ├── main │ ├── resources │ │ └── META-INF │ │ │ └── services │ │ │ └── org.apache.spark.sql.sources.DataSourceRegister │ └── scala │ │ └── io │ │ └── minio │ │ └── spark │ │ └── select │ │ ├── Credentials.scala │ │ ├── FilterPushdown.scala │ │ ├── Select.scala │ │ ├── SelectCSVRelation.scala │ │ ├── SelectCSVSource.scala │ │ ├── SelectJSONRelation.scala │ │ ├── SelectJSONSource.scala │ │ ├── SelectParquetRelation.scala │ │ ├── SelectParquetSource.scala │ │ └── util │ │ ├── S3URI.scala │ │ └── TypeCast.scala └── test │ └── scala │ └── io │ └── minio │ └── spark │ └── select │ └── .keep └── version.sbt /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.log 3 | 4 | # sbt specific 5 | .cache 6 | .history 7 | .lib/ 8 | dist/* 9 | target/ 10 | lib_managed/ 11 | src_managed/ 12 | project/boot/ 13 | project/plugins/project/ 14 | 15 | # Scala-IDE specific 16 | .scala_dependencies 17 | .worksheet 18 | 19 | .idea/workspace.xml 20 | logs 21 | project/project 22 | project/target 23 | target 24 | tmp 25 | .history 26 | dist 27 | .idea/ 28 | /*.iml 29 | /out 30 | /.idea_modules 31 | /.classpath 32 | /.project 33 | /RUNNING_PID 34 | /.settings 35 | projectFilesBackup/ 36 | */target 37 | metastore_db/ 38 | *.log 39 | *.class 40 | *.log 41 | *.zip 42 | *.gz 43 | *.jpeg 44 | *.jpg 45 | *.png 46 | *~ 47 | .settings/ 48 | .cache/ 49 | .history/ 50 | .lib/ 51 | dist/* 52 | target/ 53 | lib_managed/ 54 | src_managed/ 55 | project/boot/ 56 | project/plugins/project/ 57 | .project 58 | .classpath 59 | .cache 60 | .sbtserver.lock 61 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MinIO Spark Select 2 | MinIO Spark select enables retrieving only required data from an object using Select API. 3 | 4 | ## Requirements 5 | This library requires 6 | - Spark 2.3+ 7 | - Scala 2.11+ 8 | 9 | ## Features 10 | - S3 Select is supported with CSV, JSON and Parquet files using `minioSelectCSV`, `minioSelectJSON` and `minioSelectParquet` values to specify the data format. 11 | - S3 Select supports select on multiple objects. 12 | - S3 Select supports querying SSE-C encrypted objects. 13 | 14 | ### Limitations 15 | - Spark CSV and JSON options such as nanValue, positiveInf, negativeInf, and options related to corrupt records (for example, failfast and dropmalformed mode) are not supported. 16 | - Using commas (,) within decimals is not supported. For example, 10,000 is not supported and 10000 is. 17 | - The following filters are not pushed down to MinIO: 18 | - Aggregate functions such as COUNT() and SUM(). 19 | - Filters that CAST() an attribute. For example, CAST(stringColumn as INT) = 1. 20 | - Filters with an attribute that is an object or is complex. For example, intArray[1] = 1, objectColumn.objectNumber = 1. 21 | - Filters for which the value is not a literal value. For example, intColumn1 = intColumn2 22 | - Only Select [Supported Data Types](https://docs.aws.amazon.com/AmazonS3/latest/dev/s3-glacier-select-sql-reference-data-types.html) are supported with the documented limitations. 23 | 24 | ### HowTo 25 | Include this package in your Spark Applications using: 26 | 27 | #### *spark-shell*, *pyspark*, or *spark-submit* 28 | ``` 29 | > $SPARK_HOME/bin/spark-shell --packages io.minio:spark-select_2.11:2.1 30 | ``` 31 | 32 | #### *sbt* 33 | If you use the [sbt-spark-package plugin](http://github.com/databricks/sbt-spark-package), in your sbt build file, add: 34 | ``` 35 | spDependencies += "minio/spark-select:2.1" 36 | ``` 37 | Otherwise, 38 | ``` 39 | libraryDependencies += "io.minio" % "spark-select_2.11" % "2.1" 40 | ``` 41 | 42 | #### *Maven* 43 | In your pom.xml, add: 44 | ```xml 45 | 46 | 47 | 48 | io.minio 49 | spark-select_2.11 50 | 2.1 51 | 52 | 53 | ``` 54 | 55 | #### *Source* 56 | 57 | Setup all required environment variables 58 | > NOTE: It is assumed that you have already installed hadoop-2.8.5, spark 2.3.1 at some locations locally. 59 | ``` 60 | export HADOOP_HOME=${HOME}/spark/hadoop-2.8.5/ 61 | export PATH=${PATH}:${HADOOP_HOME}/bin 62 | export SPARK_DIST_CLASSPATH=$(hadoop classpath) 63 | 64 | export SPARK_HOME=${HOME}/spark/spark-2.3.1-bin-without-hadoop/ 65 | export PATH=${PATH}:${SPARK_HOME}/bin 66 | export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk-amd64/ 67 | 68 | git clone https://github.com/minio/spark-select 69 | sbt assembly 70 | spark-shell --jars target/scala-2.11/spark-select-assembly-2.1.jar 71 | ``` 72 | 73 | Once the `spark-shell` has been successfully invoked. 74 | ``` 75 | scala> :load examples/csv.scala 76 | Loading examples/csv.scala... 77 | import org.apache.spark.sql._ 78 | import org.apache.spark.sql.types._ 79 | schema: org.apache.spark.sql.types.StructType = StructType(StructField(name,StringType,true), StructField(age,IntegerType,false)) 80 | df: org.apache.spark.sql.DataFrame = [name: string, age: int] 81 | +-------+---+ 82 | | name|age| 83 | +-------+---+ 84 | |Michael| 31| 85 | | Andy| 30| 86 | | Justin| 19| 87 | +-------+---+ 88 | 89 | scala> 90 | ``` 91 | 92 | ### API 93 | 94 | #### *PySpark* 95 | ```py 96 | spark 97 | .read 98 | .format("minioSelectCSV") // "minioSelectJSON" for JSON or "minioSelectParquet" for Parquet 99 | .schema(...) // mandatory 100 | .options(...) // optional 101 | .load("s3://path/to/my/datafiles") 102 | ``` 103 | 104 | #### *R* 105 | ``` 106 | read.df("s3://path/to/my/datafiles", "minioSelectCSV", schema) 107 | ``` 108 | 109 | #### *Scala* 110 | ``` 111 | spark 112 | .read 113 | .format("minioSelectCSV") // "minioSelectJSON" for JSON or "minioSelectParquet" for Parquet 114 | .schema(...) // mandatory 115 | .options(...) // optional. Examples: 116 | // .options(Map("quote" -> "\'", "header" -> "true")) or 117 | // .option("quote", "\'").option("header", "true") 118 | .load("s3://path/to/my/datafiles") 119 | ``` 120 | 121 | #### *SQL* 122 | ``` 123 | CREATE TEMPORARY VIEW MyView (number INT, name STRING) USING minioSelectCSV OPTIONS (path "s3://path/to/my/datafiles") 124 | ``` 125 | 126 | ### Options 127 | The following options are available when using `minioSelectCSV` and `minioSelectJSON`. If not specified, default values are used. 128 | 129 | #### *Options with minioSelectCSV* 130 | | Option | Default | Usage | 131 | |---|---|---| 132 | | `compression` | "none" | Indicates whether compression is used. "gzip", "bzip2" are values supported besides "none". 133 | | `delimiter` | "," | Specifies the field delimiter. 134 | | `quote` | '"' | Specifies the quote character. Specifying an empty string is not supported and results in a malformed XML error. 135 | | `escape` | '"' | Specifies the quote escape character. 136 | | `header` | "true" | "false" specifies that there is no header. "true" specifies that a header is in the first line. Only headers in the first line are supported, and empty lines before a header are not supported. 137 | | `comment` | "#" | Specifies the comment character. 138 | 139 | #### *Options with minioSelectJSON* 140 | | Option | Default | Usage | 141 | |---|---|---| 142 | | `compression` | "none" | Indicates whether compression is used. "gzip", "bzip2" are values supported besides "none". 143 | | `multiline` | "false" | "false" specifies that the JSON is in Select LINES format, meaning that each line in the input data contains a single JSON object. "true" specifies that the JSON is in Select DOCUMENT format, meaning that a JSON object can span multiple lines in the input data. 144 | 145 | #### *Options with minioSelectParquet* 146 | There are no **options** needed with Parquet files. 147 | 148 | ### Full Examples 149 | 150 | #### *Scala* 151 | 152 | Schema with two columns for `CSV`. 153 | ```scala 154 | import org.apache.spark.sql._ 155 | import org.apache.spark.sql.types._ 156 | 157 | object app { 158 | def main(args: Array[String]) { 159 | val schema = StructType( 160 | List( 161 | StructField("name", StringType, true), 162 | StructField("age", IntegerType, false) 163 | ) 164 | ) 165 | 166 | val df = spark 167 | .read 168 | .format("minioSelectCSV") 169 | .schema(schema) 170 | .load("s3://sjm-airlines/people.csv") 171 | 172 | println(df.show()) 173 | 174 | println(df.select("*").filter("age > 19").show()) 175 | 176 | } 177 | } 178 | ``` 179 | 180 | With custom schema for `JSON`. 181 | ```scala 182 | import org.apache.spark.sql._ 183 | import org.apache.spark.sql.types._ 184 | 185 | object app { 186 | def main(args: Array[String]) { 187 | val schema = StructType( 188 | List( 189 | StructField("name", StringType, true), 190 | StructField("age", IntegerType, false) 191 | ) 192 | ) 193 | 194 | val df = spark 195 | .read 196 | .format("minioSelectJSON") 197 | .schema(schema) 198 | .load("s3://sjm-airlines/people.json") 199 | 200 | println(df.show()) 201 | 202 | println(df.select("*").filter("age > 19").show()) 203 | 204 | } 205 | } 206 | ``` 207 | 208 | With custom schema for `Parquet`. 209 | ```scala 210 | import org.apache.spark.sql._ 211 | import org.apache.spark.sql.types._ 212 | 213 | object app { 214 | def main(args: Array[String]) { 215 | val schema = StructType( 216 | List( 217 | StructField("name", StringType, true), 218 | StructField("age", IntegerType, false) 219 | ) 220 | ) 221 | 222 | val df = spark 223 | .read 224 | .format("minioSelectParquet") 225 | .schema(schema) 226 | .load("s3://sjm-airlines/people.parquet") 227 | 228 | println(df.show()) 229 | 230 | println(df.select("*").filter("age > 19").show()) 231 | 232 | } 233 | } 234 | ``` 235 | 236 | #### *Python* 237 | 238 | Schema with two columns for `CSV`. 239 | ```py 240 | from pyspark.sql import * 241 | from pyspark.sql.types import * 242 | 243 | if __name__ == "__main__": 244 | # create SparkSession 245 | spark = SparkSession.builder \ 246 | .master("local") \ 247 | .appName("spark-select in python") \ 248 | .getOrCreate() 249 | 250 | # filtered schema 251 | st = StructType([ 252 | StructField("name", StringType(), True), 253 | StructField("age", IntegerType(), False), 254 | ]) 255 | 256 | df = spark \ 257 | .read \ 258 | .format('minioSelectCSV') \ 259 | .schema(st) \ 260 | .load("s3://testbucket/people.csv") 261 | 262 | # show all rows. 263 | df.show() 264 | 265 | # show only filtered rows. 266 | df.select("*").filter("age > 19").show() 267 | ``` 268 | 269 | ``` 270 | > $SPARK_HOME/bin/spark-submit --packages io.minio:spark-select_2.11:2.1 271 | ``` 272 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | name := "spark-select" 2 | 3 | organization := "io.minio" 4 | 5 | scalaVersion := "2.11.12" 6 | 7 | crossScalaVersions := Seq("2.11.12") 8 | 9 | spName := "minio/spark-select" 10 | 11 | spAppendScalaVersion := true 12 | 13 | spIncludeMaven := true 14 | 15 | spIgnoreProvided := true 16 | 17 | sparkVersion := "2.3.1" 18 | 19 | val testSparkVersion = settingKey[String]("The version of Spark to test against.") 20 | 21 | testSparkVersion := sys.props.get("spark.testVersion").getOrElse(sparkVersion.value) 22 | 23 | // used spark components 24 | sparkComponents := Seq("sql") 25 | 26 | assemblyMergeStrategy in assembly := { 27 | case "META-INF/io.netty.versions.properties" => MergeStrategy.concat 28 | case x => 29 | val oldStrategy = (assemblyMergeStrategy in assembly).value 30 | oldStrategy(x) 31 | } 32 | 33 | // Dependent libraries 34 | libraryDependencies ++= Seq( 35 | "com.amazonaws" % "aws-java-sdk" % "1.11.434" exclude("com.fasterxml.jackson.core", "jackson-databind"), 36 | "org.apache.commons" % "commons-csv" % "1.7", 37 | "org.slf4j" % "slf4j-api" % "1.7.5" % "provided", 38 | "org.mockito" % "mockito-core" % "2.0.31-beta" 39 | ) 40 | 41 | libraryDependencies ++= Seq( 42 | "org.apache.spark" %% "spark-core" % testSparkVersion.value % "test" force(), 43 | "org.apache.spark" %% "spark-sql" % testSparkVersion.value % "test" force(), 44 | "org.scala-lang" % "scala-library" % scalaVersion.value % "compile" 45 | ) 46 | 47 | /** 48 | * release settings 49 | */ 50 | 51 | publishMavenStyle := true 52 | 53 | releaseCrossBuild := true 54 | 55 | licenses += ("Apache-2.0", url("http://www.apache.org/licenses/LICENSE-2.0")) 56 | 57 | releasePublishArtifactsAction := PgpKeys.publishSigned.value 58 | 59 | publishArtifact in Test := false 60 | 61 | pomIncludeRepository := { _ => false } 62 | 63 | publishTo := { 64 | val nexus = "https://oss.sonatype.org/" 65 | if (version.value.endsWith("SNAPSHOT")) 66 | Some("snapshots" at nexus + "content/repositories/snapshots") 67 | else 68 | Some("releases" at nexus + "service/local/staging/deploy/maven2") 69 | } 70 | 71 | pomExtra := ( 72 | https://github.com/minio/spark-select 73 | 74 | scm:git:github.com/minio/spark-select 75 | scm:git:git@github.com:minio/spark-select 76 | github.com/minio/spark-select 77 | 78 | 79 | 80 | minio 81 | MinIO 82 | http://www.minio.io 83 | 84 | ) 85 | 86 | // Skip tests during assembly 87 | test in assembly := {} 88 | 89 | ScoverageSbtPlugin.ScoverageKeys.coverageHighlighting := { 90 | if (scalaBinaryVersion.value == "2.10") false 91 | else true 92 | } 93 | 94 | import ReleaseTransformations._ 95 | 96 | // Add publishing to spark packages as another step. 97 | releaseProcess := Seq[ReleaseStep]( 98 | checkSnapshotDependencies, 99 | inquireVersions, 100 | runTest, 101 | setReleaseVersion, 102 | commitReleaseVersion, 103 | tagRelease, 104 | publishArtifacts, 105 | setNextVersion, 106 | commitNextVersion, 107 | pushChanges, 108 | releaseStepTask(spPublish) 109 | ) 110 | -------------------------------------------------------------------------------- /examples/csv.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import * 2 | from pyspark.sql.types import * 3 | 4 | if __name__ == "__main__": 5 | # create SparkSession 6 | spark = SparkSession.builder \ 7 | .master("local") \ 8 | .appName("spark-select in python") \ 9 | .getOrCreate() 10 | 11 | # filtered schema 12 | st = StructType([ 13 | StructField("name", StringType(), True), 14 | StructField("age", IntegerType(), False), 15 | ]) 16 | 17 | df = spark \ 18 | .read \ 19 | .format('minioSelectCSV') \ 20 | .schema(st) \ 21 | .load("s3://testbucket/people.csv") 22 | 23 | # show all rows. 24 | df.show() 25 | 26 | # show only filtered rows. 27 | df.select("*").filter("age > 19").show() 28 | -------------------------------------------------------------------------------- /examples/csv.scala: -------------------------------------------------------------------------------- 1 | import org.apache.spark.sql._ 2 | import org.apache.spark.sql.types._ 3 | 4 | object app { 5 | def main(args: Array[String]) { 6 | val schema = StructType( 7 | List( 8 | StructField("name", StringType, true), 9 | StructField("age", IntegerType, false) 10 | ) 11 | ) 12 | 13 | val df = spark 14 | .read 15 | .format("minioSelectCSV") 16 | .schema(schema) 17 | .load("s3://sjm-airlines/people.csv") 18 | 19 | println(df.show()) 20 | 21 | println(df.select("*").filter("age > 19").show()) 22 | 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /examples/json.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | from pyspark.sql.types import * 3 | 4 | if __name__ == "__main__": 5 | # create SparkSession 6 | spark = SparkSession.builder \ 7 | .master("local") \ 8 | .appName("spark-select in python") \ 9 | .getOrCreate() 10 | 11 | # filtered schema 12 | st = StructType([ 13 | StructField("name", StringType(), True), 14 | StructField("age", IntegerType(), False), 15 | ]) 16 | 17 | df = spark \ 18 | .read \ 19 | .format('minioSelectJSON') \ 20 | .schema(st) \ 21 | .load("s3://testbucket/people.json") 22 | 23 | # show all rows. 24 | df.show() 25 | 26 | # show only filtered rows. 27 | df.select("*").filter("age > 19").show() 28 | -------------------------------------------------------------------------------- /examples/json.scala: -------------------------------------------------------------------------------- 1 | import org.apache.spark.sql._ 2 | import org.apache.spark.sql.types._ 3 | 4 | object app { 5 | def main(args: Array[String]) { 6 | val schema = StructType( 7 | List( 8 | StructField("name", StringType, true), 9 | StructField("age", IntegerType, false) 10 | ) 11 | ) 12 | 13 | val df = spark 14 | .read 15 | .format("minioSelectJSON") 16 | .schema(schema) 17 | .load("s3://sjm-airlines/people.json") 18 | 19 | println(df.show()) 20 | 21 | println(df.select("*").filter("age > 19").show()) 22 | 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /examples/parquet.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | from pyspark.sql.types import * 3 | 4 | if __name__ == "__main__": 5 | # create SparkSession 6 | spark = SparkSession.builder \ 7 | .master("local") \ 8 | .appName("spark-select in python") \ 9 | .getOrCreate() 10 | 11 | # filtered schema 12 | st = StructType([ 13 | StructField("name", StringType(), True), 14 | StructField("age", IntegerType(), False), 15 | ]) 16 | 17 | df = spark \ 18 | .read \ 19 | .format('minioSelectParquet') \ 20 | .schema(st) \ 21 | .load("s3://testbucket/people.parquet") 22 | 23 | # show all rows. 24 | df.show() 25 | 26 | # show only filtered rows. 27 | df.select("*").filter("age > 19").show() 28 | -------------------------------------------------------------------------------- /examples/parquet.scala: -------------------------------------------------------------------------------- 1 | import org.apache.spark.sql._ 2 | import org.apache.spark.sql.types._ 3 | 4 | object app { 5 | def main(args: Array[String]) { 6 | val schema = StructType( 7 | List( 8 | StructField("name", StringType, true), 9 | StructField("age", IntegerType, false) 10 | ) 11 | ) 12 | 13 | val df = spark 14 | .read 15 | .format("minioSelectParquet") 16 | .schema(schema) 17 | .load("s3://sjm-airlines/people.parquet") 18 | 19 | println(df.show()) 20 | 21 | println(df.select("*").filter("age > 19").show()) 22 | 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /examples/people.csv: -------------------------------------------------------------------------------- 1 | name,age 2 | Michael,31 3 | Andy,30 4 | Justin,19 5 | -------------------------------------------------------------------------------- /examples/people.json: -------------------------------------------------------------------------------- 1 | {"name":"Michael", "age": 31} 2 | {"name":"Andy", "age": 30} 3 | {"name":"Justin", "age": 19} 4 | -------------------------------------------------------------------------------- /examples/people.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/minio/spark-select/d8721b46e84639690b7382d4a4e0877996e64b3d/examples/people.parquet -------------------------------------------------------------------------------- /project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.12.0") 2 | -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | sbt.version=0.13.17 18 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | scalaVersion := "2.10.6" 2 | 3 | resolvers += Resolver.url("artifactory", url("http://scalasbt.artifactoryonline.com/scalasbt/sbt-plugin-releases"))(Resolver.ivyStylePatterns) 4 | 5 | resolvers += "Typesafe Repository" at "http://repo.typesafe.com/typesafe/releases/" 6 | 7 | resolvers += "sonatype-releases" at "https://oss.sonatype.org/content/repositories/releases/" 8 | 9 | resolvers += "Spark Package Main Repo" at "https://dl.bintray.com/spark-packages/maven" 10 | 11 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "4.0.0") 12 | 13 | addSbtPlugin("com.github.mpeltonen" % "sbt-idea" % "1.6.0") 14 | 15 | addSbtPlugin("org.spark-packages" % "sbt-spark-package" % "0.2.4") 16 | 17 | addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "0.6.0") 18 | 19 | addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.1.0") 20 | 21 | addSbtPlugin("org.scoverage" % "sbt-coveralls" % "1.0.0") 22 | 23 | addSbtPlugin("com.github.gseitz" % "sbt-release" % "1.0.10") 24 | 25 | addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.1.1") 26 | 27 | addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "2.3") 28 | -------------------------------------------------------------------------------- /scalastyle-config.xml: -------------------------------------------------------------------------------- 1 | 2 | Scalastyle standard configuration 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | -------------------------------------------------------------------------------- /src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister: -------------------------------------------------------------------------------- 1 | io.minio.spark.select.SelectCSVSource 2 | io.minio.spark.select.SelectJSONSource 3 | io.minio.spark.select.SelectParquetSource 4 | -------------------------------------------------------------------------------- /src/main/scala/io/minio/spark/select/Credentials.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 MinIO, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package io.minio.spark.select 17 | 18 | import java.net.URI 19 | 20 | // For BasicAWSCredentials 21 | import com.amazonaws.auth.AWSCredentials 22 | import com.amazonaws.auth.AWSCredentialsProvider 23 | import com.amazonaws.auth.BasicAWSCredentials 24 | import com.amazonaws.auth.BasicSessionCredentials 25 | import com.amazonaws.auth.DefaultAWSCredentialsProviderChain 26 | 27 | import org.apache.hadoop.conf.Configuration 28 | 29 | private[spark] object Credentials { 30 | private def staticCredentialsProvider(credentials: AWSCredentials): AWSCredentialsProvider = { 31 | new AWSCredentialsProvider { 32 | override def getCredentials: AWSCredentials = credentials 33 | override def refresh(): Unit = {} 34 | } 35 | } 36 | 37 | def load(location: Option[String], hadoopConfiguration: Configuration): AWSCredentialsProvider = { 38 | val uri = new URI(location.getOrElse("")) 39 | val uriScheme = uri.getScheme 40 | 41 | uriScheme match { 42 | case "s3" | "s3a" => 43 | // This matches what S3A does, with one exception: we don't 44 | // support anonymous credentials. First, try to parse from URI: 45 | Option(uri.getUserInfo).flatMap { userInfo => 46 | if (userInfo.contains(":")) { 47 | val Array(accessKey, secretKey) = userInfo.split(":") 48 | Some(staticCredentialsProvider(new BasicAWSCredentials(accessKey, secretKey))) 49 | } else { 50 | None 51 | } 52 | }.orElse { 53 | val accessKey = hadoopConfiguration.get(s"fs.s3a.access.key", null) 54 | val secretKey = hadoopConfiguration.get(s"fs.s3a.secret.key", null) 55 | val sessionToken = hadoopConfiguration.get(s"fs.s3a.session.token", null) 56 | if (accessKey != null && secretKey != null) { 57 | if (sessionToken != null) { 58 | Some(staticCredentialsProvider(new BasicSessionCredentials(accessKey, secretKey, sessionToken))) 59 | } else { 60 | Some(staticCredentialsProvider(new BasicAWSCredentials(accessKey, secretKey))) 61 | } 62 | } else { 63 | None 64 | } 65 | }.getOrElse { 66 | // Finally, fall back on the instance profile provider 67 | new DefaultAWSCredentialsProviderChain() 68 | } 69 | case other => 70 | throw new IllegalArgumentException(s"Unrecognized scheme $other; expected s3, or s3a") 71 | } 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /src/main/scala/io/minio/spark/select/FilterPushdown.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 MinIO, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.minio.spark.select 18 | 19 | import java.sql.{Date, Timestamp} 20 | 21 | import org.apache.spark.sql.sources._ 22 | import org.apache.spark.sql.types._ 23 | 24 | /** 25 | * Helper methods for pushing filters into Select queries. 26 | */ 27 | private[spark] object FilterPushdown { 28 | /** 29 | * Build a SQL WHERE clause for the given filters. If a filter cannot be pushed down then no 30 | * condition will be added to the WHERE clause. If none of the filters can be pushed down then 31 | * an empty string will be returned. 32 | * 33 | * @param schema the schema of the table being queried 34 | * @param filters an array of filters, the conjunction of which is the filter condition for the 35 | * scan. 36 | */ 37 | def buildWhereClause(schema: StructType, filters: Seq[Filter]): String = { 38 | val filterExpressions = filters.flatMap(f => buildFilterExpression(schema, f)).mkString(" AND ") 39 | if (filterExpressions.isEmpty) "" else "WHERE " + filterExpressions 40 | } 41 | 42 | /** 43 | * Attempt to convert the given filter into a Select expression. Returns None if the expression 44 | * could not be converted. 45 | */ 46 | def buildFilterExpression(schema: StructType, filter: Filter): Option[String] = { 47 | def buildComparison(attr: String, value: Any, comparisonOp: String): Option[String] = { 48 | getTypeForAttribute(schema, attr).map { dataType => 49 | val sqlEscapedValue: String = dataType match { 50 | case StringType => s""""${value.toString.replace("'", "\\'\\'")}"""" 51 | case DateType => s""""${value.asInstanceOf[Date]}"""" 52 | case TimestampType => s""""${value.asInstanceOf[Timestamp]}"""" 53 | case _ => value.toString 54 | } 55 | s"s."+s""""$attr""""+s" $comparisonOp $sqlEscapedValue" 56 | } 57 | } 58 | 59 | filter match { 60 | case EqualTo(attr, value) => buildComparison(attr, value, "=") 61 | case LessThan(attr, value) => buildComparison(attr, value, "<") 62 | case GreaterThan(attr, value) => buildComparison(attr, value, ">") 63 | case LessThanOrEqual(attr, value) => buildComparison(attr, value, "<=") 64 | case GreaterThanOrEqual(attr, value) => buildComparison(attr, value, ">=") 65 | case _ => None 66 | } 67 | } 68 | 69 | /** 70 | * Use the given schema to look up the attribute's data type. Returns None if the attribute could 71 | * not be resolved. 72 | */ 73 | private def getTypeForAttribute(schema: StructType, attribute: String): Option[DataType] = { 74 | if (schema.fieldNames.contains(attribute)) { 75 | Some(schema(attribute).dataType) 76 | } else { 77 | None 78 | } 79 | } 80 | 81 | def queryFromSchema(schema: StructType, filters: Array[Filter]): String = { 82 | var columnList = schema.fields.map(x => s"s."+s""""${x.name}"""").mkString(",") 83 | if (columnList.length == 0) { 84 | columnList = "*" 85 | } 86 | val whereClause = buildWhereClause(schema, filters) 87 | if (whereClause.length == 0) { 88 | s"select $columnList from S3Object s" 89 | } else { 90 | s"select $columnList from S3Object s $whereClause" 91 | } 92 | } 93 | 94 | } 95 | -------------------------------------------------------------------------------- /src/main/scala/io/minio/spark/select/Select.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 MinIO, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package io.minio.spark.select 17 | 18 | import org.apache.hadoop.conf.Configuration 19 | 20 | // Select API 21 | import com.amazonaws.services.s3.model.JSONInput 22 | import com.amazonaws.services.s3.model.JSONType 23 | import com.amazonaws.services.s3.model.CSVInput 24 | import com.amazonaws.services.s3.model.CSVOutput 25 | import com.amazonaws.services.s3.model.ParquetInput 26 | import com.amazonaws.services.s3.model.CompressionType 27 | import com.amazonaws.services.s3.model.ExpressionType 28 | import com.amazonaws.services.s3.model.SSECustomerKey 29 | import com.amazonaws.services.s3.model.InputSerialization 30 | import com.amazonaws.services.s3.model.OutputSerialization 31 | import com.amazonaws.services.s3.model.SelectObjectContentRequest 32 | import com.amazonaws.services.s3.model.SelectObjectContentResult 33 | import com.amazonaws.services.s3.model.SelectObjectContentEvent 34 | import com.amazonaws.services.s3.model.SelectObjectContentEvent.RecordsEvent 35 | import com.amazonaws.services.s3.model.FileHeaderInfo 36 | 37 | import org.apache.spark.sql.types._ 38 | import org.apache.spark.sql.sources._ 39 | 40 | private[spark] object Select { 41 | private val SERVER_ENCRYPTION_ALGORITHM = s"fs.s3a.server-side-encryption-algorithm" 42 | private val SERVER_ENCRYPTION_KEY = s"fs.s3a.server-side-encryption.key" 43 | 44 | private def compressionType(params: Map[String, String]): CompressionType = { 45 | params.getOrElse("compression", "none") match { 46 | case "none" => CompressionType.NONE 47 | case "gzip" => CompressionType.GZIP 48 | case "bzip2" => CompressionType.BZIP2 49 | } 50 | } 51 | 52 | private def jsonType(params: Map[String, String]): JSONType = { 53 | params.getOrElse("multiline", "false") match { 54 | case "false" => JSONType.LINES 55 | case "true" => JSONType.DOCUMENT 56 | } 57 | } 58 | 59 | private def headerInfo(params: Map[String, String]): FileHeaderInfo = { 60 | params.getOrElse("header", "true") match { 61 | case "false" => FileHeaderInfo.NONE 62 | case "true" => FileHeaderInfo.USE 63 | } 64 | } 65 | 66 | private def sseCustomerKey(algo: String, key: String): SSECustomerKey = { 67 | algo match { 68 | case "SSE-C" => 69 | if (key != null) { 70 | new SSECustomerKey(key) 71 | } else { 72 | null 73 | } 74 | case other => 75 | throw new IllegalArgumentException(s"Unrecognized algorithm $algo; expected SSE-C") 76 | } 77 | } 78 | 79 | def requestParquet(bucket: String, key: String, params: Map[String, String], 80 | schema: StructType, filters: Array[Filter], 81 | hadoopConfiguration: Configuration): SelectObjectContentRequest = { 82 | 83 | new SelectObjectContentRequest() { request => 84 | request.setBucketName(bucket) 85 | request.setKey(key) 86 | request.setExpression(FilterPushdown.queryFromSchema(schema, filters)) 87 | request.setExpressionType(ExpressionType.SQL) 88 | val algo = hadoopConfiguration.get(SERVER_ENCRYPTION_ALGORITHM, null) 89 | if (algo != null) { 90 | request.withSSECustomerKey(sseCustomerKey(algo, 91 | hadoopConfiguration.get(SERVER_ENCRYPTION_KEY, null))) 92 | } 93 | 94 | val inputSerialization = new InputSerialization() 95 | val parquetInput = new ParquetInput() 96 | inputSerialization.setParquet(parquetInput) 97 | request.setInputSerialization(inputSerialization) 98 | 99 | val outputSerialization = new OutputSerialization() 100 | val csvOutput = new CSVOutput() 101 | outputSerialization.setCsv(csvOutput) 102 | request.setOutputSerialization(outputSerialization) 103 | } 104 | } 105 | 106 | def requestJSON(bucket: String, key: String, params: Map[String, String], 107 | schema: StructType, filters: Array[Filter], 108 | hadoopConfiguration: Configuration): SelectObjectContentRequest = { 109 | 110 | new SelectObjectContentRequest() { request => 111 | request.setBucketName(bucket) 112 | request.setKey(key) 113 | request.setExpression(FilterPushdown.queryFromSchema(schema, filters)) 114 | request.setExpressionType(ExpressionType.SQL) 115 | val algo = hadoopConfiguration.get(SERVER_ENCRYPTION_ALGORITHM, null) 116 | if (algo != null) { 117 | request.withSSECustomerKey(sseCustomerKey(algo, 118 | hadoopConfiguration.get(SERVER_ENCRYPTION_KEY, null))) 119 | } 120 | 121 | val inputSerialization = new InputSerialization() 122 | val jsonInput = new JSONInput() 123 | jsonInput.withType(jsonType(params)) 124 | inputSerialization.setJson(jsonInput) 125 | inputSerialization.setCompressionType(compressionType(params)) 126 | request.setInputSerialization(inputSerialization) 127 | 128 | val outputSerialization = new OutputSerialization() 129 | val csvOutput = new CSVOutput() 130 | outputSerialization.setCsv(csvOutput) 131 | request.setOutputSerialization(outputSerialization) 132 | } 133 | } 134 | 135 | 136 | def requestCSV(bucket: String, key: String, params: Map[String, String], 137 | schema: StructType, filters: Array[Filter], 138 | hadoopConfiguration: Configuration): SelectObjectContentRequest = { 139 | new SelectObjectContentRequest() { request => 140 | request.setBucketName(bucket) 141 | request.setKey(key) 142 | request.setExpression(FilterPushdown.queryFromSchema(schema, filters)) 143 | request.setExpressionType(ExpressionType.SQL) 144 | val algo = hadoopConfiguration.get(SERVER_ENCRYPTION_ALGORITHM, null) 145 | if (algo != null) { 146 | request.withSSECustomerKey(sseCustomerKey(algo, 147 | hadoopConfiguration.get(SERVER_ENCRYPTION_KEY, null))) 148 | } 149 | 150 | val inputSerialization = new InputSerialization() 151 | val csvInput = new CSVInput() 152 | csvInput.withFileHeaderInfo(headerInfo(params)) 153 | csvInput.withRecordDelimiter('\n') 154 | csvInput.withQuoteCharacter(params.getOrElse(s"quote", "\"")) 155 | csvInput.withQuoteEscapeCharacter(params.getOrElse(s"escape", "\"")) 156 | csvInput.withComments(params.getOrElse(s"comment", "#")) 157 | csvInput.withFieldDelimiter(params.getOrElse(s"delimiter", ",")) 158 | inputSerialization.setCsv(csvInput) 159 | inputSerialization.setCompressionType(compressionType(params)) 160 | request.setInputSerialization(inputSerialization) 161 | 162 | val outputSerialization = new OutputSerialization() 163 | val csvOutput = new CSVOutput() 164 | csvOutput.withRecordDelimiter('\n') 165 | csvOutput.withFieldDelimiter(params.getOrElse("delimiter", ",")) 166 | outputSerialization.setCsv(csvOutput) 167 | request.setOutputSerialization(outputSerialization) 168 | } 169 | } 170 | } 171 | -------------------------------------------------------------------------------- /src/main/scala/io/minio/spark/select/SelectCSVRelation.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 MinIO, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package io.minio.spark.select 17 | 18 | import scala.collection.JavaConversions.asScalaBuffer 19 | import scala.collection.JavaConverters._ 20 | 21 | import scala.util.control.NonFatal 22 | 23 | import org.slf4j.LoggerFactory 24 | 25 | import java.io.InputStreamReader 26 | import java.io.BufferedReader 27 | 28 | // Import all utilities 29 | import io.minio.spark.select.util._ 30 | 31 | // Apache commons 32 | import org.apache.commons.csv._ 33 | 34 | // For AmazonS3 client 35 | import com.amazonaws.services.s3.AmazonS3 36 | import com.amazonaws.services.s3.AmazonS3URI 37 | import com.amazonaws.services.s3.AmazonS3ClientBuilder 38 | import com.amazonaws.client.builder.AwsClientBuilder.EndpointConfiguration 39 | 40 | import com.amazonaws.services.s3.model.ListObjectsV2Request 41 | import com.amazonaws.services.s3.model.ListObjectsV2Result 42 | import com.amazonaws.services.s3.model.S3ObjectSummary 43 | 44 | import org.apache.spark.rdd.RDD 45 | import org.apache.spark.sql.types._ 46 | import org.apache.spark.sql.sources._ 47 | import org.apache.spark.sql.catalyst.encoders.RowEncoder 48 | import org.apache.spark.sql.{DataFrame, Row, SQLContext} 49 | 50 | import scala.collection.mutable.{ListBuffer, ArrayBuffer} 51 | 52 | /** 53 | * Abstract relation class to download data from S3 compatible storage 54 | */ 55 | case class SelectCSVRelation protected[spark] ( 56 | location: Option[String], 57 | params: Map[String, String], 58 | userSchema: StructType = null)(@transient val sqlContext: SQLContext) 59 | extends BaseRelation 60 | with TableScan 61 | with PrunedScan 62 | with PrunedFilteredScan { 63 | 64 | private val API_PATH_STYLE_ACCESS = s"fs.s3a.path.style.access" 65 | private val SERVER_ENDPOINT = s"fs.s3a.endpoint" 66 | private val SERVER_REGION = s"fs.s3a.region" 67 | 68 | private val hadoopConfiguration = sqlContext.sparkContext.hadoopConfiguration 69 | private val pathStyleAccess = hadoopConfiguration.get(API_PATH_STYLE_ACCESS, "false") == "true" 70 | private val endpoint = hadoopConfiguration.get(SERVER_ENDPOINT, "https://s3.amazonaws.com") 71 | private val region = hadoopConfiguration.get(SERVER_REGION, "us-east-1") 72 | private val s3Client = 73 | AmazonS3ClientBuilder.standard() 74 | .withCredentials(Credentials.load(location, hadoopConfiguration)) 75 | .withPathStyleAccessEnabled(pathStyleAccess) 76 | .withEndpointConfiguration(new EndpointConfiguration(endpoint, region)) 77 | .build() 78 | 79 | private val logger = LoggerFactory.getLogger(SelectCSVRelation.getClass) 80 | 81 | override lazy val schema: StructType = Option(userSchema).getOrElse({ 82 | // With no schema we return error. 83 | throw new RuntimeException(s"Schema cannot be empty") 84 | }) 85 | 86 | private def getRows(prunedSchema: StructType, filters: Array[Filter]): Seq[Row] = { 87 | var records = new ListBuffer[Row] 88 | var req = new ListObjectsV2Request() 89 | var result = new ListObjectsV2Result() 90 | var s3URI = S3URI.toAmazonS3URI(location.getOrElse("")) 91 | 92 | req.withBucketName(s3URI.getBucket()) 93 | req.withPrefix(s3URI.getKey().stripSuffix("*")) 94 | req.withMaxKeys(1000) 95 | 96 | val csvFormat = CSVFormat.DEFAULT 97 | .withHeader(prunedSchema.fields.map(x => x.name): _*) 98 | .withRecordSeparator("\n") 99 | .withDelimiter(params.getOrElse("delimiter", ",").charAt(0)) 100 | .withQuote(params.getOrElse("quote", "\"").charAt(0)) 101 | .withEscape(params.getOrElse(s"escape", "\"").charAt(0)) 102 | .withCommentMarker(params.getOrElse(s"comment", "#").charAt(0)) 103 | 104 | do { 105 | result = s3Client.listObjectsV2(req) 106 | asScalaBuffer(result.getObjectSummaries()).foreach(objectSummary => { 107 | val in = s3Client.selectObjectContent( 108 | Select.requestCSV( 109 | objectSummary.getBucketName(), 110 | objectSummary.getKey(), 111 | params, 112 | prunedSchema, 113 | filters, 114 | hadoopConfiguration) 115 | ).getPayload().getRecordsInputStream() 116 | var parser = CSVParser.parse(in, java.nio.charset.Charset.forName("UTF-8"), csvFormat) 117 | try { 118 | for (record <- parser.asScala) { 119 | records += Row.fromSeq(prunedSchema.fields.map(x => { 120 | TypeCast.castTo(record.get(x.name), x.dataType, x.nullable) 121 | })) 122 | } 123 | } catch { 124 | case NonFatal(e) => { 125 | logger.error(s"Exception while parsing ", e) 126 | } 127 | } 128 | parser.close() 129 | }) 130 | req.setContinuationToken(result.getNextContinuationToken()) 131 | } while (result.isTruncated()) 132 | records.toList 133 | } 134 | 135 | override def toString: String = s"SelectCSVRelation()" 136 | 137 | private def tokenRDD(schema: StructType, filters: Array[Filter]): RDD[Row] = { 138 | sqlContext.sparkContext.makeRDD(getRows(schema, filters)) 139 | } 140 | 141 | override def buildScan(): RDD[Row] = { 142 | tokenRDD(schema, null) 143 | } 144 | 145 | override def buildScan(columns: Array[String]): RDD[Row] = { 146 | tokenRDD(pruneSchema(schema, columns), null) 147 | } 148 | 149 | override def buildScan(columns: Array[String], filters: Array[Filter]): RDD[Row] = { 150 | tokenRDD(pruneSchema(schema, columns), filters) 151 | } 152 | 153 | private def pruneSchema(schema: StructType, columns: Array[String]): StructType = { 154 | val fieldMap = Map(schema.fields.map(x => x.name -> x): _*) 155 | new StructType(columns.map(name => fieldMap(name))) 156 | } 157 | } 158 | -------------------------------------------------------------------------------- /src/main/scala/io/minio/spark/select/SelectCSVSource.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 MinIO, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package io.minio.spark.select 17 | 18 | // Java standard libraries 19 | import java.io.File 20 | 21 | // Spark internal libraries 22 | import org.apache.spark.sql.SQLContext 23 | import org.apache.spark.sql.sources.{BaseRelation, RelationProvider, SchemaRelationProvider} 24 | import org.apache.spark.sql.types.StructType 25 | 26 | import org.apache.spark.sql.sources.DataSourceRegister 27 | 28 | class SelectCSVSource 29 | extends SchemaRelationProvider 30 | with DataSourceRegister { 31 | 32 | private def checkPath(parameters: Map[String, String]): String = { 33 | parameters.getOrElse("path", sys.error("'path' must be specified for CSV data.")) 34 | } 35 | 36 | /** 37 | * Short alias for spark-select data source. 38 | */ 39 | override def shortName(): String = "minioSelectCSV" 40 | 41 | override def createRelation(sqlContext: SQLContext, params: Map[String, String], schema: StructType): SelectCSVRelation = { 42 | val path = checkPath(params) 43 | SelectCSVRelation(Some(path), params, schema)(sqlContext) 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/main/scala/io/minio/spark/select/SelectJSONRelation.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 MinIO, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package io.minio.spark.select 17 | 18 | import scala.collection.JavaConversions.asScalaBuffer 19 | 20 | import java.io.InputStreamReader 21 | import java.io.BufferedReader 22 | 23 | // Import all utilities 24 | import io.minio.spark.select.util._ 25 | 26 | // Apache commons 27 | import org.apache.commons.csv.{CSVFormat, QuoteMode} 28 | 29 | // For AmazonS3 client 30 | import com.amazonaws.services.s3.AmazonS3 31 | import com.amazonaws.services.s3.AmazonS3URI 32 | import com.amazonaws.services.s3.AmazonS3ClientBuilder 33 | import com.amazonaws.client.builder.AwsClientBuilder.EndpointConfiguration 34 | 35 | import com.amazonaws.services.s3.model.ListObjectsV2Request 36 | import com.amazonaws.services.s3.model.ListObjectsV2Result 37 | import com.amazonaws.services.s3.model.S3ObjectSummary 38 | 39 | import org.apache.spark.rdd.RDD 40 | import org.apache.spark.sql.types._ 41 | import org.apache.spark.sql.sources._ 42 | import org.apache.spark.sql.catalyst.encoders.RowEncoder 43 | import org.apache.spark.sql.{DataFrame, Row, SQLContext} 44 | 45 | import scala.collection.mutable.{ListBuffer, ArrayBuffer} 46 | 47 | /** 48 | * Abstract relation class to download data from S3 compatible storage 49 | */ 50 | case class SelectJSONRelation protected[spark] ( 51 | location: Option[String], 52 | params: Map[String, String], 53 | userSchema: StructType = null)(@transient val sqlContext: SQLContext) 54 | extends BaseRelation 55 | with TableScan 56 | with PrunedScan 57 | with PrunedFilteredScan { 58 | 59 | private val API_PATH_STYLE_ACCESS = s"fs.s3a.path.style.access" 60 | private val SERVER_ENDPOINT = s"fs.s3a.endpoint" 61 | private val SERVER_REGION = s"fs.s3a.region" 62 | 63 | private val hadoopConfiguration = sqlContext.sparkContext.hadoopConfiguration 64 | private val pathStyleAccess = hadoopConfiguration.get(API_PATH_STYLE_ACCESS, "false") == "true" 65 | private val endpoint = hadoopConfiguration.get(SERVER_ENDPOINT, "https://s3.amazonaws.com") 66 | private val region = hadoopConfiguration.get(SERVER_REGION, "us-east-1") 67 | private val s3Client = 68 | AmazonS3ClientBuilder.standard() 69 | .withCredentials(Credentials.load(location, hadoopConfiguration)) 70 | .withPathStyleAccessEnabled(pathStyleAccess) 71 | .withEndpointConfiguration(new EndpointConfiguration(endpoint, region)) 72 | .build() 73 | 74 | override lazy val schema: StructType = Option(userSchema).getOrElse({ 75 | // With no schema we return error. 76 | throw new RuntimeException(s"Schema cannot be empty") 77 | }) 78 | 79 | 80 | private def getRows(schema: StructType, filters: Array[Filter]): Seq[Row] = { 81 | var records = new ListBuffer[Row] 82 | var req = new ListObjectsV2Request() 83 | var result = new ListObjectsV2Result() 84 | var s3URI = S3URI.toAmazonS3URI(location.getOrElse("")) 85 | 86 | req.withBucketName(s3URI.getBucket()) 87 | req.withPrefix(s3URI.getKey().stripSuffix("*")) 88 | req.withMaxKeys(1000) 89 | 90 | do { 91 | result = s3Client.listObjectsV2(req) 92 | asScalaBuffer(result.getObjectSummaries()).foreach(objectSummary => { 93 | val br = new BufferedReader(new InputStreamReader( 94 | s3Client.selectObjectContent( 95 | Select.requestJSON( 96 | objectSummary.getBucketName(), 97 | objectSummary.getKey(), 98 | params, 99 | schema, 100 | filters, 101 | hadoopConfiguration) 102 | ).getPayload().getRecordsInputStream())) 103 | var line : String = null 104 | while ( {line = br.readLine(); line != null}) { 105 | var row = new Array[Any](schema.fields.length) 106 | var rowValues = line.split(",") 107 | var index = 0 108 | while (index < rowValues.length) { 109 | val field = schema.fields(index) 110 | row(index) = TypeCast.castTo(rowValues(index), field.dataType, 111 | field.nullable) 112 | index += 1 113 | } 114 | records += Row.fromSeq(row) 115 | } 116 | br.close() 117 | }) 118 | req.setContinuationToken(result.getNextContinuationToken()) 119 | } while (result.isTruncated()) 120 | records.toList 121 | } 122 | 123 | override def toString: String = s"SelectJSONRelation()" 124 | 125 | private def tokenRDD(schema: StructType, filters: Array[Filter]): RDD[Row] = { 126 | sqlContext.sparkContext.makeRDD(getRows(schema, filters)) 127 | } 128 | 129 | override def buildScan(): RDD[Row] = { 130 | tokenRDD(schema, null) 131 | } 132 | 133 | override def buildScan(columns: Array[String]): RDD[Row] = { 134 | tokenRDD(pruneSchema(schema, columns), null) 135 | } 136 | 137 | override def buildScan(columns: Array[String], filters: Array[Filter]): RDD[Row] = { 138 | tokenRDD(pruneSchema(schema, columns), filters) 139 | } 140 | 141 | private def pruneSchema(schema: StructType, columns: Array[String]): StructType = { 142 | val fieldMap = Map(schema.fields.map(x => x.name -> x): _*) 143 | new StructType(columns.map(name => fieldMap(name))) 144 | } 145 | } 146 | -------------------------------------------------------------------------------- /src/main/scala/io/minio/spark/select/SelectJSONSource.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 MinIO, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package io.minio.spark.select 17 | 18 | // Java standard libraries 19 | import java.io.File 20 | 21 | // Spark internal libraries 22 | import org.apache.spark.sql.SQLContext 23 | import org.apache.spark.sql.sources.{BaseRelation, RelationProvider, SchemaRelationProvider} 24 | import org.apache.spark.sql.types.StructType 25 | 26 | import org.apache.spark.sql.sources.DataSourceRegister 27 | 28 | class SelectJSONSource 29 | extends SchemaRelationProvider 30 | with DataSourceRegister { 31 | 32 | private def checkPath(parameters: Map[String, String]): String = { 33 | parameters.getOrElse("path", sys.error("'path' must be specified for JSON data.")) 34 | } 35 | 36 | /** 37 | * Short alias for spark-select data source. 38 | */ 39 | override def shortName(): String = "minioSelectJSON" 40 | 41 | override def createRelation(sqlContext: SQLContext, params: Map[String, String], schema: StructType): SelectJSONRelation = { 42 | val path = checkPath(params) 43 | SelectJSONRelation(Some(path), params, schema)(sqlContext) 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/main/scala/io/minio/spark/select/SelectParquetRelation.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 MinIO, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package io.minio.spark.select 17 | 18 | import scala.collection.JavaConversions.asScalaBuffer 19 | 20 | import java.io.InputStreamReader 21 | import java.io.BufferedReader 22 | 23 | // Import all utilities 24 | import io.minio.spark.select.util._ 25 | 26 | // For AmazonS3 client 27 | import com.amazonaws.services.s3.AmazonS3 28 | import com.amazonaws.services.s3.AmazonS3URI 29 | import com.amazonaws.services.s3.AmazonS3ClientBuilder 30 | import com.amazonaws.client.builder.AwsClientBuilder.EndpointConfiguration 31 | 32 | import com.amazonaws.services.s3.model.ListObjectsV2Request 33 | import com.amazonaws.services.s3.model.ListObjectsV2Result 34 | import com.amazonaws.services.s3.model.S3ObjectSummary 35 | 36 | import org.apache.spark.rdd.RDD 37 | import org.apache.spark.sql.types._ 38 | import org.apache.spark.sql.sources._ 39 | import org.apache.spark.sql.catalyst.encoders.RowEncoder 40 | import org.apache.spark.sql.{DataFrame, Row, SQLContext} 41 | 42 | import scala.collection.mutable.{ListBuffer, ArrayBuffer} 43 | 44 | /** 45 | * Abstract relation class to download data from S3 compatible storage 46 | */ 47 | case class SelectParquetRelation protected[spark] ( 48 | location: Option[String], 49 | params: Map[String, String], 50 | userSchema: StructType = null)(@transient val sqlContext: SQLContext) 51 | extends BaseRelation 52 | with TableScan 53 | with PrunedScan 54 | with PrunedFilteredScan { 55 | 56 | private val API_PATH_STYLE_ACCESS = s"fs.s3a.path.style.access" 57 | private val SERVER_ENDPOINT = s"fs.s3a.endpoint" 58 | private val SERVER_REGION = s"fs.s3a.region" 59 | 60 | private val hadoopConfiguration = sqlContext.sparkContext.hadoopConfiguration 61 | private val pathStyleAccess = hadoopConfiguration.get(API_PATH_STYLE_ACCESS, "false") == "true" 62 | private val endpoint = hadoopConfiguration.get(SERVER_ENDPOINT, "https://s3.amazonaws.com") 63 | private val region = hadoopConfiguration.get(SERVER_REGION, "us-east-1") 64 | private val s3Client = 65 | AmazonS3ClientBuilder.standard() 66 | .withCredentials(Credentials.load(location, hadoopConfiguration)) 67 | .withPathStyleAccessEnabled(pathStyleAccess) 68 | .withEndpointConfiguration(new EndpointConfiguration(endpoint, region)) 69 | .build() 70 | 71 | override lazy val schema: StructType = Option(userSchema).getOrElse({ 72 | // With no schema we return error. 73 | throw new RuntimeException(s"Schema cannot be empty") 74 | }) 75 | 76 | private def getRows(schema: StructType, filters: Array[Filter]): Seq[Row] = { 77 | var records = new ListBuffer[Row] 78 | var req = new ListObjectsV2Request() 79 | var result = new ListObjectsV2Result() 80 | var s3URI = S3URI.toAmazonS3URI(location.getOrElse("")) 81 | 82 | req.withBucketName(s3URI.getBucket()) 83 | req.withPrefix(s3URI.getKey().stripSuffix("*")) 84 | req.withMaxKeys(1000) 85 | 86 | do { 87 | result = s3Client.listObjectsV2(req) 88 | asScalaBuffer(result.getObjectSummaries()).foreach(objectSummary => { 89 | val br = new BufferedReader(new InputStreamReader( 90 | s3Client.selectObjectContent( 91 | Select.requestParquet( 92 | objectSummary.getBucketName(), 93 | objectSummary.getKey(), 94 | params, 95 | schema, 96 | filters, 97 | hadoopConfiguration) 98 | ).getPayload().getRecordsInputStream())) 99 | var line : String = null 100 | while ( {line = br.readLine(); line != null}) { 101 | var row = new Array[Any](schema.fields.length) 102 | var rowValues = line.split(",") 103 | var index = 0 104 | while (index < rowValues.length) { 105 | val field = schema.fields(index) 106 | row(index) = TypeCast.castTo(rowValues(index), field.dataType, 107 | field.nullable) 108 | index += 1 109 | } 110 | records += Row.fromSeq(row) 111 | } 112 | br.close() 113 | }) 114 | req.setContinuationToken(result.getNextContinuationToken()) 115 | } while (result.isTruncated()) 116 | records.toList 117 | } 118 | 119 | override def toString: String = s"SelectParquetRelation()" 120 | 121 | private def tokenRDD(schema: StructType, filters: Array[Filter]): RDD[Row] = { 122 | sqlContext.sparkContext.makeRDD(getRows(schema, filters)) 123 | } 124 | 125 | override def buildScan(): RDD[Row] = { 126 | tokenRDD(schema, null) 127 | } 128 | 129 | override def buildScan(columns: Array[String]): RDD[Row] = { 130 | tokenRDD(pruneSchema(schema, columns), null) 131 | } 132 | 133 | override def buildScan(columns: Array[String], filters: Array[Filter]): RDD[Row] = { 134 | tokenRDD(pruneSchema(schema, columns), filters) 135 | } 136 | 137 | private def pruneSchema(schema: StructType, columns: Array[String]): StructType = { 138 | val fieldMap = Map(schema.fields.map(x => x.name -> x): _*) 139 | new StructType(columns.map(name => fieldMap(name))) 140 | } 141 | } 142 | -------------------------------------------------------------------------------- /src/main/scala/io/minio/spark/select/SelectParquetSource.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 MinIO, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package io.minio.spark.select 17 | 18 | // Java standard libraries 19 | import java.io.File 20 | 21 | // Spark internal libraries 22 | import org.apache.spark.sql.SQLContext 23 | import org.apache.spark.sql.sources.{BaseRelation, RelationProvider, SchemaRelationProvider} 24 | import org.apache.spark.sql.types.StructType 25 | 26 | import org.apache.spark.sql.sources.DataSourceRegister 27 | 28 | class SelectParquetSource 29 | extends SchemaRelationProvider 30 | with DataSourceRegister { 31 | 32 | private def checkPath(parameters: Map[String, String]): String = { 33 | parameters.getOrElse("path", sys.error("'path' must be specified for Parquet data.")) 34 | } 35 | 36 | /** 37 | * Short alias for spark-select data source. 38 | */ 39 | override def shortName(): String = "minioSelectParquet" 40 | 41 | override def createRelation(sqlContext: SQLContext, params: Map[String, String], schema: StructType): SelectParquetRelation = { 42 | val path = checkPath(params) 43 | SelectParquetRelation(Some(path), params, schema)(sqlContext) 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/main/scala/io/minio/spark/select/util/S3URI.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 MinIO, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package io.minio.spark.select.util 17 | 18 | import java.net.URI 19 | import com.amazonaws.services.s3.AmazonS3URI 20 | 21 | object S3URI { 22 | private[select] def toAmazonS3URI( 23 | location: String): AmazonS3URI = { 24 | val uri = new URI(location) 25 | val uriScheme = uri.getScheme 26 | uriScheme match { 27 | case "s3" => 28 | new AmazonS3URI(uri) 29 | case "s3a" | "s3n" => 30 | new AmazonS3URI(new URI("s3", uri.getUserInfo, uri.getHost, uri.getPort, uri.getPath, 31 | uri.getQuery, uri.getFragment)) 32 | case other => 33 | throw new IllegalArgumentException(s"Unrecognized scheme $other; expected s3, or s3a or s3n") 34 | } 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/main/scala/io/minio/spark/select/util/TypeCast.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 MinIO, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package io.minio.spark.select.util 17 | 18 | import java.math.BigDecimal 19 | import java.sql.{Date, Timestamp} 20 | import java.text.{SimpleDateFormat, NumberFormat} 21 | import java.util.Locale 22 | 23 | import org.apache.spark.sql.types._ 24 | 25 | import scala.util.Try 26 | 27 | /** 28 | * Utility functions for type casting 29 | */ 30 | object TypeCast { 31 | 32 | /** 33 | * Casts given string datum to specified type. 34 | * Currently we do not support complex types (ArrayType, MapType, StructType). 35 | * 36 | * For string types, this is simply the datum. For other types. 37 | * For other nullable types, this is null if the string datum is empty. 38 | * 39 | * @param datum string value 40 | * @param castType SparkSQL type 41 | */ 42 | private[select] def castTo( 43 | datum: String, 44 | castType: DataType, 45 | nullable: Boolean = true, 46 | treatEmptyValuesAsNulls: Boolean = false, 47 | nullValue: String = "", 48 | dateFormatter: SimpleDateFormat = null): Any = { 49 | if (datum == nullValue && 50 | nullable || 51 | (treatEmptyValuesAsNulls && datum == "")){ 52 | null 53 | } else { 54 | castType match { 55 | case _: ByteType => datum.toByte 56 | case _: ShortType => datum.toShort 57 | case _: IntegerType => datum.toInt 58 | case _: LongType => datum.toLong 59 | case _: FloatType => Try(datum.toFloat) 60 | .getOrElse(NumberFormat.getInstance(Locale.getDefault).parse(datum).floatValue()) 61 | case _: DoubleType => Try(datum.toDouble) 62 | .getOrElse(NumberFormat.getInstance(Locale.getDefault).parse(datum).doubleValue()) 63 | case _: BooleanType => datum.toBoolean 64 | case _: DecimalType => new BigDecimal(datum.replaceAll(",", "")) 65 | case _: TimestampType if dateFormatter != null => 66 | new Timestamp(dateFormatter.parse(datum).getTime) 67 | case _: TimestampType => Timestamp.valueOf(datum) 68 | case _: DateType if dateFormatter != null => 69 | new Date(dateFormatter.parse(datum).getTime) 70 | case _: DateType => Date.valueOf(datum) 71 | case _: StringType => datum 72 | case _ => throw new RuntimeException(s"Unsupported type: ${castType.typeName}") 73 | } 74 | } 75 | } 76 | 77 | /** 78 | * Helper method that converts string representation of a character to actual character. 79 | * It handles some Java escaped strings and throws exception if given string is longer than one 80 | * character. 81 | * 82 | */ 83 | @throws[IllegalArgumentException] 84 | private[select] def toChar(str: String): Char = { 85 | if (str.charAt(0) == '\\') { 86 | str.charAt(1) 87 | match { 88 | case 't' => '\t' 89 | case 'r' => '\r' 90 | case 'b' => '\b' 91 | case 'f' => '\f' 92 | case '\"' => '\"' // In case user changes quote char and uses \" as delimiter in options 93 | case '\'' => '\'' 94 | case 'u' if str == """\u0000""" => '\u0000' 95 | case _ => 96 | throw new IllegalArgumentException(s"Unsupported special character for delimiter: $str") 97 | } 98 | } else if (str.length == 1) { 99 | str.charAt(0) 100 | } else { 101 | throw new IllegalArgumentException(s"Delimiter cannot be more than one character: $str") 102 | } 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /src/test/scala/io/minio/spark/select/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/minio/spark-select/d8721b46e84639690b7382d4a4e0877996e64b3d/src/test/scala/io/minio/spark/select/.keep -------------------------------------------------------------------------------- /version.sbt: -------------------------------------------------------------------------------- 1 | version in ThisBuild := "2.1" 2 | --------------------------------------------------------------------------------