├── .gitignore
├── LICENSE
├── README.md
├── build.sbt
├── examples
├── csv.py
├── csv.scala
├── json.py
├── json.scala
├── parquet.py
├── parquet.scala
├── people.csv
├── people.json
└── people.parquet
├── project
├── assembly.sbt
├── build.properties
└── plugins.sbt
├── scalastyle-config.xml
├── src
├── main
│ ├── resources
│ │ └── META-INF
│ │ │ └── services
│ │ │ └── org.apache.spark.sql.sources.DataSourceRegister
│ └── scala
│ │ └── io
│ │ └── minio
│ │ └── spark
│ │ └── select
│ │ ├── Credentials.scala
│ │ ├── FilterPushdown.scala
│ │ ├── Select.scala
│ │ ├── SelectCSVRelation.scala
│ │ ├── SelectCSVSource.scala
│ │ ├── SelectJSONRelation.scala
│ │ ├── SelectJSONSource.scala
│ │ ├── SelectParquetRelation.scala
│ │ ├── SelectParquetSource.scala
│ │ └── util
│ │ ├── S3URI.scala
│ │ └── TypeCast.scala
└── test
│ └── scala
│ └── io
│ └── minio
│ └── spark
│ └── select
│ └── .keep
└── version.sbt
/.gitignore:
--------------------------------------------------------------------------------
1 | *.class
2 | *.log
3 |
4 | # sbt specific
5 | .cache
6 | .history
7 | .lib/
8 | dist/*
9 | target/
10 | lib_managed/
11 | src_managed/
12 | project/boot/
13 | project/plugins/project/
14 |
15 | # Scala-IDE specific
16 | .scala_dependencies
17 | .worksheet
18 |
19 | .idea/workspace.xml
20 | logs
21 | project/project
22 | project/target
23 | target
24 | tmp
25 | .history
26 | dist
27 | .idea/
28 | /*.iml
29 | /out
30 | /.idea_modules
31 | /.classpath
32 | /.project
33 | /RUNNING_PID
34 | /.settings
35 | projectFilesBackup/
36 | */target
37 | metastore_db/
38 | *.log
39 | *.class
40 | *.log
41 | *.zip
42 | *.gz
43 | *.jpeg
44 | *.jpg
45 | *.png
46 | *~
47 | .settings/
48 | .cache/
49 | .history/
50 | .lib/
51 | dist/*
52 | target/
53 | lib_managed/
54 | src_managed/
55 | project/boot/
56 | project/plugins/project/
57 | .project
58 | .classpath
59 | .cache
60 | .sbtserver.lock
61 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "{}"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright {yyyy} {name of copyright owner}
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # MinIO Spark Select
2 | MinIO Spark select enables retrieving only required data from an object using Select API.
3 |
4 | ## Requirements
5 | This library requires
6 | - Spark 2.3+
7 | - Scala 2.11+
8 |
9 | ## Features
10 | - S3 Select is supported with CSV, JSON and Parquet files using `minioSelectCSV`, `minioSelectJSON` and `minioSelectParquet` values to specify the data format.
11 | - S3 Select supports select on multiple objects.
12 | - S3 Select supports querying SSE-C encrypted objects.
13 |
14 | ### Limitations
15 | - Spark CSV and JSON options such as nanValue, positiveInf, negativeInf, and options related to corrupt records (for example, failfast and dropmalformed mode) are not supported.
16 | - Using commas (,) within decimals is not supported. For example, 10,000 is not supported and 10000 is.
17 | - The following filters are not pushed down to MinIO:
18 | - Aggregate functions such as COUNT() and SUM().
19 | - Filters that CAST() an attribute. For example, CAST(stringColumn as INT) = 1.
20 | - Filters with an attribute that is an object or is complex. For example, intArray[1] = 1, objectColumn.objectNumber = 1.
21 | - Filters for which the value is not a literal value. For example, intColumn1 = intColumn2
22 | - Only Select [Supported Data Types](https://docs.aws.amazon.com/AmazonS3/latest/dev/s3-glacier-select-sql-reference-data-types.html) are supported with the documented limitations.
23 |
24 | ### HowTo
25 | Include this package in your Spark Applications using:
26 |
27 | #### *spark-shell*, *pyspark*, or *spark-submit*
28 | ```
29 | > $SPARK_HOME/bin/spark-shell --packages io.minio:spark-select_2.11:2.1
30 | ```
31 |
32 | #### *sbt*
33 | If you use the [sbt-spark-package plugin](http://github.com/databricks/sbt-spark-package), in your sbt build file, add:
34 | ```
35 | spDependencies += "minio/spark-select:2.1"
36 | ```
37 | Otherwise,
38 | ```
39 | libraryDependencies += "io.minio" % "spark-select_2.11" % "2.1"
40 | ```
41 |
42 | #### *Maven*
43 | In your pom.xml, add:
44 | ```xml
45 |
46 |
47 |
48 | io.minio
49 | spark-select_2.11
50 | 2.1
51 |
52 |
53 | ```
54 |
55 | #### *Source*
56 |
57 | Setup all required environment variables
58 | > NOTE: It is assumed that you have already installed hadoop-2.8.5, spark 2.3.1 at some locations locally.
59 | ```
60 | export HADOOP_HOME=${HOME}/spark/hadoop-2.8.5/
61 | export PATH=${PATH}:${HADOOP_HOME}/bin
62 | export SPARK_DIST_CLASSPATH=$(hadoop classpath)
63 |
64 | export SPARK_HOME=${HOME}/spark/spark-2.3.1-bin-without-hadoop/
65 | export PATH=${PATH}:${SPARK_HOME}/bin
66 | export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk-amd64/
67 |
68 | git clone https://github.com/minio/spark-select
69 | sbt assembly
70 | spark-shell --jars target/scala-2.11/spark-select-assembly-2.1.jar
71 | ```
72 |
73 | Once the `spark-shell` has been successfully invoked.
74 | ```
75 | scala> :load examples/csv.scala
76 | Loading examples/csv.scala...
77 | import org.apache.spark.sql._
78 | import org.apache.spark.sql.types._
79 | schema: org.apache.spark.sql.types.StructType = StructType(StructField(name,StringType,true), StructField(age,IntegerType,false))
80 | df: org.apache.spark.sql.DataFrame = [name: string, age: int]
81 | +-------+---+
82 | | name|age|
83 | +-------+---+
84 | |Michael| 31|
85 | | Andy| 30|
86 | | Justin| 19|
87 | +-------+---+
88 |
89 | scala>
90 | ```
91 |
92 | ### API
93 |
94 | #### *PySpark*
95 | ```py
96 | spark
97 | .read
98 | .format("minioSelectCSV") // "minioSelectJSON" for JSON or "minioSelectParquet" for Parquet
99 | .schema(...) // mandatory
100 | .options(...) // optional
101 | .load("s3://path/to/my/datafiles")
102 | ```
103 |
104 | #### *R*
105 | ```
106 | read.df("s3://path/to/my/datafiles", "minioSelectCSV", schema)
107 | ```
108 |
109 | #### *Scala*
110 | ```
111 | spark
112 | .read
113 | .format("minioSelectCSV") // "minioSelectJSON" for JSON or "minioSelectParquet" for Parquet
114 | .schema(...) // mandatory
115 | .options(...) // optional. Examples:
116 | // .options(Map("quote" -> "\'", "header" -> "true")) or
117 | // .option("quote", "\'").option("header", "true")
118 | .load("s3://path/to/my/datafiles")
119 | ```
120 |
121 | #### *SQL*
122 | ```
123 | CREATE TEMPORARY VIEW MyView (number INT, name STRING) USING minioSelectCSV OPTIONS (path "s3://path/to/my/datafiles")
124 | ```
125 |
126 | ### Options
127 | The following options are available when using `minioSelectCSV` and `minioSelectJSON`. If not specified, default values are used.
128 |
129 | #### *Options with minioSelectCSV*
130 | | Option | Default | Usage |
131 | |---|---|---|
132 | | `compression` | "none" | Indicates whether compression is used. "gzip", "bzip2" are values supported besides "none".
133 | | `delimiter` | "," | Specifies the field delimiter.
134 | | `quote` | '"' | Specifies the quote character. Specifying an empty string is not supported and results in a malformed XML error.
135 | | `escape` | '"' | Specifies the quote escape character.
136 | | `header` | "true" | "false" specifies that there is no header. "true" specifies that a header is in the first line. Only headers in the first line are supported, and empty lines before a header are not supported.
137 | | `comment` | "#" | Specifies the comment character.
138 |
139 | #### *Options with minioSelectJSON*
140 | | Option | Default | Usage |
141 | |---|---|---|
142 | | `compression` | "none" | Indicates whether compression is used. "gzip", "bzip2" are values supported besides "none".
143 | | `multiline` | "false" | "false" specifies that the JSON is in Select LINES format, meaning that each line in the input data contains a single JSON object. "true" specifies that the JSON is in Select DOCUMENT format, meaning that a JSON object can span multiple lines in the input data.
144 |
145 | #### *Options with minioSelectParquet*
146 | There are no **options** needed with Parquet files.
147 |
148 | ### Full Examples
149 |
150 | #### *Scala*
151 |
152 | Schema with two columns for `CSV`.
153 | ```scala
154 | import org.apache.spark.sql._
155 | import org.apache.spark.sql.types._
156 |
157 | object app {
158 | def main(args: Array[String]) {
159 | val schema = StructType(
160 | List(
161 | StructField("name", StringType, true),
162 | StructField("age", IntegerType, false)
163 | )
164 | )
165 |
166 | val df = spark
167 | .read
168 | .format("minioSelectCSV")
169 | .schema(schema)
170 | .load("s3://sjm-airlines/people.csv")
171 |
172 | println(df.show())
173 |
174 | println(df.select("*").filter("age > 19").show())
175 |
176 | }
177 | }
178 | ```
179 |
180 | With custom schema for `JSON`.
181 | ```scala
182 | import org.apache.spark.sql._
183 | import org.apache.spark.sql.types._
184 |
185 | object app {
186 | def main(args: Array[String]) {
187 | val schema = StructType(
188 | List(
189 | StructField("name", StringType, true),
190 | StructField("age", IntegerType, false)
191 | )
192 | )
193 |
194 | val df = spark
195 | .read
196 | .format("minioSelectJSON")
197 | .schema(schema)
198 | .load("s3://sjm-airlines/people.json")
199 |
200 | println(df.show())
201 |
202 | println(df.select("*").filter("age > 19").show())
203 |
204 | }
205 | }
206 | ```
207 |
208 | With custom schema for `Parquet`.
209 | ```scala
210 | import org.apache.spark.sql._
211 | import org.apache.spark.sql.types._
212 |
213 | object app {
214 | def main(args: Array[String]) {
215 | val schema = StructType(
216 | List(
217 | StructField("name", StringType, true),
218 | StructField("age", IntegerType, false)
219 | )
220 | )
221 |
222 | val df = spark
223 | .read
224 | .format("minioSelectParquet")
225 | .schema(schema)
226 | .load("s3://sjm-airlines/people.parquet")
227 |
228 | println(df.show())
229 |
230 | println(df.select("*").filter("age > 19").show())
231 |
232 | }
233 | }
234 | ```
235 |
236 | #### *Python*
237 |
238 | Schema with two columns for `CSV`.
239 | ```py
240 | from pyspark.sql import *
241 | from pyspark.sql.types import *
242 |
243 | if __name__ == "__main__":
244 | # create SparkSession
245 | spark = SparkSession.builder \
246 | .master("local") \
247 | .appName("spark-select in python") \
248 | .getOrCreate()
249 |
250 | # filtered schema
251 | st = StructType([
252 | StructField("name", StringType(), True),
253 | StructField("age", IntegerType(), False),
254 | ])
255 |
256 | df = spark \
257 | .read \
258 | .format('minioSelectCSV') \
259 | .schema(st) \
260 | .load("s3://testbucket/people.csv")
261 |
262 | # show all rows.
263 | df.show()
264 |
265 | # show only filtered rows.
266 | df.select("*").filter("age > 19").show()
267 | ```
268 |
269 | ```
270 | > $SPARK_HOME/bin/spark-submit --packages io.minio:spark-select_2.11:2.1
271 | ```
272 |
--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
1 | name := "spark-select"
2 |
3 | organization := "io.minio"
4 |
5 | scalaVersion := "2.11.12"
6 |
7 | crossScalaVersions := Seq("2.11.12")
8 |
9 | spName := "minio/spark-select"
10 |
11 | spAppendScalaVersion := true
12 |
13 | spIncludeMaven := true
14 |
15 | spIgnoreProvided := true
16 |
17 | sparkVersion := "2.3.1"
18 |
19 | val testSparkVersion = settingKey[String]("The version of Spark to test against.")
20 |
21 | testSparkVersion := sys.props.get("spark.testVersion").getOrElse(sparkVersion.value)
22 |
23 | // used spark components
24 | sparkComponents := Seq("sql")
25 |
26 | assemblyMergeStrategy in assembly := {
27 | case "META-INF/io.netty.versions.properties" => MergeStrategy.concat
28 | case x =>
29 | val oldStrategy = (assemblyMergeStrategy in assembly).value
30 | oldStrategy(x)
31 | }
32 |
33 | // Dependent libraries
34 | libraryDependencies ++= Seq(
35 | "com.amazonaws" % "aws-java-sdk" % "1.11.434" exclude("com.fasterxml.jackson.core", "jackson-databind"),
36 | "org.apache.commons" % "commons-csv" % "1.7",
37 | "org.slf4j" % "slf4j-api" % "1.7.5" % "provided",
38 | "org.mockito" % "mockito-core" % "2.0.31-beta"
39 | )
40 |
41 | libraryDependencies ++= Seq(
42 | "org.apache.spark" %% "spark-core" % testSparkVersion.value % "test" force(),
43 | "org.apache.spark" %% "spark-sql" % testSparkVersion.value % "test" force(),
44 | "org.scala-lang" % "scala-library" % scalaVersion.value % "compile"
45 | )
46 |
47 | /**
48 | * release settings
49 | */
50 |
51 | publishMavenStyle := true
52 |
53 | releaseCrossBuild := true
54 |
55 | licenses += ("Apache-2.0", url("http://www.apache.org/licenses/LICENSE-2.0"))
56 |
57 | releasePublishArtifactsAction := PgpKeys.publishSigned.value
58 |
59 | publishArtifact in Test := false
60 |
61 | pomIncludeRepository := { _ => false }
62 |
63 | publishTo := {
64 | val nexus = "https://oss.sonatype.org/"
65 | if (version.value.endsWith("SNAPSHOT"))
66 | Some("snapshots" at nexus + "content/repositories/snapshots")
67 | else
68 | Some("releases" at nexus + "service/local/staging/deploy/maven2")
69 | }
70 |
71 | pomExtra := (
72 | https://github.com/minio/spark-select
73 |
74 | scm:git:github.com/minio/spark-select
75 | scm:git:git@github.com:minio/spark-select
76 | github.com/minio/spark-select
77 |
78 |
79 |
80 | minio
81 | MinIO
82 | http://www.minio.io
83 |
84 | )
85 |
86 | // Skip tests during assembly
87 | test in assembly := {}
88 |
89 | ScoverageSbtPlugin.ScoverageKeys.coverageHighlighting := {
90 | if (scalaBinaryVersion.value == "2.10") false
91 | else true
92 | }
93 |
94 | import ReleaseTransformations._
95 |
96 | // Add publishing to spark packages as another step.
97 | releaseProcess := Seq[ReleaseStep](
98 | checkSnapshotDependencies,
99 | inquireVersions,
100 | runTest,
101 | setReleaseVersion,
102 | commitReleaseVersion,
103 | tagRelease,
104 | publishArtifacts,
105 | setNextVersion,
106 | commitNextVersion,
107 | pushChanges,
108 | releaseStepTask(spPublish)
109 | )
110 |
--------------------------------------------------------------------------------
/examples/csv.py:
--------------------------------------------------------------------------------
1 | from pyspark.sql import *
2 | from pyspark.sql.types import *
3 |
4 | if __name__ == "__main__":
5 | # create SparkSession
6 | spark = SparkSession.builder \
7 | .master("local") \
8 | .appName("spark-select in python") \
9 | .getOrCreate()
10 |
11 | # filtered schema
12 | st = StructType([
13 | StructField("name", StringType(), True),
14 | StructField("age", IntegerType(), False),
15 | ])
16 |
17 | df = spark \
18 | .read \
19 | .format('minioSelectCSV') \
20 | .schema(st) \
21 | .load("s3://testbucket/people.csv")
22 |
23 | # show all rows.
24 | df.show()
25 |
26 | # show only filtered rows.
27 | df.select("*").filter("age > 19").show()
28 |
--------------------------------------------------------------------------------
/examples/csv.scala:
--------------------------------------------------------------------------------
1 | import org.apache.spark.sql._
2 | import org.apache.spark.sql.types._
3 |
4 | object app {
5 | def main(args: Array[String]) {
6 | val schema = StructType(
7 | List(
8 | StructField("name", StringType, true),
9 | StructField("age", IntegerType, false)
10 | )
11 | )
12 |
13 | val df = spark
14 | .read
15 | .format("minioSelectCSV")
16 | .schema(schema)
17 | .load("s3://sjm-airlines/people.csv")
18 |
19 | println(df.show())
20 |
21 | println(df.select("*").filter("age > 19").show())
22 |
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/examples/json.py:
--------------------------------------------------------------------------------
1 | from pyspark.sql import SparkSession
2 | from pyspark.sql.types import *
3 |
4 | if __name__ == "__main__":
5 | # create SparkSession
6 | spark = SparkSession.builder \
7 | .master("local") \
8 | .appName("spark-select in python") \
9 | .getOrCreate()
10 |
11 | # filtered schema
12 | st = StructType([
13 | StructField("name", StringType(), True),
14 | StructField("age", IntegerType(), False),
15 | ])
16 |
17 | df = spark \
18 | .read \
19 | .format('minioSelectJSON') \
20 | .schema(st) \
21 | .load("s3://testbucket/people.json")
22 |
23 | # show all rows.
24 | df.show()
25 |
26 | # show only filtered rows.
27 | df.select("*").filter("age > 19").show()
28 |
--------------------------------------------------------------------------------
/examples/json.scala:
--------------------------------------------------------------------------------
1 | import org.apache.spark.sql._
2 | import org.apache.spark.sql.types._
3 |
4 | object app {
5 | def main(args: Array[String]) {
6 | val schema = StructType(
7 | List(
8 | StructField("name", StringType, true),
9 | StructField("age", IntegerType, false)
10 | )
11 | )
12 |
13 | val df = spark
14 | .read
15 | .format("minioSelectJSON")
16 | .schema(schema)
17 | .load("s3://sjm-airlines/people.json")
18 |
19 | println(df.show())
20 |
21 | println(df.select("*").filter("age > 19").show())
22 |
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/examples/parquet.py:
--------------------------------------------------------------------------------
1 | from pyspark.sql import SparkSession
2 | from pyspark.sql.types import *
3 |
4 | if __name__ == "__main__":
5 | # create SparkSession
6 | spark = SparkSession.builder \
7 | .master("local") \
8 | .appName("spark-select in python") \
9 | .getOrCreate()
10 |
11 | # filtered schema
12 | st = StructType([
13 | StructField("name", StringType(), True),
14 | StructField("age", IntegerType(), False),
15 | ])
16 |
17 | df = spark \
18 | .read \
19 | .format('minioSelectParquet') \
20 | .schema(st) \
21 | .load("s3://testbucket/people.parquet")
22 |
23 | # show all rows.
24 | df.show()
25 |
26 | # show only filtered rows.
27 | df.select("*").filter("age > 19").show()
28 |
--------------------------------------------------------------------------------
/examples/parquet.scala:
--------------------------------------------------------------------------------
1 | import org.apache.spark.sql._
2 | import org.apache.spark.sql.types._
3 |
4 | object app {
5 | def main(args: Array[String]) {
6 | val schema = StructType(
7 | List(
8 | StructField("name", StringType, true),
9 | StructField("age", IntegerType, false)
10 | )
11 | )
12 |
13 | val df = spark
14 | .read
15 | .format("minioSelectParquet")
16 | .schema(schema)
17 | .load("s3://sjm-airlines/people.parquet")
18 |
19 | println(df.show())
20 |
21 | println(df.select("*").filter("age > 19").show())
22 |
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/examples/people.csv:
--------------------------------------------------------------------------------
1 | name,age
2 | Michael,31
3 | Andy,30
4 | Justin,19
5 |
--------------------------------------------------------------------------------
/examples/people.json:
--------------------------------------------------------------------------------
1 | {"name":"Michael", "age": 31}
2 | {"name":"Andy", "age": 30}
3 | {"name":"Justin", "age": 19}
4 |
--------------------------------------------------------------------------------
/examples/people.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/minio/spark-select/d8721b46e84639690b7382d4a4e0877996e64b3d/examples/people.parquet
--------------------------------------------------------------------------------
/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.12.0")
2 |
--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | sbt.version=0.13.17
18 |
--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | scalaVersion := "2.10.6"
2 |
3 | resolvers += Resolver.url("artifactory", url("http://scalasbt.artifactoryonline.com/scalasbt/sbt-plugin-releases"))(Resolver.ivyStylePatterns)
4 |
5 | resolvers += "Typesafe Repository" at "http://repo.typesafe.com/typesafe/releases/"
6 |
7 | resolvers += "sonatype-releases" at "https://oss.sonatype.org/content/repositories/releases/"
8 |
9 | resolvers += "Spark Package Main Repo" at "https://dl.bintray.com/spark-packages/maven"
10 |
11 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "4.0.0")
12 |
13 | addSbtPlugin("com.github.mpeltonen" % "sbt-idea" % "1.6.0")
14 |
15 | addSbtPlugin("org.spark-packages" % "sbt-spark-package" % "0.2.4")
16 |
17 | addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "0.6.0")
18 |
19 | addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.1.0")
20 |
21 | addSbtPlugin("org.scoverage" % "sbt-coveralls" % "1.0.0")
22 |
23 | addSbtPlugin("com.github.gseitz" % "sbt-release" % "1.0.10")
24 |
25 | addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.1.1")
26 |
27 | addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "2.3")
28 |
--------------------------------------------------------------------------------
/scalastyle-config.xml:
--------------------------------------------------------------------------------
1 |
2 | Scalastyle standard configuration
3 |
4 |
5 |
6 |
7 |
8 |
9 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
--------------------------------------------------------------------------------
/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister:
--------------------------------------------------------------------------------
1 | io.minio.spark.select.SelectCSVSource
2 | io.minio.spark.select.SelectJSONSource
3 | io.minio.spark.select.SelectParquetSource
4 |
--------------------------------------------------------------------------------
/src/main/scala/io/minio/spark/select/Credentials.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 MinIO, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package io.minio.spark.select
17 |
18 | import java.net.URI
19 |
20 | // For BasicAWSCredentials
21 | import com.amazonaws.auth.AWSCredentials
22 | import com.amazonaws.auth.AWSCredentialsProvider
23 | import com.amazonaws.auth.BasicAWSCredentials
24 | import com.amazonaws.auth.BasicSessionCredentials
25 | import com.amazonaws.auth.DefaultAWSCredentialsProviderChain
26 |
27 | import org.apache.hadoop.conf.Configuration
28 |
29 | private[spark] object Credentials {
30 | private def staticCredentialsProvider(credentials: AWSCredentials): AWSCredentialsProvider = {
31 | new AWSCredentialsProvider {
32 | override def getCredentials: AWSCredentials = credentials
33 | override def refresh(): Unit = {}
34 | }
35 | }
36 |
37 | def load(location: Option[String], hadoopConfiguration: Configuration): AWSCredentialsProvider = {
38 | val uri = new URI(location.getOrElse(""))
39 | val uriScheme = uri.getScheme
40 |
41 | uriScheme match {
42 | case "s3" | "s3a" =>
43 | // This matches what S3A does, with one exception: we don't
44 | // support anonymous credentials. First, try to parse from URI:
45 | Option(uri.getUserInfo).flatMap { userInfo =>
46 | if (userInfo.contains(":")) {
47 | val Array(accessKey, secretKey) = userInfo.split(":")
48 | Some(staticCredentialsProvider(new BasicAWSCredentials(accessKey, secretKey)))
49 | } else {
50 | None
51 | }
52 | }.orElse {
53 | val accessKey = hadoopConfiguration.get(s"fs.s3a.access.key", null)
54 | val secretKey = hadoopConfiguration.get(s"fs.s3a.secret.key", null)
55 | val sessionToken = hadoopConfiguration.get(s"fs.s3a.session.token", null)
56 | if (accessKey != null && secretKey != null) {
57 | if (sessionToken != null) {
58 | Some(staticCredentialsProvider(new BasicSessionCredentials(accessKey, secretKey, sessionToken)))
59 | } else {
60 | Some(staticCredentialsProvider(new BasicAWSCredentials(accessKey, secretKey)))
61 | }
62 | } else {
63 | None
64 | }
65 | }.getOrElse {
66 | // Finally, fall back on the instance profile provider
67 | new DefaultAWSCredentialsProviderChain()
68 | }
69 | case other =>
70 | throw new IllegalArgumentException(s"Unrecognized scheme $other; expected s3, or s3a")
71 | }
72 | }
73 | }
74 |
--------------------------------------------------------------------------------
/src/main/scala/io/minio/spark/select/FilterPushdown.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2018 MinIO, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.minio.spark.select
18 |
19 | import java.sql.{Date, Timestamp}
20 |
21 | import org.apache.spark.sql.sources._
22 | import org.apache.spark.sql.types._
23 |
24 | /**
25 | * Helper methods for pushing filters into Select queries.
26 | */
27 | private[spark] object FilterPushdown {
28 | /**
29 | * Build a SQL WHERE clause for the given filters. If a filter cannot be pushed down then no
30 | * condition will be added to the WHERE clause. If none of the filters can be pushed down then
31 | * an empty string will be returned.
32 | *
33 | * @param schema the schema of the table being queried
34 | * @param filters an array of filters, the conjunction of which is the filter condition for the
35 | * scan.
36 | */
37 | def buildWhereClause(schema: StructType, filters: Seq[Filter]): String = {
38 | val filterExpressions = filters.flatMap(f => buildFilterExpression(schema, f)).mkString(" AND ")
39 | if (filterExpressions.isEmpty) "" else "WHERE " + filterExpressions
40 | }
41 |
42 | /**
43 | * Attempt to convert the given filter into a Select expression. Returns None if the expression
44 | * could not be converted.
45 | */
46 | def buildFilterExpression(schema: StructType, filter: Filter): Option[String] = {
47 | def buildComparison(attr: String, value: Any, comparisonOp: String): Option[String] = {
48 | getTypeForAttribute(schema, attr).map { dataType =>
49 | val sqlEscapedValue: String = dataType match {
50 | case StringType => s""""${value.toString.replace("'", "\\'\\'")}""""
51 | case DateType => s""""${value.asInstanceOf[Date]}""""
52 | case TimestampType => s""""${value.asInstanceOf[Timestamp]}""""
53 | case _ => value.toString
54 | }
55 | s"s."+s""""$attr""""+s" $comparisonOp $sqlEscapedValue"
56 | }
57 | }
58 |
59 | filter match {
60 | case EqualTo(attr, value) => buildComparison(attr, value, "=")
61 | case LessThan(attr, value) => buildComparison(attr, value, "<")
62 | case GreaterThan(attr, value) => buildComparison(attr, value, ">")
63 | case LessThanOrEqual(attr, value) => buildComparison(attr, value, "<=")
64 | case GreaterThanOrEqual(attr, value) => buildComparison(attr, value, ">=")
65 | case _ => None
66 | }
67 | }
68 |
69 | /**
70 | * Use the given schema to look up the attribute's data type. Returns None if the attribute could
71 | * not be resolved.
72 | */
73 | private def getTypeForAttribute(schema: StructType, attribute: String): Option[DataType] = {
74 | if (schema.fieldNames.contains(attribute)) {
75 | Some(schema(attribute).dataType)
76 | } else {
77 | None
78 | }
79 | }
80 |
81 | def queryFromSchema(schema: StructType, filters: Array[Filter]): String = {
82 | var columnList = schema.fields.map(x => s"s."+s""""${x.name}"""").mkString(",")
83 | if (columnList.length == 0) {
84 | columnList = "*"
85 | }
86 | val whereClause = buildWhereClause(schema, filters)
87 | if (whereClause.length == 0) {
88 | s"select $columnList from S3Object s"
89 | } else {
90 | s"select $columnList from S3Object s $whereClause"
91 | }
92 | }
93 |
94 | }
95 |
--------------------------------------------------------------------------------
/src/main/scala/io/minio/spark/select/Select.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 MinIO, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package io.minio.spark.select
17 |
18 | import org.apache.hadoop.conf.Configuration
19 |
20 | // Select API
21 | import com.amazonaws.services.s3.model.JSONInput
22 | import com.amazonaws.services.s3.model.JSONType
23 | import com.amazonaws.services.s3.model.CSVInput
24 | import com.amazonaws.services.s3.model.CSVOutput
25 | import com.amazonaws.services.s3.model.ParquetInput
26 | import com.amazonaws.services.s3.model.CompressionType
27 | import com.amazonaws.services.s3.model.ExpressionType
28 | import com.amazonaws.services.s3.model.SSECustomerKey
29 | import com.amazonaws.services.s3.model.InputSerialization
30 | import com.amazonaws.services.s3.model.OutputSerialization
31 | import com.amazonaws.services.s3.model.SelectObjectContentRequest
32 | import com.amazonaws.services.s3.model.SelectObjectContentResult
33 | import com.amazonaws.services.s3.model.SelectObjectContentEvent
34 | import com.amazonaws.services.s3.model.SelectObjectContentEvent.RecordsEvent
35 | import com.amazonaws.services.s3.model.FileHeaderInfo
36 |
37 | import org.apache.spark.sql.types._
38 | import org.apache.spark.sql.sources._
39 |
40 | private[spark] object Select {
41 | private val SERVER_ENCRYPTION_ALGORITHM = s"fs.s3a.server-side-encryption-algorithm"
42 | private val SERVER_ENCRYPTION_KEY = s"fs.s3a.server-side-encryption.key"
43 |
44 | private def compressionType(params: Map[String, String]): CompressionType = {
45 | params.getOrElse("compression", "none") match {
46 | case "none" => CompressionType.NONE
47 | case "gzip" => CompressionType.GZIP
48 | case "bzip2" => CompressionType.BZIP2
49 | }
50 | }
51 |
52 | private def jsonType(params: Map[String, String]): JSONType = {
53 | params.getOrElse("multiline", "false") match {
54 | case "false" => JSONType.LINES
55 | case "true" => JSONType.DOCUMENT
56 | }
57 | }
58 |
59 | private def headerInfo(params: Map[String, String]): FileHeaderInfo = {
60 | params.getOrElse("header", "true") match {
61 | case "false" => FileHeaderInfo.NONE
62 | case "true" => FileHeaderInfo.USE
63 | }
64 | }
65 |
66 | private def sseCustomerKey(algo: String, key: String): SSECustomerKey = {
67 | algo match {
68 | case "SSE-C" =>
69 | if (key != null) {
70 | new SSECustomerKey(key)
71 | } else {
72 | null
73 | }
74 | case other =>
75 | throw new IllegalArgumentException(s"Unrecognized algorithm $algo; expected SSE-C")
76 | }
77 | }
78 |
79 | def requestParquet(bucket: String, key: String, params: Map[String, String],
80 | schema: StructType, filters: Array[Filter],
81 | hadoopConfiguration: Configuration): SelectObjectContentRequest = {
82 |
83 | new SelectObjectContentRequest() { request =>
84 | request.setBucketName(bucket)
85 | request.setKey(key)
86 | request.setExpression(FilterPushdown.queryFromSchema(schema, filters))
87 | request.setExpressionType(ExpressionType.SQL)
88 | val algo = hadoopConfiguration.get(SERVER_ENCRYPTION_ALGORITHM, null)
89 | if (algo != null) {
90 | request.withSSECustomerKey(sseCustomerKey(algo,
91 | hadoopConfiguration.get(SERVER_ENCRYPTION_KEY, null)))
92 | }
93 |
94 | val inputSerialization = new InputSerialization()
95 | val parquetInput = new ParquetInput()
96 | inputSerialization.setParquet(parquetInput)
97 | request.setInputSerialization(inputSerialization)
98 |
99 | val outputSerialization = new OutputSerialization()
100 | val csvOutput = new CSVOutput()
101 | outputSerialization.setCsv(csvOutput)
102 | request.setOutputSerialization(outputSerialization)
103 | }
104 | }
105 |
106 | def requestJSON(bucket: String, key: String, params: Map[String, String],
107 | schema: StructType, filters: Array[Filter],
108 | hadoopConfiguration: Configuration): SelectObjectContentRequest = {
109 |
110 | new SelectObjectContentRequest() { request =>
111 | request.setBucketName(bucket)
112 | request.setKey(key)
113 | request.setExpression(FilterPushdown.queryFromSchema(schema, filters))
114 | request.setExpressionType(ExpressionType.SQL)
115 | val algo = hadoopConfiguration.get(SERVER_ENCRYPTION_ALGORITHM, null)
116 | if (algo != null) {
117 | request.withSSECustomerKey(sseCustomerKey(algo,
118 | hadoopConfiguration.get(SERVER_ENCRYPTION_KEY, null)))
119 | }
120 |
121 | val inputSerialization = new InputSerialization()
122 | val jsonInput = new JSONInput()
123 | jsonInput.withType(jsonType(params))
124 | inputSerialization.setJson(jsonInput)
125 | inputSerialization.setCompressionType(compressionType(params))
126 | request.setInputSerialization(inputSerialization)
127 |
128 | val outputSerialization = new OutputSerialization()
129 | val csvOutput = new CSVOutput()
130 | outputSerialization.setCsv(csvOutput)
131 | request.setOutputSerialization(outputSerialization)
132 | }
133 | }
134 |
135 |
136 | def requestCSV(bucket: String, key: String, params: Map[String, String],
137 | schema: StructType, filters: Array[Filter],
138 | hadoopConfiguration: Configuration): SelectObjectContentRequest = {
139 | new SelectObjectContentRequest() { request =>
140 | request.setBucketName(bucket)
141 | request.setKey(key)
142 | request.setExpression(FilterPushdown.queryFromSchema(schema, filters))
143 | request.setExpressionType(ExpressionType.SQL)
144 | val algo = hadoopConfiguration.get(SERVER_ENCRYPTION_ALGORITHM, null)
145 | if (algo != null) {
146 | request.withSSECustomerKey(sseCustomerKey(algo,
147 | hadoopConfiguration.get(SERVER_ENCRYPTION_KEY, null)))
148 | }
149 |
150 | val inputSerialization = new InputSerialization()
151 | val csvInput = new CSVInput()
152 | csvInput.withFileHeaderInfo(headerInfo(params))
153 | csvInput.withRecordDelimiter('\n')
154 | csvInput.withQuoteCharacter(params.getOrElse(s"quote", "\""))
155 | csvInput.withQuoteEscapeCharacter(params.getOrElse(s"escape", "\""))
156 | csvInput.withComments(params.getOrElse(s"comment", "#"))
157 | csvInput.withFieldDelimiter(params.getOrElse(s"delimiter", ","))
158 | inputSerialization.setCsv(csvInput)
159 | inputSerialization.setCompressionType(compressionType(params))
160 | request.setInputSerialization(inputSerialization)
161 |
162 | val outputSerialization = new OutputSerialization()
163 | val csvOutput = new CSVOutput()
164 | csvOutput.withRecordDelimiter('\n')
165 | csvOutput.withFieldDelimiter(params.getOrElse("delimiter", ","))
166 | outputSerialization.setCsv(csvOutput)
167 | request.setOutputSerialization(outputSerialization)
168 | }
169 | }
170 | }
171 |
--------------------------------------------------------------------------------
/src/main/scala/io/minio/spark/select/SelectCSVRelation.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2018 MinIO, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package io.minio.spark.select
17 |
18 | import scala.collection.JavaConversions.asScalaBuffer
19 | import scala.collection.JavaConverters._
20 |
21 | import scala.util.control.NonFatal
22 |
23 | import org.slf4j.LoggerFactory
24 |
25 | import java.io.InputStreamReader
26 | import java.io.BufferedReader
27 |
28 | // Import all utilities
29 | import io.minio.spark.select.util._
30 |
31 | // Apache commons
32 | import org.apache.commons.csv._
33 |
34 | // For AmazonS3 client
35 | import com.amazonaws.services.s3.AmazonS3
36 | import com.amazonaws.services.s3.AmazonS3URI
37 | import com.amazonaws.services.s3.AmazonS3ClientBuilder
38 | import com.amazonaws.client.builder.AwsClientBuilder.EndpointConfiguration
39 |
40 | import com.amazonaws.services.s3.model.ListObjectsV2Request
41 | import com.amazonaws.services.s3.model.ListObjectsV2Result
42 | import com.amazonaws.services.s3.model.S3ObjectSummary
43 |
44 | import org.apache.spark.rdd.RDD
45 | import org.apache.spark.sql.types._
46 | import org.apache.spark.sql.sources._
47 | import org.apache.spark.sql.catalyst.encoders.RowEncoder
48 | import org.apache.spark.sql.{DataFrame, Row, SQLContext}
49 |
50 | import scala.collection.mutable.{ListBuffer, ArrayBuffer}
51 |
52 | /**
53 | * Abstract relation class to download data from S3 compatible storage
54 | */
55 | case class SelectCSVRelation protected[spark] (
56 | location: Option[String],
57 | params: Map[String, String],
58 | userSchema: StructType = null)(@transient val sqlContext: SQLContext)
59 | extends BaseRelation
60 | with TableScan
61 | with PrunedScan
62 | with PrunedFilteredScan {
63 |
64 | private val API_PATH_STYLE_ACCESS = s"fs.s3a.path.style.access"
65 | private val SERVER_ENDPOINT = s"fs.s3a.endpoint"
66 | private val SERVER_REGION = s"fs.s3a.region"
67 |
68 | private val hadoopConfiguration = sqlContext.sparkContext.hadoopConfiguration
69 | private val pathStyleAccess = hadoopConfiguration.get(API_PATH_STYLE_ACCESS, "false") == "true"
70 | private val endpoint = hadoopConfiguration.get(SERVER_ENDPOINT, "https://s3.amazonaws.com")
71 | private val region = hadoopConfiguration.get(SERVER_REGION, "us-east-1")
72 | private val s3Client =
73 | AmazonS3ClientBuilder.standard()
74 | .withCredentials(Credentials.load(location, hadoopConfiguration))
75 | .withPathStyleAccessEnabled(pathStyleAccess)
76 | .withEndpointConfiguration(new EndpointConfiguration(endpoint, region))
77 | .build()
78 |
79 | private val logger = LoggerFactory.getLogger(SelectCSVRelation.getClass)
80 |
81 | override lazy val schema: StructType = Option(userSchema).getOrElse({
82 | // With no schema we return error.
83 | throw new RuntimeException(s"Schema cannot be empty")
84 | })
85 |
86 | private def getRows(prunedSchema: StructType, filters: Array[Filter]): Seq[Row] = {
87 | var records = new ListBuffer[Row]
88 | var req = new ListObjectsV2Request()
89 | var result = new ListObjectsV2Result()
90 | var s3URI = S3URI.toAmazonS3URI(location.getOrElse(""))
91 |
92 | req.withBucketName(s3URI.getBucket())
93 | req.withPrefix(s3URI.getKey().stripSuffix("*"))
94 | req.withMaxKeys(1000)
95 |
96 | val csvFormat = CSVFormat.DEFAULT
97 | .withHeader(prunedSchema.fields.map(x => x.name): _*)
98 | .withRecordSeparator("\n")
99 | .withDelimiter(params.getOrElse("delimiter", ",").charAt(0))
100 | .withQuote(params.getOrElse("quote", "\"").charAt(0))
101 | .withEscape(params.getOrElse(s"escape", "\"").charAt(0))
102 | .withCommentMarker(params.getOrElse(s"comment", "#").charAt(0))
103 |
104 | do {
105 | result = s3Client.listObjectsV2(req)
106 | asScalaBuffer(result.getObjectSummaries()).foreach(objectSummary => {
107 | val in = s3Client.selectObjectContent(
108 | Select.requestCSV(
109 | objectSummary.getBucketName(),
110 | objectSummary.getKey(),
111 | params,
112 | prunedSchema,
113 | filters,
114 | hadoopConfiguration)
115 | ).getPayload().getRecordsInputStream()
116 | var parser = CSVParser.parse(in, java.nio.charset.Charset.forName("UTF-8"), csvFormat)
117 | try {
118 | for (record <- parser.asScala) {
119 | records += Row.fromSeq(prunedSchema.fields.map(x => {
120 | TypeCast.castTo(record.get(x.name), x.dataType, x.nullable)
121 | }))
122 | }
123 | } catch {
124 | case NonFatal(e) => {
125 | logger.error(s"Exception while parsing ", e)
126 | }
127 | }
128 | parser.close()
129 | })
130 | req.setContinuationToken(result.getNextContinuationToken())
131 | } while (result.isTruncated())
132 | records.toList
133 | }
134 |
135 | override def toString: String = s"SelectCSVRelation()"
136 |
137 | private def tokenRDD(schema: StructType, filters: Array[Filter]): RDD[Row] = {
138 | sqlContext.sparkContext.makeRDD(getRows(schema, filters))
139 | }
140 |
141 | override def buildScan(): RDD[Row] = {
142 | tokenRDD(schema, null)
143 | }
144 |
145 | override def buildScan(columns: Array[String]): RDD[Row] = {
146 | tokenRDD(pruneSchema(schema, columns), null)
147 | }
148 |
149 | override def buildScan(columns: Array[String], filters: Array[Filter]): RDD[Row] = {
150 | tokenRDD(pruneSchema(schema, columns), filters)
151 | }
152 |
153 | private def pruneSchema(schema: StructType, columns: Array[String]): StructType = {
154 | val fieldMap = Map(schema.fields.map(x => x.name -> x): _*)
155 | new StructType(columns.map(name => fieldMap(name)))
156 | }
157 | }
158 |
--------------------------------------------------------------------------------
/src/main/scala/io/minio/spark/select/SelectCSVSource.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2018 MinIO, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package io.minio.spark.select
17 |
18 | // Java standard libraries
19 | import java.io.File
20 |
21 | // Spark internal libraries
22 | import org.apache.spark.sql.SQLContext
23 | import org.apache.spark.sql.sources.{BaseRelation, RelationProvider, SchemaRelationProvider}
24 | import org.apache.spark.sql.types.StructType
25 |
26 | import org.apache.spark.sql.sources.DataSourceRegister
27 |
28 | class SelectCSVSource
29 | extends SchemaRelationProvider
30 | with DataSourceRegister {
31 |
32 | private def checkPath(parameters: Map[String, String]): String = {
33 | parameters.getOrElse("path", sys.error("'path' must be specified for CSV data."))
34 | }
35 |
36 | /**
37 | * Short alias for spark-select data source.
38 | */
39 | override def shortName(): String = "minioSelectCSV"
40 |
41 | override def createRelation(sqlContext: SQLContext, params: Map[String, String], schema: StructType): SelectCSVRelation = {
42 | val path = checkPath(params)
43 | SelectCSVRelation(Some(path), params, schema)(sqlContext)
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/src/main/scala/io/minio/spark/select/SelectJSONRelation.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2018 MinIO, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package io.minio.spark.select
17 |
18 | import scala.collection.JavaConversions.asScalaBuffer
19 |
20 | import java.io.InputStreamReader
21 | import java.io.BufferedReader
22 |
23 | // Import all utilities
24 | import io.minio.spark.select.util._
25 |
26 | // Apache commons
27 | import org.apache.commons.csv.{CSVFormat, QuoteMode}
28 |
29 | // For AmazonS3 client
30 | import com.amazonaws.services.s3.AmazonS3
31 | import com.amazonaws.services.s3.AmazonS3URI
32 | import com.amazonaws.services.s3.AmazonS3ClientBuilder
33 | import com.amazonaws.client.builder.AwsClientBuilder.EndpointConfiguration
34 |
35 | import com.amazonaws.services.s3.model.ListObjectsV2Request
36 | import com.amazonaws.services.s3.model.ListObjectsV2Result
37 | import com.amazonaws.services.s3.model.S3ObjectSummary
38 |
39 | import org.apache.spark.rdd.RDD
40 | import org.apache.spark.sql.types._
41 | import org.apache.spark.sql.sources._
42 | import org.apache.spark.sql.catalyst.encoders.RowEncoder
43 | import org.apache.spark.sql.{DataFrame, Row, SQLContext}
44 |
45 | import scala.collection.mutable.{ListBuffer, ArrayBuffer}
46 |
47 | /**
48 | * Abstract relation class to download data from S3 compatible storage
49 | */
50 | case class SelectJSONRelation protected[spark] (
51 | location: Option[String],
52 | params: Map[String, String],
53 | userSchema: StructType = null)(@transient val sqlContext: SQLContext)
54 | extends BaseRelation
55 | with TableScan
56 | with PrunedScan
57 | with PrunedFilteredScan {
58 |
59 | private val API_PATH_STYLE_ACCESS = s"fs.s3a.path.style.access"
60 | private val SERVER_ENDPOINT = s"fs.s3a.endpoint"
61 | private val SERVER_REGION = s"fs.s3a.region"
62 |
63 | private val hadoopConfiguration = sqlContext.sparkContext.hadoopConfiguration
64 | private val pathStyleAccess = hadoopConfiguration.get(API_PATH_STYLE_ACCESS, "false") == "true"
65 | private val endpoint = hadoopConfiguration.get(SERVER_ENDPOINT, "https://s3.amazonaws.com")
66 | private val region = hadoopConfiguration.get(SERVER_REGION, "us-east-1")
67 | private val s3Client =
68 | AmazonS3ClientBuilder.standard()
69 | .withCredentials(Credentials.load(location, hadoopConfiguration))
70 | .withPathStyleAccessEnabled(pathStyleAccess)
71 | .withEndpointConfiguration(new EndpointConfiguration(endpoint, region))
72 | .build()
73 |
74 | override lazy val schema: StructType = Option(userSchema).getOrElse({
75 | // With no schema we return error.
76 | throw new RuntimeException(s"Schema cannot be empty")
77 | })
78 |
79 |
80 | private def getRows(schema: StructType, filters: Array[Filter]): Seq[Row] = {
81 | var records = new ListBuffer[Row]
82 | var req = new ListObjectsV2Request()
83 | var result = new ListObjectsV2Result()
84 | var s3URI = S3URI.toAmazonS3URI(location.getOrElse(""))
85 |
86 | req.withBucketName(s3URI.getBucket())
87 | req.withPrefix(s3URI.getKey().stripSuffix("*"))
88 | req.withMaxKeys(1000)
89 |
90 | do {
91 | result = s3Client.listObjectsV2(req)
92 | asScalaBuffer(result.getObjectSummaries()).foreach(objectSummary => {
93 | val br = new BufferedReader(new InputStreamReader(
94 | s3Client.selectObjectContent(
95 | Select.requestJSON(
96 | objectSummary.getBucketName(),
97 | objectSummary.getKey(),
98 | params,
99 | schema,
100 | filters,
101 | hadoopConfiguration)
102 | ).getPayload().getRecordsInputStream()))
103 | var line : String = null
104 | while ( {line = br.readLine(); line != null}) {
105 | var row = new Array[Any](schema.fields.length)
106 | var rowValues = line.split(",")
107 | var index = 0
108 | while (index < rowValues.length) {
109 | val field = schema.fields(index)
110 | row(index) = TypeCast.castTo(rowValues(index), field.dataType,
111 | field.nullable)
112 | index += 1
113 | }
114 | records += Row.fromSeq(row)
115 | }
116 | br.close()
117 | })
118 | req.setContinuationToken(result.getNextContinuationToken())
119 | } while (result.isTruncated())
120 | records.toList
121 | }
122 |
123 | override def toString: String = s"SelectJSONRelation()"
124 |
125 | private def tokenRDD(schema: StructType, filters: Array[Filter]): RDD[Row] = {
126 | sqlContext.sparkContext.makeRDD(getRows(schema, filters))
127 | }
128 |
129 | override def buildScan(): RDD[Row] = {
130 | tokenRDD(schema, null)
131 | }
132 |
133 | override def buildScan(columns: Array[String]): RDD[Row] = {
134 | tokenRDD(pruneSchema(schema, columns), null)
135 | }
136 |
137 | override def buildScan(columns: Array[String], filters: Array[Filter]): RDD[Row] = {
138 | tokenRDD(pruneSchema(schema, columns), filters)
139 | }
140 |
141 | private def pruneSchema(schema: StructType, columns: Array[String]): StructType = {
142 | val fieldMap = Map(schema.fields.map(x => x.name -> x): _*)
143 | new StructType(columns.map(name => fieldMap(name)))
144 | }
145 | }
146 |
--------------------------------------------------------------------------------
/src/main/scala/io/minio/spark/select/SelectJSONSource.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2018 MinIO, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package io.minio.spark.select
17 |
18 | // Java standard libraries
19 | import java.io.File
20 |
21 | // Spark internal libraries
22 | import org.apache.spark.sql.SQLContext
23 | import org.apache.spark.sql.sources.{BaseRelation, RelationProvider, SchemaRelationProvider}
24 | import org.apache.spark.sql.types.StructType
25 |
26 | import org.apache.spark.sql.sources.DataSourceRegister
27 |
28 | class SelectJSONSource
29 | extends SchemaRelationProvider
30 | with DataSourceRegister {
31 |
32 | private def checkPath(parameters: Map[String, String]): String = {
33 | parameters.getOrElse("path", sys.error("'path' must be specified for JSON data."))
34 | }
35 |
36 | /**
37 | * Short alias for spark-select data source.
38 | */
39 | override def shortName(): String = "minioSelectJSON"
40 |
41 | override def createRelation(sqlContext: SQLContext, params: Map[String, String], schema: StructType): SelectJSONRelation = {
42 | val path = checkPath(params)
43 | SelectJSONRelation(Some(path), params, schema)(sqlContext)
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/src/main/scala/io/minio/spark/select/SelectParquetRelation.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 MinIO, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package io.minio.spark.select
17 |
18 | import scala.collection.JavaConversions.asScalaBuffer
19 |
20 | import java.io.InputStreamReader
21 | import java.io.BufferedReader
22 |
23 | // Import all utilities
24 | import io.minio.spark.select.util._
25 |
26 | // For AmazonS3 client
27 | import com.amazonaws.services.s3.AmazonS3
28 | import com.amazonaws.services.s3.AmazonS3URI
29 | import com.amazonaws.services.s3.AmazonS3ClientBuilder
30 | import com.amazonaws.client.builder.AwsClientBuilder.EndpointConfiguration
31 |
32 | import com.amazonaws.services.s3.model.ListObjectsV2Request
33 | import com.amazonaws.services.s3.model.ListObjectsV2Result
34 | import com.amazonaws.services.s3.model.S3ObjectSummary
35 |
36 | import org.apache.spark.rdd.RDD
37 | import org.apache.spark.sql.types._
38 | import org.apache.spark.sql.sources._
39 | import org.apache.spark.sql.catalyst.encoders.RowEncoder
40 | import org.apache.spark.sql.{DataFrame, Row, SQLContext}
41 |
42 | import scala.collection.mutable.{ListBuffer, ArrayBuffer}
43 |
44 | /**
45 | * Abstract relation class to download data from S3 compatible storage
46 | */
47 | case class SelectParquetRelation protected[spark] (
48 | location: Option[String],
49 | params: Map[String, String],
50 | userSchema: StructType = null)(@transient val sqlContext: SQLContext)
51 | extends BaseRelation
52 | with TableScan
53 | with PrunedScan
54 | with PrunedFilteredScan {
55 |
56 | private val API_PATH_STYLE_ACCESS = s"fs.s3a.path.style.access"
57 | private val SERVER_ENDPOINT = s"fs.s3a.endpoint"
58 | private val SERVER_REGION = s"fs.s3a.region"
59 |
60 | private val hadoopConfiguration = sqlContext.sparkContext.hadoopConfiguration
61 | private val pathStyleAccess = hadoopConfiguration.get(API_PATH_STYLE_ACCESS, "false") == "true"
62 | private val endpoint = hadoopConfiguration.get(SERVER_ENDPOINT, "https://s3.amazonaws.com")
63 | private val region = hadoopConfiguration.get(SERVER_REGION, "us-east-1")
64 | private val s3Client =
65 | AmazonS3ClientBuilder.standard()
66 | .withCredentials(Credentials.load(location, hadoopConfiguration))
67 | .withPathStyleAccessEnabled(pathStyleAccess)
68 | .withEndpointConfiguration(new EndpointConfiguration(endpoint, region))
69 | .build()
70 |
71 | override lazy val schema: StructType = Option(userSchema).getOrElse({
72 | // With no schema we return error.
73 | throw new RuntimeException(s"Schema cannot be empty")
74 | })
75 |
76 | private def getRows(schema: StructType, filters: Array[Filter]): Seq[Row] = {
77 | var records = new ListBuffer[Row]
78 | var req = new ListObjectsV2Request()
79 | var result = new ListObjectsV2Result()
80 | var s3URI = S3URI.toAmazonS3URI(location.getOrElse(""))
81 |
82 | req.withBucketName(s3URI.getBucket())
83 | req.withPrefix(s3URI.getKey().stripSuffix("*"))
84 | req.withMaxKeys(1000)
85 |
86 | do {
87 | result = s3Client.listObjectsV2(req)
88 | asScalaBuffer(result.getObjectSummaries()).foreach(objectSummary => {
89 | val br = new BufferedReader(new InputStreamReader(
90 | s3Client.selectObjectContent(
91 | Select.requestParquet(
92 | objectSummary.getBucketName(),
93 | objectSummary.getKey(),
94 | params,
95 | schema,
96 | filters,
97 | hadoopConfiguration)
98 | ).getPayload().getRecordsInputStream()))
99 | var line : String = null
100 | while ( {line = br.readLine(); line != null}) {
101 | var row = new Array[Any](schema.fields.length)
102 | var rowValues = line.split(",")
103 | var index = 0
104 | while (index < rowValues.length) {
105 | val field = schema.fields(index)
106 | row(index) = TypeCast.castTo(rowValues(index), field.dataType,
107 | field.nullable)
108 | index += 1
109 | }
110 | records += Row.fromSeq(row)
111 | }
112 | br.close()
113 | })
114 | req.setContinuationToken(result.getNextContinuationToken())
115 | } while (result.isTruncated())
116 | records.toList
117 | }
118 |
119 | override def toString: String = s"SelectParquetRelation()"
120 |
121 | private def tokenRDD(schema: StructType, filters: Array[Filter]): RDD[Row] = {
122 | sqlContext.sparkContext.makeRDD(getRows(schema, filters))
123 | }
124 |
125 | override def buildScan(): RDD[Row] = {
126 | tokenRDD(schema, null)
127 | }
128 |
129 | override def buildScan(columns: Array[String]): RDD[Row] = {
130 | tokenRDD(pruneSchema(schema, columns), null)
131 | }
132 |
133 | override def buildScan(columns: Array[String], filters: Array[Filter]): RDD[Row] = {
134 | tokenRDD(pruneSchema(schema, columns), filters)
135 | }
136 |
137 | private def pruneSchema(schema: StructType, columns: Array[String]): StructType = {
138 | val fieldMap = Map(schema.fields.map(x => x.name -> x): _*)
139 | new StructType(columns.map(name => fieldMap(name)))
140 | }
141 | }
142 |
--------------------------------------------------------------------------------
/src/main/scala/io/minio/spark/select/SelectParquetSource.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 MinIO, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package io.minio.spark.select
17 |
18 | // Java standard libraries
19 | import java.io.File
20 |
21 | // Spark internal libraries
22 | import org.apache.spark.sql.SQLContext
23 | import org.apache.spark.sql.sources.{BaseRelation, RelationProvider, SchemaRelationProvider}
24 | import org.apache.spark.sql.types.StructType
25 |
26 | import org.apache.spark.sql.sources.DataSourceRegister
27 |
28 | class SelectParquetSource
29 | extends SchemaRelationProvider
30 | with DataSourceRegister {
31 |
32 | private def checkPath(parameters: Map[String, String]): String = {
33 | parameters.getOrElse("path", sys.error("'path' must be specified for Parquet data."))
34 | }
35 |
36 | /**
37 | * Short alias for spark-select data source.
38 | */
39 | override def shortName(): String = "minioSelectParquet"
40 |
41 | override def createRelation(sqlContext: SQLContext, params: Map[String, String], schema: StructType): SelectParquetRelation = {
42 | val path = checkPath(params)
43 | SelectParquetRelation(Some(path), params, schema)(sqlContext)
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/src/main/scala/io/minio/spark/select/util/S3URI.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 MinIO, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package io.minio.spark.select.util
17 |
18 | import java.net.URI
19 | import com.amazonaws.services.s3.AmazonS3URI
20 |
21 | object S3URI {
22 | private[select] def toAmazonS3URI(
23 | location: String): AmazonS3URI = {
24 | val uri = new URI(location)
25 | val uriScheme = uri.getScheme
26 | uriScheme match {
27 | case "s3" =>
28 | new AmazonS3URI(uri)
29 | case "s3a" | "s3n" =>
30 | new AmazonS3URI(new URI("s3", uri.getUserInfo, uri.getHost, uri.getPort, uri.getPath,
31 | uri.getQuery, uri.getFragment))
32 | case other =>
33 | throw new IllegalArgumentException(s"Unrecognized scheme $other; expected s3, or s3a or s3n")
34 | }
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/src/main/scala/io/minio/spark/select/util/TypeCast.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2018 MinIO, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package io.minio.spark.select.util
17 |
18 | import java.math.BigDecimal
19 | import java.sql.{Date, Timestamp}
20 | import java.text.{SimpleDateFormat, NumberFormat}
21 | import java.util.Locale
22 |
23 | import org.apache.spark.sql.types._
24 |
25 | import scala.util.Try
26 |
27 | /**
28 | * Utility functions for type casting
29 | */
30 | object TypeCast {
31 |
32 | /**
33 | * Casts given string datum to specified type.
34 | * Currently we do not support complex types (ArrayType, MapType, StructType).
35 | *
36 | * For string types, this is simply the datum. For other types.
37 | * For other nullable types, this is null if the string datum is empty.
38 | *
39 | * @param datum string value
40 | * @param castType SparkSQL type
41 | */
42 | private[select] def castTo(
43 | datum: String,
44 | castType: DataType,
45 | nullable: Boolean = true,
46 | treatEmptyValuesAsNulls: Boolean = false,
47 | nullValue: String = "",
48 | dateFormatter: SimpleDateFormat = null): Any = {
49 | if (datum == nullValue &&
50 | nullable ||
51 | (treatEmptyValuesAsNulls && datum == "")){
52 | null
53 | } else {
54 | castType match {
55 | case _: ByteType => datum.toByte
56 | case _: ShortType => datum.toShort
57 | case _: IntegerType => datum.toInt
58 | case _: LongType => datum.toLong
59 | case _: FloatType => Try(datum.toFloat)
60 | .getOrElse(NumberFormat.getInstance(Locale.getDefault).parse(datum).floatValue())
61 | case _: DoubleType => Try(datum.toDouble)
62 | .getOrElse(NumberFormat.getInstance(Locale.getDefault).parse(datum).doubleValue())
63 | case _: BooleanType => datum.toBoolean
64 | case _: DecimalType => new BigDecimal(datum.replaceAll(",", ""))
65 | case _: TimestampType if dateFormatter != null =>
66 | new Timestamp(dateFormatter.parse(datum).getTime)
67 | case _: TimestampType => Timestamp.valueOf(datum)
68 | case _: DateType if dateFormatter != null =>
69 | new Date(dateFormatter.parse(datum).getTime)
70 | case _: DateType => Date.valueOf(datum)
71 | case _: StringType => datum
72 | case _ => throw new RuntimeException(s"Unsupported type: ${castType.typeName}")
73 | }
74 | }
75 | }
76 |
77 | /**
78 | * Helper method that converts string representation of a character to actual character.
79 | * It handles some Java escaped strings and throws exception if given string is longer than one
80 | * character.
81 | *
82 | */
83 | @throws[IllegalArgumentException]
84 | private[select] def toChar(str: String): Char = {
85 | if (str.charAt(0) == '\\') {
86 | str.charAt(1)
87 | match {
88 | case 't' => '\t'
89 | case 'r' => '\r'
90 | case 'b' => '\b'
91 | case 'f' => '\f'
92 | case '\"' => '\"' // In case user changes quote char and uses \" as delimiter in options
93 | case '\'' => '\''
94 | case 'u' if str == """\u0000""" => '\u0000'
95 | case _ =>
96 | throw new IllegalArgumentException(s"Unsupported special character for delimiter: $str")
97 | }
98 | } else if (str.length == 1) {
99 | str.charAt(0)
100 | } else {
101 | throw new IllegalArgumentException(s"Delimiter cannot be more than one character: $str")
102 | }
103 | }
104 | }
105 |
--------------------------------------------------------------------------------
/src/test/scala/io/minio/spark/select/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/minio/spark-select/d8721b46e84639690b7382d4a4e0877996e64b3d/src/test/scala/io/minio/spark/select/.keep
--------------------------------------------------------------------------------
/version.sbt:
--------------------------------------------------------------------------------
1 | version in ThisBuild := "2.1"
2 |
--------------------------------------------------------------------------------