├── .gitignore
├── LICENSE
├── README.md
├── build.sbt
├── examples
    ├── csv.py
    ├── csv.scala
    ├── json.py
    ├── json.scala
    ├── parquet.py
    ├── parquet.scala
    ├── people.csv
    ├── people.json
    └── people.parquet
├── project
    ├── assembly.sbt
    ├── build.properties
    └── plugins.sbt
├── scalastyle-config.xml
├── src
    ├── main
    │   ├── resources
    │   │   └── META-INF
    │   │   │   └── services
    │   │   │       └── org.apache.spark.sql.sources.DataSourceRegister
    │   └── scala
    │   │   └── io
    │   │       └── minio
    │   │           └── spark
    │   │               └── select
    │   │                   ├── Credentials.scala
    │   │                   ├── FilterPushdown.scala
    │   │                   ├── Select.scala
    │   │                   ├── SelectCSVRelation.scala
    │   │                   ├── SelectCSVSource.scala
    │   │                   ├── SelectJSONRelation.scala
    │   │                   ├── SelectJSONSource.scala
    │   │                   ├── SelectParquetRelation.scala
    │   │                   ├── SelectParquetSource.scala
    │   │                   └── util
    │   │                       ├── S3URI.scala
    │   │                       └── TypeCast.scala
    └── test
    │   └── scala
    │       └── io
    │           └── minio
    │               └── spark
    │                   └── select
    │                       └── .keep
└── version.sbt


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.class
 2 | *.log
 3 | 
 4 | # sbt specific
 5 | .cache
 6 | .history
 7 | .lib/
 8 | dist/*
 9 | target/
10 | lib_managed/
11 | src_managed/
12 | project/boot/
13 | project/plugins/project/
14 | 
15 | # Scala-IDE specific
16 | .scala_dependencies
17 | .worksheet
18 | 
19 | .idea/workspace.xml
20 | logs
21 | project/project
22 | project/target
23 | target
24 | tmp
25 | .history
26 | dist
27 | .idea/
28 | /*.iml
29 | /out
30 | /.idea_modules
31 | /.classpath
32 | /.project
33 | /RUNNING_PID
34 | /.settings
35 | projectFilesBackup/
36 | */target
37 | metastore_db/
38 | *.log
39 | *.class
40 | *.log
41 | *.zip
42 | *.gz
43 | *.jpeg
44 | *.jpg
45 | *.png
46 | *~
47 | .settings/
48 | .cache/
49 | .history/
50 | .lib/
51 | dist/*
52 | target/
53 | lib_managed/
54 | src_managed/
55 | project/boot/
56 | project/plugins/project/
57 | .project
58 | .classpath
59 | .cache
60 | .sbtserver.lock
61 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # MinIO Spark Select
  2 | MinIO Spark select enables retrieving only required data from an object using Select API.
  3 | 
  4 | ## Requirements
  5 | This library requires
  6 | - Spark 2.3+
  7 | - Scala 2.11+
  8 | 
  9 | ## Features
 10 | - S3 Select is supported with CSV, JSON and Parquet files using `minioSelectCSV`, `minioSelectJSON` and `minioSelectParquet` values to specify the data format.
 11 | - S3 Select supports select on multiple objects.
 12 | - S3 Select supports querying SSE-C encrypted objects.
 13 | 
 14 | ### Limitations
 15 | - Spark CSV and JSON options such as nanValue, positiveInf, negativeInf, and options related to corrupt records (for example, failfast and dropmalformed mode) are not supported.
 16 | - Using commas (,) within decimals is not supported. For example, 10,000 is not supported and 10000 is.
 17 | - The following filters are not pushed down to MinIO:
 18 |   - Aggregate functions such as COUNT() and SUM().
 19 |   - Filters that CAST() an attribute. For example, CAST(stringColumn as INT) = 1.
 20 |   - Filters with an attribute that is an object or is complex. For example, intArray[1] = 1, objectColumn.objectNumber = 1.
 21 |   - Filters for which the value is not a literal value. For example, intColumn1 = intColumn2
 22 |   - Only Select [Supported Data Types](https://docs.aws.amazon.com/AmazonS3/latest/dev/s3-glacier-select-sql-reference-data-types.html) are supported with the documented limitations.
 23 | 
 24 | ### HowTo
 25 | Include this package in your Spark Applications using:
 26 | 
 27 | #### *spark-shell*, *pyspark*, or *spark-submit*
 28 | ```
 29 | > $SPARK_HOME/bin/spark-shell --packages io.minio:spark-select_2.11:2.1
 30 | ```
 31 | 
 32 | #### *sbt*
 33 | If you use the [sbt-spark-package plugin](http://github.com/databricks/sbt-spark-package), in your sbt build file, add:
 34 | ```
 35 | spDependencies += "minio/spark-select:2.1"
 36 | ```
 37 | Otherwise,
 38 | ```
 39 | libraryDependencies += "io.minio" % "spark-select_2.11" % "2.1"
 40 | ```
 41 | 
 42 | #### *Maven*
 43 | In your pom.xml, add:
 44 | ```xml
 45 | <dependencies>
 46 |   <!-- list of dependencies -->
 47 |   <dependency>
 48 |     <groupId>io.minio</groupId>
 49 |     <artifactId>spark-select_2.11</artifactId>
 50 |     <version>2.1</version>
 51 |   </dependency>
 52 | </dependencies>
 53 | ```
 54 | 
 55 | #### *Source*
 56 | 
 57 | Setup all required environment variables
 58 | > NOTE: It is assumed that you have already installed hadoop-2.8.5, spark 2.3.1 at some locations locally.
 59 | ```
 60 | export HADOOP_HOME=${HOME}/spark/hadoop-2.8.5/
 61 | export PATH=${PATH}:${HADOOP_HOME}/bin
 62 | export SPARK_DIST_CLASSPATH=$(hadoop classpath)
 63 | 
 64 | export SPARK_HOME=${HOME}/spark/spark-2.3.1-bin-without-hadoop/
 65 | export PATH=${PATH}:${SPARK_HOME}/bin
 66 | export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk-amd64/
 67 | 
 68 | git clone https://github.com/minio/spark-select
 69 | sbt assembly
 70 | spark-shell --jars target/scala-2.11/spark-select-assembly-2.1.jar
 71 | ```
 72 | 
 73 | Once the `spark-shell` has been successfully invoked.
 74 | ```
 75 | scala> :load examples/csv.scala
 76 | Loading examples/csv.scala...
 77 | import org.apache.spark.sql._
 78 | import org.apache.spark.sql.types._
 79 | schema: org.apache.spark.sql.types.StructType = StructType(StructField(name,StringType,true), StructField(age,IntegerType,false))
 80 | df: org.apache.spark.sql.DataFrame = [name: string, age: int]
 81 | +-------+---+
 82 | |   name|age|
 83 | +-------+---+
 84 | |Michael| 31|
 85 | |   Andy| 30|
 86 | | Justin| 19|
 87 | +-------+---+
 88 | 
 89 | scala>
 90 | ```
 91 | 
 92 | ### API
 93 | 
 94 | #### *PySpark*
 95 | ```py
 96 | spark
 97 |   .read
 98 |   .format("minioSelectCSV") // "minioSelectJSON" for JSON or "minioSelectParquet" for Parquet
 99 |   .schema(...) // mandatory
100 |   .options(...) // optional
101 |   .load("s3://path/to/my/datafiles")
102 | ```
103 | 
104 | #### *R*
105 | ```
106 | read.df("s3://path/to/my/datafiles", "minioSelectCSV", schema)
107 | ```
108 | 
109 | #### *Scala*
110 | ```
111 | spark
112 |   .read
113 |   .format("minioSelectCSV") // "minioSelectJSON" for JSON or "minioSelectParquet" for Parquet
114 |   .schema(...) // mandatory
115 |   .options(...) // optional. Examples:
116 |   // .options(Map("quote" -> "\'", "header" -> "true")) or
117 |   // .option("quote", "\'").option("header", "true")
118 |   .load("s3://path/to/my/datafiles")
119 | ```
120 | 
121 | #### *SQL*
122 | ```
123 | CREATE TEMPORARY VIEW MyView (number INT, name STRING) USING minioSelectCSV OPTIONS (path "s3://path/to/my/datafiles")
124 | ```
125 | 
126 | ### Options
127 | The following options are available when using `minioSelectCSV` and `minioSelectJSON`. If not specified, default values are used.
128 | 
129 | #### *Options with minioSelectCSV*
130 | | Option | Default | Usage |
131 | |---|---|---|
132 | | `compression` | "none" | Indicates whether compression is used. "gzip", "bzip2" are values supported besides "none".
133 | | `delimiter` | "," | Specifies the field delimiter.
134 | | `quote` | '"' | Specifies the quote character. Specifying an empty string is not supported and results in a malformed XML error.
135 | | `escape` | '"' | Specifies the quote escape character.
136 | | `header` | "true" | "false" specifies that there is no header. "true" specifies that a header is in the first line. Only headers in the first line are supported, and empty lines before a header are not supported.
137 | | `comment` | "#" | Specifies the comment character.
138 | 
139 | #### *Options with minioSelectJSON*
140 | | Option | Default | Usage |
141 | |---|---|---|
142 | | `compression` | "none" | Indicates whether compression is used. "gzip", "bzip2" are values supported besides "none".
143 | | `multiline` | "false" | "false" specifies that the JSON is in Select LINES format, meaning that each line in the input data contains a single JSON object. "true" specifies that the JSON is in Select DOCUMENT format, meaning that a JSON object can span multiple lines in the input data.
144 | 
145 | #### *Options with minioSelectParquet*
146 | There are no **options** needed with Parquet files.
147 | 
148 | ### Full Examples
149 | 
150 | #### *Scala*
151 | 
152 | Schema with two columns for `CSV`.
153 | ```scala
154 | import org.apache.spark.sql._
155 | import org.apache.spark.sql.types._
156 | 
157 | object app {
158 |   def main(args: Array[String]) {
159 |     val schema = StructType(
160 |       List(
161 |         StructField("name", StringType, true),
162 |         StructField("age", IntegerType, false)
163 |       )
164 |     )
165 | 
166 |     val df = spark
167 |       .read
168 |       .format("minioSelectCSV")
169 |       .schema(schema)
170 |       .load("s3://sjm-airlines/people.csv")
171 | 
172 |     println(df.show())
173 | 
174 |     println(df.select("*").filter("age > 19").show())
175 | 
176 |   }
177 | }
178 | ```
179 | 
180 | With custom schema for `JSON`.
181 | ```scala
182 | import org.apache.spark.sql._
183 | import org.apache.spark.sql.types._
184 | 
185 | object app {
186 |   def main(args: Array[String]) {
187 |     val schema = StructType(
188 |       List(
189 |         StructField("name", StringType, true),
190 |         StructField("age", IntegerType, false)
191 |       )
192 |     )
193 | 
194 |     val df = spark
195 |       .read
196 |       .format("minioSelectJSON")
197 |       .schema(schema)
198 |       .load("s3://sjm-airlines/people.json")
199 | 
200 |     println(df.show())
201 | 
202 |     println(df.select("*").filter("age > 19").show())
203 | 
204 |   }
205 | }
206 | ```
207 | 
208 | With custom schema for `Parquet`.
209 | ```scala
210 | import org.apache.spark.sql._
211 | import org.apache.spark.sql.types._
212 | 
213 | object app {
214 |   def main(args: Array[String]) {
215 |     val schema = StructType(
216 |       List(
217 |         StructField("name", StringType, true),
218 |         StructField("age", IntegerType, false)
219 |       )
220 |     )
221 | 
222 |     val df = spark
223 |       .read
224 |       .format("minioSelectParquet")
225 |       .schema(schema)
226 |       .load("s3://sjm-airlines/people.parquet")
227 | 
228 |     println(df.show())
229 | 
230 |     println(df.select("*").filter("age > 19").show())
231 | 
232 |   }
233 | }
234 | ```
235 | 
236 | #### *Python*
237 | 
238 | Schema with two columns for `CSV`.
239 | ```py
240 | from pyspark.sql import *
241 | from pyspark.sql.types import *
242 | 
243 | if __name__ == "__main__":
244 |     # create SparkSession
245 |     spark = SparkSession.builder \
246 |         .master("local") \
247 |         .appName("spark-select in python") \
248 |         .getOrCreate()
249 | 
250 |     # filtered schema
251 |     st = StructType([
252 |         StructField("name", StringType(), True),
253 |         StructField("age", IntegerType(), False),
254 |     ])
255 | 
256 |     df = spark \
257 |         .read \
258 |         .format('minioSelectCSV') \
259 |         .schema(st) \
260 |         .load("s3://testbucket/people.csv")
261 | 
262 |     # show all rows.
263 |     df.show()
264 | 
265 |     # show only filtered rows.
266 |     df.select("*").filter("age > 19").show()
267 | ```
268 | 
269 | ```
270 | > $SPARK_HOME/bin/spark-submit --packages io.minio:spark-select_2.11:2.1 <python-file>
271 | ```
272 | 


--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
  1 | name := "spark-select"
  2 | 
  3 | organization := "io.minio"
  4 | 
  5 | scalaVersion := "2.11.12"
  6 | 
  7 | crossScalaVersions := Seq("2.11.12")
  8 | 
  9 | spName := "minio/spark-select"
 10 | 
 11 | spAppendScalaVersion := true
 12 | 
 13 | spIncludeMaven := true
 14 | 
 15 | spIgnoreProvided := true
 16 | 
 17 | sparkVersion := "2.3.1"
 18 | 
 19 | val testSparkVersion = settingKey[String]("The version of Spark to test against.")
 20 | 
 21 | testSparkVersion := sys.props.get("spark.testVersion").getOrElse(sparkVersion.value)
 22 | 
 23 | // used spark components
 24 | sparkComponents := Seq("sql")
 25 | 
 26 | assemblyMergeStrategy in assembly := {
 27 |   case "META-INF/io.netty.versions.properties" => MergeStrategy.concat
 28 |   case x =>
 29 |     val oldStrategy = (assemblyMergeStrategy in assembly).value
 30 |     oldStrategy(x)
 31 | }
 32 | 
 33 | // Dependent libraries
 34 | libraryDependencies ++= Seq(
 35 |   "com.amazonaws" % "aws-java-sdk" % "1.11.434" exclude("com.fasterxml.jackson.core", "jackson-databind"),
 36 |   "org.apache.commons" % "commons-csv" % "1.7",
 37 |   "org.slf4j" % "slf4j-api" % "1.7.5" % "provided",
 38 |   "org.mockito" % "mockito-core" % "2.0.31-beta"
 39 | )
 40 | 
 41 | libraryDependencies ++= Seq(
 42 |   "org.apache.spark" %% "spark-core" % testSparkVersion.value % "test" force(),
 43 |   "org.apache.spark" %% "spark-sql" % testSparkVersion.value % "test"  force(),
 44 |   "org.scala-lang" % "scala-library" % scalaVersion.value % "compile"
 45 | )
 46 | 
 47 | /**
 48 |  * release settings
 49 |   */
 50 | 
 51 | publishMavenStyle := true
 52 | 
 53 | releaseCrossBuild := true
 54 | 
 55 | licenses += ("Apache-2.0", url("http://www.apache.org/licenses/LICENSE-2.0"))
 56 | 
 57 | releasePublishArtifactsAction := PgpKeys.publishSigned.value
 58 | 
 59 | publishArtifact in Test := false
 60 | 
 61 | pomIncludeRepository := { _ => false }
 62 | 
 63 | publishTo := {
 64 |   val nexus = "https://oss.sonatype.org/"
 65 |   if (version.value.endsWith("SNAPSHOT"))
 66 |     Some("snapshots" at nexus + "content/repositories/snapshots")
 67 |   else
 68 |     Some("releases"  at nexus + "service/local/staging/deploy/maven2")
 69 | }
 70 | 
 71 | pomExtra := (
 72 |   <url>https://github.com/minio/spark-select</url>
 73 |     <scm>
 74 |       <connection>scm:git:github.com/minio/spark-select</connection>
 75 |       <developerConnection>scm:git:git@github.com:minio/spark-select</developerConnection>
 76 |       <url>github.com/minio/spark-select</url>
 77 |     </scm>
 78 |     <developers>
 79 |       <developer>
 80 |         <id>minio</id>
 81 |         <name>MinIO</name>
 82 |         <url>http://www.minio.io</url>
 83 |       </developer>
 84 |     </developers>)
 85 | 
 86 | // Skip tests during assembly
 87 | test in assembly := {}
 88 | 
 89 | ScoverageSbtPlugin.ScoverageKeys.coverageHighlighting := {
 90 |   if (scalaBinaryVersion.value == "2.10") false
 91 |   else true
 92 | }
 93 | 
 94 | import ReleaseTransformations._
 95 | 
 96 | // Add publishing to spark packages as another step.
 97 | releaseProcess := Seq[ReleaseStep](
 98 |   checkSnapshotDependencies,
 99 |   inquireVersions,
100 |   runTest,
101 |   setReleaseVersion,
102 |   commitReleaseVersion,
103 |   tagRelease,
104 |   publishArtifacts,
105 |   setNextVersion,
106 |   commitNextVersion,
107 |   pushChanges,
108 |   releaseStepTask(spPublish)
109 | )
110 | 


--------------------------------------------------------------------------------
/examples/csv.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql import *
 2 | from pyspark.sql.types import *
 3 | 
 4 | if __name__ == "__main__":
 5 |     # create SparkSession
 6 |     spark = SparkSession.builder \
 7 |         .master("local") \
 8 |         .appName("spark-select in python") \
 9 |         .getOrCreate()
10 | 
11 |     # filtered schema
12 |     st = StructType([
13 |         StructField("name", StringType(), True),
14 |         StructField("age", IntegerType(), False),
15 |     ])
16 | 
17 |     df = spark \
18 |         .read \
19 |         .format('minioSelectCSV') \
20 |         .schema(st) \
21 |         .load("s3://testbucket/people.csv")
22 | 
23 |     # show all rows.
24 |     df.show()
25 | 
26 |     # show only filtered rows.
27 |     df.select("*").filter("age > 19").show()
28 | 


--------------------------------------------------------------------------------
/examples/csv.scala:
--------------------------------------------------------------------------------
 1 | import org.apache.spark.sql._
 2 | import org.apache.spark.sql.types._
 3 | 
 4 | object app {
 5 |   def main(args: Array[String]) {
 6 |     val schema = StructType(
 7 |       List(
 8 |         StructField("name", StringType, true),
 9 |         StructField("age", IntegerType, false)
10 |       )
11 |     )
12 | 
13 |     val df = spark
14 |       .read
15 |       .format("minioSelectCSV")
16 |       .schema(schema)
17 |       .load("s3://sjm-airlines/people.csv")
18 | 
19 |     println(df.show())
20 | 
21 |     println(df.select("*").filter("age > 19").show())
22 | 
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/examples/json.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql import SparkSession
 2 | from pyspark.sql.types import *
 3 | 
 4 | if __name__ == "__main__":
 5 |     # create SparkSession
 6 |     spark = SparkSession.builder \
 7 |         .master("local") \
 8 |         .appName("spark-select in python") \
 9 |         .getOrCreate()
10 | 
11 |     # filtered schema
12 |     st = StructType([
13 |         StructField("name", StringType(), True),
14 |         StructField("age", IntegerType(), False),
15 |     ])
16 | 
17 |     df = spark \
18 |         .read \
19 |         .format('minioSelectJSON') \
20 |         .schema(st) \
21 |         .load("s3://testbucket/people.json")
22 | 
23 |     # show all rows.
24 |     df.show()
25 | 
26 |     # show only filtered rows.
27 |     df.select("*").filter("age > 19").show()
28 | 


--------------------------------------------------------------------------------
/examples/json.scala:
--------------------------------------------------------------------------------
 1 | import org.apache.spark.sql._
 2 | import org.apache.spark.sql.types._
 3 | 
 4 | object app {
 5 |   def main(args: Array[String]) {
 6 |     val schema = StructType(
 7 |       List(
 8 |         StructField("name", StringType, true),
 9 |         StructField("age", IntegerType, false)
10 |       )
11 |     )
12 | 
13 |     val df = spark
14 |       .read
15 |       .format("minioSelectJSON")
16 |       .schema(schema)
17 |       .load("s3://sjm-airlines/people.json")
18 | 
19 |     println(df.show())
20 | 
21 |     println(df.select("*").filter("age > 19").show())
22 | 
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/examples/parquet.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql import SparkSession
 2 | from pyspark.sql.types import *
 3 | 
 4 | if __name__ == "__main__":
 5 |     # create SparkSession
 6 |     spark = SparkSession.builder \
 7 |         .master("local") \
 8 |         .appName("spark-select in python") \
 9 |         .getOrCreate()
10 | 
11 |     # filtered schema
12 |     st = StructType([
13 |         StructField("name", StringType(), True),
14 |         StructField("age", IntegerType(), False),
15 |     ])
16 | 
17 |     df = spark \
18 |         .read \
19 |         .format('minioSelectParquet') \
20 |         .schema(st) \
21 |         .load("s3://testbucket/people.parquet")
22 | 
23 |     # show all rows.
24 |     df.show()
25 | 
26 |     # show only filtered rows.
27 |     df.select("*").filter("age > 19").show()
28 | 


--------------------------------------------------------------------------------
/examples/parquet.scala:
--------------------------------------------------------------------------------
 1 | import org.apache.spark.sql._
 2 | import org.apache.spark.sql.types._
 3 | 
 4 | object app {
 5 |   def main(args: Array[String]) {
 6 |     val schema = StructType(
 7 |       List(
 8 |         StructField("name", StringType, true),
 9 |         StructField("age", IntegerType, false)
10 |       )
11 |     )
12 | 
13 |     val df = spark
14 |       .read
15 |       .format("minioSelectParquet")
16 |       .schema(schema)
17 |       .load("s3://sjm-airlines/people.parquet")
18 | 
19 |     println(df.show())
20 | 
21 |     println(df.select("*").filter("age > 19").show())
22 | 
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/examples/people.csv:
--------------------------------------------------------------------------------
1 | name,age
2 | Michael,31
3 | Andy,30
4 | Justin,19
5 | 


--------------------------------------------------------------------------------
/examples/people.json:
--------------------------------------------------------------------------------
1 | {"name":"Michael", "age": 31}
2 | {"name":"Andy", "age": 30}
3 | {"name":"Justin", "age": 19}
4 | 


--------------------------------------------------------------------------------
/examples/people.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/minio/spark-select/d8721b46e84639690b7382d4a4e0877996e64b3d/examples/people.parquet


--------------------------------------------------------------------------------
/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.12.0")
2 | 


--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | sbt.version=0.13.17
18 | 


--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
 1 | scalaVersion := "2.10.6"
 2 | 
 3 | resolvers += Resolver.url("artifactory", url("http://scalasbt.artifactoryonline.com/scalasbt/sbt-plugin-releases"))(Resolver.ivyStylePatterns)
 4 | 
 5 | resolvers += "Typesafe Repository" at "http://repo.typesafe.com/typesafe/releases/"
 6 | 
 7 | resolvers += "sonatype-releases" at "https://oss.sonatype.org/content/repositories/releases/"
 8 | 
 9 | resolvers += "Spark Package Main Repo" at "https://dl.bintray.com/spark-packages/maven"
10 | 
11 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "4.0.0")
12 | 
13 | addSbtPlugin("com.github.mpeltonen" % "sbt-idea" % "1.6.0")
14 | 
15 | addSbtPlugin("org.spark-packages" % "sbt-spark-package" % "0.2.4")
16 | 
17 | addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "0.6.0")
18 | 
19 | addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.1.0")
20 | 
21 | addSbtPlugin("org.scoverage" % "sbt-coveralls" % "1.0.0")
22 | 
23 | addSbtPlugin("com.github.gseitz" % "sbt-release" % "1.0.10")
24 | 
25 | addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.1.1")
26 | 
27 | addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "2.3")
28 | 


--------------------------------------------------------------------------------
/scalastyle-config.xml:
--------------------------------------------------------------------------------
  1 | <scalastyle>
  2 |  <name>Scalastyle standard configuration</name>
  3 |  <check level="warning" class="org.scalastyle.file.FileTabChecker" enabled="true"></check>
  4 |  <check level="warning" class="org.scalastyle.file.FileLengthChecker" enabled="true">
  5 |   <parameters>
  6 |    <parameter name="maxFileLength"><![CDATA[800]]></parameter>
  7 |   </parameters>
  8 |  </check>
  9 |  <check level="warning" class="org.scalastyle.file.HeaderMatchesChecker" enabled="false">
 10 |   <parameters>
 11 |    <parameter name="header"><![CDATA[// Copyright (C) 2011-2012 the original author or authors.
 12 | // See the LICENCE.txt file distributed with this work for additional
 13 | // information regarding copyright ownership.
 14 | //
 15 | // Licensed under the Apache License, Version 2.0 (the "License");
 16 | // you may not use this file except in compliance with the License.
 17 | // You may obtain a copy of the License at
 18 | //
 19 | // http://www.apache.org/licenses/LICENSE-2.0
 20 | //
 21 | // Unless required by applicable law or agreed to in writing, software
 22 | // distributed under the License is distributed on an "AS IS" BASIS,
 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 24 | // See the License for the specific language governing permissions and
 25 | // limitations under the License.]]></parameter>
 26 |   </parameters>
 27 |  </check>
 28 |  <check level="warning" class="org.scalastyle.scalariform.SpacesAfterPlusChecker" enabled="true"></check>
 29 |  <check level="warning" class="org.scalastyle.file.WhitespaceEndOfLineChecker" enabled="true"></check>
 30 |  <check level="warning" class="org.scalastyle.scalariform.SpacesBeforePlusChecker" enabled="true"></check>
 31 |  <check level="warning" class="org.scalastyle.file.FileLineLengthChecker" enabled="true">
 32 |   <parameters>
 33 |    <parameter name="maxLineLength"><![CDATA[160]]></parameter>
 34 |    <parameter name="tabSize"><![CDATA[4]]></parameter>
 35 |   </parameters>
 36 |  </check>
 37 |  <check level="warning" class="org.scalastyle.scalariform.ClassNamesChecker" enabled="true">
 38 |   <parameters>
 39 |    <parameter name="regex"><![CDATA[[A-Z][A-Za-z]*]]></parameter>
 40 |   </parameters>
 41 |  </check>
 42 |  <check level="warning" class="org.scalastyle.scalariform.ObjectNamesChecker" enabled="true">
 43 |   <parameters>
 44 |    <parameter name="regex"><![CDATA[[A-Z][A-Za-z]*]]></parameter>
 45 |   </parameters>
 46 |  </check>
 47 |  <check level="warning" class="org.scalastyle.scalariform.PackageObjectNamesChecker" enabled="true">
 48 |   <parameters>
 49 |    <parameter name="regex"><![CDATA[^[a-z][A-Za-z]*$]]></parameter>
 50 |   </parameters>
 51 |  </check>
 52 |  <check level="warning" class="org.scalastyle.scalariform.EqualsHashCodeChecker" enabled="true"></check>
 53 |  <check level="warning" class="org.scalastyle.scalariform.IllegalImportsChecker" enabled="true">
 54 |   <parameters>
 55 |    <parameter name="illegalImports"><![CDATA[sun._,java.awt._]]></parameter>
 56 |   </parameters>
 57 |  </check>
 58 |  <check level="warning" class="org.scalastyle.scalariform.ParameterNumberChecker" enabled="true">
 59 |   <parameters>
 60 |    <parameter name="maxParameters"><![CDATA[8]]></parameter>
 61 |   </parameters>
 62 |  </check>
 63 |  <check level="warning" class="org.scalastyle.scalariform.MagicNumberChecker" enabled="true">
 64 |   <parameters>
 65 |    <parameter name="ignore"><![CDATA[-1,0,1,2,3]]></parameter>
 66 |   </parameters>
 67 |  </check>
 68 |  <check level="warning" class="org.scalastyle.scalariform.NoWhitespaceBeforeLeftBracketChecker" enabled="true"></check>
 69 |  <check level="warning" class="org.scalastyle.scalariform.NoWhitespaceAfterLeftBracketChecker" enabled="true"></check>
 70 |  <check level="warning" class="org.scalastyle.scalariform.ReturnChecker" enabled="true"></check>
 71 |  <check level="warning" class="org.scalastyle.scalariform.NullChecker" enabled="false"></check>
 72 |  <check level="warning" class="org.scalastyle.scalariform.NoCloneChecker" enabled="true"></check>
 73 |  <check level="warning" class="org.scalastyle.scalariform.NoFinalizeChecker" enabled="true"></check>
 74 |  <check level="warning" class="org.scalastyle.scalariform.CovariantEqualsChecker" enabled="true"></check>
 75 |  <check level="warning" class="org.scalastyle.scalariform.StructuralTypeChecker" enabled="true"></check>
 76 |  <check level="warning" class="org.scalastyle.file.RegexChecker" enabled="true">
 77 |   <parameters>
 78 |    <parameter name="regex"><![CDATA[println]]></parameter>
 79 |   </parameters>
 80 |  </check>
 81 |  <check level="warning" class="org.scalastyle.scalariform.NumberOfTypesChecker" enabled="true">
 82 |   <parameters>
 83 |    <parameter name="maxTypes"><![CDATA[30]]></parameter>
 84 |   </parameters>
 85 |  </check>
 86 |  <check level="warning" class="org.scalastyle.scalariform.CyclomaticComplexityChecker" enabled="true">
 87 |   <parameters>
 88 |    <parameter name="maximum"><![CDATA[10]]></parameter>
 89 |   </parameters>
 90 |  </check>
 91 |  <check level="warning" class="org.scalastyle.scalariform.UppercaseLChecker" enabled="true"></check>
 92 |  <check level="warning" class="org.scalastyle.scalariform.SimplifyBooleanExpressionChecker" enabled="true"></check>
 93 |  <check level="warning" class="org.scalastyle.scalariform.IfBraceChecker" enabled="true">
 94 |   <parameters>
 95 |    <parameter name="singleLineAllowed"><![CDATA[true]]></parameter>
 96 |    <parameter name="doubleLineAllowed"><![CDATA[false]]></parameter>
 97 |   </parameters>
 98 |  </check>
 99 |  <check level="warning" class="org.scalastyle.scalariform.MethodLengthChecker" enabled="true">
100 |   <parameters>
101 |    <parameter name="maxLength"><![CDATA[50]]></parameter>
102 |   </parameters>
103 |  </check>
104 |  <check level="warning" class="org.scalastyle.scalariform.MethodNamesChecker" enabled="true">
105 |   <parameters>
106 |    <parameter name="regex"><![CDATA[^[a-z][A-Za-z0-9]*$]]></parameter>
107 |   </parameters>
108 |  </check>
109 |  <check level="warning" class="org.scalastyle.scalariform.NumberOfMethodsInTypeChecker" enabled="true">
110 |   <parameters>
111 |    <parameter name="maxMethods"><![CDATA[30]]></parameter>
112 |   </parameters>
113 |  </check>
114 |  <check level="warning" class="org.scalastyle.scalariform.PublicMethodsHaveTypeChecker" enabled="true"></check>
115 |  <check level="warning" class="org.scalastyle.file.NewLineAtEofChecker" enabled="true"></check>
116 |  <check level="warning" class="org.scalastyle.file.NoNewLineAtEofChecker" enabled="false"></check>
117 | </scalastyle>


--------------------------------------------------------------------------------
/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister:
--------------------------------------------------------------------------------
1 | io.minio.spark.select.SelectCSVSource
2 | io.minio.spark.select.SelectJSONSource
3 | io.minio.spark.select.SelectParquetSource
4 | 


--------------------------------------------------------------------------------
/src/main/scala/io/minio/spark/select/Credentials.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2019 MinIO, Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | package io.minio.spark.select
17 | 
18 | import java.net.URI
19 | 
20 | // For BasicAWSCredentials
21 | import com.amazonaws.auth.AWSCredentials
22 | import com.amazonaws.auth.AWSCredentialsProvider
23 | import com.amazonaws.auth.BasicAWSCredentials
24 | import com.amazonaws.auth.BasicSessionCredentials
25 | import com.amazonaws.auth.DefaultAWSCredentialsProviderChain
26 | 
27 | import org.apache.hadoop.conf.Configuration
28 | 
29 | private[spark] object Credentials {
30 |   private def staticCredentialsProvider(credentials: AWSCredentials): AWSCredentialsProvider = {
31 |     new AWSCredentialsProvider {
32 |       override def getCredentials: AWSCredentials = credentials
33 |       override def refresh(): Unit = {}
34 |     }
35 |   }
36 | 
37 |   def load(location: Option[String], hadoopConfiguration: Configuration): AWSCredentialsProvider = {
38 |     val uri = new URI(location.getOrElse(""))
39 |     val uriScheme = uri.getScheme
40 | 
41 |     uriScheme match {
42 |       case "s3" | "s3a" =>
43 |         // This matches what S3A does, with one exception: we don't
44 |         // support anonymous credentials. First, try to parse from URI:
45 |         Option(uri.getUserInfo).flatMap { userInfo =>
46 |           if (userInfo.contains(":")) {
47 |             val Array(accessKey, secretKey) = userInfo.split(":")
48 |             Some(staticCredentialsProvider(new BasicAWSCredentials(accessKey, secretKey)))
49 |           } else {
50 |             None
51 |           }
52 |         }.orElse {
53 |           val accessKey = hadoopConfiguration.get(s"fs.s3a.access.key", null)
54 |           val secretKey = hadoopConfiguration.get(s"fs.s3a.secret.key", null)
55 |           val sessionToken = hadoopConfiguration.get(s"fs.s3a.session.token", null)
56 |           if (accessKey != null && secretKey != null) {
57 |             if (sessionToken != null) {
58 |               Some(staticCredentialsProvider(new BasicSessionCredentials(accessKey, secretKey, sessionToken)))
59 |             } else {
60 |               Some(staticCredentialsProvider(new BasicAWSCredentials(accessKey, secretKey)))
61 |             }
62 |           } else {
63 |             None
64 |           }
65 |         }.getOrElse {
66 |           // Finally, fall back on the instance profile provider
67 |           new DefaultAWSCredentialsProviderChain()
68 |         }
69 |       case other =>
70 |         throw new IllegalArgumentException(s"Unrecognized scheme $other; expected s3, or s3a")
71 |     }
72 |   }
73 | }
74 | 


--------------------------------------------------------------------------------
/src/main/scala/io/minio/spark/select/FilterPushdown.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2018 MinIO, Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package io.minio.spark.select
18 | 
19 | import java.sql.{Date, Timestamp}
20 | 
21 | import org.apache.spark.sql.sources._
22 | import org.apache.spark.sql.types._
23 | 
24 | /**
25 |  * Helper methods for pushing filters into Select queries.
26 |  */
27 | private[spark] object FilterPushdown {
28 |   /**
29 |    * Build a SQL WHERE clause for the given filters. If a filter cannot be pushed down then no
30 |    * condition will be added to the WHERE clause. If none of the filters can be pushed down then
31 |    * an empty string will be returned.
32 |    *
33 |    * @param schema the schema of the table being queried
34 |    * @param filters an array of filters, the conjunction of which is the filter condition for the
35 |    *                scan.
36 |    */
37 |   def buildWhereClause(schema: StructType, filters: Seq[Filter]): String = {
38 |     val filterExpressions = filters.flatMap(f => buildFilterExpression(schema, f)).mkString(" AND ")
39 |     if (filterExpressions.isEmpty) "" else "WHERE " + filterExpressions
40 |   }
41 | 
42 |   /**
43 |    * Attempt to convert the given filter into a Select expression. Returns None if the expression
44 |    * could not be converted.
45 |    */
46 |   def buildFilterExpression(schema: StructType, filter: Filter): Option[String] = {
47 |     def buildComparison(attr: String, value: Any, comparisonOp: String): Option[String] = {
48 |       getTypeForAttribute(schema, attr).map { dataType =>
49 |         val sqlEscapedValue: String = dataType match {
50 |           case StringType => s""""${value.toString.replace("'", "\\'\\'")}""""
51 |           case DateType => s""""${value.asInstanceOf[Date]}""""
52 |           case TimestampType => s""""${value.asInstanceOf[Timestamp]}""""
53 |           case _ => value.toString
54 |         }
55 |         s"s."+s""""$attr""""+s" $comparisonOp $sqlEscapedValue"
56 |       }
57 |     }
58 | 
59 |     filter match {
60 |       case EqualTo(attr, value) => buildComparison(attr, value, "=")
61 |       case LessThan(attr, value) => buildComparison(attr, value, "<")
62 |       case GreaterThan(attr, value) => buildComparison(attr, value, ">")
63 |       case LessThanOrEqual(attr, value) => buildComparison(attr, value, "<=")
64 |       case GreaterThanOrEqual(attr, value) => buildComparison(attr, value, ">=")
65 |       case _ => None
66 |     }
67 |   }
68 | 
69 |   /**
70 |    * Use the given schema to look up the attribute's data type. Returns None if the attribute could
71 |    * not be resolved.
72 |    */
73 |   private def getTypeForAttribute(schema: StructType, attribute: String): Option[DataType] = {
74 |     if (schema.fieldNames.contains(attribute)) {
75 |       Some(schema(attribute).dataType)
76 |     } else {
77 |       None
78 |     }
79 |   }
80 | 
81 |   def queryFromSchema(schema: StructType, filters: Array[Filter]): String = {
82 |     var columnList = schema.fields.map(x => s"s."+s""""${x.name}"""").mkString(",")
83 |     if (columnList.length == 0) {
84 |       columnList = "*"
85 |     }
86 |     val whereClause = buildWhereClause(schema, filters)
87 |     if (whereClause.length == 0) {
88 |       s"select $columnList from S3Object s"
89 |     } else {
90 |       s"select $columnList from S3Object s $whereClause"
91 |     }
92 |   }
93 | 
94 | }
95 | 


--------------------------------------------------------------------------------
/src/main/scala/io/minio/spark/select/Select.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2019 MinIO, Inc.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | package io.minio.spark.select
 17 | 
 18 | import org.apache.hadoop.conf.Configuration
 19 | 
 20 | // Select API
 21 | import com.amazonaws.services.s3.model.JSONInput
 22 | import com.amazonaws.services.s3.model.JSONType
 23 | import com.amazonaws.services.s3.model.CSVInput
 24 | import com.amazonaws.services.s3.model.CSVOutput
 25 | import com.amazonaws.services.s3.model.ParquetInput
 26 | import com.amazonaws.services.s3.model.CompressionType
 27 | import com.amazonaws.services.s3.model.ExpressionType
 28 | import com.amazonaws.services.s3.model.SSECustomerKey
 29 | import com.amazonaws.services.s3.model.InputSerialization
 30 | import com.amazonaws.services.s3.model.OutputSerialization
 31 | import com.amazonaws.services.s3.model.SelectObjectContentRequest
 32 | import com.amazonaws.services.s3.model.SelectObjectContentResult
 33 | import com.amazonaws.services.s3.model.SelectObjectContentEvent
 34 | import com.amazonaws.services.s3.model.SelectObjectContentEvent.RecordsEvent
 35 | import com.amazonaws.services.s3.model.FileHeaderInfo
 36 | 
 37 | import org.apache.spark.sql.types._
 38 | import org.apache.spark.sql.sources._
 39 | 
 40 | private[spark] object Select {
 41 |   private val SERVER_ENCRYPTION_ALGORITHM = s"fs.s3a.server-side-encryption-algorithm"
 42 |   private val SERVER_ENCRYPTION_KEY = s"fs.s3a.server-side-encryption.key"
 43 | 
 44 |   private def compressionType(params: Map[String, String]): CompressionType = {
 45 |     params.getOrElse("compression", "none") match {
 46 |       case "none" => CompressionType.NONE
 47 |       case "gzip" => CompressionType.GZIP
 48 |       case "bzip2" => CompressionType.BZIP2
 49 |     }
 50 |   }
 51 | 
 52 |   private def jsonType(params: Map[String, String]): JSONType = {
 53 |     params.getOrElse("multiline", "false") match {
 54 |       case "false" => JSONType.LINES
 55 |       case "true" => JSONType.DOCUMENT
 56 |     }
 57 |   }
 58 | 
 59 |   private def headerInfo(params: Map[String, String]): FileHeaderInfo = {
 60 |     params.getOrElse("header", "true") match {
 61 |       case "false" => FileHeaderInfo.NONE
 62 |       case "true" => FileHeaderInfo.USE
 63 |     }
 64 |   }
 65 | 
 66 |   private def sseCustomerKey(algo: String, key: String): SSECustomerKey = {
 67 |     algo match {
 68 |       case "SSE-C" =>
 69 |         if (key != null) {
 70 |           new SSECustomerKey(key)
 71 |         } else {
 72 |           null
 73 |         }
 74 |       case other =>
 75 |         throw new IllegalArgumentException(s"Unrecognized algorithm $algo; expected SSE-C")
 76 |     }
 77 |   }
 78 | 
 79 |   def requestParquet(bucket: String, key: String, params: Map[String, String],
 80 |     schema: StructType, filters: Array[Filter],
 81 |     hadoopConfiguration: Configuration): SelectObjectContentRequest = {
 82 | 
 83 |     new SelectObjectContentRequest() { request =>
 84 |       request.setBucketName(bucket)
 85 |       request.setKey(key)
 86 |       request.setExpression(FilterPushdown.queryFromSchema(schema, filters))
 87 |       request.setExpressionType(ExpressionType.SQL)
 88 |       val algo = hadoopConfiguration.get(SERVER_ENCRYPTION_ALGORITHM, null)
 89 |       if (algo != null) {
 90 |         request.withSSECustomerKey(sseCustomerKey(algo,
 91 |           hadoopConfiguration.get(SERVER_ENCRYPTION_KEY, null)))
 92 |       }
 93 | 
 94 |       val inputSerialization = new InputSerialization()
 95 |       val parquetInput = new ParquetInput()
 96 |       inputSerialization.setParquet(parquetInput)
 97 |       request.setInputSerialization(inputSerialization)
 98 | 
 99 |       val outputSerialization = new OutputSerialization()
100 |       val csvOutput = new CSVOutput()
101 |       outputSerialization.setCsv(csvOutput)
102 |       request.setOutputSerialization(outputSerialization)
103 |     }
104 |   }
105 | 
106 |   def requestJSON(bucket: String, key: String, params: Map[String, String],
107 |     schema: StructType, filters: Array[Filter],
108 |     hadoopConfiguration: Configuration): SelectObjectContentRequest = {
109 | 
110 |     new SelectObjectContentRequest() { request =>
111 |       request.setBucketName(bucket)
112 |       request.setKey(key)
113 |       request.setExpression(FilterPushdown.queryFromSchema(schema, filters))
114 |       request.setExpressionType(ExpressionType.SQL)
115 |       val algo = hadoopConfiguration.get(SERVER_ENCRYPTION_ALGORITHM, null)
116 |       if (algo != null) {
117 |         request.withSSECustomerKey(sseCustomerKey(algo,
118 |           hadoopConfiguration.get(SERVER_ENCRYPTION_KEY, null)))
119 |       }
120 | 
121 |       val inputSerialization = new InputSerialization()
122 |       val jsonInput = new JSONInput()
123 |       jsonInput.withType(jsonType(params))
124 |       inputSerialization.setJson(jsonInput)
125 |       inputSerialization.setCompressionType(compressionType(params))
126 |       request.setInputSerialization(inputSerialization)
127 | 
128 |       val outputSerialization = new OutputSerialization()
129 |       val csvOutput = new CSVOutput()
130 |       outputSerialization.setCsv(csvOutput)
131 |       request.setOutputSerialization(outputSerialization)
132 |     }
133 |   }
134 | 
135 | 
136 |   def requestCSV(bucket: String, key: String, params: Map[String, String],
137 |     schema: StructType, filters: Array[Filter],
138 |     hadoopConfiguration: Configuration): SelectObjectContentRequest = {
139 |     new SelectObjectContentRequest() { request =>
140 |       request.setBucketName(bucket)
141 |       request.setKey(key)
142 |       request.setExpression(FilterPushdown.queryFromSchema(schema, filters))
143 |       request.setExpressionType(ExpressionType.SQL)
144 |       val algo = hadoopConfiguration.get(SERVER_ENCRYPTION_ALGORITHM, null)
145 |       if (algo != null) {
146 |         request.withSSECustomerKey(sseCustomerKey(algo,
147 |           hadoopConfiguration.get(SERVER_ENCRYPTION_KEY, null)))
148 |       }
149 | 
150 |       val inputSerialization = new InputSerialization()
151 |       val csvInput = new CSVInput()
152 |       csvInput.withFileHeaderInfo(headerInfo(params))
153 |       csvInput.withRecordDelimiter('\n')
154 |       csvInput.withQuoteCharacter(params.getOrElse(s"quote", "\""))
155 |       csvInput.withQuoteEscapeCharacter(params.getOrElse(s"escape", "\""))
156 |       csvInput.withComments(params.getOrElse(s"comment", "#"))
157 |       csvInput.withFieldDelimiter(params.getOrElse(s"delimiter", ","))
158 |       inputSerialization.setCsv(csvInput)
159 |       inputSerialization.setCompressionType(compressionType(params))
160 |       request.setInputSerialization(inputSerialization)
161 | 
162 |       val outputSerialization = new OutputSerialization()
163 |       val csvOutput = new CSVOutput()
164 |       csvOutput.withRecordDelimiter('\n')
165 |       csvOutput.withFieldDelimiter(params.getOrElse("delimiter", ","))
166 |       outputSerialization.setCsv(csvOutput)
167 |       request.setOutputSerialization(outputSerialization)
168 |     }
169 |   }
170 | }
171 | 


--------------------------------------------------------------------------------
/src/main/scala/io/minio/spark/select/SelectCSVRelation.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2018 MinIO, Inc.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | package io.minio.spark.select
 17 | 
 18 | import scala.collection.JavaConversions.asScalaBuffer
 19 | import scala.collection.JavaConverters._
 20 | 
 21 | import scala.util.control.NonFatal
 22 | 
 23 | import org.slf4j.LoggerFactory
 24 | 
 25 | import java.io.InputStreamReader
 26 | import java.io.BufferedReader
 27 | 
 28 | // Import all utilities
 29 | import io.minio.spark.select.util._
 30 | 
 31 | // Apache commons
 32 | import org.apache.commons.csv._
 33 | 
 34 | // For AmazonS3 client
 35 | import com.amazonaws.services.s3.AmazonS3
 36 | import com.amazonaws.services.s3.AmazonS3URI
 37 | import com.amazonaws.services.s3.AmazonS3ClientBuilder
 38 | import com.amazonaws.client.builder.AwsClientBuilder.EndpointConfiguration
 39 | 
 40 | import com.amazonaws.services.s3.model.ListObjectsV2Request
 41 | import com.amazonaws.services.s3.model.ListObjectsV2Result
 42 | import com.amazonaws.services.s3.model.S3ObjectSummary
 43 | 
 44 | import org.apache.spark.rdd.RDD
 45 | import org.apache.spark.sql.types._
 46 | import org.apache.spark.sql.sources._
 47 | import org.apache.spark.sql.catalyst.encoders.RowEncoder
 48 | import org.apache.spark.sql.{DataFrame, Row, SQLContext}
 49 | 
 50 | import scala.collection.mutable.{ListBuffer, ArrayBuffer}
 51 | 
 52 | /**
 53 |   * Abstract relation class to download data from S3 compatible storage
 54 |   */
 55 | case class SelectCSVRelation protected[spark] (
 56 |   location: Option[String],
 57 |   params: Map[String, String],
 58 |   userSchema: StructType = null)(@transient val sqlContext: SQLContext)
 59 |     extends BaseRelation
 60 |     with TableScan
 61 |     with PrunedScan
 62 |     with PrunedFilteredScan {
 63 | 
 64 |   private val API_PATH_STYLE_ACCESS = s"fs.s3a.path.style.access"
 65 |   private val SERVER_ENDPOINT = s"fs.s3a.endpoint"
 66 |   private val SERVER_REGION = s"fs.s3a.region"
 67 | 
 68 |   private val hadoopConfiguration = sqlContext.sparkContext.hadoopConfiguration
 69 |   private val pathStyleAccess = hadoopConfiguration.get(API_PATH_STYLE_ACCESS, "false") == "true"
 70 |   private val endpoint = hadoopConfiguration.get(SERVER_ENDPOINT, "https://s3.amazonaws.com")
 71 |   private val region = hadoopConfiguration.get(SERVER_REGION, "us-east-1")
 72 |   private val s3Client =
 73 |     AmazonS3ClientBuilder.standard()
 74 |       .withCredentials(Credentials.load(location, hadoopConfiguration))
 75 |       .withPathStyleAccessEnabled(pathStyleAccess)
 76 |       .withEndpointConfiguration(new EndpointConfiguration(endpoint, region))
 77 |       .build()
 78 | 
 79 |   private val logger = LoggerFactory.getLogger(SelectCSVRelation.getClass)
 80 | 
 81 |   override lazy val schema: StructType = Option(userSchema).getOrElse({
 82 |       // With no schema we return error.
 83 |       throw new RuntimeException(s"Schema cannot be empty")
 84 |   })
 85 | 
 86 |   private def getRows(prunedSchema: StructType, filters: Array[Filter]): Seq[Row] = {
 87 |     var records = new ListBuffer[Row]
 88 |     var req = new ListObjectsV2Request()
 89 |     var result = new ListObjectsV2Result()
 90 |     var s3URI = S3URI.toAmazonS3URI(location.getOrElse(""))
 91 | 
 92 |     req.withBucketName(s3URI.getBucket())
 93 |     req.withPrefix(s3URI.getKey().stripSuffix("*"))
 94 |     req.withMaxKeys(1000)
 95 | 
 96 |     val csvFormat = CSVFormat.DEFAULT
 97 |       .withHeader(prunedSchema.fields.map(x => x.name): _*)
 98 |       .withRecordSeparator("\n")
 99 |       .withDelimiter(params.getOrElse("delimiter", ",").charAt(0))
100 |       .withQuote(params.getOrElse("quote", "\"").charAt(0))
101 |       .withEscape(params.getOrElse(s"escape", "\"").charAt(0))
102 |       .withCommentMarker(params.getOrElse(s"comment", "#").charAt(0))
103 | 
104 |     do {
105 |       result = s3Client.listObjectsV2(req)
106 |       asScalaBuffer(result.getObjectSummaries()).foreach(objectSummary => {
107 |         val in = s3Client.selectObjectContent(
108 |           Select.requestCSV(
109 |             objectSummary.getBucketName(),
110 |             objectSummary.getKey(),
111 |             params,
112 |             prunedSchema,
113 |             filters,
114 |             hadoopConfiguration)
115 |         ).getPayload().getRecordsInputStream()
116 |         var parser = CSVParser.parse(in, java.nio.charset.Charset.forName("UTF-8"), csvFormat)
117 |         try {
118 |           for (record <- parser.asScala) {
119 |             records += Row.fromSeq(prunedSchema.fields.map(x => {
120 |               TypeCast.castTo(record.get(x.name), x.dataType, x.nullable)
121 |             }))
122 |           }
123 |         } catch {
124 |           case NonFatal(e) => {
125 |             logger.error(s"Exception while parsing ", e)
126 |           }
127 |         }
128 |         parser.close()
129 |       })
130 |       req.setContinuationToken(result.getNextContinuationToken())
131 |     } while (result.isTruncated())
132 |     records.toList
133 |   }
134 | 
135 |   override def toString: String = s"SelectCSVRelation()"
136 | 
137 |   private def tokenRDD(schema: StructType, filters: Array[Filter]): RDD[Row] = {
138 |     sqlContext.sparkContext.makeRDD(getRows(schema, filters))
139 |   }
140 | 
141 |   override def buildScan(): RDD[Row] = {
142 |     tokenRDD(schema, null)
143 |   }
144 | 
145 |   override def buildScan(columns: Array[String]): RDD[Row] = {
146 |     tokenRDD(pruneSchema(schema, columns), null)
147 |   }
148 | 
149 |   override def buildScan(columns: Array[String], filters: Array[Filter]): RDD[Row] = {
150 |     tokenRDD(pruneSchema(schema, columns), filters)
151 |   }
152 | 
153 |   private def pruneSchema(schema: StructType, columns: Array[String]): StructType = {
154 |     val fieldMap = Map(schema.fields.map(x => x.name -> x): _*)
155 |     new StructType(columns.map(name => fieldMap(name)))
156 |   }
157 | }
158 | 


--------------------------------------------------------------------------------
/src/main/scala/io/minio/spark/select/SelectCSVSource.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2018 MinIO, Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | package io.minio.spark.select
17 | 
18 | // Java standard libraries
19 | import java.io.File
20 | 
21 | // Spark internal libraries
22 | import org.apache.spark.sql.SQLContext
23 | import org.apache.spark.sql.sources.{BaseRelation, RelationProvider, SchemaRelationProvider}
24 | import org.apache.spark.sql.types.StructType
25 | 
26 | import org.apache.spark.sql.sources.DataSourceRegister
27 | 
28 | class SelectCSVSource
29 |   extends SchemaRelationProvider
30 |   with DataSourceRegister {
31 | 
32 |   private def checkPath(parameters: Map[String, String]): String = {
33 |     parameters.getOrElse("path", sys.error("'path' must be specified for CSV data."))
34 |   }
35 | 
36 |   /**
37 |    * Short alias for spark-select data source.
38 |    */
39 |   override def shortName(): String = "minioSelectCSV"
40 | 
41 |   override def createRelation(sqlContext: SQLContext, params: Map[String, String], schema: StructType): SelectCSVRelation = {
42 |     val path = checkPath(params)
43 |     SelectCSVRelation(Some(path), params, schema)(sqlContext)
44 |   }
45 | }
46 | 


--------------------------------------------------------------------------------
/src/main/scala/io/minio/spark/select/SelectJSONRelation.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2018 MinIO, Inc.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | package io.minio.spark.select
 17 | 
 18 | import scala.collection.JavaConversions.asScalaBuffer
 19 | 
 20 | import java.io.InputStreamReader
 21 | import java.io.BufferedReader
 22 | 
 23 | // Import all utilities
 24 | import io.minio.spark.select.util._
 25 | 
 26 | // Apache commons
 27 | import org.apache.commons.csv.{CSVFormat, QuoteMode}
 28 | 
 29 | // For AmazonS3 client
 30 | import com.amazonaws.services.s3.AmazonS3
 31 | import com.amazonaws.services.s3.AmazonS3URI
 32 | import com.amazonaws.services.s3.AmazonS3ClientBuilder
 33 | import com.amazonaws.client.builder.AwsClientBuilder.EndpointConfiguration
 34 | 
 35 | import com.amazonaws.services.s3.model.ListObjectsV2Request
 36 | import com.amazonaws.services.s3.model.ListObjectsV2Result
 37 | import com.amazonaws.services.s3.model.S3ObjectSummary
 38 | 
 39 | import org.apache.spark.rdd.RDD
 40 | import org.apache.spark.sql.types._
 41 | import org.apache.spark.sql.sources._
 42 | import org.apache.spark.sql.catalyst.encoders.RowEncoder
 43 | import org.apache.spark.sql.{DataFrame, Row, SQLContext}
 44 | 
 45 | import scala.collection.mutable.{ListBuffer, ArrayBuffer}
 46 | 
 47 | /**
 48 |   * Abstract relation class to download data from S3 compatible storage
 49 |   */
 50 | case class SelectJSONRelation protected[spark] (
 51 |   location: Option[String],
 52 |   params: Map[String, String],
 53 |   userSchema: StructType = null)(@transient val sqlContext: SQLContext)
 54 |     extends BaseRelation
 55 |     with TableScan
 56 |     with PrunedScan
 57 |     with PrunedFilteredScan {
 58 | 
 59 |   private val API_PATH_STYLE_ACCESS = s"fs.s3a.path.style.access"
 60 |   private val SERVER_ENDPOINT = s"fs.s3a.endpoint"
 61 |   private val SERVER_REGION = s"fs.s3a.region"
 62 | 
 63 |   private val hadoopConfiguration = sqlContext.sparkContext.hadoopConfiguration
 64 |   private val pathStyleAccess = hadoopConfiguration.get(API_PATH_STYLE_ACCESS, "false") == "true"
 65 |   private val endpoint = hadoopConfiguration.get(SERVER_ENDPOINT, "https://s3.amazonaws.com")
 66 |   private val region = hadoopConfiguration.get(SERVER_REGION, "us-east-1")
 67 |   private val s3Client =
 68 |     AmazonS3ClientBuilder.standard()
 69 |       .withCredentials(Credentials.load(location, hadoopConfiguration))
 70 |       .withPathStyleAccessEnabled(pathStyleAccess)
 71 |       .withEndpointConfiguration(new EndpointConfiguration(endpoint, region))
 72 |       .build()
 73 | 
 74 |   override lazy val schema: StructType = Option(userSchema).getOrElse({
 75 |       // With no schema we return error.
 76 |       throw new RuntimeException(s"Schema cannot be empty")
 77 |   })
 78 | 
 79 | 
 80 |   private def getRows(schema: StructType, filters: Array[Filter]): Seq[Row] = {
 81 |     var records = new ListBuffer[Row]
 82 |     var req = new ListObjectsV2Request()
 83 |     var result = new ListObjectsV2Result()
 84 |     var s3URI = S3URI.toAmazonS3URI(location.getOrElse(""))
 85 | 
 86 |     req.withBucketName(s3URI.getBucket())
 87 |     req.withPrefix(s3URI.getKey().stripSuffix("*"))
 88 |     req.withMaxKeys(1000)
 89 | 
 90 |     do {
 91 |       result = s3Client.listObjectsV2(req)
 92 |       asScalaBuffer(result.getObjectSummaries()).foreach(objectSummary => {
 93 |         val br = new BufferedReader(new InputStreamReader(
 94 |           s3Client.selectObjectContent(
 95 |             Select.requestJSON(
 96 |               objectSummary.getBucketName(),
 97 |               objectSummary.getKey(),
 98 |               params,
 99 |               schema,
100 |               filters,
101 |               hadoopConfiguration)
102 |           ).getPayload().getRecordsInputStream()))
103 |         var line : String = null
104 |         while ( {line = br.readLine(); line != null}) {
105 |           var row = new Array[Any](schema.fields.length)
106 |           var rowValues = line.split(",")
107 |           var index = 0
108 |           while (index < rowValues.length) {
109 |             val field = schema.fields(index)
110 |             row(index) = TypeCast.castTo(rowValues(index), field.dataType,
111 |               field.nullable)
112 |             index += 1
113 |           }
114 |           records += Row.fromSeq(row)
115 |         }
116 |         br.close()
117 |       })
118 |       req.setContinuationToken(result.getNextContinuationToken())
119 |     } while (result.isTruncated())
120 |     records.toList
121 |   }
122 | 
123 |   override def toString: String = s"SelectJSONRelation()"
124 | 
125 |   private def tokenRDD(schema: StructType, filters: Array[Filter]): RDD[Row] = {
126 |     sqlContext.sparkContext.makeRDD(getRows(schema, filters))
127 |   }
128 | 
129 |   override def buildScan(): RDD[Row] = {
130 |     tokenRDD(schema, null)
131 |   }
132 | 
133 |   override def buildScan(columns: Array[String]): RDD[Row] = {
134 |     tokenRDD(pruneSchema(schema, columns), null)
135 |   }
136 | 
137 |   override def buildScan(columns: Array[String], filters: Array[Filter]): RDD[Row] = {
138 |     tokenRDD(pruneSchema(schema, columns), filters)
139 |   }
140 | 
141 |   private def pruneSchema(schema: StructType, columns: Array[String]): StructType = {
142 |     val fieldMap = Map(schema.fields.map(x => x.name -> x): _*)
143 |     new StructType(columns.map(name => fieldMap(name)))
144 |   }
145 | }
146 | 


--------------------------------------------------------------------------------
/src/main/scala/io/minio/spark/select/SelectJSONSource.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2018 MinIO, Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | package io.minio.spark.select
17 | 
18 | // Java standard libraries
19 | import java.io.File
20 | 
21 | // Spark internal libraries
22 | import org.apache.spark.sql.SQLContext
23 | import org.apache.spark.sql.sources.{BaseRelation, RelationProvider, SchemaRelationProvider}
24 | import org.apache.spark.sql.types.StructType
25 | 
26 | import org.apache.spark.sql.sources.DataSourceRegister
27 | 
28 | class SelectJSONSource
29 |   extends SchemaRelationProvider
30 |   with DataSourceRegister {
31 | 
32 |   private def checkPath(parameters: Map[String, String]): String = {
33 |     parameters.getOrElse("path", sys.error("'path' must be specified for JSON data."))
34 |   }
35 | 
36 |   /**
37 |    * Short alias for spark-select data source.
38 |    */
39 |   override def shortName(): String = "minioSelectJSON"
40 | 
41 |   override def createRelation(sqlContext: SQLContext, params: Map[String, String], schema: StructType): SelectJSONRelation = {
42 |     val path = checkPath(params)
43 |     SelectJSONRelation(Some(path), params, schema)(sqlContext)
44 |   }
45 | }
46 | 


--------------------------------------------------------------------------------
/src/main/scala/io/minio/spark/select/SelectParquetRelation.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2019 MinIO, Inc.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | package io.minio.spark.select
 17 | 
 18 | import scala.collection.JavaConversions.asScalaBuffer
 19 | 
 20 | import java.io.InputStreamReader
 21 | import java.io.BufferedReader
 22 | 
 23 | // Import all utilities
 24 | import io.minio.spark.select.util._
 25 | 
 26 | // For AmazonS3 client
 27 | import com.amazonaws.services.s3.AmazonS3
 28 | import com.amazonaws.services.s3.AmazonS3URI
 29 | import com.amazonaws.services.s3.AmazonS3ClientBuilder
 30 | import com.amazonaws.client.builder.AwsClientBuilder.EndpointConfiguration
 31 | 
 32 | import com.amazonaws.services.s3.model.ListObjectsV2Request
 33 | import com.amazonaws.services.s3.model.ListObjectsV2Result
 34 | import com.amazonaws.services.s3.model.S3ObjectSummary
 35 | 
 36 | import org.apache.spark.rdd.RDD
 37 | import org.apache.spark.sql.types._
 38 | import org.apache.spark.sql.sources._
 39 | import org.apache.spark.sql.catalyst.encoders.RowEncoder
 40 | import org.apache.spark.sql.{DataFrame, Row, SQLContext}
 41 | 
 42 | import scala.collection.mutable.{ListBuffer, ArrayBuffer}
 43 | 
 44 | /**
 45 |   * Abstract relation class to download data from S3 compatible storage
 46 |   */
 47 | case class SelectParquetRelation protected[spark] (
 48 |   location: Option[String],
 49 |   params: Map[String, String],
 50 |   userSchema: StructType = null)(@transient val sqlContext: SQLContext)
 51 |     extends BaseRelation
 52 |     with TableScan
 53 |     with PrunedScan
 54 |     with PrunedFilteredScan {
 55 | 
 56 |   private val API_PATH_STYLE_ACCESS = s"fs.s3a.path.style.access"
 57 |   private val SERVER_ENDPOINT = s"fs.s3a.endpoint"
 58 |   private val SERVER_REGION = s"fs.s3a.region"
 59 | 
 60 |   private val hadoopConfiguration = sqlContext.sparkContext.hadoopConfiguration
 61 |   private val pathStyleAccess = hadoopConfiguration.get(API_PATH_STYLE_ACCESS, "false") == "true"
 62 |   private val endpoint = hadoopConfiguration.get(SERVER_ENDPOINT, "https://s3.amazonaws.com")
 63 |   private val region = hadoopConfiguration.get(SERVER_REGION, "us-east-1")
 64 |   private val s3Client =
 65 |     AmazonS3ClientBuilder.standard()
 66 |       .withCredentials(Credentials.load(location, hadoopConfiguration))
 67 |       .withPathStyleAccessEnabled(pathStyleAccess)
 68 |       .withEndpointConfiguration(new EndpointConfiguration(endpoint, region))
 69 |       .build()
 70 | 
 71 |   override lazy val schema: StructType = Option(userSchema).getOrElse({
 72 |       // With no schema we return error.
 73 |       throw new RuntimeException(s"Schema cannot be empty")
 74 |   })
 75 | 
 76 |   private def getRows(schema: StructType, filters: Array[Filter]): Seq[Row] = {
 77 |     var records = new ListBuffer[Row]
 78 |     var req = new ListObjectsV2Request()
 79 |     var result = new ListObjectsV2Result()
 80 |     var s3URI = S3URI.toAmazonS3URI(location.getOrElse(""))
 81 | 
 82 |     req.withBucketName(s3URI.getBucket())
 83 |     req.withPrefix(s3URI.getKey().stripSuffix("*"))
 84 |     req.withMaxKeys(1000)
 85 | 
 86 |     do {
 87 |       result = s3Client.listObjectsV2(req)
 88 |       asScalaBuffer(result.getObjectSummaries()).foreach(objectSummary => {
 89 |         val br = new BufferedReader(new InputStreamReader(
 90 |           s3Client.selectObjectContent(
 91 |             Select.requestParquet(
 92 |               objectSummary.getBucketName(),
 93 |               objectSummary.getKey(),
 94 |               params,
 95 |               schema,
 96 |               filters,
 97 |               hadoopConfiguration)
 98 |           ).getPayload().getRecordsInputStream()))
 99 |         var line : String = null
100 |         while ( {line = br.readLine(); line != null}) {
101 |           var row = new Array[Any](schema.fields.length)
102 |           var rowValues = line.split(",")
103 |           var index = 0
104 |           while (index < rowValues.length) {
105 |             val field = schema.fields(index)
106 |             row(index) = TypeCast.castTo(rowValues(index), field.dataType,
107 |               field.nullable)
108 |             index += 1
109 |           }
110 |           records += Row.fromSeq(row)
111 |         }
112 |         br.close()
113 |       })
114 |       req.setContinuationToken(result.getNextContinuationToken())
115 |     } while (result.isTruncated())
116 |     records.toList
117 |   }
118 | 
119 |   override def toString: String = s"SelectParquetRelation()"
120 | 
121 |   private def tokenRDD(schema: StructType, filters: Array[Filter]): RDD[Row] = {
122 |     sqlContext.sparkContext.makeRDD(getRows(schema, filters))
123 |   }
124 | 
125 |   override def buildScan(): RDD[Row] = {
126 |     tokenRDD(schema, null)
127 |   }
128 | 
129 |   override def buildScan(columns: Array[String]): RDD[Row] = {
130 |     tokenRDD(pruneSchema(schema, columns), null)
131 |   }
132 | 
133 |   override def buildScan(columns: Array[String], filters: Array[Filter]): RDD[Row] = {
134 |     tokenRDD(pruneSchema(schema, columns), filters)
135 |   }
136 | 
137 |   private def pruneSchema(schema: StructType, columns: Array[String]): StructType = {
138 |     val fieldMap = Map(schema.fields.map(x => x.name -> x): _*)
139 |     new StructType(columns.map(name => fieldMap(name)))
140 |   }
141 | }
142 | 


--------------------------------------------------------------------------------
/src/main/scala/io/minio/spark/select/SelectParquetSource.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2019 MinIO, Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | package io.minio.spark.select
17 | 
18 | // Java standard libraries
19 | import java.io.File
20 | 
21 | // Spark internal libraries
22 | import org.apache.spark.sql.SQLContext
23 | import org.apache.spark.sql.sources.{BaseRelation, RelationProvider, SchemaRelationProvider}
24 | import org.apache.spark.sql.types.StructType
25 | 
26 | import org.apache.spark.sql.sources.DataSourceRegister
27 | 
28 | class SelectParquetSource
29 |   extends SchemaRelationProvider
30 |   with DataSourceRegister {
31 | 
32 |   private def checkPath(parameters: Map[String, String]): String = {
33 |     parameters.getOrElse("path", sys.error("'path' must be specified for Parquet data."))
34 |   }
35 | 
36 |   /**
37 |    * Short alias for spark-select data source.
38 |    */
39 |   override def shortName(): String = "minioSelectParquet"
40 | 
41 |   override def createRelation(sqlContext: SQLContext, params: Map[String, String], schema: StructType): SelectParquetRelation = {
42 |     val path = checkPath(params)
43 |     SelectParquetRelation(Some(path), params, schema)(sqlContext)
44 |   }
45 | }
46 | 


--------------------------------------------------------------------------------
/src/main/scala/io/minio/spark/select/util/S3URI.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2019 MinIO, Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | package io.minio.spark.select.util
17 | 
18 | import java.net.URI
19 | import com.amazonaws.services.s3.AmazonS3URI
20 | 
21 | object S3URI {
22 |   private[select] def toAmazonS3URI(
23 |     location: String): AmazonS3URI = {
24 |     val uri = new URI(location)
25 |     val uriScheme = uri.getScheme
26 |     uriScheme match {
27 |       case "s3" =>
28 |         new AmazonS3URI(uri)
29 |       case "s3a" | "s3n" =>
30 |         new AmazonS3URI(new URI("s3", uri.getUserInfo, uri.getHost, uri.getPort, uri.getPath,
31 |           uri.getQuery, uri.getFragment))
32 |       case other =>
33 |         throw new IllegalArgumentException(s"Unrecognized scheme $other; expected s3, or s3a or s3n")
34 |     }
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/src/main/scala/io/minio/spark/select/util/TypeCast.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2018 MinIO, Inc.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | package io.minio.spark.select.util
 17 | 
 18 | import java.math.BigDecimal
 19 | import java.sql.{Date, Timestamp}
 20 | import java.text.{SimpleDateFormat, NumberFormat}
 21 | import java.util.Locale
 22 | 
 23 | import org.apache.spark.sql.types._
 24 | 
 25 | import scala.util.Try
 26 | 
 27 | /**
 28 |  * Utility functions for type casting
 29 |  */
 30 | object TypeCast {
 31 | 
 32 |   /**
 33 |    * Casts given string datum to specified type.
 34 |    * Currently we do not support complex types (ArrayType, MapType, StructType).
 35 |    *
 36 |    * For string types, this is simply the datum. For other types.
 37 |    * For other nullable types, this is null if the string datum is empty.
 38 |    *
 39 |    * @param datum string value
 40 |    * @param castType SparkSQL type
 41 |    */
 42 |   private[select] def castTo(
 43 |       datum: String,
 44 |       castType: DataType,
 45 |       nullable: Boolean = true,
 46 |       treatEmptyValuesAsNulls: Boolean = false,
 47 |       nullValue: String = "",
 48 |       dateFormatter: SimpleDateFormat = null): Any = {
 49 |     if (datum == nullValue &&
 50 |       nullable ||
 51 |       (treatEmptyValuesAsNulls && datum == "")){
 52 |       null
 53 |     } else {
 54 |       castType match {
 55 |         case _: ByteType => datum.toByte
 56 |         case _: ShortType => datum.toShort
 57 |         case _: IntegerType => datum.toInt
 58 |         case _: LongType => datum.toLong
 59 |         case _: FloatType => Try(datum.toFloat)
 60 |           .getOrElse(NumberFormat.getInstance(Locale.getDefault).parse(datum).floatValue())
 61 |         case _: DoubleType => Try(datum.toDouble)
 62 |           .getOrElse(NumberFormat.getInstance(Locale.getDefault).parse(datum).doubleValue())
 63 |         case _: BooleanType => datum.toBoolean
 64 |         case _: DecimalType => new BigDecimal(datum.replaceAll(",", ""))
 65 |         case _: TimestampType if dateFormatter != null =>
 66 |           new Timestamp(dateFormatter.parse(datum).getTime)
 67 |         case _: TimestampType => Timestamp.valueOf(datum)
 68 |         case _: DateType if dateFormatter != null =>
 69 |           new Date(dateFormatter.parse(datum).getTime)
 70 |         case _: DateType => Date.valueOf(datum)
 71 |         case _: StringType => datum
 72 |         case _ => throw new RuntimeException(s"Unsupported type: ${castType.typeName}")
 73 |       }
 74 |     }
 75 |   }
 76 | 
 77 |   /**
 78 |    * Helper method that converts string representation of a character to actual character.
 79 |    * It handles some Java escaped strings and throws exception if given string is longer than one
 80 |    * character.
 81 |    *
 82 |    */
 83 |   @throws[IllegalArgumentException]
 84 |   private[select] def toChar(str: String): Char = {
 85 |     if (str.charAt(0) == '\\') {
 86 |       str.charAt(1)
 87 |       match {
 88 |         case 't' => '\t'
 89 |         case 'r' => '\r'
 90 |         case 'b' => '\b'
 91 |         case 'f' => '\f'
 92 |         case '\"' => '\"' // In case user changes quote char and uses \" as delimiter in options
 93 |         case '\'' => '\''
 94 |         case 'u' if str == """\u0000""" => '\u0000'
 95 |         case _ =>
 96 |           throw new IllegalArgumentException(s"Unsupported special character for delimiter: $str")
 97 |       }
 98 |     } else if (str.length == 1) {
 99 |       str.charAt(0)
100 |     } else {
101 |       throw new IllegalArgumentException(s"Delimiter cannot be more than one character: $str")
102 |     }
103 |   }
104 | }
105 | 


--------------------------------------------------------------------------------
/src/test/scala/io/minio/spark/select/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/minio/spark-select/d8721b46e84639690b7382d4a4e0877996e64b3d/src/test/scala/io/minio/spark/select/.keep


--------------------------------------------------------------------------------
/version.sbt:
--------------------------------------------------------------------------------
1 | version in ThisBuild := "2.1"
2 | 


--------------------------------------------------------------------------------