├── .gitignore
├── LICENSE
├── README.md
├── build.sbt
├── build
├── sbt
└── sbt-launch-lib.bash
├── project
├── build.properties
└── plugins.sbt
└── src
├── main
└── scala
│ └── com
│ └── springml
│ └── spark
│ └── sftp
│ ├── DatasetRelation.scala
│ ├── DefaultSource.scala
│ ├── DeleteTempFileShutdownHook.scala
│ ├── constants.scala
│ └── util
│ └── Utils.scala
└── test
├── resources
├── books.orc
├── books.xml
├── custom-delimiter.csv
├── people.json
├── plaintext.txt
├── sample.csv
├── sample_quoted_multiline.csv
├── users.avro
└── users.parquet
└── scala
└── com
└── springml
└── spark
└── sftp
├── CustomSchemaTest.scala
└── TestDatasetRelation.scala
/.gitignore:
--------------------------------------------------------------------------------
1 | *.class
2 | *.log
3 | *.pyc
4 | sbt/*.jar
5 |
6 | # sbt specific
7 | .cache/
8 | .history/
9 | .lib/
10 | dist/*
11 | target/
12 | lib_managed/
13 | src_managed/
14 | project/boot/
15 | project/plugins/project/
16 | /bin/
17 |
18 | .cache-main
19 | .classpath
20 | .project
21 | .settings/
22 | .cache-tests
23 |
24 | # intellij
25 | .idea
26 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License, Version 2.0
2 | Apache License
3 | Version 2.0, January 2004
4 | http://www.apache.org/licenses/
5 |
6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7 |
8 | 1. Definitions.
9 |
10 | "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License.
13 |
14 | "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity.
15 |
16 | "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License.
17 |
18 | "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files.
19 |
20 | "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types.
21 |
22 | "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below).
23 |
24 | "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof.
25 |
26 | "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution."
27 |
28 | "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work.
29 |
30 | 2. Grant of Copyright License.
31 |
32 | Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form.
33 |
34 | 3. Grant of Patent License.
35 |
36 | Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed.
37 |
38 | 4. Redistribution.
39 |
40 | You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
41 |
42 | You must give any other recipients of the Work or Derivative Works a copy of this License; and
43 | You must cause any modified files to carry prominent notices stating that You changed the files; and
44 | You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and
45 | If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License.
46 | You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License.
47 |
48 | 5. Submission of Contributions.
49 |
50 | Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions.
51 |
52 | 6. Trademarks.
53 |
54 | This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file.
55 |
56 | 7. Disclaimer of Warranty.
57 |
58 | Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License.
59 |
60 | 8. Limitation of Liability.
61 |
62 | In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
63 |
64 | 9. Accepting Warranty or Additional Liability.
65 |
66 | While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
67 |
68 | END OF TERMS AND CONDITIONS
69 |
70 | APPENDIX: How to apply the Apache License to your work
71 |
72 | To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives.
73 |
74 | Copyright [yyyy] [name of copyright owner]
75 |
76 | Licensed under the Apache License, Version 2.0 (the "License");
77 | you may not use this file except in compliance with the License.
78 | You may obtain a copy of the License at
79 |
80 | http://www.apache.org/licenses/LICENSE-2.0
81 |
82 | Unless required by applicable law or agreed to in writing, software
83 | distributed under the License is distributed on an "AS IS" BASIS,
84 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
85 | See the License for the specific language governing permissions and
86 | limitations under the License.
87 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Spark SFTP Connector Library
2 |
3 | A library for constructing dataframes by downloading files from SFTP and writing dataframe to a SFTP server
4 |
5 | ## Requirements
6 |
7 | This library requires Spark 2.x.
8 |
9 | For Spark 1.x support, please check [spark1.x](https://github.com/springml/spark-sftp/tree/spark1.x) branch.
10 |
11 | ## Linking
12 | You can link against this library in your program at the following ways:
13 |
14 | ### Maven Dependency
15 | ```
16 |
17 | com.springml
18 | spark-sftp_2.11
19 | 1.1.3
20 |
21 |
22 | ```
23 |
24 | ### SBT Dependency
25 | ```
26 | libraryDependencies += "com.springml" % "spark-sftp_2.11" % "1.1.3"
27 | ```
28 |
29 |
30 | ## Using with Spark shell
31 | This package can be added to Spark using the `--packages` command line option. For example, to include it when starting the spark shell:
32 |
33 | ```
34 | $ bin/spark-shell --packages com.springml:spark-sftp_2.11:1.1.3
35 | ```
36 |
37 | ## Features
38 | This package can be used to construct spark dataframe by downloading the files from SFTP server.
39 |
40 | This package can also be used to write spark dataframe as a csv|json|acro tp SFTP server
41 |
42 | This library requires following options:
43 | * `path`: FTP URL of the file to be used for dataframe construction
44 | * `username`: SFTP Server Username.
45 | * `password`: (Optional) SFTP Server Password.
46 | * `pem`: (Optional) Location of PEM file. Either pem or password has to be specified
47 | * `pemPassphrase`: (Optional) Passphrase for PEM file.
48 | * `host`: SFTP Host.
49 | * `port`: (Optional) Port in which SFTP server is running. Default value 22.
50 | * `fileType`: Type of the file. Supported types are csv, txt, json, avro and parquet
51 | * `inferSchema`: (Optional) InferSchema from the file content. Currently applicable only for csv fileType
52 | * `header`: (Optional) Applicable only for csv fileType. Is the first row in CSV file is header.
53 | * `delimiter`: (Optional) Set the field delimiter. Applicable only for csv fileType. Default is comma.
54 | * `quote`: (Optional) Set the quote character. Applicable only for csv fileType. Default is ".
55 | * `escape`: (Optional) Set the escape character. Applicable only for csv fileType. Default is \.
56 | * `multiLine`: (Optional) Set the multiline. Applicable only for csv fileType. Default is false.
57 | * `codec`: (Optional) Applicable only for csv fileType. Compression codec to use when saving to file. Should be the fully qualified name of a class implementing org.apache.hadoop.io.compress.CompressionCodec or one of case-insensitive shorten names (bzip2, gzip, lz4, and snappy). Defaults to no compression when a codec is not specified.
58 |
59 | ### Scala API
60 | ```scala
61 |
62 | // Construct Spark dataframe using file in FTP server
63 | val df = spark.read.
64 | format("com.springml.spark.sftp").
65 | option("host", "SFTP_HOST").
66 | option("username", "SFTP_USER").
67 | option("password", "****").
68 | option("fileType", "csv").
69 | option("delimiter", ";").
70 | option("quote", "\"").
71 | option("escape", "\\").
72 | option("multiLine", "true").
73 | option("inferSchema", "true").
74 | load("/ftp/files/sample.csv")
75 |
76 | // Write dataframe as CSV file to FTP server
77 | df.write.
78 | format("com.springml.spark.sftp").
79 | option("host", "SFTP_HOST").
80 | option("username", "SFTP_USER").
81 | option("password", "****").
82 | option("fileType", "csv").
83 | option("delimiter", ";").
84 | option("codec", "bzip2").
85 | save("/ftp/files/sample.csv")
86 |
87 |
88 | // Construct spark dataframe using text file in FTP server
89 | val df = spark.read.
90 | format("com.springml.spark.sftp").
91 | option("host", "SFTP_HOST").
92 | option("username", "SFTP_USER").
93 | option("password", "****").
94 | option("fileType", "txt").
95 | load("config")
96 |
97 | // Construct spark dataframe using xml file in FTP server
98 | val df = spark.read.
99 | format("com.springml.spark.sftp").
100 | option("host", "SFTP_HOST").
101 | option("username", "SFTP_USER").
102 | option("password", "*****").
103 | option("fileType", "xml").
104 | option("rowTag", "YEAR").load("myxml.xml")
105 |
106 | // Write dataframe as XML file to FTP server
107 |
108 | df.write.format("com.springml.spark.sftp").
109 | option("host", "SFTP_HOST").
110 | option("username", "SFTP_USER").
111 | option("password", "*****").
112 | option("fileType", "xml").
113 | option("rootTag", "YTD").
114 | option("rowTag", "YEAR").save("myxmlOut.xml.gz")
115 |
116 | ```
117 |
118 |
119 | ### Java API
120 | ```java
121 | // Construct Spark dataframe using file in FTP server
122 | DataFrame df = spark.read().
123 | format("com.springml.spark.sftp").
124 | option("host", "SFTP_HOST").
125 | option("username", "SFTP_USER").
126 | option("password", "****").
127 | option("fileType", "json").
128 | load("/ftp/files/sample.json")
129 |
130 | // Write dataframe as CSV file to FTP server
131 | df.write().
132 | format("com.springml.spark.sftp").
133 | option("host", "SFTP_HOST").
134 | option("username", "SFTP_USER").
135 | option("password", "****").
136 | option("fileType", "json").
137 | save("/ftp/files/sample.json");
138 | ```
139 |
140 | ### R API
141 | Spark 1.5+:
142 | ```r
143 |
144 | if (nchar(Sys.getenv("SPARK_HOME")) < 1) {
145 | Sys.setenv(SPARK_HOME = "/home/spark")
146 | }
147 | library(SparkR, lib.loc = c(file.path(Sys.getenv("SPARK_HOME"), "R", "lib")))
148 | sparkR.session(master = "local[*]", sparkConfig = list(spark.driver.memory = "2g"))
149 |
150 | # Construct Spark dataframe using avro file in FTP server
151 | df <- read.df(path="/ftp/files/sample.avro",
152 | source="com.springml.spark.sftp",
153 | host="SFTP_HOST",
154 | username="SFTP_USER",
155 | pem="/home/user/mypem.pem",
156 | fileType="avro")
157 |
158 | # Write dataframe as avro file to FTP server
159 | write.df(df,
160 | path="/ftp/files/sample.avro",
161 | source="com.springml.spark.sftp",
162 | host="SFTP_HOST",
163 | username="SFTP_USER",
164 | pem="/home/user/mypem.pem",
165 | fileType="avro")
166 | ```
167 |
168 | ### Note
169 | 1. SFTP files are fetched and written using [jsch](http://www.jcraft.com/jsch/). It will be executed as a single process
170 | 2. Files from SFTP server will be downloaded to temp location and it will be deleted only during spark shutdown
171 |
172 |
173 | ## Building From Source
174 | This library is built with [SBT](http://www.scala-sbt.org/0.13/docs/Command-Line-Reference.html), which is automatically downloaded by the included shell script. To build a JAR file simply run `build/sbt package` from the project root.
175 |
--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
1 | name := "spark-sftp"
2 |
3 | organization := "com.springml"
4 |
5 | scalaVersion := "2.11.8"
6 |
7 | sparkVersion := "2.3.0"
8 |
9 | spName := "springml/spark-sftp"
10 |
11 | version := "1.1.4"
12 |
13 | // Dependent libraries
14 | libraryDependencies ++= Seq(
15 | "com.springml" % "sftp.client" % "1.0.3",
16 | "org.mockito" % "mockito-core" % "2.0.31-beta",
17 | "com.databricks" % "spark-xml_2.11" % "0.4.1"
18 | )
19 |
20 | // used spark components
21 | sparkComponents += "sql"
22 |
23 | // Repositories
24 | resolvers += "Spark Package Main Repo" at "https://dl.bintray.com/spark-packages/maven"
25 |
26 | // Spark packages
27 | spDependencies += "com.databricks/spark-avro_2.11:3.2.0"
28 |
29 | // Test dependencies
30 | libraryDependencies += "org.scalatest" %% "scalatest" % "2.2.1" % "test"
31 | libraryDependencies += "org.apache.avro" % "avro-mapred" % "1.7.7" % "test" exclude("org.mortbay.jetty", "servlet-api")
32 | libraryDependencies += "org.apache.spark" %% "spark-hive" % sparkVersion.value % "test"
33 |
34 | spIgnoreProvided := true
35 | // licenses := Seq("Apache-2.0" -> url("http://opensource.org/licenses/Apache-2.0"))
36 |
37 | credentials += Credentials(Path.userHome / ".ivy2" / ".credentials")
38 |
39 | publishTo := {
40 | val nexus = "https://oss.sonatype.org/"
41 | if (version.value.endsWith("SNAPSHOT"))
42 | Some("snapshots" at nexus + "content/repositories/snapshots")
43 | else
44 | Some("releases" at nexus + "service/local/staging/deploy/maven2")
45 | }
46 |
47 | pomExtra := (
48 | https://github.com/springml/spark-sftp
49 |
50 |
51 | Apache License, Verision 2.0
52 | http://www.apache.org/licenses/LICENSE-2.0.html
53 | repo
54 |
55 |
56 |
57 | scm:git:github.com/springml/spark-sftp
58 | scm:git:git@github.com:springml/spark-sftp
59 | github.com/springml/spark-sftp
60 |
61 |
62 |
63 | springml
64 | Springml
65 | http://www.springml.com
66 |
67 | )
68 |
--------------------------------------------------------------------------------
/build/sbt:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # When creating new tests for Spark SQL Hive, the HADOOP_CLASSPATH must contain the hive jars so
4 | # that we can run Hive to generate the golden answer. This is not required for normal development
5 | # or testing.
6 | for i in $HIVE_HOME/lib/*
7 | do HADOOP_CLASSPATH=$HADOOP_CLASSPATH:$i
8 | done
9 | export HADOOP_CLASSPATH
10 |
11 | realpath () {
12 | (
13 | TARGET_FILE=$1
14 |
15 | cd $(dirname $TARGET_FILE)
16 | TARGET_FILE=$(basename $TARGET_FILE)
17 |
18 | COUNT=0
19 | while [ -L "$TARGET_FILE" -a $COUNT -lt 100 ]
20 | do
21 | TARGET_FILE=$(readlink $TARGET_FILE)
22 | cd $(dirname $TARGET_FILE)
23 | TARGET_FILE=$(basename $TARGET_FILE)
24 | COUNT=$(($COUNT + 1))
25 | done
26 |
27 | echo $(pwd -P)/$TARGET_FILE
28 | )
29 | }
30 |
31 | . $(dirname $(realpath $0))/sbt-launch-lib.bash
32 |
33 |
34 | declare -r noshare_opts="-Dsbt.global.base=project/.sbtboot -Dsbt.boot.directory=project/.boot -Dsbt.ivy.home=project/.ivy"
35 | declare -r sbt_opts_file=".sbtopts"
36 | declare -r etc_sbt_opts_file="/etc/sbt/sbtopts"
37 |
38 | usage() {
39 | cat < path to global settings/plugins directory (default: ~/.sbt)
47 | -sbt-boot path to shared boot directory (default: ~/.sbt/boot in 0.11 series)
48 | -ivy path to local Ivy repository (default: ~/.ivy2)
49 | -mem set memory options (default: $sbt_mem, which is $(get_mem_opts $sbt_mem))
50 | -no-share use all local caches; no sharing
51 | -no-global uses global caches, but does not use global ~/.sbt directory.
52 | -jvm-debug Turn on JVM debugging, open at the given port.
53 | -batch Disable interactive mode
54 | # sbt version (default: from project/build.properties if present, else latest release)
55 | -sbt-version use the specified version of sbt
56 | -sbt-jar use the specified jar as the sbt launcher
57 | -sbt-rc use an RC version of sbt
58 | -sbt-snapshot use a snapshot version of sbt
59 | # java version (default: java from PATH, currently $(java -version 2>&1 | grep version))
60 | -java-home alternate JAVA_HOME
61 | # jvm options and output control
62 | JAVA_OPTS environment variable, if unset uses "$java_opts"
63 | SBT_OPTS environment variable, if unset uses "$default_sbt_opts"
64 | .sbtopts if this file exists in the current directory, it is
65 | prepended to the runner args
66 | /etc/sbt/sbtopts if this file exists, it is prepended to the runner args
67 | -Dkey=val pass -Dkey=val directly to the java runtime
68 | -J-X pass option -X directly to the java runtime
69 | (-J is stripped)
70 | -S-X add -X to sbt's scalacOptions (-J is stripped)
71 | -PmavenProfiles Enable a maven profile for the build.
72 | In the case of duplicated or conflicting options, the order above
73 | shows precedence: JAVA_OPTS lowest, command line options highest.
74 | EOM
75 | }
76 |
77 | process_my_args () {
78 | while [[ $# -gt 0 ]]; do
79 | case "$1" in
80 | -no-colors) addJava "-Dsbt.log.noformat=true" && shift ;;
81 | -no-share) addJava "$noshare_opts" && shift ;;
82 | -no-global) addJava "-Dsbt.global.base=$(pwd)/project/.sbtboot" && shift ;;
83 | -sbt-boot) require_arg path "$1" "$2" && addJava "-Dsbt.boot.directory=$2" && shift 2 ;;
84 | -sbt-dir) require_arg path "$1" "$2" && addJava "-Dsbt.global.base=$2" && shift 2 ;;
85 | -debug-inc) addJava "-Dxsbt.inc.debug=true" && shift ;;
86 | -batch) exec &2 "$@"
31 | }
32 | vlog () {
33 | [[ $verbose || $debug ]] && echoerr "$@"
34 | }
35 | dlog () {
36 | [[ $debug ]] && echoerr "$@"
37 | }
38 |
39 | acquire_sbt_jar () {
40 | SBT_VERSION=`awk -F "=" '/sbt\\.version/ {print $2}' ./project/build.properties`
41 | URL1=http://typesafe.artifactoryonline.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
42 | URL2=http://repo.typesafe.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
43 | JAR=build/sbt-launch-${SBT_VERSION}.jar
44 |
45 | sbt_jar=$JAR
46 |
47 | if [[ ! -f "$sbt_jar" ]]; then
48 | # Download sbt launch jar if it hasn't been downloaded yet
49 | if [ ! -f ${JAR} ]; then
50 | # Download
51 | printf "Attempting to fetch sbt\n"
52 | JAR_DL=${JAR}.part
53 | if hash curl 2>/dev/null; then
54 | (curl --progress-bar ${URL1} > ${JAR_DL} || curl --progress-bar ${URL2} > ${JAR_DL}) && mv ${JAR_DL} ${JAR}
55 | elif hash wget 2>/dev/null; then
56 | (wget --progress=bar ${URL1} -O ${JAR_DL} || wget --progress=bar ${URL2} -O ${JAR_DL}) && mv ${JAR_DL} ${JAR}
57 | else
58 | printf "You do not have curl or wget installed, please install sbt manually from http://www.scala-sbt.org/\n"
59 | exit -1
60 | fi
61 | fi
62 | if [ ! -f ${JAR} ]; then
63 | # We failed to download
64 | printf "Our attempt to download sbt locally to ${JAR} failed. Please install sbt manually from http://www.scala-sbt.org/\n"
65 | exit -1
66 | fi
67 | printf "Launching sbt from ${JAR}\n"
68 | fi
69 | }
70 |
71 | execRunner () {
72 | # print the arguments one to a line, quoting any containing spaces
73 | [[ $verbose || $debug ]] && echo "# Executing command line:" && {
74 | for arg; do
75 | if printf "%s\n" "$arg" | grep -q ' '; then
76 | printf "\"%s\"\n" "$arg"
77 | else
78 | printf "%s\n" "$arg"
79 | fi
80 | done
81 | echo ""
82 | }
83 |
84 | exec "$@"
85 | }
86 |
87 | addJava () {
88 | dlog "[addJava] arg = '$1'"
89 | java_args=( "${java_args[@]}" "$1" )
90 | }
91 |
92 | enableProfile () {
93 | dlog "[enableProfile] arg = '$1'"
94 | maven_profiles=( "${maven_profiles[@]}" "$1" )
95 | export SBT_MAVEN_PROFILES="${maven_profiles[@]}"
96 | }
97 |
98 | addSbt () {
99 | dlog "[addSbt] arg = '$1'"
100 | sbt_commands=( "${sbt_commands[@]}" "$1" )
101 | }
102 | addResidual () {
103 | dlog "[residual] arg = '$1'"
104 | residual_args=( "${residual_args[@]}" "$1" )
105 | }
106 | addDebugger () {
107 | addJava "-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=$1"
108 | }
109 |
110 | # a ham-fisted attempt to move some memory settings in concert
111 | # so they need not be dicked around with individually.
112 | get_mem_opts () {
113 | local mem=${1:-2048}
114 | local perm=$(( $mem / 4 ))
115 | (( $perm > 256 )) || perm=256
116 | (( $perm < 4096 )) || perm=4096
117 | local codecache=$(( $perm / 2 ))
118 |
119 | echo "-Xms${mem}m -Xmx${mem}m -XX:MaxPermSize=${perm}m -XX:ReservedCodeCacheSize=${codecache}m"
120 | }
121 |
122 | require_arg () {
123 | local type="$1"
124 | local opt="$2"
125 | local arg="$3"
126 | if [[ -z "$arg" ]] || [[ "${arg:0:1}" == "-" ]]; then
127 | die "$opt requires <$type> argument"
128 | fi
129 | }
130 |
131 | is_function_defined() {
132 | declare -f "$1" > /dev/null
133 | }
134 |
135 | process_args () {
136 | while [[ $# -gt 0 ]]; do
137 | case "$1" in
138 | -h|-help) usage; exit 1 ;;
139 | -v|-verbose) verbose=1 && shift ;;
140 | -d|-debug) debug=1 && shift ;;
141 |
142 | -ivy) require_arg path "$1" "$2" && addJava "-Dsbt.ivy.home=$2" && shift 2 ;;
143 | -mem) require_arg integer "$1" "$2" && sbt_mem="$2" && shift 2 ;;
144 | -jvm-debug) require_arg port "$1" "$2" && addDebugger $2 && shift 2 ;;
145 | -batch) exec dataframeReader.avro(fileLocation)
40 | case "txt" => dataframeReader.format("text").load(fileLocation)
41 | case "xml" => dataframeReader.format(constants.xmlClass)
42 | .option(constants.xmlRowTag, rowTag)
43 | .load(fileLocation)
44 | case "csv" => dataframeReader.
45 | option("header", header).
46 | option("delimiter", delimiter).
47 | option("quote", quote).
48 | option("escape", escape).
49 | option("multiLine", multiLine).
50 | option("inferSchema", inferSchema).
51 | csv(fileLocation)
52 | case _ => dataframeReader.format(fileType).load(fileLocation)
53 | }
54 | df
55 | }
56 |
57 | override def schema: StructType = {
58 | df.schema
59 | }
60 |
61 | override def buildScan(): RDD[Row] = {
62 | df.rdd
63 | }
64 |
65 | }
66 |
--------------------------------------------------------------------------------
/src/main/scala/com/springml/spark/sftp/DefaultSource.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2015 springml
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package com.springml.spark.sftp
17 |
18 | import java.io.File
19 | import java.util.UUID
20 |
21 | import com.springml.sftp.client.SFTPClient
22 | import com.springml.spark.sftp.util.Utils.ImplicitDataFrameWriter
23 |
24 | import org.apache.commons.io.FilenameUtils
25 | import org.apache.hadoop.fs.Path
26 | import org.apache.log4j.Logger
27 | import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, RelationProvider, SchemaRelationProvider}
28 | import org.apache.spark.sql.types.StructType
29 | import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode}
30 |
31 | /**
32 | * Datasource to construct dataframe from a sftp url
33 | */
34 | class DefaultSource extends RelationProvider with SchemaRelationProvider with CreatableRelationProvider {
35 | @transient val logger = Logger.getLogger(classOf[DefaultSource])
36 |
37 | /**
38 | * Copy the file from SFTP to local location and then create dataframe using local file
39 | */
40 | override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]):BaseRelation = {
41 | createRelation(sqlContext, parameters, null)
42 | }
43 |
44 | /**
45 | * Copy the file from SFTP to local location and then create dataframe using local file
46 | */
47 | override def createRelation(sqlContext: SQLContext, parameters: Map[String, String], schema: StructType) = {
48 | val username = parameters.get("username")
49 | val password = parameters.get("password")
50 | val pemFileLocation = parameters.get("pem")
51 | val pemPassphrase = parameters.get("pemPassphrase")
52 | val host = parameters.getOrElse("host", sys.error("SFTP Host has to be provided using 'host' option"))
53 | val port = parameters.get("port")
54 | val path = parameters.getOrElse("path", sys.error("'path' must be specified"))
55 | val fileType = parameters.getOrElse("fileType", sys.error("File type has to be provided using 'fileType' option"))
56 | val inferSchema = parameters.get("inferSchema")
57 | val header = parameters.getOrElse("header", "true")
58 | val delimiter = parameters.getOrElse("delimiter", ",")
59 | val quote = parameters.getOrElse("quote", "\"")
60 | val escape = parameters.getOrElse("escape", "\\")
61 | val multiLine = parameters.getOrElse("multiLine", "false")
62 | val createDF = parameters.getOrElse("createDF", "true")
63 | val copyLatest = parameters.getOrElse("copyLatest", "false")
64 | val tempFolder = parameters.getOrElse("tempLocation", System.getProperty("java.io.tmpdir"))
65 | val hdfsTemp = parameters.getOrElse("hdfsTempLocation", tempFolder)
66 | val cryptoKey = parameters.getOrElse("cryptoKey", null)
67 | val cryptoAlgorithm = parameters.getOrElse("cryptoAlgorithm", "AES")
68 | val rowTag = parameters.getOrElse(constants.xmlRowTag, null)
69 |
70 | val supportedFileTypes = List("csv", "json", "avro", "parquet", "txt", "xml","orc")
71 | if (!supportedFileTypes.contains(fileType)) {
72 | sys.error("fileType " + fileType + " not supported. Supported file types are " + supportedFileTypes)
73 | }
74 |
75 | val inferSchemaFlag = if (inferSchema != null && inferSchema.isDefined) {
76 | inferSchema.get
77 | } else {
78 | "false"
79 | }
80 |
81 | val sftpClient = getSFTPClient(username, password, pemFileLocation, pemPassphrase, host, port,
82 | cryptoKey, cryptoAlgorithm)
83 | val copiedFileLocation = copy(sftpClient, path, tempFolder, copyLatest.toBoolean)
84 | val fileLocation = copyToHdfs(sqlContext, copiedFileLocation, hdfsTemp)
85 |
86 | if (!createDF.toBoolean) {
87 | logger.info("Returning an empty dataframe after copying files...")
88 | createReturnRelation(sqlContext, schema)
89 | } else {
90 | DatasetRelation(fileLocation, fileType, inferSchemaFlag, header, delimiter, quote, escape, multiLine, rowTag, schema,
91 | sqlContext)
92 | }
93 | }
94 |
95 | override def createRelation(
96 | sqlContext: SQLContext,
97 | mode: SaveMode,
98 | parameters: Map[String, String],
99 | data: DataFrame): BaseRelation = {
100 |
101 | val username = parameters.get("username")
102 | val password = parameters.get("password")
103 | val pemFileLocation = parameters.get("pem")
104 | val pemPassphrase = parameters.get("pemPassphrase")
105 | val host = parameters.getOrElse("host", sys.error("SFTP Host has to be provided using 'host' option"))
106 | val port = parameters.get("port")
107 | val path = parameters.getOrElse("path", sys.error("'path' must be specified"))
108 | val fileType = parameters.getOrElse("fileType", sys.error("File type has to be provided using 'fileType' option"))
109 | val header = parameters.getOrElse("header", "true")
110 | val copyLatest = parameters.getOrElse("copyLatest", "false")
111 | val tmpFolder = parameters.getOrElse("tempLocation", System.getProperty("java.io.tmpdir"))
112 | val hdfsTemp = parameters.getOrElse("hdfsTempLocation", tmpFolder)
113 | val cryptoKey = parameters.getOrElse("cryptoKey", null)
114 | val cryptoAlgorithm = parameters.getOrElse("cryptoAlgorithm", "AES")
115 | val delimiter = parameters.getOrElse("delimiter", ",")
116 | val quote = parameters.getOrElse("quote", "\"")
117 | val escape = parameters.getOrElse("escape", "\\")
118 | val multiLine = parameters.getOrElse("multiLine", "false")
119 | val codec = parameters.getOrElse("codec", null)
120 | val rowTag = parameters.getOrElse(constants.xmlRowTag, null)
121 | val rootTag = parameters.getOrElse(constants.xmlRootTag, null)
122 |
123 | val supportedFileTypes = List("csv", "json", "avro", "parquet", "txt", "xml","orc")
124 | if (!supportedFileTypes.contains(fileType)) {
125 | sys.error("fileType " + fileType + " not supported. Supported file types are " + supportedFileTypes)
126 | }
127 |
128 | val sftpClient = getSFTPClient(username, password, pemFileLocation, pemPassphrase, host, port,
129 | cryptoKey, cryptoAlgorithm)
130 | val tempFile = writeToTemp(sqlContext, data, hdfsTemp, tmpFolder, fileType, header, delimiter, quote, escape, multiLine, codec, rowTag, rootTag)
131 |
132 | upload(tempFile, path, sftpClient)
133 | return createReturnRelation(data)
134 | }
135 | private def copyToHdfs(sqlContext: SQLContext, fileLocation : String,
136 | hdfsTemp : String): String = {
137 | val hadoopConf = sqlContext.sparkContext.hadoopConfiguration
138 | val hdfsPath = new Path(fileLocation)
139 | val fs = hdfsPath.getFileSystem(hadoopConf)
140 | if ("hdfs".equalsIgnoreCase(fs.getScheme)) {
141 | fs.copyFromLocalFile(new Path(fileLocation), new Path(hdfsTemp))
142 | val filePath = hdfsTemp + "/" + hdfsPath.getName
143 | fs.deleteOnExit(new Path(filePath))
144 | return filePath
145 | } else {
146 | return fileLocation
147 | }
148 | }
149 |
150 | private def copyFromHdfs(sqlContext: SQLContext, hdfsTemp : String,
151 | fileLocation : String): String = {
152 | val hadoopConf = sqlContext.sparkContext.hadoopConfiguration
153 | val hdfsPath = new Path(hdfsTemp)
154 | val fs = hdfsPath.getFileSystem(hadoopConf)
155 | if ("hdfs".equalsIgnoreCase(fs.getScheme)) {
156 | fs.copyToLocalFile(new Path(hdfsTemp), new Path(fileLocation))
157 | fs.deleteOnExit(new Path(hdfsTemp))
158 | return fileLocation
159 | } else {
160 | return hdfsTemp
161 | }
162 | }
163 |
164 | private def upload(source: String, target: String, sftpClient: SFTPClient) {
165 | logger.info("Copying " + source + " to " + target)
166 | sftpClient.copyToFTP(source, target)
167 | }
168 |
169 | private def getSFTPClient(
170 | username: Option[String],
171 | password: Option[String],
172 | pemFileLocation: Option[String],
173 | pemPassphrase: Option[String],
174 | host: String,
175 | port: Option[String],
176 | cryptoKey : String,
177 | cryptoAlgorithm : String) : SFTPClient = {
178 |
179 | val sftpPort = if (port != null && port.isDefined) {
180 | port.get.toInt
181 | } else {
182 | 22
183 | }
184 |
185 | val cryptoEnabled = cryptoKey != null
186 |
187 | if (cryptoEnabled) {
188 | new SFTPClient(getValue(pemFileLocation), getValue(pemPassphrase), getValue(username),
189 | getValue(password),
190 | host, sftpPort, cryptoEnabled, cryptoKey, cryptoAlgorithm)
191 | } else {
192 | new SFTPClient(getValue(pemFileLocation), getValue(pemPassphrase), getValue(username),
193 | getValue(password), host, sftpPort)
194 | }
195 | }
196 |
197 | private def createReturnRelation(data: DataFrame): BaseRelation = {
198 | createReturnRelation(data.sqlContext, data.schema)
199 | }
200 |
201 | private def createReturnRelation(sqlContextVar: SQLContext, schemaVar: StructType): BaseRelation = {
202 | new BaseRelation {
203 | override def sqlContext: SQLContext = sqlContextVar
204 | override def schema: StructType = schemaVar
205 | }
206 | }
207 |
208 | private def copy(sftpClient: SFTPClient, source: String,
209 | tempFolder: String, latest: Boolean): String = {
210 | var copiedFilePath: String = null
211 | try {
212 | val target = tempFolder + File.separator + FilenameUtils.getName(source)
213 | copiedFilePath = target
214 | if (latest) {
215 | copiedFilePath = sftpClient.copyLatest(source, tempFolder)
216 | } else {
217 | logger.info("Copying " + source + " to " + target)
218 | copiedFilePath = sftpClient.copy(source, target)
219 | }
220 |
221 | copiedFilePath
222 | } finally {
223 | addShutdownHook(copiedFilePath)
224 | }
225 | }
226 |
227 | private def getValue(param: Option[String]): String = {
228 | if (param != null && param.isDefined) {
229 | param.get
230 | } else {
231 | null
232 | }
233 | }
234 |
235 | private def writeToTemp(sqlContext: SQLContext, df: DataFrame,
236 | hdfsTemp: String, tempFolder: String, fileType: String, header: String,
237 | delimiter: String, quote: String, escape: String, multiLine: String, codec: String, rowTag: String, rootTag: String) : String = {
238 | val randomSuffix = "spark_sftp_connection_temp_" + UUID.randomUUID
239 | val hdfsTempLocation = hdfsTemp + File.separator + randomSuffix
240 | val localTempLocation = tempFolder + File.separator + randomSuffix
241 |
242 | addShutdownHook(localTempLocation)
243 |
244 | fileType match {
245 |
246 | case "xml" => df.coalesce(1).write.format(constants.xmlClass)
247 | .option(constants.xmlRowTag, rowTag)
248 | .option(constants.xmlRootTag, rootTag).save(hdfsTempLocation)
249 | case "csv" => df.coalesce(1).
250 | write.
251 | option("header", header).
252 | option("delimiter", delimiter).
253 | option("quote", quote).
254 | option("escape", escape).
255 | option("multiLine", multiLine).
256 | optionNoNull("codec", Option(codec)).
257 | csv(hdfsTempLocation)
258 | case "txt" => df.coalesce(1).write.text(hdfsTempLocation)
259 | case "avro" => df.coalesce(1).write.format("com.databricks.spark.avro").save(hdfsTempLocation)
260 | case _ => df.coalesce(1).write.format(fileType).save(hdfsTempLocation)
261 | }
262 |
263 | copyFromHdfs(sqlContext, hdfsTempLocation, localTempLocation)
264 | copiedFile(localTempLocation)
265 | }
266 |
267 | private def addShutdownHook(tempLocation: String) {
268 | logger.debug("Adding hook for file " + tempLocation)
269 | val hook = new DeleteTempFileShutdownHook(tempLocation)
270 | Runtime.getRuntime.addShutdownHook(hook)
271 | }
272 |
273 | private def copiedFile(tempFileLocation: String) : String = {
274 | val baseTemp = new File(tempFileLocation)
275 | val files = baseTemp.listFiles().filter { x =>
276 | (!x.isDirectory()
277 | && !x.getName.contains("SUCCESS")
278 | && !x.isHidden()
279 | && !x.getName.contains(".crc")
280 | && !x.getName.contains("_committed_")
281 | && !x.getName.contains("_started_")
282 | )
283 | }
284 | files(0).getAbsolutePath
285 | }
286 | }
287 |
--------------------------------------------------------------------------------
/src/main/scala/com/springml/spark/sftp/DeleteTempFileShutdownHook.scala:
--------------------------------------------------------------------------------
1 | package com.springml.spark.sftp
2 |
3 | import org.apache.commons.io.FileUtils
4 | import java.io.File
5 | import org.apache.log4j.Logger
6 |
7 | /**
8 | * Delete the temp file created during spark shutdown
9 | */
10 | class DeleteTempFileShutdownHook(
11 | fileLocation: String) extends Thread {
12 |
13 | private val logger = Logger.getLogger(classOf[DatasetRelation])
14 |
15 | override def run(): Unit = {
16 | logger.info("Deleting " + fileLocation )
17 | FileUtils.deleteQuietly(new File(fileLocation))
18 | }
19 | }
--------------------------------------------------------------------------------
/src/main/scala/com/springml/spark/sftp/constants.scala:
--------------------------------------------------------------------------------
1 | package com.springml.spark.sftp
2 |
3 | /**
4 | * Created by bagopalan on 9/16/18.
5 | */
6 | object constants {
7 |
8 | val xmlClass: String = "com.databricks.spark.xml"
9 | val xmlRowTag: String = "rowTag"
10 | val xmlRootTag: String = "rootTag"
11 |
12 | }
13 |
--------------------------------------------------------------------------------
/src/main/scala/com/springml/spark/sftp/util/Utils.scala:
--------------------------------------------------------------------------------
1 | package com.springml.spark.sftp.util
2 |
3 | import org.apache.spark.sql.DataFrameWriter
4 |
5 |
6 | object Utils {
7 |
8 |
9 | /**
10 | * [[DataFrameWriter]] implicits
11 | */
12 | implicit class ImplicitDataFrameWriter[T](dataFrameWriter: DataFrameWriter[T]) {
13 |
14 | /**
15 | * Adds an output option for the underlying data source if the option has a value.
16 | */
17 | def optionNoNull(key: String, optionValue: Option[String]): DataFrameWriter[T] = {
18 | optionValue match {
19 | case Some(_) => dataFrameWriter.option(key, optionValue.get)
20 | case None => dataFrameWriter
21 | }
22 | }
23 | }
24 |
25 | }
--------------------------------------------------------------------------------
/src/test/resources/books.orc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/egen/spark-sftp/090917547001574afa93cddaf2a022151a3f4260/src/test/resources/books.orc
--------------------------------------------------------------------------------
/src/test/resources/books.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Gambardella, Matthew
5 | XML Developer's Guide
6 | Computer
7 | 44.95
8 | 2000-10-01
9 |
10 |
11 |
12 | An in-depth look at creating applications
13 | with XML.This manual describes Oracle XML DB, and how you can use it to store, generate, manipulate, manage,
14 | and query XML data in the database.
15 |
16 |
17 | After introducing you to the heart of Oracle XML DB, namely the XMLType framework and Oracle XML DB repository,
18 | the manual provides a brief introduction to design criteria to consider when planning your Oracle XML DB
19 | application. It provides examples of how and where you can use Oracle XML DB.
20 |
21 |
22 | The manual then describes ways you can store and retrieve XML data using Oracle XML DB, APIs for manipulating
23 | XMLType data, and ways you can view, generate, transform, and search on existing XML data. The remainder of
24 | the manual discusses how to use Oracle XML DB repository, including versioning and security,
25 | how to access and manipulate repository resources using protocols, SQL, PL/SQL, or Java, and how to manage
26 | your Oracle XML DB application using Oracle Enterprise Manager. It also introduces you to XML messaging and
27 | Oracle Streams Advanced Queuing XMLType support.
28 |
29 | Ralls, Kim
30 | Midnight Rain
31 | Fantasy
32 | 5.95
33 | 2000-12-16
34 | A former architect battles corporate zombies,
35 | an evil sorceress, and her own childhood to become queen
36 | of the world.
37 |
38 |
39 | Corets, Eva
40 | Maeve Ascendant
41 | Fantasy
42 | 5.95
43 | 2000-11-17
44 | After the collapse of a nanotechnology
45 | society in England, the young survivors lay the
46 | foundation for a new society.
47 |
48 |
49 | Corets, Eva
50 | Oberon's Legacy
51 | Fantasy
52 | 5.95
53 | 2001-03-10
54 | In post-apocalypse England, the mysterious
55 | agent known only as Oberon helps to create a new life
56 | for the inhabitants of London. Sequel to Maeve
57 | Ascendant.
58 |
59 |
60 | Corets, Eva
61 | The Sundered Grail
62 | Fantasy
63 | 5.95
64 | 2001-09-10
65 | The two daughters of Maeve, half-sisters,
66 | battle one another for control of England. Sequel to
67 | Oberon's Legacy.
68 |
69 |
70 | Randall, Cynthia
71 | Lover Birds
72 | Romance
73 | 4.95
74 | 2000-09-02
75 | When Carla meets Paul at an ornithology
76 | conference, tempers fly as feathers get ruffled.
77 |
78 |
79 | Thurman, Paula
80 | Splish Splash
81 | Romance
82 | 4.95
83 | 2000-11-02
84 | A deep sea diver finds true love twenty
85 | thousand leagues beneath the sea.
86 |
87 |
88 | Knorr, Stefan
89 | Creepy Crawlies
90 | Horror
91 | 4.95
92 | 2000-12-06
93 | An anthology of horror stories about roaches,
94 | centipedes, scorpions and other insects.
95 |
96 |
97 | Kress, Peter
98 | Paradox Lost
99 | Science Fiction
100 | 6.95
101 | 2000-11-02
102 | After an inadvertant trip through a Heisenberg
103 | Uncertainty Device, James Salway discovers the problems
104 | of being quantum.
105 |
106 |
107 | O'Brien, Tim
108 | Microsoft .NET: The Programming Bible
109 | Computer
110 | 36.95
111 | 2000-12-09
112 | Microsoft's .NET initiative is explored in
113 | detail in this deep programmer's reference.
114 |
115 |
116 | O'Brien, Tim
117 | MSXML3: A Comprehensive Guide
118 | Computer
119 | 36.95
120 | 2000-12-01
121 | The Microsoft MSXML3 parser is covered in
122 | detail, with attention to XML DOM interfaces, XSLT processing,
123 | SAX and more.
124 |
125 |
126 | Galos, Mike
127 | Visual Studio 7: A Comprehensive Guide
128 | Computer
129 | 49.95
130 | 2001-04-16
131 | Microsoft Visual Studio 7 is explored in depth,
132 | looking at how Visual Basic, Visual C++, C#, and ASP+ are
133 | integrated into a comprehensive development
134 | environment.
135 |
136 |
--------------------------------------------------------------------------------
/src/test/resources/custom-delimiter.csv:
--------------------------------------------------------------------------------
1 | ProposalId;OpportunityId;Clicks;Impressions
2 | 103;006B0000002ndnuIAA;30;133
3 | 101;006B0000002ndnkIAA;12;73
4 | 102;006B0000002ndnpIAA;20;97
5 |
--------------------------------------------------------------------------------
/src/test/resources/people.json:
--------------------------------------------------------------------------------
1 | {"name":"Michael"}
2 | {"name":"Andy", "age":30}
3 | {"name":"Justin", "age":19}
4 |
--------------------------------------------------------------------------------
/src/test/resources/plaintext.txt:
--------------------------------------------------------------------------------
1 | adam
2 | Emily
3 | sundar
--------------------------------------------------------------------------------
/src/test/resources/sample.csv:
--------------------------------------------------------------------------------
1 | ProposalId,OpportunityId,Clicks,Impressions
2 | 103,006B0000002ndnuIAA,30,133
3 | 101,006B0000002ndnkIAA,12,73
4 | 102,006B0000002ndnpIAA,20,97
5 |
--------------------------------------------------------------------------------
/src/test/resources/sample_quoted_multiline.csv:
--------------------------------------------------------------------------------
1 | ProposalId,OpportunityId,Clicks,Impressions,Message
2 | 103,006B0000002ndnuIAA,30,133,"test
3 | multiline \"here we have a quote\"
4 | message",
5 | 101,006B0000002ndnkIAA,12,73,"regular message"
6 | 102,006B0000002ndnpIAA,20,97,"regular message"
7 |
--------------------------------------------------------------------------------
/src/test/resources/users.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/egen/spark-sftp/090917547001574afa93cddaf2a022151a3f4260/src/test/resources/users.avro
--------------------------------------------------------------------------------
/src/test/resources/users.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/egen/spark-sftp/090917547001574afa93cddaf2a022151a3f4260/src/test/resources/users.parquet
--------------------------------------------------------------------------------
/src/test/scala/com/springml/spark/sftp/CustomSchemaTest.scala:
--------------------------------------------------------------------------------
1 | package com.springml.spark.sftp
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructField, _}
5 | import org.scalatest.{BeforeAndAfterEach, FunSuite}
6 |
7 | /**
8 | * Tests for creating dataframe using custom schema
9 | */
10 | class CustomSchemaTest extends FunSuite with BeforeAndAfterEach {
11 | var ss: SparkSession = _
12 |
13 | val csvTypesMap = Map("ProposalId" -> IntegerType,
14 | "OpportunityId" -> StringType,
15 | "Clicks" -> LongType,
16 | "Impressions" -> LongType
17 | )
18 |
19 | val jsonTypesMap = Map("name" -> StringType,
20 | "age" -> IntegerType
21 | )
22 |
23 | override def beforeEach() {
24 | ss = SparkSession.builder().master("local").appName("Custom Schema Test").getOrCreate()
25 | }
26 |
27 | private def validateTypes(field : StructField, typeMap : Map[String, DataType]) = {
28 | val expectedType = typeMap(field.name)
29 | assert(expectedType == field.dataType)
30 | }
31 |
32 | private def columnArray(typeMap : Map[String, DataType]) : Array[StructField] = {
33 | val columns = typeMap.map(x => new StructField(x._1, x._2, true))
34 |
35 | val columnStruct = Array[StructField] ()
36 | columns.copyToArray(columnStruct)
37 |
38 | columnStruct
39 | }
40 |
41 | test ("Read CSV with custom schema") {
42 | val columnStruct = columnArray(csvTypesMap)
43 | val expectedSchema = StructType(columnStruct)
44 |
45 | val fileLocation = getClass.getResource("/sample.csv").getPath
46 | val dsr = DatasetRelation(fileLocation, "csv", "false", "true", ",", "\"", "\\", "false", null, expectedSchema, ss.sqlContext)
47 | val rdd = dsr.buildScan()
48 |
49 | assert(dsr.schema.fields.length == columnStruct.length)
50 | dsr.schema.fields.foreach(s => validateTypes(s, csvTypesMap))
51 | }
52 |
53 | test ("Read Json with custom schema") {
54 | val columnStruct = columnArray(jsonTypesMap)
55 | val expectedSchema = StructType(columnStruct)
56 |
57 | val fileLocation = getClass.getResource("/people.json").getPath
58 | val dsr = DatasetRelation(fileLocation, "json", "false", "true", ",", "\"", "\\", "false", null, expectedSchema, ss.sqlContext)
59 | val rdd = dsr.buildScan()
60 |
61 | assert(dsr.schema.fields.length == columnStruct.length)
62 | dsr.schema.fields.foreach(s => validateTypes(s, jsonTypesMap))
63 | }
64 |
65 | }
66 |
--------------------------------------------------------------------------------
/src/test/scala/com/springml/spark/sftp/TestDatasetRelation.scala:
--------------------------------------------------------------------------------
1 | package com.springml.spark.sftp
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.scalatest.{BeforeAndAfterEach, FunSuite}
5 |
6 | /**
7 | * Simple unit test for basic testing on different formats of file
8 | */
9 | class TestDatasetRelation extends FunSuite with BeforeAndAfterEach {
10 | var ss: SparkSession = _
11 |
12 | override def beforeEach() {
13 | ss = SparkSession.builder().master("local").enableHiveSupport().appName("Test Dataset Relation").getOrCreate()
14 | }
15 |
16 | test ("Read CSV") {
17 | val fileLocation = getClass.getResource("/sample.csv").getPath
18 | val dsr = DatasetRelation(fileLocation, "csv", "false", "true", ",", "\"", "\\", "false", null, null, ss.sqlContext)
19 | val rdd = dsr.buildScan()
20 | assert(3 == rdd.count())
21 | }
22 |
23 | test ("Read CSV using custom delimiter") {
24 | val fileLocation = getClass.getResource("/sample.csv").getPath
25 | val dsr = DatasetRelation(fileLocation, "csv", "false", "true", ";", "\"", "\\", "false", null, null, ss.sqlContext)
26 | val rdd = dsr.buildScan()
27 | assert(3 == rdd.count())
28 | }
29 |
30 | test ("Read multiline CSV using custom quote and escape") {
31 | val fileLocation = getClass.getResource("/sample_quoted_multiline.csv").getPath
32 | val dsr = DatasetRelation(fileLocation, "csv", "false", "true", ",", "\"", "\\", "true", null, null, ss.sqlContext)
33 | val rdd = dsr.buildScan()
34 | assert(3 == rdd.count())
35 | }
36 |
37 |
38 | test ("Read JSON") {
39 | val fileLocation = getClass.getResource("/people.json").getPath
40 | val dsr = DatasetRelation(fileLocation, "json", "false", "true", ",", "\"", "\\", "false", null, null, ss.sqlContext)
41 | val rdd = dsr.buildScan()
42 | assert(3 == rdd.count())
43 | }
44 |
45 | test ("Read AVRO") {
46 | val fileLocation = getClass.getResource("/users.avro").getPath
47 | val dsr = DatasetRelation(fileLocation, "avro", "false", "true", ",", "\"", "\\", "false", null, null, ss.sqlContext)
48 | val rdd = dsr.buildScan()
49 | assert(2 == rdd.count())
50 | }
51 |
52 | test ("Read parquet") {
53 | val fileLocation = getClass.getResource("/users.parquet").getPath
54 | val dsr = DatasetRelation(fileLocation, "parquet", "false", "true", ",", "\"", "\\", "false", null, null, ss.sqlContext)
55 | val rdd = dsr.buildScan()
56 | assert(2 == rdd.count())
57 | }
58 |
59 | test ("Read text file") {
60 | val fileLocation = getClass.getResource("/plaintext.txt").getPath
61 | val dsr = DatasetRelation(fileLocation, "txt", "false", "true", ",", "\"", "\\", "false", null, null, ss.sqlContext)
62 | val rdd = dsr.buildScan()
63 | assert(3 == rdd.count())
64 | }
65 |
66 | test ("Read xml file") {
67 | val fileLocation = getClass.getResource("/books.xml").getPath
68 | val dsr = DatasetRelation(fileLocation, "xml", "false", "true", ",", "\"", "\\", "false", "book", null, ss.sqlContext)
69 | val rdd = dsr.buildScan()
70 | assert(12 == rdd.count())
71 | }
72 | test ("Read orc file") {
73 | val fileLocation = getClass.getResource("/books.orc").getPath
74 | val dsr = DatasetRelation(fileLocation, "orc", "false", "true", ",", "\"", "\\", "false", "book", null, ss.sqlContext)
75 | val rdd = dsr.buildScan()
76 | assert(12 == rdd.count())
77 | }
78 | }
79 |
--------------------------------------------------------------------------------