├── .gitignore
├── LICENSE
├── README.md
├── build.sbt
├── build
    ├── sbt
    └── sbt-launch-lib.bash
├── project
    ├── build.properties
    └── plugins.sbt
└── src
    ├── main
        └── scala
        │   └── com
        │       └── springml
        │           └── spark
        │               └── sftp
        │                   ├── DatasetRelation.scala
        │                   ├── DefaultSource.scala
        │                   ├── DeleteTempFileShutdownHook.scala
        │                   ├── constants.scala
        │                   └── util
        │                       └── Utils.scala
    └── test
        ├── resources
            ├── books.orc
            ├── books.xml
            ├── custom-delimiter.csv
            ├── people.json
            ├── plaintext.txt
            ├── sample.csv
            ├── sample_quoted_multiline.csv
            ├── users.avro
            └── users.parquet
        └── scala
            └── com
                └── springml
                    └── spark
                        └── sftp
                            ├── CustomSchemaTest.scala
                            └── TestDatasetRelation.scala


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.class
 2 | *.log
 3 | *.pyc
 4 | sbt/*.jar
 5 | 
 6 | # sbt specific
 7 | .cache/
 8 | .history/
 9 | .lib/
10 | dist/*
11 | target/
12 | lib_managed/
13 | src_managed/
14 | project/boot/
15 | project/plugins/project/
16 | /bin/
17 | 
18 | .cache-main
19 | .classpath
20 | .project
21 | .settings/
22 | .cache-tests
23 | 
24 | # intellij
25 | .idea
26 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Apache License, Version 2.0
 2 | Apache License
 3 | Version 2.0, January 2004
 4 | http://www.apache.org/licenses/
 5 | 
 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
 7 | 
 8 | 1. Definitions.
 9 | 
10 | "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document.
11 | 
12 | "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License.
13 | 
14 | "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity.
15 | 
16 | "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License.
17 | 
18 | "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files.
19 | 
20 | "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types.
21 | 
22 | "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below).
23 | 
24 | "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof.
25 | 
26 | "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution."
27 | 
28 | "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work.
29 | 
30 | 2. Grant of Copyright License.
31 | 
32 | Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form.
33 | 
34 | 3. Grant of Patent License.
35 | 
36 | Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed.
37 | 
38 | 4. Redistribution.
39 | 
40 | You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
41 | 
42 | You must give any other recipients of the Work or Derivative Works a copy of this License; and
43 | You must cause any modified files to carry prominent notices stating that You changed the files; and
44 | You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and
45 | If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License.
46 | You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License.
47 | 
48 | 5. Submission of Contributions.
49 | 
50 | Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions.
51 | 
52 | 6. Trademarks.
53 | 
54 | This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file.
55 | 
56 | 7. Disclaimer of Warranty.
57 | 
58 | Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License.
59 | 
60 | 8. Limitation of Liability.
61 | 
62 | In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
63 | 
64 | 9. Accepting Warranty or Additional Liability.
65 | 
66 | While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
67 | 
68 | END OF TERMS AND CONDITIONS
69 | 
70 | APPENDIX: How to apply the Apache License to your work
71 | 
72 | To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives.
73 | 
74 |    Copyright [yyyy] [name of copyright owner]
75 | 
76 |    Licensed under the Apache License, Version 2.0 (the "License");
77 |    you may not use this file except in compliance with the License.
78 |    You may obtain a copy of the License at
79 | 
80 |      http://www.apache.org/licenses/LICENSE-2.0
81 | 
82 |    Unless required by applicable law or agreed to in writing, software
83 |    distributed under the License is distributed on an "AS IS" BASIS,
84 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
85 |    See the License for the specific language governing permissions and
86 |    limitations under the License.
87 |    


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Spark SFTP Connector Library
  2 | 
  3 | A library for constructing dataframes by downloading files from SFTP and writing dataframe to a SFTP server
  4 | 
  5 | ## Requirements
  6 | 
  7 | This library requires Spark 2.x.
  8 | 
  9 | For Spark 1.x support, please check [spark1.x](https://github.com/springml/spark-sftp/tree/spark1.x) branch.
 10 | 
 11 | ## Linking
 12 | You can link against this library in your program at the following ways:
 13 | 
 14 | ### Maven Dependency
 15 | ```
 16 | <dependency>
 17 | 	<groupId>com.springml</groupId>
 18 | 	<artifactId>spark-sftp_2.11</artifactId>
 19 | 	<version>1.1.3</version>
 20 | </dependency>
 21 | 
 22 | ```
 23 | 
 24 | ### SBT Dependency
 25 | ```
 26 | libraryDependencies += "com.springml" % "spark-sftp_2.11" % "1.1.3"
 27 | ```
 28 | 
 29 | 
 30 | ## Using with Spark shell
 31 | This package can be added to Spark using the `--packages` command line option.  For example, to include it when starting the spark shell:
 32 | 
 33 | ```
 34 | $ bin/spark-shell --packages com.springml:spark-sftp_2.11:1.1.3
 35 | ```
 36 | 
 37 | ## Features
 38 | This package can be used to construct spark dataframe by downloading the files from SFTP server.
 39 | 
 40 | This package can also be used to write spark dataframe as a csv|json|acro tp SFTP server
 41 | 
 42 | This library requires following options:
 43 | * `path`: FTP URL of the file to be used for dataframe construction
 44 | * `username`: SFTP Server Username. 
 45 | * `password`: (Optional) SFTP Server Password. 
 46 | * `pem`: (Optional) Location of PEM file. Either pem or password has to be specified
 47 | * `pemPassphrase`: (Optional) Passphrase for PEM file.
 48 | * `host`: SFTP Host.
 49 | * `port`: (Optional) Port in which SFTP server is running. Default value 22.
 50 | * `fileType`: Type of the file. Supported types are csv, txt, json, avro and parquet
 51 | * `inferSchema`: (Optional) InferSchema from the file content. Currently applicable only for csv fileType
 52 | * `header`: (Optional) Applicable only for csv fileType. Is the first row in CSV file is header. 
 53 | * `delimiter`: (Optional) Set the field delimiter. Applicable only for csv fileType. Default is comma.
 54 | * `quote`: (Optional) Set the quote character. Applicable only for csv fileType. Default is ".
 55 | * `escape`: (Optional) Set the escape character. Applicable only for csv fileType. Default is \.
 56 | * `multiLine`: (Optional) Set the multiline. Applicable only for csv fileType. Default is false.
 57 | * `codec`: (Optional) Applicable only for csv fileType. Compression codec to use when saving to file. Should be the fully qualified name of a class implementing org.apache.hadoop.io.compress.CompressionCodec or one of case-insensitive shorten names (bzip2, gzip, lz4, and snappy). Defaults to no compression when a codec is not specified.
 58 | 
 59 | ### Scala API
 60 | ```scala
 61 | 
 62 | // Construct Spark dataframe using file in FTP server
 63 | val df = spark.read.
 64 |             format("com.springml.spark.sftp").
 65 |             option("host", "SFTP_HOST").
 66 |             option("username", "SFTP_USER").
 67 |             option("password", "****").
 68 |             option("fileType", "csv").
 69 |             option("delimiter", ";").
 70 |             option("quote", "\"").
 71 |             option("escape", "\\").
 72 |             option("multiLine", "true").
 73 |             option("inferSchema", "true").
 74 |             load("/ftp/files/sample.csv")
 75 | 
 76 | // Write dataframe as CSV file to FTP server
 77 | df.write.
 78 |       format("com.springml.spark.sftp").
 79 |       option("host", "SFTP_HOST").
 80 |       option("username", "SFTP_USER").
 81 |       option("password", "****").
 82 |       option("fileType", "csv").
 83 |       option("delimiter", ";").
 84 |       option("codec", "bzip2").
 85 |       save("/ftp/files/sample.csv")
 86 | 
 87 | 
 88 | // Construct spark dataframe using text file in FTP server
 89 |  val df = spark.read.
 90 |             format("com.springml.spark.sftp").
 91 |             option("host", "SFTP_HOST").
 92 |             option("username", "SFTP_USER").
 93 |             option("password", "****").
 94 |             option("fileType", "txt").
 95 |             load("config")
 96 |             
 97 |  // Construct spark dataframe using xml file in FTP server           
 98 |             val df = spark.read.
 99 |                  format("com.springml.spark.sftp").
100 |                  option("host", "SFTP_HOST").
101 |                  option("username", "SFTP_USER").
102 |                  option("password", "*****").
103 |                  option("fileType", "xml").
104 |                  option("rowTag", "YEAR").load("myxml.xml")
105 |                  
106 |  // Write dataframe as XML file to FTP server           
107 |            
108 |                  df.write.format("com.springml.spark.sftp").
109 |                  option("host", "SFTP_HOST").
110 |                  option("username", "SFTP_USER").
111 |                  option("password", "*****").
112 |                  option("fileType", "xml").
113 |                  option("rootTag", "YTD").
114 |                  option("rowTag", "YEAR").save("myxmlOut.xml.gz")
115 | 
116 | ```
117 | 
118 | 
119 | ### Java API
120 | ```java
121 | // Construct Spark dataframe using file in FTP server
122 | DataFrame df = spark.read().
123 | 					format("com.springml.spark.sftp").
124 | 				    option("host", "SFTP_HOST").
125 | 				    option("username", "SFTP_USER").
126 | 				    option("password", "****").
127 | 				    option("fileType", "json").
128 | 				    load("/ftp/files/sample.json")
129 | 
130 | // Write dataframe as CSV file to FTP server
131 | df.write().
132 |       format("com.springml.spark.sftp").
133 |       option("host", "SFTP_HOST").
134 |       option("username", "SFTP_USER").
135 |       option("password", "****").
136 |       option("fileType", "json").
137 |       save("/ftp/files/sample.json");
138 | ```
139 | 
140 | ### R API
141 | Spark 1.5+:
142 | ```r
143 | 
144 | if (nchar(Sys.getenv("SPARK_HOME")) < 1) {
145 |   Sys.setenv(SPARK_HOME = "/home/spark")
146 | }
147 | library(SparkR, lib.loc = c(file.path(Sys.getenv("SPARK_HOME"), "R", "lib")))
148 | sparkR.session(master = "local[*]", sparkConfig = list(spark.driver.memory = "2g"))
149 | 
150 | # Construct Spark dataframe using avro file in FTP server
151 | df <- read.df(path="/ftp/files/sample.avro",
152 |             source="com.springml.spark.sftp",
153 |             host="SFTP_HOST",
154 |             username="SFTP_USER",
155 |             pem="/home/user/mypem.pem",
156 |             fileType="avro")
157 | 
158 | # Write dataframe as avro file to FTP server
159 | write.df(df,
160 |         path="/ftp/files/sample.avro",
161 |         source="com.springml.spark.sftp",
162 |         host="SFTP_HOST",
163 |         username="SFTP_USER",
164 |         pem="/home/user/mypem.pem",
165 |         fileType="avro")
166 | ```
167 | 
168 | ### Note
169 | 1. SFTP files are fetched and written using [jsch](http://www.jcraft.com/jsch/). It will be executed as a single process
170 | 2. Files from SFTP server will be downloaded to temp location and it will be deleted only during spark shutdown
171 | 
172 | 
173 | ## Building From Source
174 | This library is built with [SBT](http://www.scala-sbt.org/0.13/docs/Command-Line-Reference.html), which is automatically downloaded by the included shell script. To build a JAR file simply run `build/sbt package` from the project root.
175 | 


--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "spark-sftp"
 2 | 
 3 | organization := "com.springml"
 4 | 
 5 | scalaVersion := "2.11.8"
 6 | 
 7 | sparkVersion := "2.3.0"
 8 | 
 9 | spName := "springml/spark-sftp"
10 | 
11 | version := "1.1.4"
12 | 
13 | // Dependent libraries
14 | libraryDependencies ++= Seq(
15 |   "com.springml" % "sftp.client" % "1.0.3",
16 |   "org.mockito" % "mockito-core" % "2.0.31-beta",
17 |   "com.databricks" % "spark-xml_2.11" % "0.4.1"
18 | )
19 | 
20 | // used spark components
21 | sparkComponents += "sql"
22 | 
23 | // Repositories
24 | resolvers += "Spark Package Main Repo" at "https://dl.bintray.com/spark-packages/maven"
25 | 
26 | // Spark packages
27 | spDependencies += "com.databricks/spark-avro_2.11:3.2.0"
28 | 
29 | // Test dependencies
30 | libraryDependencies += "org.scalatest" %% "scalatest" % "2.2.1" % "test"
31 | libraryDependencies += "org.apache.avro" % "avro-mapred" % "1.7.7" % "test" exclude("org.mortbay.jetty", "servlet-api")
32 | libraryDependencies +=  "org.apache.spark" %% "spark-hive" % sparkVersion.value % "test"
33 | 
34 | spIgnoreProvided := true
35 | // licenses := Seq("Apache-2.0" -> url("http://opensource.org/licenses/Apache-2.0"))
36 | 
37 | credentials += Credentials(Path.userHome / ".ivy2" / ".credentials")
38 | 
39 | publishTo := {
40 |   val nexus = "https://oss.sonatype.org/"
41 |   if (version.value.endsWith("SNAPSHOT"))
42 |     Some("snapshots" at nexus + "content/repositories/snapshots")
43 |   else
44 |     Some("releases"  at nexus + "service/local/staging/deploy/maven2")
45 | }
46 | 
47 | pomExtra := (
48 |   <url>https://github.com/springml/spark-sftp</url>
49 |     <licenses>
50 |       <license>
51 |         <name>Apache License, Verision 2.0</name>
52 |         <url>http://www.apache.org/licenses/LICENSE-2.0.html</url>
53 |         <distribution>repo</distribution>
54 |       </license>
55 |     </licenses>
56 |     <scm>
57 |       <connection>scm:git:github.com/springml/spark-sftp</connection>
58 |       <developerConnection>scm:git:git@github.com:springml/spark-sftp</developerConnection>
59 |       <url>github.com/springml/spark-sftp</url>
60 |     </scm>
61 |     <developers>
62 |       <developer>
63 |         <id>springml</id>
64 |         <name>Springml</name>
65 |         <url>http://www.springml.com</url>
66 |       </developer>
67 |     </developers>)
68 | 


--------------------------------------------------------------------------------
/build/sbt:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | # When creating new tests for Spark SQL Hive, the HADOOP_CLASSPATH must contain the hive jars so
  4 | # that we can run Hive to generate the golden answer.  This is not required for normal development
  5 | # or testing.
  6 | for i in $HIVE_HOME/lib/*
  7 | do HADOOP_CLASSPATH=$HADOOP_CLASSPATH:$i
  8 | done
  9 | export HADOOP_CLASSPATH
 10 | 
 11 | realpath () {
 12 | (
 13 |   TARGET_FILE=$1
 14 | 
 15 |   cd $(dirname $TARGET_FILE)
 16 |   TARGET_FILE=$(basename $TARGET_FILE)
 17 | 
 18 |   COUNT=0
 19 |   while [ -L "$TARGET_FILE" -a $COUNT -lt 100 ]
 20 |   do
 21 |       TARGET_FILE=$(readlink $TARGET_FILE)
 22 |       cd $(dirname $TARGET_FILE)
 23 |       TARGET_FILE=$(basename $TARGET_FILE)
 24 |       COUNT=$(($COUNT + 1))
 25 |   done
 26 | 
 27 |   echo $(pwd -P)/$TARGET_FILE
 28 | )
 29 | }
 30 | 
 31 | . $(dirname $(realpath $0))/sbt-launch-lib.bash
 32 | 
 33 | 
 34 | declare -r noshare_opts="-Dsbt.global.base=project/.sbtboot -Dsbt.boot.directory=project/.boot -Dsbt.ivy.home=project/.ivy"
 35 | declare -r sbt_opts_file=".sbtopts"
 36 | declare -r etc_sbt_opts_file="/etc/sbt/sbtopts"
 37 | 
 38 | usage() {
 39 |  cat <<EOM
 40 | Usage: $script_name [options]
 41 |   -h | -help         print this message
 42 |   -v | -verbose      this runner is chattier
 43 |   -d | -debug        set sbt log level to debug
 44 |   -no-colors         disable ANSI color codes
 45 |   -sbt-create        start sbt even if current directory contains no sbt project
 46 |   -sbt-dir   <path>  path to global settings/plugins directory (default: ~/.sbt)
 47 |   -sbt-boot  <path>  path to shared boot directory (default: ~/.sbt/boot in 0.11 series)
 48 |   -ivy       <path>  path to local Ivy repository (default: ~/.ivy2)
 49 |   -mem    <integer>  set memory options (default: $sbt_mem, which is $(get_mem_opts $sbt_mem))
 50 |   -no-share          use all local caches; no sharing
 51 |   -no-global         uses global caches, but does not use global ~/.sbt directory.
 52 |   -jvm-debug <port>  Turn on JVM debugging, open at the given port.
 53 |   -batch             Disable interactive mode
 54 |   # sbt version (default: from project/build.properties if present, else latest release)
 55 |   -sbt-version  <version>   use the specified version of sbt
 56 |   -sbt-jar      <path>      use the specified jar as the sbt launcher
 57 |   -sbt-rc                   use an RC version of sbt
 58 |   -sbt-snapshot             use a snapshot version of sbt
 59 |   # java version (default: java from PATH, currently $(java -version 2>&1 | grep version))
 60 |   -java-home <path>         alternate JAVA_HOME
 61 |   # jvm options and output control
 62 |   JAVA_OPTS          environment variable, if unset uses "$java_opts"
 63 |   SBT_OPTS           environment variable, if unset uses "$default_sbt_opts"
 64 |   .sbtopts           if this file exists in the current directory, it is
 65 |                      prepended to the runner args
 66 |   /etc/sbt/sbtopts   if this file exists, it is prepended to the runner args
 67 |   -Dkey=val          pass -Dkey=val directly to the java runtime
 68 |   -J-X               pass option -X directly to the java runtime
 69 |                      (-J is stripped)
 70 |   -S-X               add -X to sbt's scalacOptions (-J is stripped)
 71 |   -PmavenProfiles     Enable a maven profile for the build.
 72 | In the case of duplicated or conflicting options, the order above
 73 | shows precedence: JAVA_OPTS lowest, command line options highest.
 74 | EOM
 75 | }
 76 | 
 77 | process_my_args () {
 78 |   while [[ $# -gt 0 ]]; do
 79 |     case "$1" in
 80 |      -no-colors) addJava "-Dsbt.log.noformat=true" && shift ;;
 81 |       -no-share) addJava "$noshare_opts" && shift ;;
 82 |      -no-global) addJava "-Dsbt.global.base=$(pwd)/project/.sbtboot" && shift ;;
 83 |       -sbt-boot) require_arg path "$1" "$2" && addJava "-Dsbt.boot.directory=$2" && shift 2 ;;
 84 |        -sbt-dir) require_arg path "$1" "$2" && addJava "-Dsbt.global.base=$2" && shift 2 ;;
 85 |      -debug-inc) addJava "-Dxsbt.inc.debug=true" && shift ;;
 86 |          -batch) exec </dev/null && shift ;;
 87 | 
 88 |     -sbt-create) sbt_create=true && shift ;;
 89 | 
 90 |               *) addResidual "$1" && shift ;;
 91 |     esac
 92 |   done
 93 | 
 94 |   # Now, ensure sbt version is used.
 95 |   [[ "${sbt_version}XXX" != "XXX" ]] && addJava "-Dsbt.version=$sbt_version"
 96 | }
 97 | 
 98 | loadConfigFile() {
 99 |   cat "$1" | sed '/^\#/d'
100 | }
101 | 
102 | # if sbtopts files exist, prepend their contents to $@ so it can be processed by this runner
103 | [[ -f "$etc_sbt_opts_file" ]] && set -- $(loadConfigFile "$etc_sbt_opts_file") "$@"
104 | [[ -f "$sbt_opts_file" ]] && set -- $(loadConfigFile "$sbt_opts_file") "$@"
105 | 
106 | run "$@"
107 | 


--------------------------------------------------------------------------------
/build/sbt-launch-lib.bash:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | #
  3 | 
  4 | # A library to simplify using the SBT launcher from other packages.
  5 | # Note: This should be used by tools like giter8/conscript etc.
  6 | 
  7 | # TODO - Should we merge the main SBT script with this library?
  8 | 
  9 | if test -z "$HOME"; then
 10 |   declare -r script_dir="$(dirname $script_path)"
 11 | else
 12 |   declare -r script_dir="$HOME/.sbt"
 13 | fi
 14 | 
 15 | declare -a residual_args
 16 | declare -a java_args
 17 | declare -a scalac_args
 18 | declare -a sbt_commands
 19 | declare -a maven_profiles
 20 | 
 21 | if test -x "$JAVA_HOME/bin/java"; then
 22 |     echo -e "Using $JAVA_HOME as default JAVA_HOME."
 23 |     echo "Note, this will be overridden by -java-home if it is set."
 24 |     declare java_cmd="$JAVA_HOME/bin/java"
 25 | else
 26 |     declare java_cmd=java
 27 | fi
 28 | 
 29 | echoerr () {
 30 |   echo 1>&2 "$@"
 31 | }
 32 | vlog () {
 33 |   [[ $verbose || $debug ]] && echoerr "$@"
 34 | }
 35 | dlog () {
 36 |   [[ $debug ]] && echoerr "$@"
 37 | }
 38 | 
 39 | acquire_sbt_jar () {
 40 |   SBT_VERSION=`awk -F "=" '/sbt\\.version/ {print $2}' ./project/build.properties`
 41 |   URL1=http://typesafe.artifactoryonline.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
 42 |   URL2=http://repo.typesafe.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
 43 |   JAR=build/sbt-launch-${SBT_VERSION}.jar
 44 | 
 45 |   sbt_jar=$JAR
 46 | 
 47 |   if [[ ! -f "$sbt_jar" ]]; then
 48 |     # Download sbt launch jar if it hasn't been downloaded yet
 49 |     if [ ! -f ${JAR} ]; then
 50 |     # Download
 51 |     printf "Attempting to fetch sbt\n"
 52 |     JAR_DL=${JAR}.part
 53 |     if hash curl 2>/dev/null; then
 54 |       (curl --progress-bar ${URL1} > ${JAR_DL} || curl --progress-bar ${URL2} > ${JAR_DL}) && mv ${JAR_DL} ${JAR}
 55 |     elif hash wget 2>/dev/null; then
 56 |       (wget --progress=bar ${URL1} -O ${JAR_DL} || wget --progress=bar ${URL2} -O ${JAR_DL}) && mv ${JAR_DL} ${JAR}
 57 |     else
 58 |       printf "You do not have curl or wget installed, please install sbt manually from http://www.scala-sbt.org/\n"
 59 |       exit -1
 60 |     fi
 61 |     fi
 62 |     if [ ! -f ${JAR} ]; then
 63 |     # We failed to download
 64 |     printf "Our attempt to download sbt locally to ${JAR} failed. Please install sbt manually from http://www.scala-sbt.org/\n"
 65 |     exit -1
 66 |     fi
 67 |     printf "Launching sbt from ${JAR}\n"
 68 |   fi
 69 | }
 70 | 
 71 | execRunner () {
 72 |   # print the arguments one to a line, quoting any containing spaces
 73 |   [[ $verbose || $debug ]] && echo "# Executing command line:" && {
 74 |     for arg; do
 75 |       if printf "%s\n" "$arg" | grep -q ' '; then
 76 |         printf "\"%s\"\n" "$arg"
 77 |       else
 78 |         printf "%s\n" "$arg"
 79 |       fi
 80 |     done
 81 |     echo ""
 82 |   }
 83 | 
 84 |   exec "$@"
 85 | }
 86 | 
 87 | addJava () {
 88 |   dlog "[addJava] arg = '$1'"
 89 |   java_args=( "${java_args[@]}" "$1" )
 90 | }
 91 | 
 92 | enableProfile () {
 93 |   dlog "[enableProfile] arg = '$1'"
 94 |   maven_profiles=( "${maven_profiles[@]}" "$1" )
 95 |   export SBT_MAVEN_PROFILES="${maven_profiles[@]}"
 96 | }
 97 | 
 98 | addSbt () {
 99 |   dlog "[addSbt] arg = '$1'"
100 |   sbt_commands=( "${sbt_commands[@]}" "$1" )
101 | }
102 | addResidual () {
103 |   dlog "[residual] arg = '$1'"
104 |   residual_args=( "${residual_args[@]}" "$1" )
105 | }
106 | addDebugger () {
107 |   addJava "-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=$1"
108 | }
109 | 
110 | # a ham-fisted attempt to move some memory settings in concert
111 | # so they need not be dicked around with individually.
112 | get_mem_opts () {
113 |   local mem=${1:-2048}
114 |   local perm=$(( $mem / 4 ))
115 |   (( $perm > 256 )) || perm=256
116 |   (( $perm < 4096 )) || perm=4096
117 |   local codecache=$(( $perm / 2 ))
118 | 
119 |   echo "-Xms${mem}m -Xmx${mem}m -XX:MaxPermSize=${perm}m -XX:ReservedCodeCacheSize=${codecache}m"
120 | }
121 | 
122 | require_arg () {
123 |   local type="$1"
124 |   local opt="$2"
125 |   local arg="$3"
126 |   if [[ -z "$arg" ]] || [[ "${arg:0:1}" == "-" ]]; then
127 |     die "$opt requires <$type> argument"
128 |   fi
129 | }
130 | 
131 | is_function_defined() {
132 |   declare -f "$1" > /dev/null
133 | }
134 | 
135 | process_args () {
136 |   while [[ $# -gt 0 ]]; do
137 |     case "$1" in
138 |        -h|-help) usage; exit 1 ;;
139 |     -v|-verbose) verbose=1 && shift ;;
140 |       -d|-debug) debug=1 && shift ;;
141 | 
142 |            -ivy) require_arg path "$1" "$2" && addJava "-Dsbt.ivy.home=$2" && shift 2 ;;
143 |            -mem) require_arg integer "$1" "$2" && sbt_mem="$2" && shift 2 ;;
144 |      -jvm-debug) require_arg port "$1" "$2" && addDebugger $2 && shift 2 ;;
145 |          -batch) exec </dev/null && shift ;;
146 | 
147 |        -sbt-jar) require_arg path "$1" "$2" && sbt_jar="$2" && shift 2 ;;
148 |    -sbt-version) require_arg version "$1" "$2" && sbt_version="$2" && shift 2 ;;
149 |      -java-home) require_arg path "$1" "$2" && java_cmd="$2/bin/java" && export JAVA_HOME=$2 && shift 2 ;;
150 | 
151 |             -D*) addJava "$1" && shift ;;
152 |             -J*) addJava "${1:2}" && shift ;;
153 |             -P*) enableProfile "$1" && shift ;;
154 |               *) addResidual "$1" && shift ;;
155 |     esac
156 |   done
157 | 
158 |   is_function_defined process_my_args && {
159 |     myargs=("${residual_args[@]}")
160 |     residual_args=()
161 |     process_my_args "${myargs[@]}"
162 |   }
163 | }
164 | 
165 | run() {
166 |   # no jar? download it.
167 |   [[ -f "$sbt_jar" ]] || acquire_sbt_jar "$sbt_version" || {
168 |     # still no jar? uh-oh.
169 |     echo "Download failed. Obtain the sbt-launch.jar manually and place it at $sbt_jar"
170 |     exit 1
171 |   }
172 | 
173 |   # process the combined args, then reset "$@" to the residuals
174 |   process_args "$@"
175 |   set -- "${residual_args[@]}"
176 |   argumentCount=$#
177 | 
178 |   # run sbt
179 |   execRunner "$java_cmd" \
180 |     ${SBT_OPTS:-$default_sbt_opts} \
181 |     $(get_mem_opts $sbt_mem) \
182 |     ${java_opts} \
183 |     ${java_args[@]} \
184 |     -jar "$sbt_jar" \
185 |     "${sbt_commands[@]}" \
186 |     "${residual_args[@]}"
187 | }
188 | 
189 | runAlternateBoot() {
190 |   local bootpropsfile="$1"
191 |   shift
192 |   addJava "-Dsbt.boot.properties=$bootpropsfile"
193 |   run $@
194 | }
195 | 


--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | // This file should only contain the version of sbt to use.
2 | sbt.version=0.13.13
3 | 


--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | // You may use this file to add plugin dependencies for sbt.
2 | resolvers += "bintray-spark-packages" at "https://dl.bintray.com/spark-packages/maven/"
3 | 
4 | addSbtPlugin("org.spark-packages" %% "sbt-spark-package" % "0.2.5")
5 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "4.0.0")
6 | addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "1.1")
7 | addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.0.0")
8 | 


--------------------------------------------------------------------------------
/src/main/scala/com/springml/spark/sftp/DatasetRelation.scala:
--------------------------------------------------------------------------------
 1 | package com.springml.spark.sftp
 2 | 
 3 | import com.databricks.spark.avro._
 4 | import org.apache.log4j.Logger
 5 | import org.apache.spark.rdd.RDD
 6 | import org.apache.spark.sql.{DataFrame, Row, SQLContext}
 7 | import org.apache.spark.sql.sources.{BaseRelation, TableScan}
 8 | import org.apache.spark.sql.types.StructType
 9 | 
10 | /**
11 |  * Abstract relation class for reading data from file
12 |  */
13 | case class DatasetRelation(
14 |     fileLocation: String,
15 |     fileType: String,
16 |     inferSchema: String,
17 |     header: String,
18 |     delimiter: String,
19 |     quote: String,
20 |     escape: String,
21 |     multiLine: String,
22 |     rowTag: String,
23 |     customSchema: StructType,
24 |     sqlContext: SQLContext) extends BaseRelation with TableScan {
25 | 
26 |     private val logger = Logger.getLogger(classOf[DatasetRelation])
27 | 
28 |     val df = read()
29 | 
30 |     private def read(): DataFrame = {
31 |       var dataframeReader = sqlContext.read
32 |       if (customSchema != null) {
33 |         dataframeReader = dataframeReader.schema(customSchema)
34 |       }
35 | 
36 |       var df: DataFrame = null
37 | 
38 |       df = fileType match {
39 |         case "avro" => dataframeReader.avro(fileLocation)
40 |         case "txt" => dataframeReader.format("text").load(fileLocation)
41 |         case "xml" => dataframeReader.format(constants.xmlClass)
42 |           .option(constants.xmlRowTag, rowTag)
43 |           .load(fileLocation)
44 |         case "csv" => dataframeReader.
45 |           option("header", header).
46 |           option("delimiter", delimiter).
47 |           option("quote", quote).
48 |           option("escape", escape).
49 |           option("multiLine", multiLine).
50 |           option("inferSchema", inferSchema).
51 |           csv(fileLocation)
52 |         case _ => dataframeReader.format(fileType).load(fileLocation)
53 |       }
54 |      df
55 |     }
56 | 
57 |     override def schema: StructType = {
58 |       df.schema
59 |     }
60 | 
61 |     override def buildScan(): RDD[Row] = {
62 |       df.rdd
63 |     }
64 | 
65 | }
66 | 


--------------------------------------------------------------------------------
/src/main/scala/com/springml/spark/sftp/DefaultSource.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2015 springml
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | package com.springml.spark.sftp
 17 | 
 18 | import java.io.File
 19 | import java.util.UUID
 20 | 
 21 | import com.springml.sftp.client.SFTPClient
 22 | import com.springml.spark.sftp.util.Utils.ImplicitDataFrameWriter
 23 | 
 24 | import org.apache.commons.io.FilenameUtils
 25 | import org.apache.hadoop.fs.Path
 26 | import org.apache.log4j.Logger
 27 | import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, RelationProvider, SchemaRelationProvider}
 28 | import org.apache.spark.sql.types.StructType
 29 | import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode}
 30 | 
 31 | /**
 32 |  * Datasource to construct dataframe from a sftp url
 33 |  */
 34 | class DefaultSource extends RelationProvider with SchemaRelationProvider with CreatableRelationProvider  {
 35 |   @transient val logger = Logger.getLogger(classOf[DefaultSource])
 36 | 
 37 |   /**
 38 |    * Copy the file from SFTP to local location and then create dataframe using local file
 39 |    */
 40 |   override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]):BaseRelation = {
 41 |     createRelation(sqlContext, parameters, null)
 42 |   }
 43 | 
 44 |   /**
 45 |    * Copy the file from SFTP to local location and then create dataframe using local file
 46 |    */
 47 |   override def createRelation(sqlContext: SQLContext, parameters: Map[String, String], schema: StructType) = {
 48 |     val username = parameters.get("username")
 49 |     val password = parameters.get("password")
 50 |     val pemFileLocation = parameters.get("pem")
 51 |     val pemPassphrase = parameters.get("pemPassphrase")
 52 |     val host = parameters.getOrElse("host", sys.error("SFTP Host has to be provided using 'host' option"))
 53 |     val port = parameters.get("port")
 54 |     val path = parameters.getOrElse("path", sys.error("'path' must be specified"))
 55 |     val fileType = parameters.getOrElse("fileType", sys.error("File type has to be provided using 'fileType' option"))
 56 |     val inferSchema = parameters.get("inferSchema")
 57 |     val header = parameters.getOrElse("header", "true")
 58 |     val delimiter = parameters.getOrElse("delimiter", ",")
 59 |     val quote = parameters.getOrElse("quote", "\"")
 60 |     val escape = parameters.getOrElse("escape", "\\")
 61 |     val multiLine = parameters.getOrElse("multiLine", "false")
 62 |     val createDF = parameters.getOrElse("createDF", "true")
 63 |     val copyLatest = parameters.getOrElse("copyLatest", "false")
 64 |     val tempFolder = parameters.getOrElse("tempLocation", System.getProperty("java.io.tmpdir"))
 65 |     val hdfsTemp = parameters.getOrElse("hdfsTempLocation", tempFolder)
 66 |     val cryptoKey = parameters.getOrElse("cryptoKey", null)
 67 |     val cryptoAlgorithm = parameters.getOrElse("cryptoAlgorithm", "AES")
 68 |     val rowTag = parameters.getOrElse(constants.xmlRowTag, null)
 69 | 
 70 |     val supportedFileTypes = List("csv", "json", "avro", "parquet", "txt", "xml","orc")
 71 |     if (!supportedFileTypes.contains(fileType)) {
 72 |       sys.error("fileType " + fileType + " not supported. Supported file types are " + supportedFileTypes)
 73 |     }
 74 | 
 75 |     val inferSchemaFlag = if (inferSchema != null && inferSchema.isDefined) {
 76 |       inferSchema.get
 77 |     } else {
 78 |       "false"
 79 |     }
 80 | 
 81 |     val sftpClient = getSFTPClient(username, password, pemFileLocation, pemPassphrase, host, port,
 82 |       cryptoKey, cryptoAlgorithm)
 83 |     val copiedFileLocation = copy(sftpClient, path, tempFolder, copyLatest.toBoolean)
 84 |     val fileLocation = copyToHdfs(sqlContext, copiedFileLocation, hdfsTemp)
 85 | 
 86 |     if (!createDF.toBoolean) {
 87 |       logger.info("Returning an empty dataframe after copying files...")
 88 |       createReturnRelation(sqlContext, schema)
 89 |     } else {
 90 |       DatasetRelation(fileLocation, fileType, inferSchemaFlag, header, delimiter, quote, escape, multiLine, rowTag, schema,
 91 |         sqlContext)
 92 |     }
 93 |   }
 94 | 
 95 |   override def createRelation(
 96 |       sqlContext: SQLContext,
 97 |       mode: SaveMode,
 98 |       parameters: Map[String, String],
 99 |       data: DataFrame): BaseRelation = {
100 | 
101 |     val username = parameters.get("username")
102 |     val password = parameters.get("password")
103 |     val pemFileLocation = parameters.get("pem")
104 |     val pemPassphrase = parameters.get("pemPassphrase")
105 |     val host = parameters.getOrElse("host", sys.error("SFTP Host has to be provided using 'host' option"))
106 |     val port = parameters.get("port")
107 |     val path = parameters.getOrElse("path", sys.error("'path' must be specified"))
108 |     val fileType = parameters.getOrElse("fileType", sys.error("File type has to be provided using 'fileType' option"))
109 |     val header = parameters.getOrElse("header", "true")
110 |     val copyLatest = parameters.getOrElse("copyLatest", "false")
111 |     val tmpFolder = parameters.getOrElse("tempLocation", System.getProperty("java.io.tmpdir"))
112 |     val hdfsTemp = parameters.getOrElse("hdfsTempLocation", tmpFolder)
113 |     val cryptoKey = parameters.getOrElse("cryptoKey", null)
114 |     val cryptoAlgorithm = parameters.getOrElse("cryptoAlgorithm", "AES")
115 |     val delimiter = parameters.getOrElse("delimiter", ",")
116 |     val quote = parameters.getOrElse("quote", "\"")
117 |     val escape = parameters.getOrElse("escape", "\\")
118 |     val multiLine = parameters.getOrElse("multiLine", "false")
119 |     val codec = parameters.getOrElse("codec", null)
120 |     val rowTag = parameters.getOrElse(constants.xmlRowTag, null)
121 |     val rootTag = parameters.getOrElse(constants.xmlRootTag, null)
122 | 
123 |     val supportedFileTypes = List("csv", "json", "avro", "parquet", "txt", "xml","orc")
124 |     if (!supportedFileTypes.contains(fileType)) {
125 |       sys.error("fileType " + fileType + " not supported. Supported file types are " + supportedFileTypes)
126 |     }
127 | 
128 |     val sftpClient = getSFTPClient(username, password, pemFileLocation, pemPassphrase, host, port,
129 |       cryptoKey, cryptoAlgorithm)
130 |     val tempFile = writeToTemp(sqlContext, data, hdfsTemp, tmpFolder, fileType, header, delimiter, quote, escape, multiLine, codec, rowTag, rootTag)
131 | 
132 |     upload(tempFile, path, sftpClient)
133 |     return createReturnRelation(data)
134 |   }
135 |   private def copyToHdfs(sqlContext: SQLContext, fileLocation : String,
136 |                          hdfsTemp : String): String  = {
137 |     val hadoopConf = sqlContext.sparkContext.hadoopConfiguration
138 |     val hdfsPath = new Path(fileLocation)
139 |     val fs = hdfsPath.getFileSystem(hadoopConf)
140 |     if ("hdfs".equalsIgnoreCase(fs.getScheme)) {
141 |       fs.copyFromLocalFile(new Path(fileLocation), new Path(hdfsTemp))
142 |       val filePath = hdfsTemp + "/" + hdfsPath.getName
143 |       fs.deleteOnExit(new Path(filePath))
144 |       return filePath
145 |     } else {
146 |       return fileLocation
147 |     }
148 |   }
149 | 
150 |   private def copyFromHdfs(sqlContext: SQLContext, hdfsTemp : String,
151 |                            fileLocation : String): String  = {
152 |     val hadoopConf = sqlContext.sparkContext.hadoopConfiguration
153 |     val hdfsPath = new Path(hdfsTemp)
154 |     val fs = hdfsPath.getFileSystem(hadoopConf)
155 |     if ("hdfs".equalsIgnoreCase(fs.getScheme)) {
156 |       fs.copyToLocalFile(new Path(hdfsTemp), new Path(fileLocation))
157 |       fs.deleteOnExit(new Path(hdfsTemp))
158 |       return fileLocation
159 |     } else {
160 |       return hdfsTemp
161 |     }
162 |   }
163 | 
164 |   private def upload(source: String, target: String, sftpClient: SFTPClient) {
165 |     logger.info("Copying " + source + " to " + target)
166 |     sftpClient.copyToFTP(source, target)
167 |   }
168 | 
169 |   private def getSFTPClient(
170 |       username: Option[String],
171 |       password: Option[String],
172 |       pemFileLocation: Option[String],
173 |       pemPassphrase: Option[String],
174 |       host: String,
175 |       port: Option[String],
176 |       cryptoKey : String,
177 |       cryptoAlgorithm : String) : SFTPClient = {
178 | 
179 |     val sftpPort = if (port != null && port.isDefined) {
180 |       port.get.toInt
181 |     } else {
182 |       22
183 |     }
184 | 
185 |     val cryptoEnabled = cryptoKey != null
186 | 
187 |     if (cryptoEnabled) {
188 |       new SFTPClient(getValue(pemFileLocation), getValue(pemPassphrase), getValue(username),
189 |         getValue(password),
190 |           host, sftpPort, cryptoEnabled, cryptoKey, cryptoAlgorithm)
191 |     } else {
192 |       new SFTPClient(getValue(pemFileLocation), getValue(pemPassphrase), getValue(username),
193 |         getValue(password), host, sftpPort)
194 |     }
195 |   }
196 | 
197 |   private def createReturnRelation(data: DataFrame): BaseRelation = {
198 |     createReturnRelation(data.sqlContext, data.schema)
199 |   }
200 | 
201 |   private def createReturnRelation(sqlContextVar: SQLContext, schemaVar: StructType): BaseRelation = {
202 |     new BaseRelation {
203 |       override def sqlContext: SQLContext = sqlContextVar
204 |       override def schema: StructType = schemaVar
205 |     }
206 |   }
207 | 
208 |   private def copy(sftpClient: SFTPClient, source: String,
209 |       tempFolder: String, latest: Boolean): String = {
210 |     var copiedFilePath: String = null
211 |     try {
212 |       val target = tempFolder + File.separator + FilenameUtils.getName(source)
213 |       copiedFilePath = target
214 |       if (latest) {
215 |         copiedFilePath = sftpClient.copyLatest(source, tempFolder)
216 |       } else {
217 |         logger.info("Copying " + source + " to " + target)
218 |         copiedFilePath = sftpClient.copy(source, target)
219 |       }
220 | 
221 |       copiedFilePath
222 |     } finally {
223 |       addShutdownHook(copiedFilePath)
224 |     }
225 |   }
226 | 
227 |   private def getValue(param: Option[String]): String = {
228 |     if (param != null && param.isDefined) {
229 |       param.get
230 |     } else {
231 |       null
232 |     }
233 |   }
234 | 
235 |   private def writeToTemp(sqlContext: SQLContext, df: DataFrame,
236 |                           hdfsTemp: String, tempFolder: String, fileType: String, header: String,
237 |                           delimiter: String, quote: String, escape: String, multiLine: String, codec: String, rowTag: String, rootTag: String) : String = {
238 |     val randomSuffix = "spark_sftp_connection_temp_" + UUID.randomUUID
239 |     val hdfsTempLocation = hdfsTemp + File.separator + randomSuffix
240 |     val localTempLocation = tempFolder + File.separator + randomSuffix
241 | 
242 |     addShutdownHook(localTempLocation)
243 | 
244 |     fileType match {
245 | 
246 |       case "xml" =>  df.coalesce(1).write.format(constants.xmlClass)
247 |                     .option(constants.xmlRowTag, rowTag)
248 |                     .option(constants.xmlRootTag, rootTag).save(hdfsTempLocation)
249 |       case "csv" => df.coalesce(1).
250 |                     write.
251 |                     option("header", header).
252 |                     option("delimiter", delimiter).
253 |                     option("quote", quote).
254 |                     option("escape", escape).
255 |                     option("multiLine", multiLine).
256 |                     optionNoNull("codec", Option(codec)).
257 |                     csv(hdfsTempLocation)
258 |       case "txt" => df.coalesce(1).write.text(hdfsTempLocation)
259 |       case "avro" => df.coalesce(1).write.format("com.databricks.spark.avro").save(hdfsTempLocation)
260 |       case _ => df.coalesce(1).write.format(fileType).save(hdfsTempLocation)
261 |     }
262 | 
263 |     copyFromHdfs(sqlContext, hdfsTempLocation, localTempLocation)
264 |     copiedFile(localTempLocation)
265 |   }
266 | 
267 |   private def addShutdownHook(tempLocation: String) {
268 |     logger.debug("Adding hook for file " + tempLocation)
269 |     val hook = new DeleteTempFileShutdownHook(tempLocation)
270 |     Runtime.getRuntime.addShutdownHook(hook)
271 |   }
272 | 
273 |   private def copiedFile(tempFileLocation: String) : String = {
274 |     val baseTemp = new File(tempFileLocation)
275 |     val files = baseTemp.listFiles().filter { x =>
276 |       (!x.isDirectory()
277 |         && !x.getName.contains("SUCCESS")
278 |         && !x.isHidden()
279 |         && !x.getName.contains(".crc")
280 |         && !x.getName.contains("_committed_")
281 |         && !x.getName.contains("_started_")
282 |         )
283 |     }
284 |     files(0).getAbsolutePath
285 |   }
286 | }
287 | 


--------------------------------------------------------------------------------
/src/main/scala/com/springml/spark/sftp/DeleteTempFileShutdownHook.scala:
--------------------------------------------------------------------------------
 1 | package com.springml.spark.sftp
 2 | 
 3 | import org.apache.commons.io.FileUtils
 4 | import java.io.File
 5 | import org.apache.log4j.Logger
 6 | 
 7 | /**
 8 |  * Delete the temp file created during spark shutdown
 9 |  */
10 | class DeleteTempFileShutdownHook(
11 |     fileLocation: String) extends Thread {
12 | 
13 |   private val logger = Logger.getLogger(classOf[DatasetRelation])
14 | 
15 |   override def run(): Unit = {
16 |     logger.info("Deleting " + fileLocation )
17 |     FileUtils.deleteQuietly(new File(fileLocation))
18 |   }
19 | }


--------------------------------------------------------------------------------
/src/main/scala/com/springml/spark/sftp/constants.scala:
--------------------------------------------------------------------------------
 1 | package com.springml.spark.sftp
 2 | 
 3 | /**
 4 |   * Created by bagopalan on 9/16/18.
 5 |   */
 6 | object constants {
 7 | 
 8 |   val xmlClass: String = "com.databricks.spark.xml"
 9 |   val xmlRowTag: String = "rowTag"
10 |   val xmlRootTag: String = "rootTag"
11 | 
12 | }
13 | 


--------------------------------------------------------------------------------
/src/main/scala/com/springml/spark/sftp/util/Utils.scala:
--------------------------------------------------------------------------------
 1 | package com.springml.spark.sftp.util
 2 | 
 3 | import org.apache.spark.sql.DataFrameWriter
 4 | 
 5 | 
 6 | object Utils {
 7 | 
 8 | 
 9 |   /**
10 |     * [[DataFrameWriter]] implicits
11 |     */
12 |   implicit class ImplicitDataFrameWriter[T](dataFrameWriter: DataFrameWriter[T]) {
13 | 
14 |     /**
15 |       * Adds an output option for the underlying data source if the option has a value.
16 |       */
17 |     def optionNoNull(key: String, optionValue: Option[String]): DataFrameWriter[T] = {
18 |       optionValue match {
19 |         case Some(_) => dataFrameWriter.option(key, optionValue.get)
20 |         case None => dataFrameWriter
21 |       }
22 |     }
23 |   }
24 | 
25 | }


--------------------------------------------------------------------------------
/src/test/resources/books.orc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/egen/spark-sftp/090917547001574afa93cddaf2a022151a3f4260/src/test/resources/books.orc


--------------------------------------------------------------------------------
/src/test/resources/books.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0"?>
  2 | <catalog>
  3 |     <book id="bk101">
  4 |         <author>Gambardella, Matthew</author>
  5 |         <title>XML Developer's Guide</title>
  6 |         <genre>Computer</genre>
  7 |         <price>44.95</price>
  8 |         <publish_date>2000-10-01</publish_date>
  9 |         <description>
 10 | 
 11 | 
 12 |             An in-depth look at creating applications
 13 |             with XML.This manual describes Oracle XML DB, and how you can use it to store, generate, manipulate, manage,
 14 |             and query XML data in the database.
 15 | 
 16 | 
 17 |             After introducing you to the heart of Oracle XML DB, namely the XMLType framework and Oracle XML DB repository,
 18 |             the manual provides a brief introduction to design criteria to consider when planning your Oracle XML DB
 19 |             application. It provides examples of how and where you can use Oracle XML DB.
 20 | 
 21 | 
 22 |             The manual then describes ways you can store and retrieve XML data using Oracle XML DB, APIs for manipulating
 23 |             XMLType data, and ways you can view, generate, transform, and search on existing XML data. The remainder of
 24 |             the manual discusses how to use Oracle XML DB repository, including versioning and security,
 25 |             how to access and manipulate repository resources using protocols, SQL, PL/SQL, or Java, and how to manage
 26 |             your Oracle XML DB application using Oracle Enterprise Manager. It also introduces you to XML messaging and
 27 |             Oracle Streams Advanced Queuing XMLType support.
 28 |         </description></book><book id="bk102">
 29 |     <author>Ralls, Kim</author>
 30 |     <title>Midnight Rain</title>
 31 |     <genre>Fantasy</genre>
 32 |     <price>5.95</price>
 33 |     <publish_date>2000-12-16</publish_date>
 34 |     <description>A former architect battles corporate zombies,
 35 |         an evil sorceress, and her own childhood to become queen
 36 |         of the world.</description>
 37 | </book>
 38 |     <book id="bk103">
 39 |         <author>Corets, Eva</author>
 40 |         <title>Maeve Ascendant</title>
 41 |         <genre>Fantasy</genre>
 42 |         <price>5.95</price>
 43 |         <publish_date>2000-11-17</publish_date>
 44 |         <description>After the collapse of a nanotechnology
 45 |             society in England, the young survivors lay the
 46 |             foundation for a new society.</description>
 47 |     </book>
 48 |     <book id="bk104">
 49 |         <author>Corets, Eva</author>
 50 |         <title>Oberon's Legacy</title>
 51 |         <genre>Fantasy</genre>
 52 |         <price>5.95</price>
 53 |         <publish_date>2001-03-10</publish_date>
 54 |         <description>In post-apocalypse England, the mysterious
 55 |             agent known only as Oberon helps to create a new life
 56 |             for the inhabitants of London. Sequel to Maeve
 57 |             Ascendant.</description>
 58 |     </book>
 59 |     <book id="bk105">
 60 |         <author>Corets, Eva</author>
 61 |         <title>The Sundered Grail</title>
 62 |         <genre>Fantasy</genre>
 63 |         <price>5.95</price>
 64 |         <publish_date>2001-09-10</publish_date>
 65 |         <description>The two daughters of Maeve, half-sisters,
 66 |             battle one another for control of England. Sequel to
 67 |             Oberon's Legacy.</description>
 68 |     </book>
 69 |     <book id="bk106">
 70 |         <author>Randall, Cynthia</author>
 71 |         <title>Lover Birds</title>
 72 |         <genre>Romance</genre>
 73 |         <price>4.95</price>
 74 |         <publish_date>2000-09-02</publish_date>
 75 |         <description>When Carla meets Paul at an ornithology
 76 |             conference, tempers fly as feathers get ruffled.</description>
 77 |     </book>
 78 |     <book id="bk107">
 79 |         <author>Thurman, Paula</author>
 80 |         <title>Splish Splash</title>
 81 |         <genre>Romance</genre>
 82 |         <price>4.95</price>
 83 |         <publish_date>2000-11-02</publish_date>
 84 |         <description>A deep sea diver finds true love twenty
 85 |             thousand leagues beneath the sea.</description>
 86 |     </book>
 87 |     <book id="bk108">
 88 |         <author>Knorr, Stefan</author>
 89 |         <title>Creepy Crawlies</title>
 90 |         <genre>Horror</genre>
 91 |         <price>4.95</price>
 92 |         <publish_date>2000-12-06</publish_date>
 93 |         <description>An anthology of horror stories about roaches,
 94 |             centipedes, scorpions  and other insects.</description>
 95 |     </book>
 96 |     <book id="bk109">
 97 |         <author>Kress, Peter</author>
 98 |         <title>Paradox Lost</title>
 99 |         <genre>Science Fiction</genre>
100 |         <price>6.95</price>
101 |         <publish_date>2000-11-02</publish_date>
102 |         <description>After an inadvertant trip through a Heisenberg
103 |             Uncertainty Device, James Salway discovers the problems
104 |             of being quantum.</description>
105 |     </book>
106 |     <book id="bk110">
107 |         <author>O'Brien, Tim</author>
108 |         <title>Microsoft .NET: The Programming Bible</title>
109 |         <genre>Computer</genre>
110 |         <price>36.95</price>
111 |         <publish_date>2000-12-09</publish_date>
112 |         <description>Microsoft's .NET initiative is explored in
113 |             detail in this deep programmer's reference.</description>
114 |     </book>
115 |     <book id="bk111">
116 |         <author>O'Brien, Tim</author>
117 |         <title>MSXML3: A Comprehensive Guide</title>
118 |         <genre>Computer</genre>
119 |         <price>36.95</price>
120 |         <publish_date>2000-12-01</publish_date>
121 |         <description>The Microsoft MSXML3 parser is covered in
122 |             detail, with attention to XML DOM interfaces, XSLT processing,
123 |             SAX and more.</description>
124 |     </book>
125 |     <book id="bk112">
126 |         <author>Galos, Mike</author>
127 |         <title>Visual Studio 7: A Comprehensive Guide</title>
128 |         <genre>Computer</genre>
129 |         <price>49.95</price>
130 |         <publish_date>2001-04-16</publish_date>
131 |         <description>Microsoft Visual Studio 7 is explored in depth,
132 |             looking at how Visual Basic, Visual C++, C#, and ASP+ are
133 |             integrated into a comprehensive development
134 |             environment.</description>
135 |     </book>
136 | </catalog>


--------------------------------------------------------------------------------
/src/test/resources/custom-delimiter.csv:
--------------------------------------------------------------------------------
1 | ProposalId;OpportunityId;Clicks;Impressions
2 | 103;006B0000002ndnuIAA;30;133
3 | 101;006B0000002ndnkIAA;12;73
4 | 102;006B0000002ndnpIAA;20;97
5 | 


--------------------------------------------------------------------------------
/src/test/resources/people.json:
--------------------------------------------------------------------------------
1 | {"name":"Michael"}
2 | {"name":"Andy", "age":30}
3 | {"name":"Justin", "age":19}
4 | 


--------------------------------------------------------------------------------
/src/test/resources/plaintext.txt:
--------------------------------------------------------------------------------
1 | adam
2 | Emily
3 | sundar


--------------------------------------------------------------------------------
/src/test/resources/sample.csv:
--------------------------------------------------------------------------------
1 | ProposalId,OpportunityId,Clicks,Impressions
2 | 103,006B0000002ndnuIAA,30,133
3 | 101,006B0000002ndnkIAA,12,73
4 | 102,006B0000002ndnpIAA,20,97
5 | 


--------------------------------------------------------------------------------
/src/test/resources/sample_quoted_multiline.csv:
--------------------------------------------------------------------------------
1 | ProposalId,OpportunityId,Clicks,Impressions,Message
2 | 103,006B0000002ndnuIAA,30,133,"test
3 | multiline \"here we have a quote\"
4 | message",
5 | 101,006B0000002ndnkIAA,12,73,"regular message"
6 | 102,006B0000002ndnpIAA,20,97,"regular message"
7 | 


--------------------------------------------------------------------------------
/src/test/resources/users.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/egen/spark-sftp/090917547001574afa93cddaf2a022151a3f4260/src/test/resources/users.avro


--------------------------------------------------------------------------------
/src/test/resources/users.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/egen/spark-sftp/090917547001574afa93cddaf2a022151a3f4260/src/test/resources/users.parquet


--------------------------------------------------------------------------------
/src/test/scala/com/springml/spark/sftp/CustomSchemaTest.scala:
--------------------------------------------------------------------------------
 1 | package com.springml.spark.sftp
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructField, _}
 5 | import org.scalatest.{BeforeAndAfterEach, FunSuite}
 6 | 
 7 | /**
 8 |   * Tests for creating dataframe using custom schema
 9 |   */
10 | class CustomSchemaTest extends FunSuite with BeforeAndAfterEach {
11 |   var ss: SparkSession = _
12 | 
13 |   val csvTypesMap = Map("ProposalId" -> IntegerType,
14 |     "OpportunityId" -> StringType,
15 |     "Clicks" -> LongType,
16 |     "Impressions" -> LongType
17 |   )
18 | 
19 |   val jsonTypesMap = Map("name" -> StringType,
20 |     "age" -> IntegerType
21 |   )
22 | 
23 |   override def beforeEach() {
24 |     ss = SparkSession.builder().master("local").appName("Custom Schema Test").getOrCreate()
25 |   }
26 | 
27 |   private def validateTypes(field : StructField, typeMap : Map[String, DataType]) = {
28 |     val expectedType = typeMap(field.name)
29 |     assert(expectedType == field.dataType)
30 |   }
31 | 
32 |   private def columnArray(typeMap : Map[String, DataType]) : Array[StructField] = {
33 |     val columns = typeMap.map(x => new StructField(x._1, x._2, true))
34 | 
35 |     val columnStruct = Array[StructField] ()
36 |     columns.copyToArray(columnStruct)
37 | 
38 |     columnStruct
39 |   }
40 | 
41 |   test ("Read CSV with custom schema") {
42 |     val columnStruct = columnArray(csvTypesMap)
43 |     val expectedSchema = StructType(columnStruct)
44 | 
45 |     val fileLocation = getClass.getResource("/sample.csv").getPath
46 |     val dsr = DatasetRelation(fileLocation, "csv", "false", "true", ",", "\"", "\\", "false", null, expectedSchema, ss.sqlContext)
47 |     val rdd = dsr.buildScan()
48 | 
49 |     assert(dsr.schema.fields.length == columnStruct.length)
50 |     dsr.schema.fields.foreach(s => validateTypes(s, csvTypesMap))
51 |   }
52 | 
53 |   test ("Read Json with custom schema") {
54 |     val columnStruct = columnArray(jsonTypesMap)
55 |     val expectedSchema = StructType(columnStruct)
56 | 
57 |     val fileLocation = getClass.getResource("/people.json").getPath
58 |     val dsr = DatasetRelation(fileLocation, "json", "false", "true", ",", "\"", "\\", "false", null, expectedSchema, ss.sqlContext)
59 |     val rdd = dsr.buildScan()
60 | 
61 |     assert(dsr.schema.fields.length == columnStruct.length)
62 |     dsr.schema.fields.foreach(s => validateTypes(s, jsonTypesMap))
63 |   }
64 | 
65 | }
66 | 


--------------------------------------------------------------------------------
/src/test/scala/com/springml/spark/sftp/TestDatasetRelation.scala:
--------------------------------------------------------------------------------
 1 | package com.springml.spark.sftp
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.scalatest.{BeforeAndAfterEach, FunSuite}
 5 | 
 6 | /**
 7 |  * Simple unit test for basic testing on different formats of file
 8 |  */
 9 | class TestDatasetRelation extends FunSuite with BeforeAndAfterEach {
10 |   var ss: SparkSession = _
11 | 
12 |   override def beforeEach() {
13 |     ss = SparkSession.builder().master("local").enableHiveSupport().appName("Test Dataset Relation").getOrCreate()
14 |   }
15 | 
16 |   test ("Read CSV") {
17 |     val fileLocation = getClass.getResource("/sample.csv").getPath
18 |     val dsr = DatasetRelation(fileLocation, "csv", "false", "true", ",", "\"", "\\", "false", null, null, ss.sqlContext)
19 |     val rdd = dsr.buildScan()
20 |     assert(3 == rdd.count())
21 |   }
22 | 
23 |   test ("Read CSV using custom delimiter") {
24 |     val fileLocation = getClass.getResource("/sample.csv").getPath
25 |     val dsr = DatasetRelation(fileLocation, "csv", "false", "true", ";", "\"", "\\", "false", null, null, ss.sqlContext)
26 |     val rdd = dsr.buildScan()
27 |     assert(3 == rdd.count())
28 |   }
29 | 
30 |   test ("Read multiline CSV using custom quote and escape") {
31 |     val fileLocation = getClass.getResource("/sample_quoted_multiline.csv").getPath
32 |     val dsr = DatasetRelation(fileLocation, "csv", "false", "true", ",", "\"", "\\", "true", null, null, ss.sqlContext)
33 |     val rdd = dsr.buildScan()
34 |     assert(3 == rdd.count())
35 |   }
36 | 
37 | 
38 |   test ("Read JSON") {
39 |     val fileLocation = getClass.getResource("/people.json").getPath
40 |     val dsr = DatasetRelation(fileLocation, "json", "false", "true", ",", "\"", "\\", "false", null, null, ss.sqlContext)
41 |     val rdd = dsr.buildScan()
42 |     assert(3 == rdd.count())
43 |   }
44 | 
45 |   test ("Read AVRO") {
46 |     val fileLocation = getClass.getResource("/users.avro").getPath
47 |     val dsr = DatasetRelation(fileLocation, "avro", "false", "true", ",", "\"", "\\", "false", null, null, ss.sqlContext)
48 |     val rdd = dsr.buildScan()
49 |     assert(2 == rdd.count())
50 |   }
51 | 
52 |   test ("Read parquet") {
53 |     val fileLocation = getClass.getResource("/users.parquet").getPath
54 |     val dsr = DatasetRelation(fileLocation, "parquet", "false", "true", ",", "\"", "\\", "false", null, null, ss.sqlContext)
55 |     val rdd = dsr.buildScan()
56 |     assert(2 == rdd.count())
57 |   }
58 | 
59 |   test ("Read text file") {
60 |     val fileLocation = getClass.getResource("/plaintext.txt").getPath
61 |     val dsr = DatasetRelation(fileLocation, "txt", "false", "true", ",", "\"", "\\", "false", null, null, ss.sqlContext)
62 |     val rdd = dsr.buildScan()
63 |     assert(3 == rdd.count())
64 |   }
65 | 
66 |   test ("Read xml file") {
67 |     val fileLocation = getClass.getResource("/books.xml").getPath
68 |     val dsr = DatasetRelation(fileLocation, "xml", "false", "true", ",", "\"", "\\", "false", "book", null, ss.sqlContext)
69 |     val rdd = dsr.buildScan()
70 |     assert(12 == rdd.count())
71 |   }
72 |   test ("Read orc file") {
73 |     val fileLocation = getClass.getResource("/books.orc").getPath
74 |     val dsr = DatasetRelation(fileLocation, "orc", "false", "true", ",", "\"", "\\", "false", "book", null, ss.sqlContext)
75 |     val rdd = dsr.buildScan()
76 |     assert(12 == rdd.count())
77 |   }
78 | }
79 | 


--------------------------------------------------------------------------------