├── .gitignore ├── LICENSE ├── README.md ├── build.sbt ├── build ├── sbt └── sbt-launch-lib.bash ├── project ├── build.properties └── plugins.sbt └── src ├── main └── scala │ └── com │ └── springml │ └── spark │ └── sftp │ ├── DatasetRelation.scala │ ├── DefaultSource.scala │ ├── DeleteTempFileShutdownHook.scala │ ├── constants.scala │ └── util │ └── Utils.scala └── test ├── resources ├── books.orc ├── books.xml ├── custom-delimiter.csv ├── people.json ├── plaintext.txt ├── sample.csv ├── sample_quoted_multiline.csv ├── users.avro └── users.parquet └── scala └── com └── springml └── spark └── sftp ├── CustomSchemaTest.scala └── TestDatasetRelation.scala /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.log 3 | *.pyc 4 | sbt/*.jar 5 | 6 | # sbt specific 7 | .cache/ 8 | .history/ 9 | .lib/ 10 | dist/* 11 | target/ 12 | lib_managed/ 13 | src_managed/ 14 | project/boot/ 15 | project/plugins/project/ 16 | /bin/ 17 | 18 | .cache-main 19 | .classpath 20 | .project 21 | .settings/ 22 | .cache-tests 23 | 24 | # intellij 25 | .idea 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License, Version 2.0 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. 13 | 14 | "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. 15 | 16 | "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. 17 | 18 | "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. 19 | 20 | "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. 21 | 22 | "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). 23 | 24 | "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. 25 | 26 | "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." 27 | 28 | "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 29 | 30 | 2. Grant of Copyright License. 31 | 32 | Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 33 | 34 | 3. Grant of Patent License. 35 | 36 | Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 37 | 38 | 4. Redistribution. 39 | 40 | You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: 41 | 42 | You must give any other recipients of the Work or Derivative Works a copy of this License; and 43 | You must cause any modified files to carry prominent notices stating that You changed the files; and 44 | You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and 45 | If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. 46 | You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 47 | 48 | 5. Submission of Contributions. 49 | 50 | Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 51 | 52 | 6. Trademarks. 53 | 54 | This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 55 | 56 | 7. Disclaimer of Warranty. 57 | 58 | Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 59 | 60 | 8. Limitation of Liability. 61 | 62 | In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 63 | 64 | 9. Accepting Warranty or Additional Liability. 65 | 66 | While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. 67 | 68 | END OF TERMS AND CONDITIONS 69 | 70 | APPENDIX: How to apply the Apache License to your work 71 | 72 | To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. 73 | 74 | Copyright [yyyy] [name of copyright owner] 75 | 76 | Licensed under the Apache License, Version 2.0 (the "License"); 77 | you may not use this file except in compliance with the License. 78 | You may obtain a copy of the License at 79 | 80 | http://www.apache.org/licenses/LICENSE-2.0 81 | 82 | Unless required by applicable law or agreed to in writing, software 83 | distributed under the License is distributed on an "AS IS" BASIS, 84 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 85 | See the License for the specific language governing permissions and 86 | limitations under the License. 87 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Spark SFTP Connector Library 2 | 3 | A library for constructing dataframes by downloading files from SFTP and writing dataframe to a SFTP server 4 | 5 | ## Requirements 6 | 7 | This library requires Spark 2.x. 8 | 9 | For Spark 1.x support, please check [spark1.x](https://github.com/springml/spark-sftp/tree/spark1.x) branch. 10 | 11 | ## Linking 12 | You can link against this library in your program at the following ways: 13 | 14 | ### Maven Dependency 15 | ``` 16 | 17 | com.springml 18 | spark-sftp_2.11 19 | 1.1.3 20 | 21 | 22 | ``` 23 | 24 | ### SBT Dependency 25 | ``` 26 | libraryDependencies += "com.springml" % "spark-sftp_2.11" % "1.1.3" 27 | ``` 28 | 29 | 30 | ## Using with Spark shell 31 | This package can be added to Spark using the `--packages` command line option. For example, to include it when starting the spark shell: 32 | 33 | ``` 34 | $ bin/spark-shell --packages com.springml:spark-sftp_2.11:1.1.3 35 | ``` 36 | 37 | ## Features 38 | This package can be used to construct spark dataframe by downloading the files from SFTP server. 39 | 40 | This package can also be used to write spark dataframe as a csv|json|acro tp SFTP server 41 | 42 | This library requires following options: 43 | * `path`: FTP URL of the file to be used for dataframe construction 44 | * `username`: SFTP Server Username. 45 | * `password`: (Optional) SFTP Server Password. 46 | * `pem`: (Optional) Location of PEM file. Either pem or password has to be specified 47 | * `pemPassphrase`: (Optional) Passphrase for PEM file. 48 | * `host`: SFTP Host. 49 | * `port`: (Optional) Port in which SFTP server is running. Default value 22. 50 | * `fileType`: Type of the file. Supported types are csv, txt, json, avro and parquet 51 | * `inferSchema`: (Optional) InferSchema from the file content. Currently applicable only for csv fileType 52 | * `header`: (Optional) Applicable only for csv fileType. Is the first row in CSV file is header. 53 | * `delimiter`: (Optional) Set the field delimiter. Applicable only for csv fileType. Default is comma. 54 | * `quote`: (Optional) Set the quote character. Applicable only for csv fileType. Default is ". 55 | * `escape`: (Optional) Set the escape character. Applicable only for csv fileType. Default is \. 56 | * `multiLine`: (Optional) Set the multiline. Applicable only for csv fileType. Default is false. 57 | * `codec`: (Optional) Applicable only for csv fileType. Compression codec to use when saving to file. Should be the fully qualified name of a class implementing org.apache.hadoop.io.compress.CompressionCodec or one of case-insensitive shorten names (bzip2, gzip, lz4, and snappy). Defaults to no compression when a codec is not specified. 58 | 59 | ### Scala API 60 | ```scala 61 | 62 | // Construct Spark dataframe using file in FTP server 63 | val df = spark.read. 64 | format("com.springml.spark.sftp"). 65 | option("host", "SFTP_HOST"). 66 | option("username", "SFTP_USER"). 67 | option("password", "****"). 68 | option("fileType", "csv"). 69 | option("delimiter", ";"). 70 | option("quote", "\""). 71 | option("escape", "\\"). 72 | option("multiLine", "true"). 73 | option("inferSchema", "true"). 74 | load("/ftp/files/sample.csv") 75 | 76 | // Write dataframe as CSV file to FTP server 77 | df.write. 78 | format("com.springml.spark.sftp"). 79 | option("host", "SFTP_HOST"). 80 | option("username", "SFTP_USER"). 81 | option("password", "****"). 82 | option("fileType", "csv"). 83 | option("delimiter", ";"). 84 | option("codec", "bzip2"). 85 | save("/ftp/files/sample.csv") 86 | 87 | 88 | // Construct spark dataframe using text file in FTP server 89 | val df = spark.read. 90 | format("com.springml.spark.sftp"). 91 | option("host", "SFTP_HOST"). 92 | option("username", "SFTP_USER"). 93 | option("password", "****"). 94 | option("fileType", "txt"). 95 | load("config") 96 | 97 | // Construct spark dataframe using xml file in FTP server 98 | val df = spark.read. 99 | format("com.springml.spark.sftp"). 100 | option("host", "SFTP_HOST"). 101 | option("username", "SFTP_USER"). 102 | option("password", "*****"). 103 | option("fileType", "xml"). 104 | option("rowTag", "YEAR").load("myxml.xml") 105 | 106 | // Write dataframe as XML file to FTP server 107 | 108 | df.write.format("com.springml.spark.sftp"). 109 | option("host", "SFTP_HOST"). 110 | option("username", "SFTP_USER"). 111 | option("password", "*****"). 112 | option("fileType", "xml"). 113 | option("rootTag", "YTD"). 114 | option("rowTag", "YEAR").save("myxmlOut.xml.gz") 115 | 116 | ``` 117 | 118 | 119 | ### Java API 120 | ```java 121 | // Construct Spark dataframe using file in FTP server 122 | DataFrame df = spark.read(). 123 | format("com.springml.spark.sftp"). 124 | option("host", "SFTP_HOST"). 125 | option("username", "SFTP_USER"). 126 | option("password", "****"). 127 | option("fileType", "json"). 128 | load("/ftp/files/sample.json") 129 | 130 | // Write dataframe as CSV file to FTP server 131 | df.write(). 132 | format("com.springml.spark.sftp"). 133 | option("host", "SFTP_HOST"). 134 | option("username", "SFTP_USER"). 135 | option("password", "****"). 136 | option("fileType", "json"). 137 | save("/ftp/files/sample.json"); 138 | ``` 139 | 140 | ### R API 141 | Spark 1.5+: 142 | ```r 143 | 144 | if (nchar(Sys.getenv("SPARK_HOME")) < 1) { 145 | Sys.setenv(SPARK_HOME = "/home/spark") 146 | } 147 | library(SparkR, lib.loc = c(file.path(Sys.getenv("SPARK_HOME"), "R", "lib"))) 148 | sparkR.session(master = "local[*]", sparkConfig = list(spark.driver.memory = "2g")) 149 | 150 | # Construct Spark dataframe using avro file in FTP server 151 | df <- read.df(path="/ftp/files/sample.avro", 152 | source="com.springml.spark.sftp", 153 | host="SFTP_HOST", 154 | username="SFTP_USER", 155 | pem="/home/user/mypem.pem", 156 | fileType="avro") 157 | 158 | # Write dataframe as avro file to FTP server 159 | write.df(df, 160 | path="/ftp/files/sample.avro", 161 | source="com.springml.spark.sftp", 162 | host="SFTP_HOST", 163 | username="SFTP_USER", 164 | pem="/home/user/mypem.pem", 165 | fileType="avro") 166 | ``` 167 | 168 | ### Note 169 | 1. SFTP files are fetched and written using [jsch](http://www.jcraft.com/jsch/). It will be executed as a single process 170 | 2. Files from SFTP server will be downloaded to temp location and it will be deleted only during spark shutdown 171 | 172 | 173 | ## Building From Source 174 | This library is built with [SBT](http://www.scala-sbt.org/0.13/docs/Command-Line-Reference.html), which is automatically downloaded by the included shell script. To build a JAR file simply run `build/sbt package` from the project root. 175 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | name := "spark-sftp" 2 | 3 | organization := "com.springml" 4 | 5 | scalaVersion := "2.11.8" 6 | 7 | sparkVersion := "2.3.0" 8 | 9 | spName := "springml/spark-sftp" 10 | 11 | version := "1.1.4" 12 | 13 | // Dependent libraries 14 | libraryDependencies ++= Seq( 15 | "com.springml" % "sftp.client" % "1.0.3", 16 | "org.mockito" % "mockito-core" % "2.0.31-beta", 17 | "com.databricks" % "spark-xml_2.11" % "0.4.1" 18 | ) 19 | 20 | // used spark components 21 | sparkComponents += "sql" 22 | 23 | // Repositories 24 | resolvers += "Spark Package Main Repo" at "https://dl.bintray.com/spark-packages/maven" 25 | 26 | // Spark packages 27 | spDependencies += "com.databricks/spark-avro_2.11:3.2.0" 28 | 29 | // Test dependencies 30 | libraryDependencies += "org.scalatest" %% "scalatest" % "2.2.1" % "test" 31 | libraryDependencies += "org.apache.avro" % "avro-mapred" % "1.7.7" % "test" exclude("org.mortbay.jetty", "servlet-api") 32 | libraryDependencies += "org.apache.spark" %% "spark-hive" % sparkVersion.value % "test" 33 | 34 | spIgnoreProvided := true 35 | // licenses := Seq("Apache-2.0" -> url("http://opensource.org/licenses/Apache-2.0")) 36 | 37 | credentials += Credentials(Path.userHome / ".ivy2" / ".credentials") 38 | 39 | publishTo := { 40 | val nexus = "https://oss.sonatype.org/" 41 | if (version.value.endsWith("SNAPSHOT")) 42 | Some("snapshots" at nexus + "content/repositories/snapshots") 43 | else 44 | Some("releases" at nexus + "service/local/staging/deploy/maven2") 45 | } 46 | 47 | pomExtra := ( 48 | https://github.com/springml/spark-sftp 49 | 50 | 51 | Apache License, Verision 2.0 52 | http://www.apache.org/licenses/LICENSE-2.0.html 53 | repo 54 | 55 | 56 | 57 | scm:git:github.com/springml/spark-sftp 58 | scm:git:git@github.com:springml/spark-sftp 59 | github.com/springml/spark-sftp 60 | 61 | 62 | 63 | springml 64 | Springml 65 | http://www.springml.com 66 | 67 | ) 68 | -------------------------------------------------------------------------------- /build/sbt: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # When creating new tests for Spark SQL Hive, the HADOOP_CLASSPATH must contain the hive jars so 4 | # that we can run Hive to generate the golden answer. This is not required for normal development 5 | # or testing. 6 | for i in $HIVE_HOME/lib/* 7 | do HADOOP_CLASSPATH=$HADOOP_CLASSPATH:$i 8 | done 9 | export HADOOP_CLASSPATH 10 | 11 | realpath () { 12 | ( 13 | TARGET_FILE=$1 14 | 15 | cd $(dirname $TARGET_FILE) 16 | TARGET_FILE=$(basename $TARGET_FILE) 17 | 18 | COUNT=0 19 | while [ -L "$TARGET_FILE" -a $COUNT -lt 100 ] 20 | do 21 | TARGET_FILE=$(readlink $TARGET_FILE) 22 | cd $(dirname $TARGET_FILE) 23 | TARGET_FILE=$(basename $TARGET_FILE) 24 | COUNT=$(($COUNT + 1)) 25 | done 26 | 27 | echo $(pwd -P)/$TARGET_FILE 28 | ) 29 | } 30 | 31 | . $(dirname $(realpath $0))/sbt-launch-lib.bash 32 | 33 | 34 | declare -r noshare_opts="-Dsbt.global.base=project/.sbtboot -Dsbt.boot.directory=project/.boot -Dsbt.ivy.home=project/.ivy" 35 | declare -r sbt_opts_file=".sbtopts" 36 | declare -r etc_sbt_opts_file="/etc/sbt/sbtopts" 37 | 38 | usage() { 39 | cat < path to global settings/plugins directory (default: ~/.sbt) 47 | -sbt-boot path to shared boot directory (default: ~/.sbt/boot in 0.11 series) 48 | -ivy path to local Ivy repository (default: ~/.ivy2) 49 | -mem set memory options (default: $sbt_mem, which is $(get_mem_opts $sbt_mem)) 50 | -no-share use all local caches; no sharing 51 | -no-global uses global caches, but does not use global ~/.sbt directory. 52 | -jvm-debug Turn on JVM debugging, open at the given port. 53 | -batch Disable interactive mode 54 | # sbt version (default: from project/build.properties if present, else latest release) 55 | -sbt-version use the specified version of sbt 56 | -sbt-jar use the specified jar as the sbt launcher 57 | -sbt-rc use an RC version of sbt 58 | -sbt-snapshot use a snapshot version of sbt 59 | # java version (default: java from PATH, currently $(java -version 2>&1 | grep version)) 60 | -java-home alternate JAVA_HOME 61 | # jvm options and output control 62 | JAVA_OPTS environment variable, if unset uses "$java_opts" 63 | SBT_OPTS environment variable, if unset uses "$default_sbt_opts" 64 | .sbtopts if this file exists in the current directory, it is 65 | prepended to the runner args 66 | /etc/sbt/sbtopts if this file exists, it is prepended to the runner args 67 | -Dkey=val pass -Dkey=val directly to the java runtime 68 | -J-X pass option -X directly to the java runtime 69 | (-J is stripped) 70 | -S-X add -X to sbt's scalacOptions (-J is stripped) 71 | -PmavenProfiles Enable a maven profile for the build. 72 | In the case of duplicated or conflicting options, the order above 73 | shows precedence: JAVA_OPTS lowest, command line options highest. 74 | EOM 75 | } 76 | 77 | process_my_args () { 78 | while [[ $# -gt 0 ]]; do 79 | case "$1" in 80 | -no-colors) addJava "-Dsbt.log.noformat=true" && shift ;; 81 | -no-share) addJava "$noshare_opts" && shift ;; 82 | -no-global) addJava "-Dsbt.global.base=$(pwd)/project/.sbtboot" && shift ;; 83 | -sbt-boot) require_arg path "$1" "$2" && addJava "-Dsbt.boot.directory=$2" && shift 2 ;; 84 | -sbt-dir) require_arg path "$1" "$2" && addJava "-Dsbt.global.base=$2" && shift 2 ;; 85 | -debug-inc) addJava "-Dxsbt.inc.debug=true" && shift ;; 86 | -batch) exec &2 "$@" 31 | } 32 | vlog () { 33 | [[ $verbose || $debug ]] && echoerr "$@" 34 | } 35 | dlog () { 36 | [[ $debug ]] && echoerr "$@" 37 | } 38 | 39 | acquire_sbt_jar () { 40 | SBT_VERSION=`awk -F "=" '/sbt\\.version/ {print $2}' ./project/build.properties` 41 | URL1=http://typesafe.artifactoryonline.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar 42 | URL2=http://repo.typesafe.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar 43 | JAR=build/sbt-launch-${SBT_VERSION}.jar 44 | 45 | sbt_jar=$JAR 46 | 47 | if [[ ! -f "$sbt_jar" ]]; then 48 | # Download sbt launch jar if it hasn't been downloaded yet 49 | if [ ! -f ${JAR} ]; then 50 | # Download 51 | printf "Attempting to fetch sbt\n" 52 | JAR_DL=${JAR}.part 53 | if hash curl 2>/dev/null; then 54 | (curl --progress-bar ${URL1} > ${JAR_DL} || curl --progress-bar ${URL2} > ${JAR_DL}) && mv ${JAR_DL} ${JAR} 55 | elif hash wget 2>/dev/null; then 56 | (wget --progress=bar ${URL1} -O ${JAR_DL} || wget --progress=bar ${URL2} -O ${JAR_DL}) && mv ${JAR_DL} ${JAR} 57 | else 58 | printf "You do not have curl or wget installed, please install sbt manually from http://www.scala-sbt.org/\n" 59 | exit -1 60 | fi 61 | fi 62 | if [ ! -f ${JAR} ]; then 63 | # We failed to download 64 | printf "Our attempt to download sbt locally to ${JAR} failed. Please install sbt manually from http://www.scala-sbt.org/\n" 65 | exit -1 66 | fi 67 | printf "Launching sbt from ${JAR}\n" 68 | fi 69 | } 70 | 71 | execRunner () { 72 | # print the arguments one to a line, quoting any containing spaces 73 | [[ $verbose || $debug ]] && echo "# Executing command line:" && { 74 | for arg; do 75 | if printf "%s\n" "$arg" | grep -q ' '; then 76 | printf "\"%s\"\n" "$arg" 77 | else 78 | printf "%s\n" "$arg" 79 | fi 80 | done 81 | echo "" 82 | } 83 | 84 | exec "$@" 85 | } 86 | 87 | addJava () { 88 | dlog "[addJava] arg = '$1'" 89 | java_args=( "${java_args[@]}" "$1" ) 90 | } 91 | 92 | enableProfile () { 93 | dlog "[enableProfile] arg = '$1'" 94 | maven_profiles=( "${maven_profiles[@]}" "$1" ) 95 | export SBT_MAVEN_PROFILES="${maven_profiles[@]}" 96 | } 97 | 98 | addSbt () { 99 | dlog "[addSbt] arg = '$1'" 100 | sbt_commands=( "${sbt_commands[@]}" "$1" ) 101 | } 102 | addResidual () { 103 | dlog "[residual] arg = '$1'" 104 | residual_args=( "${residual_args[@]}" "$1" ) 105 | } 106 | addDebugger () { 107 | addJava "-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=$1" 108 | } 109 | 110 | # a ham-fisted attempt to move some memory settings in concert 111 | # so they need not be dicked around with individually. 112 | get_mem_opts () { 113 | local mem=${1:-2048} 114 | local perm=$(( $mem / 4 )) 115 | (( $perm > 256 )) || perm=256 116 | (( $perm < 4096 )) || perm=4096 117 | local codecache=$(( $perm / 2 )) 118 | 119 | echo "-Xms${mem}m -Xmx${mem}m -XX:MaxPermSize=${perm}m -XX:ReservedCodeCacheSize=${codecache}m" 120 | } 121 | 122 | require_arg () { 123 | local type="$1" 124 | local opt="$2" 125 | local arg="$3" 126 | if [[ -z "$arg" ]] || [[ "${arg:0:1}" == "-" ]]; then 127 | die "$opt requires <$type> argument" 128 | fi 129 | } 130 | 131 | is_function_defined() { 132 | declare -f "$1" > /dev/null 133 | } 134 | 135 | process_args () { 136 | while [[ $# -gt 0 ]]; do 137 | case "$1" in 138 | -h|-help) usage; exit 1 ;; 139 | -v|-verbose) verbose=1 && shift ;; 140 | -d|-debug) debug=1 && shift ;; 141 | 142 | -ivy) require_arg path "$1" "$2" && addJava "-Dsbt.ivy.home=$2" && shift 2 ;; 143 | -mem) require_arg integer "$1" "$2" && sbt_mem="$2" && shift 2 ;; 144 | -jvm-debug) require_arg port "$1" "$2" && addDebugger $2 && shift 2 ;; 145 | -batch) exec dataframeReader.avro(fileLocation) 40 | case "txt" => dataframeReader.format("text").load(fileLocation) 41 | case "xml" => dataframeReader.format(constants.xmlClass) 42 | .option(constants.xmlRowTag, rowTag) 43 | .load(fileLocation) 44 | case "csv" => dataframeReader. 45 | option("header", header). 46 | option("delimiter", delimiter). 47 | option("quote", quote). 48 | option("escape", escape). 49 | option("multiLine", multiLine). 50 | option("inferSchema", inferSchema). 51 | csv(fileLocation) 52 | case _ => dataframeReader.format(fileType).load(fileLocation) 53 | } 54 | df 55 | } 56 | 57 | override def schema: StructType = { 58 | df.schema 59 | } 60 | 61 | override def buildScan(): RDD[Row] = { 62 | df.rdd 63 | } 64 | 65 | } 66 | -------------------------------------------------------------------------------- /src/main/scala/com/springml/spark/sftp/DefaultSource.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 springml 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.springml.spark.sftp 17 | 18 | import java.io.File 19 | import java.util.UUID 20 | 21 | import com.springml.sftp.client.SFTPClient 22 | import com.springml.spark.sftp.util.Utils.ImplicitDataFrameWriter 23 | 24 | import org.apache.commons.io.FilenameUtils 25 | import org.apache.hadoop.fs.Path 26 | import org.apache.log4j.Logger 27 | import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, RelationProvider, SchemaRelationProvider} 28 | import org.apache.spark.sql.types.StructType 29 | import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode} 30 | 31 | /** 32 | * Datasource to construct dataframe from a sftp url 33 | */ 34 | class DefaultSource extends RelationProvider with SchemaRelationProvider with CreatableRelationProvider { 35 | @transient val logger = Logger.getLogger(classOf[DefaultSource]) 36 | 37 | /** 38 | * Copy the file from SFTP to local location and then create dataframe using local file 39 | */ 40 | override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]):BaseRelation = { 41 | createRelation(sqlContext, parameters, null) 42 | } 43 | 44 | /** 45 | * Copy the file from SFTP to local location and then create dataframe using local file 46 | */ 47 | override def createRelation(sqlContext: SQLContext, parameters: Map[String, String], schema: StructType) = { 48 | val username = parameters.get("username") 49 | val password = parameters.get("password") 50 | val pemFileLocation = parameters.get("pem") 51 | val pemPassphrase = parameters.get("pemPassphrase") 52 | val host = parameters.getOrElse("host", sys.error("SFTP Host has to be provided using 'host' option")) 53 | val port = parameters.get("port") 54 | val path = parameters.getOrElse("path", sys.error("'path' must be specified")) 55 | val fileType = parameters.getOrElse("fileType", sys.error("File type has to be provided using 'fileType' option")) 56 | val inferSchema = parameters.get("inferSchema") 57 | val header = parameters.getOrElse("header", "true") 58 | val delimiter = parameters.getOrElse("delimiter", ",") 59 | val quote = parameters.getOrElse("quote", "\"") 60 | val escape = parameters.getOrElse("escape", "\\") 61 | val multiLine = parameters.getOrElse("multiLine", "false") 62 | val createDF = parameters.getOrElse("createDF", "true") 63 | val copyLatest = parameters.getOrElse("copyLatest", "false") 64 | val tempFolder = parameters.getOrElse("tempLocation", System.getProperty("java.io.tmpdir")) 65 | val hdfsTemp = parameters.getOrElse("hdfsTempLocation", tempFolder) 66 | val cryptoKey = parameters.getOrElse("cryptoKey", null) 67 | val cryptoAlgorithm = parameters.getOrElse("cryptoAlgorithm", "AES") 68 | val rowTag = parameters.getOrElse(constants.xmlRowTag, null) 69 | 70 | val supportedFileTypes = List("csv", "json", "avro", "parquet", "txt", "xml","orc") 71 | if (!supportedFileTypes.contains(fileType)) { 72 | sys.error("fileType " + fileType + " not supported. Supported file types are " + supportedFileTypes) 73 | } 74 | 75 | val inferSchemaFlag = if (inferSchema != null && inferSchema.isDefined) { 76 | inferSchema.get 77 | } else { 78 | "false" 79 | } 80 | 81 | val sftpClient = getSFTPClient(username, password, pemFileLocation, pemPassphrase, host, port, 82 | cryptoKey, cryptoAlgorithm) 83 | val copiedFileLocation = copy(sftpClient, path, tempFolder, copyLatest.toBoolean) 84 | val fileLocation = copyToHdfs(sqlContext, copiedFileLocation, hdfsTemp) 85 | 86 | if (!createDF.toBoolean) { 87 | logger.info("Returning an empty dataframe after copying files...") 88 | createReturnRelation(sqlContext, schema) 89 | } else { 90 | DatasetRelation(fileLocation, fileType, inferSchemaFlag, header, delimiter, quote, escape, multiLine, rowTag, schema, 91 | sqlContext) 92 | } 93 | } 94 | 95 | override def createRelation( 96 | sqlContext: SQLContext, 97 | mode: SaveMode, 98 | parameters: Map[String, String], 99 | data: DataFrame): BaseRelation = { 100 | 101 | val username = parameters.get("username") 102 | val password = parameters.get("password") 103 | val pemFileLocation = parameters.get("pem") 104 | val pemPassphrase = parameters.get("pemPassphrase") 105 | val host = parameters.getOrElse("host", sys.error("SFTP Host has to be provided using 'host' option")) 106 | val port = parameters.get("port") 107 | val path = parameters.getOrElse("path", sys.error("'path' must be specified")) 108 | val fileType = parameters.getOrElse("fileType", sys.error("File type has to be provided using 'fileType' option")) 109 | val header = parameters.getOrElse("header", "true") 110 | val copyLatest = parameters.getOrElse("copyLatest", "false") 111 | val tmpFolder = parameters.getOrElse("tempLocation", System.getProperty("java.io.tmpdir")) 112 | val hdfsTemp = parameters.getOrElse("hdfsTempLocation", tmpFolder) 113 | val cryptoKey = parameters.getOrElse("cryptoKey", null) 114 | val cryptoAlgorithm = parameters.getOrElse("cryptoAlgorithm", "AES") 115 | val delimiter = parameters.getOrElse("delimiter", ",") 116 | val quote = parameters.getOrElse("quote", "\"") 117 | val escape = parameters.getOrElse("escape", "\\") 118 | val multiLine = parameters.getOrElse("multiLine", "false") 119 | val codec = parameters.getOrElse("codec", null) 120 | val rowTag = parameters.getOrElse(constants.xmlRowTag, null) 121 | val rootTag = parameters.getOrElse(constants.xmlRootTag, null) 122 | 123 | val supportedFileTypes = List("csv", "json", "avro", "parquet", "txt", "xml","orc") 124 | if (!supportedFileTypes.contains(fileType)) { 125 | sys.error("fileType " + fileType + " not supported. Supported file types are " + supportedFileTypes) 126 | } 127 | 128 | val sftpClient = getSFTPClient(username, password, pemFileLocation, pemPassphrase, host, port, 129 | cryptoKey, cryptoAlgorithm) 130 | val tempFile = writeToTemp(sqlContext, data, hdfsTemp, tmpFolder, fileType, header, delimiter, quote, escape, multiLine, codec, rowTag, rootTag) 131 | 132 | upload(tempFile, path, sftpClient) 133 | return createReturnRelation(data) 134 | } 135 | private def copyToHdfs(sqlContext: SQLContext, fileLocation : String, 136 | hdfsTemp : String): String = { 137 | val hadoopConf = sqlContext.sparkContext.hadoopConfiguration 138 | val hdfsPath = new Path(fileLocation) 139 | val fs = hdfsPath.getFileSystem(hadoopConf) 140 | if ("hdfs".equalsIgnoreCase(fs.getScheme)) { 141 | fs.copyFromLocalFile(new Path(fileLocation), new Path(hdfsTemp)) 142 | val filePath = hdfsTemp + "/" + hdfsPath.getName 143 | fs.deleteOnExit(new Path(filePath)) 144 | return filePath 145 | } else { 146 | return fileLocation 147 | } 148 | } 149 | 150 | private def copyFromHdfs(sqlContext: SQLContext, hdfsTemp : String, 151 | fileLocation : String): String = { 152 | val hadoopConf = sqlContext.sparkContext.hadoopConfiguration 153 | val hdfsPath = new Path(hdfsTemp) 154 | val fs = hdfsPath.getFileSystem(hadoopConf) 155 | if ("hdfs".equalsIgnoreCase(fs.getScheme)) { 156 | fs.copyToLocalFile(new Path(hdfsTemp), new Path(fileLocation)) 157 | fs.deleteOnExit(new Path(hdfsTemp)) 158 | return fileLocation 159 | } else { 160 | return hdfsTemp 161 | } 162 | } 163 | 164 | private def upload(source: String, target: String, sftpClient: SFTPClient) { 165 | logger.info("Copying " + source + " to " + target) 166 | sftpClient.copyToFTP(source, target) 167 | } 168 | 169 | private def getSFTPClient( 170 | username: Option[String], 171 | password: Option[String], 172 | pemFileLocation: Option[String], 173 | pemPassphrase: Option[String], 174 | host: String, 175 | port: Option[String], 176 | cryptoKey : String, 177 | cryptoAlgorithm : String) : SFTPClient = { 178 | 179 | val sftpPort = if (port != null && port.isDefined) { 180 | port.get.toInt 181 | } else { 182 | 22 183 | } 184 | 185 | val cryptoEnabled = cryptoKey != null 186 | 187 | if (cryptoEnabled) { 188 | new SFTPClient(getValue(pemFileLocation), getValue(pemPassphrase), getValue(username), 189 | getValue(password), 190 | host, sftpPort, cryptoEnabled, cryptoKey, cryptoAlgorithm) 191 | } else { 192 | new SFTPClient(getValue(pemFileLocation), getValue(pemPassphrase), getValue(username), 193 | getValue(password), host, sftpPort) 194 | } 195 | } 196 | 197 | private def createReturnRelation(data: DataFrame): BaseRelation = { 198 | createReturnRelation(data.sqlContext, data.schema) 199 | } 200 | 201 | private def createReturnRelation(sqlContextVar: SQLContext, schemaVar: StructType): BaseRelation = { 202 | new BaseRelation { 203 | override def sqlContext: SQLContext = sqlContextVar 204 | override def schema: StructType = schemaVar 205 | } 206 | } 207 | 208 | private def copy(sftpClient: SFTPClient, source: String, 209 | tempFolder: String, latest: Boolean): String = { 210 | var copiedFilePath: String = null 211 | try { 212 | val target = tempFolder + File.separator + FilenameUtils.getName(source) 213 | copiedFilePath = target 214 | if (latest) { 215 | copiedFilePath = sftpClient.copyLatest(source, tempFolder) 216 | } else { 217 | logger.info("Copying " + source + " to " + target) 218 | copiedFilePath = sftpClient.copy(source, target) 219 | } 220 | 221 | copiedFilePath 222 | } finally { 223 | addShutdownHook(copiedFilePath) 224 | } 225 | } 226 | 227 | private def getValue(param: Option[String]): String = { 228 | if (param != null && param.isDefined) { 229 | param.get 230 | } else { 231 | null 232 | } 233 | } 234 | 235 | private def writeToTemp(sqlContext: SQLContext, df: DataFrame, 236 | hdfsTemp: String, tempFolder: String, fileType: String, header: String, 237 | delimiter: String, quote: String, escape: String, multiLine: String, codec: String, rowTag: String, rootTag: String) : String = { 238 | val randomSuffix = "spark_sftp_connection_temp_" + UUID.randomUUID 239 | val hdfsTempLocation = hdfsTemp + File.separator + randomSuffix 240 | val localTempLocation = tempFolder + File.separator + randomSuffix 241 | 242 | addShutdownHook(localTempLocation) 243 | 244 | fileType match { 245 | 246 | case "xml" => df.coalesce(1).write.format(constants.xmlClass) 247 | .option(constants.xmlRowTag, rowTag) 248 | .option(constants.xmlRootTag, rootTag).save(hdfsTempLocation) 249 | case "csv" => df.coalesce(1). 250 | write. 251 | option("header", header). 252 | option("delimiter", delimiter). 253 | option("quote", quote). 254 | option("escape", escape). 255 | option("multiLine", multiLine). 256 | optionNoNull("codec", Option(codec)). 257 | csv(hdfsTempLocation) 258 | case "txt" => df.coalesce(1).write.text(hdfsTempLocation) 259 | case "avro" => df.coalesce(1).write.format("com.databricks.spark.avro").save(hdfsTempLocation) 260 | case _ => df.coalesce(1).write.format(fileType).save(hdfsTempLocation) 261 | } 262 | 263 | copyFromHdfs(sqlContext, hdfsTempLocation, localTempLocation) 264 | copiedFile(localTempLocation) 265 | } 266 | 267 | private def addShutdownHook(tempLocation: String) { 268 | logger.debug("Adding hook for file " + tempLocation) 269 | val hook = new DeleteTempFileShutdownHook(tempLocation) 270 | Runtime.getRuntime.addShutdownHook(hook) 271 | } 272 | 273 | private def copiedFile(tempFileLocation: String) : String = { 274 | val baseTemp = new File(tempFileLocation) 275 | val files = baseTemp.listFiles().filter { x => 276 | (!x.isDirectory() 277 | && !x.getName.contains("SUCCESS") 278 | && !x.isHidden() 279 | && !x.getName.contains(".crc") 280 | && !x.getName.contains("_committed_") 281 | && !x.getName.contains("_started_") 282 | ) 283 | } 284 | files(0).getAbsolutePath 285 | } 286 | } 287 | -------------------------------------------------------------------------------- /src/main/scala/com/springml/spark/sftp/DeleteTempFileShutdownHook.scala: -------------------------------------------------------------------------------- 1 | package com.springml.spark.sftp 2 | 3 | import org.apache.commons.io.FileUtils 4 | import java.io.File 5 | import org.apache.log4j.Logger 6 | 7 | /** 8 | * Delete the temp file created during spark shutdown 9 | */ 10 | class DeleteTempFileShutdownHook( 11 | fileLocation: String) extends Thread { 12 | 13 | private val logger = Logger.getLogger(classOf[DatasetRelation]) 14 | 15 | override def run(): Unit = { 16 | logger.info("Deleting " + fileLocation ) 17 | FileUtils.deleteQuietly(new File(fileLocation)) 18 | } 19 | } -------------------------------------------------------------------------------- /src/main/scala/com/springml/spark/sftp/constants.scala: -------------------------------------------------------------------------------- 1 | package com.springml.spark.sftp 2 | 3 | /** 4 | * Created by bagopalan on 9/16/18. 5 | */ 6 | object constants { 7 | 8 | val xmlClass: String = "com.databricks.spark.xml" 9 | val xmlRowTag: String = "rowTag" 10 | val xmlRootTag: String = "rootTag" 11 | 12 | } 13 | -------------------------------------------------------------------------------- /src/main/scala/com/springml/spark/sftp/util/Utils.scala: -------------------------------------------------------------------------------- 1 | package com.springml.spark.sftp.util 2 | 3 | import org.apache.spark.sql.DataFrameWriter 4 | 5 | 6 | object Utils { 7 | 8 | 9 | /** 10 | * [[DataFrameWriter]] implicits 11 | */ 12 | implicit class ImplicitDataFrameWriter[T](dataFrameWriter: DataFrameWriter[T]) { 13 | 14 | /** 15 | * Adds an output option for the underlying data source if the option has a value. 16 | */ 17 | def optionNoNull(key: String, optionValue: Option[String]): DataFrameWriter[T] = { 18 | optionValue match { 19 | case Some(_) => dataFrameWriter.option(key, optionValue.get) 20 | case None => dataFrameWriter 21 | } 22 | } 23 | } 24 | 25 | } -------------------------------------------------------------------------------- /src/test/resources/books.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/egen/spark-sftp/090917547001574afa93cddaf2a022151a3f4260/src/test/resources/books.orc -------------------------------------------------------------------------------- /src/test/resources/books.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Gambardella, Matthew 5 | XML Developer's Guide 6 | Computer 7 | 44.95 8 | 2000-10-01 9 | 10 | 11 | 12 | An in-depth look at creating applications 13 | with XML.This manual describes Oracle XML DB, and how you can use it to store, generate, manipulate, manage, 14 | and query XML data in the database. 15 | 16 | 17 | After introducing you to the heart of Oracle XML DB, namely the XMLType framework and Oracle XML DB repository, 18 | the manual provides a brief introduction to design criteria to consider when planning your Oracle XML DB 19 | application. It provides examples of how and where you can use Oracle XML DB. 20 | 21 | 22 | The manual then describes ways you can store and retrieve XML data using Oracle XML DB, APIs for manipulating 23 | XMLType data, and ways you can view, generate, transform, and search on existing XML data. The remainder of 24 | the manual discusses how to use Oracle XML DB repository, including versioning and security, 25 | how to access and manipulate repository resources using protocols, SQL, PL/SQL, or Java, and how to manage 26 | your Oracle XML DB application using Oracle Enterprise Manager. It also introduces you to XML messaging and 27 | Oracle Streams Advanced Queuing XMLType support. 28 | 29 | Ralls, Kim 30 | Midnight Rain 31 | Fantasy 32 | 5.95 33 | 2000-12-16 34 | A former architect battles corporate zombies, 35 | an evil sorceress, and her own childhood to become queen 36 | of the world. 37 | 38 | 39 | Corets, Eva 40 | Maeve Ascendant 41 | Fantasy 42 | 5.95 43 | 2000-11-17 44 | After the collapse of a nanotechnology 45 | society in England, the young survivors lay the 46 | foundation for a new society. 47 | 48 | 49 | Corets, Eva 50 | Oberon's Legacy 51 | Fantasy 52 | 5.95 53 | 2001-03-10 54 | In post-apocalypse England, the mysterious 55 | agent known only as Oberon helps to create a new life 56 | for the inhabitants of London. Sequel to Maeve 57 | Ascendant. 58 | 59 | 60 | Corets, Eva 61 | The Sundered Grail 62 | Fantasy 63 | 5.95 64 | 2001-09-10 65 | The two daughters of Maeve, half-sisters, 66 | battle one another for control of England. Sequel to 67 | Oberon's Legacy. 68 | 69 | 70 | Randall, Cynthia 71 | Lover Birds 72 | Romance 73 | 4.95 74 | 2000-09-02 75 | When Carla meets Paul at an ornithology 76 | conference, tempers fly as feathers get ruffled. 77 | 78 | 79 | Thurman, Paula 80 | Splish Splash 81 | Romance 82 | 4.95 83 | 2000-11-02 84 | A deep sea diver finds true love twenty 85 | thousand leagues beneath the sea. 86 | 87 | 88 | Knorr, Stefan 89 | Creepy Crawlies 90 | Horror 91 | 4.95 92 | 2000-12-06 93 | An anthology of horror stories about roaches, 94 | centipedes, scorpions and other insects. 95 | 96 | 97 | Kress, Peter 98 | Paradox Lost 99 | Science Fiction 100 | 6.95 101 | 2000-11-02 102 | After an inadvertant trip through a Heisenberg 103 | Uncertainty Device, James Salway discovers the problems 104 | of being quantum. 105 | 106 | 107 | O'Brien, Tim 108 | Microsoft .NET: The Programming Bible 109 | Computer 110 | 36.95 111 | 2000-12-09 112 | Microsoft's .NET initiative is explored in 113 | detail in this deep programmer's reference. 114 | 115 | 116 | O'Brien, Tim 117 | MSXML3: A Comprehensive Guide 118 | Computer 119 | 36.95 120 | 2000-12-01 121 | The Microsoft MSXML3 parser is covered in 122 | detail, with attention to XML DOM interfaces, XSLT processing, 123 | SAX and more. 124 | 125 | 126 | Galos, Mike 127 | Visual Studio 7: A Comprehensive Guide 128 | Computer 129 | 49.95 130 | 2001-04-16 131 | Microsoft Visual Studio 7 is explored in depth, 132 | looking at how Visual Basic, Visual C++, C#, and ASP+ are 133 | integrated into a comprehensive development 134 | environment. 135 | 136 | -------------------------------------------------------------------------------- /src/test/resources/custom-delimiter.csv: -------------------------------------------------------------------------------- 1 | ProposalId;OpportunityId;Clicks;Impressions 2 | 103;006B0000002ndnuIAA;30;133 3 | 101;006B0000002ndnkIAA;12;73 4 | 102;006B0000002ndnpIAA;20;97 5 | -------------------------------------------------------------------------------- /src/test/resources/people.json: -------------------------------------------------------------------------------- 1 | {"name":"Michael"} 2 | {"name":"Andy", "age":30} 3 | {"name":"Justin", "age":19} 4 | -------------------------------------------------------------------------------- /src/test/resources/plaintext.txt: -------------------------------------------------------------------------------- 1 | adam 2 | Emily 3 | sundar -------------------------------------------------------------------------------- /src/test/resources/sample.csv: -------------------------------------------------------------------------------- 1 | ProposalId,OpportunityId,Clicks,Impressions 2 | 103,006B0000002ndnuIAA,30,133 3 | 101,006B0000002ndnkIAA,12,73 4 | 102,006B0000002ndnpIAA,20,97 5 | -------------------------------------------------------------------------------- /src/test/resources/sample_quoted_multiline.csv: -------------------------------------------------------------------------------- 1 | ProposalId,OpportunityId,Clicks,Impressions,Message 2 | 103,006B0000002ndnuIAA,30,133,"test 3 | multiline \"here we have a quote\" 4 | message", 5 | 101,006B0000002ndnkIAA,12,73,"regular message" 6 | 102,006B0000002ndnpIAA,20,97,"regular message" 7 | -------------------------------------------------------------------------------- /src/test/resources/users.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/egen/spark-sftp/090917547001574afa93cddaf2a022151a3f4260/src/test/resources/users.avro -------------------------------------------------------------------------------- /src/test/resources/users.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/egen/spark-sftp/090917547001574afa93cddaf2a022151a3f4260/src/test/resources/users.parquet -------------------------------------------------------------------------------- /src/test/scala/com/springml/spark/sftp/CustomSchemaTest.scala: -------------------------------------------------------------------------------- 1 | package com.springml.spark.sftp 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructField, _} 5 | import org.scalatest.{BeforeAndAfterEach, FunSuite} 6 | 7 | /** 8 | * Tests for creating dataframe using custom schema 9 | */ 10 | class CustomSchemaTest extends FunSuite with BeforeAndAfterEach { 11 | var ss: SparkSession = _ 12 | 13 | val csvTypesMap = Map("ProposalId" -> IntegerType, 14 | "OpportunityId" -> StringType, 15 | "Clicks" -> LongType, 16 | "Impressions" -> LongType 17 | ) 18 | 19 | val jsonTypesMap = Map("name" -> StringType, 20 | "age" -> IntegerType 21 | ) 22 | 23 | override def beforeEach() { 24 | ss = SparkSession.builder().master("local").appName("Custom Schema Test").getOrCreate() 25 | } 26 | 27 | private def validateTypes(field : StructField, typeMap : Map[String, DataType]) = { 28 | val expectedType = typeMap(field.name) 29 | assert(expectedType == field.dataType) 30 | } 31 | 32 | private def columnArray(typeMap : Map[String, DataType]) : Array[StructField] = { 33 | val columns = typeMap.map(x => new StructField(x._1, x._2, true)) 34 | 35 | val columnStruct = Array[StructField] () 36 | columns.copyToArray(columnStruct) 37 | 38 | columnStruct 39 | } 40 | 41 | test ("Read CSV with custom schema") { 42 | val columnStruct = columnArray(csvTypesMap) 43 | val expectedSchema = StructType(columnStruct) 44 | 45 | val fileLocation = getClass.getResource("/sample.csv").getPath 46 | val dsr = DatasetRelation(fileLocation, "csv", "false", "true", ",", "\"", "\\", "false", null, expectedSchema, ss.sqlContext) 47 | val rdd = dsr.buildScan() 48 | 49 | assert(dsr.schema.fields.length == columnStruct.length) 50 | dsr.schema.fields.foreach(s => validateTypes(s, csvTypesMap)) 51 | } 52 | 53 | test ("Read Json with custom schema") { 54 | val columnStruct = columnArray(jsonTypesMap) 55 | val expectedSchema = StructType(columnStruct) 56 | 57 | val fileLocation = getClass.getResource("/people.json").getPath 58 | val dsr = DatasetRelation(fileLocation, "json", "false", "true", ",", "\"", "\\", "false", null, expectedSchema, ss.sqlContext) 59 | val rdd = dsr.buildScan() 60 | 61 | assert(dsr.schema.fields.length == columnStruct.length) 62 | dsr.schema.fields.foreach(s => validateTypes(s, jsonTypesMap)) 63 | } 64 | 65 | } 66 | -------------------------------------------------------------------------------- /src/test/scala/com/springml/spark/sftp/TestDatasetRelation.scala: -------------------------------------------------------------------------------- 1 | package com.springml.spark.sftp 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.scalatest.{BeforeAndAfterEach, FunSuite} 5 | 6 | /** 7 | * Simple unit test for basic testing on different formats of file 8 | */ 9 | class TestDatasetRelation extends FunSuite with BeforeAndAfterEach { 10 | var ss: SparkSession = _ 11 | 12 | override def beforeEach() { 13 | ss = SparkSession.builder().master("local").enableHiveSupport().appName("Test Dataset Relation").getOrCreate() 14 | } 15 | 16 | test ("Read CSV") { 17 | val fileLocation = getClass.getResource("/sample.csv").getPath 18 | val dsr = DatasetRelation(fileLocation, "csv", "false", "true", ",", "\"", "\\", "false", null, null, ss.sqlContext) 19 | val rdd = dsr.buildScan() 20 | assert(3 == rdd.count()) 21 | } 22 | 23 | test ("Read CSV using custom delimiter") { 24 | val fileLocation = getClass.getResource("/sample.csv").getPath 25 | val dsr = DatasetRelation(fileLocation, "csv", "false", "true", ";", "\"", "\\", "false", null, null, ss.sqlContext) 26 | val rdd = dsr.buildScan() 27 | assert(3 == rdd.count()) 28 | } 29 | 30 | test ("Read multiline CSV using custom quote and escape") { 31 | val fileLocation = getClass.getResource("/sample_quoted_multiline.csv").getPath 32 | val dsr = DatasetRelation(fileLocation, "csv", "false", "true", ",", "\"", "\\", "true", null, null, ss.sqlContext) 33 | val rdd = dsr.buildScan() 34 | assert(3 == rdd.count()) 35 | } 36 | 37 | 38 | test ("Read JSON") { 39 | val fileLocation = getClass.getResource("/people.json").getPath 40 | val dsr = DatasetRelation(fileLocation, "json", "false", "true", ",", "\"", "\\", "false", null, null, ss.sqlContext) 41 | val rdd = dsr.buildScan() 42 | assert(3 == rdd.count()) 43 | } 44 | 45 | test ("Read AVRO") { 46 | val fileLocation = getClass.getResource("/users.avro").getPath 47 | val dsr = DatasetRelation(fileLocation, "avro", "false", "true", ",", "\"", "\\", "false", null, null, ss.sqlContext) 48 | val rdd = dsr.buildScan() 49 | assert(2 == rdd.count()) 50 | } 51 | 52 | test ("Read parquet") { 53 | val fileLocation = getClass.getResource("/users.parquet").getPath 54 | val dsr = DatasetRelation(fileLocation, "parquet", "false", "true", ",", "\"", "\\", "false", null, null, ss.sqlContext) 55 | val rdd = dsr.buildScan() 56 | assert(2 == rdd.count()) 57 | } 58 | 59 | test ("Read text file") { 60 | val fileLocation = getClass.getResource("/plaintext.txt").getPath 61 | val dsr = DatasetRelation(fileLocation, "txt", "false", "true", ",", "\"", "\\", "false", null, null, ss.sqlContext) 62 | val rdd = dsr.buildScan() 63 | assert(3 == rdd.count()) 64 | } 65 | 66 | test ("Read xml file") { 67 | val fileLocation = getClass.getResource("/books.xml").getPath 68 | val dsr = DatasetRelation(fileLocation, "xml", "false", "true", ",", "\"", "\\", "false", "book", null, ss.sqlContext) 69 | val rdd = dsr.buildScan() 70 | assert(12 == rdd.count()) 71 | } 72 | test ("Read orc file") { 73 | val fileLocation = getClass.getResource("/books.orc").getPath 74 | val dsr = DatasetRelation(fileLocation, "orc", "false", "true", ",", "\"", "\\", "false", "book", null, ss.sqlContext) 75 | val rdd = dsr.buildScan() 76 | assert(12 == rdd.count()) 77 | } 78 | } 79 | --------------------------------------------------------------------------------