├── inst ├── staticdocs │ └── index.r ├── scala │ ├── sparklyr-1.6.1.md5 │ ├── logging.scala │ ├── utils.scala │ ├── backend.scala │ ├── handler.scala │ └── serializer.scala ├── java │ └── sparkapi-1.6.1.jar └── tools │ └── compile-scala.R ├── tests ├── testthat │ ├── test-invoke.R │ └── test-config.R └── testthat.R ├── .gitignore ├── R ├── globals.R ├── magrittr.R ├── dataframe.R ├── invoke.R ├── hive.R ├── version.R ├── extensions.R ├── compile.R ├── jobj.R ├── connection.R ├── deserialize.R ├── serialize.R └── shell.R ├── .Rbuildignore ├── README.md ├── man ├── pipe.Rd ├── spark_web.Rd ├── connection_is_open.Rd ├── print_jobj.Rd ├── hive_context.Rd ├── spark_compile.Rd ├── spark_context.Rd ├── java_context.Rd ├── spark_dataframe.Rd ├── spark_dependency.Rd ├── spark_jobj.Rd ├── spark_connection.Rd ├── spark_log.Rd ├── spark_version.Rd ├── invoke.Rd ├── connection_config.Rd ├── invoke_method.Rd ├── register_extension.Rd └── start_shell.Rd ├── sparkapi.Rproj ├── DESCRIPTION ├── configure.R ├── NAMESPACE └── LICENSE /inst/staticdocs/index.r: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /inst/scala/sparklyr-1.6.1.md5: -------------------------------------------------------------------------------- 1 | 54904ecd4f11cae8eaeb0fad3928d5ed -------------------------------------------------------------------------------- /tests/testthat/test-invoke.R: -------------------------------------------------------------------------------- 1 | 2 | library(testthat) 3 | 4 | context("Invoke") 5 | 6 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(sparkapi) 3 | 4 | test_check("sparkapi") 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | .DS_Store 6 | configure 7 | -------------------------------------------------------------------------------- /inst/java/sparkapi-1.6.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkapi/HEAD/inst/java/sparkapi-1.6.1.jar -------------------------------------------------------------------------------- /R/globals.R: -------------------------------------------------------------------------------- 1 | 2 | .globals <- new.env(parent = emptyenv()) 3 | .globals$extension_packages <- character() 4 | 5 | -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | README.Rmd 4 | ^configure$ 5 | ^configure\.win$ 6 | ^configure\.R$ 7 | 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # sparkapi 2 | 3 | This project was merged back to [sparklyr](http://spark.rstudio.com). Please visit http://spark.rstudio.com/extensions.html for information. 4 | -------------------------------------------------------------------------------- /inst/tools/compile-scala.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | spark_home <- file.path(rappdirs::app_dir("spark", "rstudio")$cache(), "spark-1.6.1-bin-hadoop2.6") 4 | spark_compile("sparkapi", spark_home = spark_home) 5 | -------------------------------------------------------------------------------- /R/magrittr.R: -------------------------------------------------------------------------------- 1 | 2 | #' Pipe operator 3 | #' 4 | #' See \code{\link[magrittr]{\%>\%}} for more details. 5 | #' 6 | #' @name %>% 7 | #' @rdname pipe 8 | #' @keywords internal 9 | #' @export 10 | #' @importFrom magrittr %>% 11 | #' @usage lhs \%>\% rhs 12 | NULL 13 | 14 | -------------------------------------------------------------------------------- /man/pipe.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/magrittr.R 3 | \name{\%>\%} 4 | \alias{\%>\%} 5 | \title{Pipe operator} 6 | \usage{ 7 | lhs \%>\% rhs 8 | } 9 | \description{ 10 | See \code{\link[magrittr]{\%>\%}} for more details. 11 | } 12 | \keyword{internal} 13 | 14 | -------------------------------------------------------------------------------- /man/spark_web.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/connection.R 3 | \name{spark_web} 4 | \alias{spark_web} 5 | \title{Open the Spark web interface} 6 | \usage{ 7 | spark_web(sc, ...) 8 | } 9 | \arguments{ 10 | \item{sc}{\code{spark_connection}} 11 | 12 | \item{...}{Unused (reserved for future use)} 13 | } 14 | \description{ 15 | Open the Spark web interface 16 | } 17 | 18 | -------------------------------------------------------------------------------- /man/connection_is_open.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/connection.R 3 | \name{connection_is_open} 4 | \alias{connection_is_open} 5 | \title{Check whether the connection is open} 6 | \usage{ 7 | connection_is_open(sc) 8 | } 9 | \arguments{ 10 | \item{sc}{\code{spark_connection}} 11 | } 12 | \description{ 13 | Check whether the connection is open 14 | } 15 | \keyword{internal} 16 | 17 | -------------------------------------------------------------------------------- /sparkapi.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: No 4 | SaveWorkspace: No 5 | AlwaysSaveHistory: No 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | PackageRoxygenize: rd,collate,namespace 22 | -------------------------------------------------------------------------------- /man/print_jobj.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/jobj.R 3 | \name{print_jobj} 4 | \alias{print_jobj} 5 | \title{Generic method for print jobj for a connection type} 6 | \usage{ 7 | print_jobj(sc, jobj, ...) 8 | } 9 | \arguments{ 10 | \item{sc}{\code{spark_connection} (used for type dispatch)} 11 | 12 | \item{jobj}{Object to print} 13 | } 14 | \description{ 15 | Generic method for print jobj for a connection type 16 | } 17 | \keyword{internal} 18 | 19 | -------------------------------------------------------------------------------- /man/hive_context.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/connection.R 3 | \name{hive_context} 4 | \alias{hive_context} 5 | \title{Get the HiveContext associated with a connection} 6 | \usage{ 7 | hive_context(sc) 8 | } 9 | \arguments{ 10 | \item{sc}{Connection to get HiveContext from} 11 | } 12 | \value{ 13 | Reference to HiveContext 14 | } 15 | \description{ 16 | Get the HiveContext \code{spark_jobj} associated with a 17 | \code{spark_connection} 18 | } 19 | 20 | -------------------------------------------------------------------------------- /man/spark_compile.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/compile.R 3 | \name{spark_compile} 4 | \alias{spark_compile} 5 | \title{Compiles scala sources and packages into a jar file} 6 | \usage{ 7 | spark_compile(name, spark_home) 8 | } 9 | \arguments{ 10 | \item{name}{The name of the target jar} 11 | 12 | \item{spark_home}{Spark version} 13 | } 14 | \description{ 15 | Compiles scala sources and packages into a jar file 16 | } 17 | \keyword{internal} 18 | 19 | -------------------------------------------------------------------------------- /man/spark_context.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/connection.R 3 | \name{spark_context} 4 | \alias{spark_context} 5 | \title{Get the SparkContext associated with a connection} 6 | \usage{ 7 | spark_context(sc) 8 | } 9 | \arguments{ 10 | \item{sc}{Connection to get SparkContext from} 11 | } 12 | \value{ 13 | Reference to SparkContext 14 | } 15 | \description{ 16 | Get the SparkContext \code{spark_jobj} associated with a 17 | \code{spark_connection} 18 | } 19 | 20 | -------------------------------------------------------------------------------- /man/java_context.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/connection.R 3 | \name{java_context} 4 | \alias{java_context} 5 | \title{Get the JavaSparkContext associated with a connection} 6 | \usage{ 7 | java_context(sc) 8 | } 9 | \arguments{ 10 | \item{sc}{Connection to get SparkContext from} 11 | } 12 | \value{ 13 | Reference to SparkContext 14 | } 15 | \description{ 16 | Get the JavaSparkContext \code{spark_jobj} associated with a 17 | \code{spark_connection} 18 | } 19 | 20 | -------------------------------------------------------------------------------- /man/spark_dataframe.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/dataframe.R 3 | \name{spark_dataframe} 4 | \alias{spark_dataframe} 5 | \title{Get the Spark DataFrame associated with an object} 6 | \usage{ 7 | spark_dataframe(x, ...) 8 | } 9 | \arguments{ 10 | \item{x}{Object to get DataFrame from} 11 | 12 | \item{...}{Reserved for future use} 13 | } 14 | \value{ 15 | Reference to DataFrame 16 | } 17 | \description{ 18 | S3 method to get the Spark DataFrame associated with objects of 19 | various types. 20 | } 21 | 22 | -------------------------------------------------------------------------------- /man/spark_dependency.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/extensions.R 3 | \name{spark_dependency} 4 | \alias{spark_dependency} 5 | \title{Define a Spark dependency} 6 | \usage{ 7 | spark_dependency(jars = NULL, packages = NULL) 8 | } 9 | \arguments{ 10 | \item{jars}{Character vector of full paths to JAR files} 11 | 12 | \item{packages}{Character vector of Spark packages names} 13 | } 14 | \value{ 15 | An object of type `spark_dependency` 16 | } 17 | \description{ 18 | Define a Spark dependency consisting of a set of custom JARs and Spark packages. 19 | } 20 | 21 | -------------------------------------------------------------------------------- /man/spark_jobj.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/jobj.R 3 | \name{spark_jobj} 4 | \alias{spark_jobj} 5 | \title{Get the spark_jobj associated with an object} 6 | \usage{ 7 | spark_jobj(x, ...) 8 | } 9 | \arguments{ 10 | \item{x}{Object to extract jobj from} 11 | 12 | \item{...}{Reserved for future use} 13 | } 14 | \value{ 15 | A \code{spark_jobj} object that can be passed to 16 | \code{\link{invoke}}. 17 | } 18 | \description{ 19 | S3 method to get the spark_jobj associated with objects of 20 | various types. 21 | } 22 | \seealso{ 23 | \code{\link{invoke}} 24 | } 25 | 26 | -------------------------------------------------------------------------------- /man/spark_connection.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/connection.R 3 | \name{spark_connection} 4 | \alias{spark_connection} 5 | \title{Get the spark_connection associated with an object} 6 | \usage{ 7 | spark_connection(x, ...) 8 | } 9 | \arguments{ 10 | \item{x}{Object to extract connection from} 11 | 12 | \item{...}{Reserved for future use} 13 | } 14 | \value{ 15 | A \code{spark_connection} object that can be passed to 16 | \code{\link{invoke_new}} and \code{\link{invoke_static}}. 17 | } 18 | \description{ 19 | S3 method to get the spark_connection associated with objects of 20 | various types. 21 | } 22 | 23 | -------------------------------------------------------------------------------- /man/spark_log.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/connection.R 3 | \name{spark_log} 4 | \alias{spark_log} 5 | \title{Retrieves entries from the Spark log} 6 | \usage{ 7 | spark_log(sc, n = 100, ...) 8 | } 9 | \arguments{ 10 | \item{sc}{\code{spark_connection}} 11 | 12 | \item{n}{Max number of log entries to retrieve (pass NULL to retrieve 13 | all lines of the log)} 14 | 15 | \item{...}{Unused (reserved for future use)} 16 | } 17 | \value{ 18 | Character vector with last \code{n} lines of the Spark log 19 | or for \code{spark_log_file} the full path to the log file. 20 | } 21 | \description{ 22 | Retrieves entries from the Spark log 23 | } 24 | 25 | -------------------------------------------------------------------------------- /man/spark_version.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/version.R 3 | \name{spark_version} 4 | \alias{spark_version} 5 | \alias{spark_version_from_home} 6 | \title{Version of Spark for a connection} 7 | \usage{ 8 | spark_version(sc) 9 | 10 | spark_version_from_home(spark_home, default = NULL) 11 | } 12 | \arguments{ 13 | \item{sc}{\code{spark_connection}} 14 | 15 | \item{spark_home}{Path to SPARK_HOME} 16 | 17 | \item{default}{The version to use as default} 18 | } 19 | \value{ 20 | A \code{\link{numeric_version}} object 21 | } 22 | \description{ 23 | Version of Spark for a connection 24 | 25 | Version of Spark for a SPARK_HOME directory 26 | } 27 | 28 | -------------------------------------------------------------------------------- /R/dataframe.R: -------------------------------------------------------------------------------- 1 | #' Get the Spark DataFrame associated with an object 2 | #' 3 | #' S3 method to get the Spark DataFrame associated with objects of 4 | #' various types. 5 | #' 6 | #' @param x Object to get DataFrame from 7 | #' @param ... Reserved for future use 8 | #' @return Reference to DataFrame 9 | #' 10 | #' @export 11 | spark_dataframe <- function(x, ...) { 12 | UseMethod("spark_dataframe") 13 | } 14 | 15 | #' @export 16 | spark_dataframe.default <- function(x, ...) { 17 | stop("Unable to retreive a Spark DataFrame from object of class ", 18 | paste(class(x), collapse = " "), call. = FALSE) 19 | } 20 | 21 | #' @export 22 | spark_dataframe.spark_jobj <- function(x, ...) { 23 | x 24 | } 25 | 26 | -------------------------------------------------------------------------------- /inst/scala/logging.scala: -------------------------------------------------------------------------------- 1 | package sparkapi 2 | 3 | import java.text.SimpleDateFormat 4 | import java.util.Calendar 5 | import java.util.Date 6 | 7 | object Logging { 8 | def getDate() : String = { 9 | val now = Calendar.getInstance().getTime() 10 | val logFormat = new SimpleDateFormat("yy/MM/dd HH:mm:ss") 11 | return logFormat.format(now) 12 | } 13 | 14 | def logError(message: String) = { 15 | System.err.println(getDate() + " ERROR " + message) 16 | } 17 | 18 | def logError(message: String, e: Exception) = { 19 | System.err.println(getDate() + " ERROR " + message, e.toString) 20 | } 21 | 22 | def logWarning(message: String) = { 23 | System.err.println(getDate() + " WARN " + message) 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /man/invoke.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/invoke.R 3 | \name{invoke} 4 | \alias{invoke} 5 | \alias{invoke_new} 6 | \alias{invoke_static} 7 | \title{Execute a method on a remote Java object} 8 | \usage{ 9 | invoke(jobj, method, ...) 10 | 11 | invoke_static(sc, class, method, ...) 12 | 13 | invoke_new(sc, class, ...) 14 | } 15 | \arguments{ 16 | \item{jobj}{Java object to execute method on.} 17 | 18 | \item{method}{Name of method to execute.} 19 | 20 | \item{...}{Unused (future expansion)} 21 | 22 | \item{sc}{\code{spark_connection} to execute on.} 23 | 24 | \item{class}{Class to execute static method on.} 25 | } 26 | \description{ 27 | Execute a method on a remote Java object 28 | } 29 | 30 | -------------------------------------------------------------------------------- /man/connection_config.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/connection.R 3 | \name{connection_config} 4 | \alias{connection_config} 5 | \title{Read configuration values for a connection} 6 | \usage{ 7 | connection_config(sc, prefix, not_prefix = list()) 8 | } 9 | \arguments{ 10 | \item{sc}{\code{spark_connection}} 11 | 12 | \item{prefix}{Prefix to read parameters for 13 | (e.g. \code{spark.context.}, \code{spark.sql.}, etc.)} 14 | 15 | \item{not_prefix}{Prefix to not include.} 16 | } 17 | \value{ 18 | Named list of config parameters (note that if a prefix was 19 | specified then the names will not include the prefix) 20 | } 21 | \description{ 22 | Read configuration values for a connection 23 | } 24 | 25 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: sparkapi 2 | Type: Package 3 | Title: Spark API Interface 4 | Version: 0.3.22 5 | Authors@R: c( 6 | person("Javier", "Luraschi", email = "javier@rstudio.com", role = c("aut", "cre")), 7 | person(family = "The Apache Software Foundation", role = c("aut", "cph")), 8 | person("Kevin", "Ushey", role = "aut", email = "kevin@rstudio.com"), 9 | person("JJ", "Allaire", role = "aut", email = "jj@rstudio.com"), 10 | person(family = "RStudio", role = c("cph"))) 11 | Imports: 12 | utils, 13 | magrittr, 14 | withr 15 | Suggests: 16 | testthat 17 | Description: Low-level socket-based interface to calling the Spark API via the 18 | RBackend server included in Spark. 19 | License: file LICENSE 20 | Encoding: UTF-8 21 | LazyData: true 22 | RoxygenNote: 5.0.1 23 | -------------------------------------------------------------------------------- /man/invoke_method.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/invoke.R 3 | \name{invoke_method} 4 | \alias{invoke_method} 5 | \title{Generic call interface for spark shell} 6 | \usage{ 7 | invoke_method(sc, static, object, method, ...) 8 | } 9 | \arguments{ 10 | \item{sc}{\code{spark_connection}} 11 | 12 | \item{static}{Is this a static method call (including a constructor). If so 13 | then the \code{object} parameter should be the name of a class (otherwise 14 | it should be a spark_jobj instance).} 15 | 16 | \item{object}{Object instance or name of class (for \code{static})} 17 | 18 | \item{method}{Name of method} 19 | 20 | \item{...}{Call parameters} 21 | } 22 | \description{ 23 | Generic call interface for spark shell 24 | } 25 | \keyword{internal} 26 | 27 | -------------------------------------------------------------------------------- /inst/scala/utils.scala: -------------------------------------------------------------------------------- 1 | package sparkapi 2 | 3 | import java.io._ 4 | import java.io.File 5 | import java.util.Arrays 6 | 7 | import org.apache.spark.{SparkEnv, SparkException} 8 | 9 | object Utils { 10 | var rPackages: Option[String] = None 11 | 12 | /** 13 | * Return a nice string representation of the exception. It will call "printStackTrace" to 14 | * recursively generate the stack trace including the exception and its causes. 15 | */ 16 | def exceptionString(e: Throwable): String = { 17 | if (e == null) { 18 | "" 19 | } else { 20 | // Use e.printStackTrace here because e.getStackTrace doesn't include the cause 21 | val stringWriter = new StringWriter() 22 | e.printStackTrace(new PrintWriter(stringWriter)) 23 | stringWriter.toString 24 | } 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /man/register_extension.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/extensions.R 3 | \name{register_extension} 4 | \alias{register_extension} 5 | \alias{registered_extensions} 6 | \title{Register a package that implements a sparkapi extension} 7 | \usage{ 8 | register_extension(package) 9 | 10 | registered_extensions() 11 | } 12 | \arguments{ 13 | \item{package}{Name of package to register} 14 | } 15 | \description{ 16 | Registering an extension package will result in the package being 17 | automatically scanned for spark dependencies when a connection 18 | to Spark is initiated (e.g. via \code{start_shell}). 19 | 20 | Enumerate all registered extension packages 21 | } 22 | \note{ 23 | Extensions are typically registered when packages are 24 | loaded onto the search path (i.e. in the \code{.onLoad} 25 | function). 26 | } 27 | 28 | -------------------------------------------------------------------------------- /tests/testthat/test-config.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | 3 | context("Config") 4 | 5 | mock_spark_config <- function(master, config = list()) { 6 | list( 7 | master = master, 8 | config = config 9 | ) 10 | } 11 | 12 | test_that("connection_config can retrieve correct prefixes", { 13 | sc <- mock_spark_config(master = "local", config = list( 14 | "spark.session.value1" = "1", 15 | "spark.session.value2" = "2" 16 | )) 17 | 18 | params <- connection_config(sc, "spark.session.") 19 | 20 | expect_true(length(params) == 2) 21 | }) 22 | 23 | test_that("connection_config can filter out prefixes", { 24 | sc <- mock_spark_config(master = "local", config = list( 25 | "spark.sql.value" = "ok", 26 | "spark.value" = "not ok" 27 | )) 28 | 29 | params <- connection_config(sc, "spark.", c("spark.sql.", 30 | "spark.session.")) 31 | 32 | expect_true(length(params) == 1) 33 | }) 34 | -------------------------------------------------------------------------------- /configure.R: -------------------------------------------------------------------------------- 1 | compile_jars <- function() { 2 | verbose <- !is.na(Sys.getenv("NOT_CRAN", unset = NA)) 3 | 4 | # skip on Travis 5 | if (!is.na(Sys.getenv("TRAVIS", unset = NA))) { 6 | if (verbose) 7 | message("** skipping Scala compilation on Travis") 8 | return(FALSE) 9 | } 10 | 11 | # skip if no 'scalac' available 12 | if (!nzchar(Sys.which("scalac"))) { 13 | if (verbose) 14 | message("** skipping Scala compilation: 'scalac' not on PATH") 15 | return(FALSE) 16 | } 17 | 18 | # skip if no 'jar' available 19 | if (!nzchar(Sys.which("jar"))) { 20 | if (verbose) 21 | message("** skipping Scala compilation: 'jar' not on PATH") 22 | return(FALSE) 23 | } 24 | 25 | source("R/version.R") 26 | source("R/compile.R") 27 | 28 | tryCatch( 29 | source("inst/tools/compile-scala.R"), 30 | error = function(e) { 31 | if (nzchar(e$message)) { 32 | message(e$message) 33 | } 34 | } 35 | ) 36 | 37 | } 38 | 39 | invisible(compile_jars()) 40 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(connection_is_open,spark_shell_connection) 4 | S3method(invoke_method,spark_shell_connection) 5 | S3method(print,spark_jobj) 6 | S3method(print,spark_log) 7 | S3method(print,spark_web_url) 8 | S3method(print_jobj,spark_shell_connection) 9 | S3method(spark_connection,default) 10 | S3method(spark_connection,spark_connection) 11 | S3method(spark_connection,spark_jobj) 12 | S3method(spark_dataframe,default) 13 | S3method(spark_dataframe,spark_jobj) 14 | S3method(spark_jobj,default) 15 | S3method(spark_jobj,spark_jobj) 16 | S3method(spark_log,default) 17 | S3method(spark_log,spark_shell_connection) 18 | S3method(spark_web,default) 19 | S3method(spark_web,spark_shell_connection) 20 | export("%>%") 21 | export(connection_config) 22 | export(connection_is_open) 23 | export(hive_context) 24 | export(invoke) 25 | export(invoke_method) 26 | export(invoke_new) 27 | export(invoke_static) 28 | export(java_context) 29 | export(print_jobj) 30 | export(register_extension) 31 | export(registered_extensions) 32 | export(spark_compile) 33 | export(spark_connection) 34 | export(spark_context) 35 | export(spark_dataframe) 36 | export(spark_dependency) 37 | export(spark_jobj) 38 | export(spark_log) 39 | export(spark_version) 40 | export(spark_version_from_home) 41 | export(spark_web) 42 | export(start_shell) 43 | export(stop_shell) 44 | import(digest) 45 | import(rprojroot) 46 | importFrom(magrittr,"%>%") 47 | -------------------------------------------------------------------------------- /man/start_shell.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/shell.R 3 | \name{start_shell} 4 | \alias{start_shell} 5 | \alias{stop_shell} 6 | \title{Start the Spark R Shell} 7 | \usage{ 8 | start_shell(master, spark_home = Sys.getenv("SPARK_HOME"), 9 | spark_version = NULL, app_name = "sparkapi", config = list(), 10 | extensions = sparkapi::registered_extensions(), jars = NULL, 11 | packages = NULL, environment = NULL, shell_args = NULL) 12 | 13 | stop_shell(sc) 14 | } 15 | \arguments{ 16 | \item{master}{Spark cluster url to connect to. Use \code{"local"} to connect to a local 17 | instance of Spark} 18 | 19 | \item{spark_home}{Spark home directory (defaults to SPARK_HOME environment variable)} 20 | 21 | \item{spark_version}{Spark version, if not specified, version taken from SPARK_HOME} 22 | 23 | \item{app_name}{Application name to be used while running in the Spark cluster} 24 | 25 | \item{config}{Named character vector of spark.* options} 26 | 27 | \item{extensions}{Extension packages to include dependencies for 28 | (see \code{\link{spark_dependency}}).} 29 | 30 | \item{jars}{Paths to Jar files to include} 31 | 32 | \item{packages}{Spark packages to include} 33 | 34 | \item{environment}{Environment variables to set} 35 | 36 | \item{shell_args}{Additional command line arguments for spark_shell} 37 | 38 | \item{sc}{\code{spark_connection}} 39 | } 40 | \value{ 41 | \code{spark_connection} object 42 | } 43 | \description{ 44 | Start the Spark R Shell 45 | 46 | Stop the Spark R Shell 47 | } 48 | 49 | -------------------------------------------------------------------------------- /R/invoke.R: -------------------------------------------------------------------------------- 1 | 2 | #' Execute a method on a remote Java object 3 | #' 4 | #' @param sc \code{spark_connection} to execute on. 5 | #' @param jobj Java object to execute method on. 6 | #' @param class Class to execute static method on. 7 | #' @param method Name of method to execute. 8 | #' @param ... Unused (future expansion) 9 | #' 10 | #' @export 11 | invoke <- function (jobj, method, ...) 12 | { 13 | invoke_method(spark_connection(jobj), 14 | FALSE, 15 | jobj, 16 | method, 17 | ...) 18 | } 19 | 20 | 21 | #' @name invoke 22 | #' @export 23 | invoke_static <- function (sc, class, method, ...) 24 | { 25 | invoke_method(sc, 26 | TRUE, 27 | class, 28 | method, 29 | ...) 30 | } 31 | 32 | 33 | #' @name invoke 34 | #' @export 35 | invoke_new <- function(sc, class, ...) 36 | { 37 | invoke_method(sc, 38 | TRUE, 39 | class, 40 | "", 41 | ...) 42 | } 43 | 44 | #' Generic call interface for spark shell 45 | #' 46 | #' @param sc \code{spark_connection} 47 | #' @param static Is this a static method call (including a constructor). If so 48 | #' then the \code{object} parameter should be the name of a class (otherwise 49 | #' it should be a spark_jobj instance). 50 | #' @param object Object instance or name of class (for \code{static}) 51 | #' @param method Name of method 52 | #' @param ... Call parameters 53 | #' 54 | #' @keywords internal 55 | #' 56 | #' @export 57 | invoke_method <- function(sc, static, object, method, ...) { 58 | UseMethod("invoke_method") 59 | } 60 | 61 | -------------------------------------------------------------------------------- /R/hive.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | create_hive_context <- function(sc) { 4 | if (spark_version(sc) >= "2.0.0") 5 | create_hive_context_v2(sc) 6 | else 7 | create_hive_context_v1(sc) 8 | } 9 | 10 | create_hive_context_v2 <- function(sc) { 11 | 12 | # SparkSession.builder().enableHiveSupport() 13 | builder <- invoke_static( 14 | sc, 15 | "org.apache.spark.sql.SparkSession", 16 | "builder" 17 | ) 18 | 19 | builder <- invoke( 20 | builder, 21 | "enableHiveSupport" 22 | ) 23 | 24 | session <- invoke( 25 | builder, 26 | "getOrCreate" 27 | ) 28 | 29 | # get config object 30 | conf <- invoke(session, "conf") 31 | 32 | # apply spark.sql. params 33 | params <- connection_config(sc, "spark.sql.") 34 | apply_config(params, conf, "set", "spark.sql.") 35 | 36 | # return session as hive context 37 | session 38 | } 39 | 40 | create_hive_context_v1 <- function(sc) { 41 | 42 | # get spark_context 43 | ctx <- spark_context(sc) 44 | 45 | # attempt to create hive_context 46 | hive_context <- tryCatch({ 47 | invoke_new( 48 | sc, 49 | "org.apache.spark.sql.hive.HiveContext", 50 | ctx 51 | )}, 52 | error = function(e) { 53 | NULL 54 | } 55 | ) 56 | 57 | # if we failed then create a SqlContext instead 58 | if (is.null(hive_context)) { 59 | 60 | warning("Failed to create Hive context, falling back to SQL. Some operations, ", 61 | "like window-functions, will not work") 62 | 63 | jsc <- invoke_static( 64 | sc, 65 | "org.apache.spark.api.java.JavaSparkContext", 66 | "fromSparkContext", 67 | ctx 68 | ) 69 | 70 | hive_context <- invoke_static( 71 | sc, 72 | "org.apache.spark.sql.api.r.SQLUtils", 73 | "createSQLContext", 74 | jsc 75 | ) 76 | } 77 | 78 | # apply configuration 79 | params <- connection_config(sc, "spark.sql.") 80 | apply_config(params, hive_context, "setConf", "spark.sql.") 81 | 82 | # return hive_context 83 | hive_context 84 | } 85 | 86 | apply_config <- function(params, object, method, prefix) { 87 | lapply(names(params), function(paramName) { 88 | configValue <- params[[paramName]] 89 | if (is.logical(configValue)) { 90 | configValue <- if (configValue) "true" else "false" 91 | } 92 | else { 93 | configValue <- as.character(configValue) 94 | } 95 | 96 | invoke( 97 | object, 98 | method, 99 | paste0(prefix, paramName), 100 | configValue 101 | ) 102 | }) 103 | } 104 | -------------------------------------------------------------------------------- /R/version.R: -------------------------------------------------------------------------------- 1 | spark_version_clean <- function(version) { 2 | gsub("([0-9]+\\.?)[^0-9\\.](.*)","\\1", version) 3 | } 4 | 5 | #' Version of Spark for a connection 6 | #' 7 | #' @param sc \code{spark_connection} 8 | #' 9 | #' @return A \code{\link{numeric_version}} object 10 | #' 11 | #' @export 12 | spark_version <- function(sc) { 13 | # get the version 14 | version <- invoke(spark_context(sc), "version") 15 | 16 | # Get rid of -preview and other suffix variations 17 | version <- spark_version_clean(version) 18 | 19 | # return numeric version 20 | numeric_version(version) 21 | } 22 | 23 | spark_version_from_home_version <- function() { 24 | version <- Sys.getenv("SPARK_HOME_VERSION") 25 | if (nchar(version) <= 0) NULL else version 26 | } 27 | 28 | #' Version of Spark for a SPARK_HOME directory 29 | #' 30 | #' @param spark_home Path to SPARK_HOME 31 | #' @param default The version to use as default 32 | #' 33 | #' @rdname spark_version 34 | #' 35 | #' @export 36 | spark_version_from_home <- function(spark_home, default = NULL) { 37 | versionAttempts <- list( 38 | useReleaseFile = function() { 39 | versionedFile <- file.path(spark_home, "RELEASE") 40 | if (file.exists(versionedFile)) { 41 | releaseContents <- readLines(versionedFile) 42 | 43 | if (!is.null(releaseContents) && length(releaseContents) > 0) { 44 | gsub("Spark | built.*", "", releaseContents[[1]]) 45 | } 46 | } 47 | }, 48 | useAssemblies = function() { 49 | candidateVersions <- list( 50 | list(path = "lib", pattern = "spark-assembly-([0-9\\.]*)-hadoop.[0-9\\.]*\\.jar"), 51 | list(path = "yarn", pattern = "spark-([0-9\\.]*)-preview-yarn-shuffle\\.jar") 52 | ) 53 | 54 | candidateFiles <- lapply(candidateVersions, function(e) { 55 | c(e, 56 | list( 57 | files = list.files( 58 | file.path(spark_home, e$path), 59 | pattern = e$pattern 60 | ) 61 | ) 62 | ) 63 | }) 64 | 65 | filteredCandidates <- Filter(function(f) length(f$files) > 0, candidateFiles) 66 | if (length(filteredCandidates) > 0) { 67 | valid <- filteredCandidates[[1]] 68 | e <- regexec(valid$pattern, valid$files[[1]]) 69 | match <- regmatches(valid$files[[1]], e) 70 | if (length(match) > 0 && length(match[[1]]) > 1) { 71 | return(match[[1]][[2]]) 72 | } 73 | } 74 | }, 75 | useEnvironmentVariable = function() { 76 | spark_version_from_home_version() 77 | }, 78 | useDefault = function() { 79 | default 80 | } 81 | ) 82 | 83 | for (versionAttempt in versionAttempts) { 84 | result <- versionAttempt() 85 | if (length(result) > 0) 86 | return(spark_version_clean(result)) 87 | } 88 | 89 | stop( 90 | "Failed to detect version from SPARK_HOME or SPARK_HOME_VERSION. ", 91 | "Try passing the spark_version explicitly.") 92 | } 93 | -------------------------------------------------------------------------------- /R/extensions.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | #' Register a package that implements a sparkapi extension 4 | #' 5 | #' Registering an extension package will result in the package being 6 | #' automatically scanned for spark dependencies when a connection 7 | #' to Spark is initiated (e.g. via \code{start_shell}). 8 | #' 9 | #' @param package Name of package to register 10 | #' 11 | #' @note Extensions are typically registered when packages are 12 | #' loaded onto the search path (i.e. in the \code{.onLoad} 13 | #' function). 14 | #' 15 | #' @export 16 | register_extension <- function(package) { 17 | .globals$extension_packages <- c(.globals$extension_packages, package) 18 | } 19 | 20 | #' Enumerate all registered extension packages 21 | 22 | #' @rdname register_extension 23 | #' @export 24 | registered_extensions <- function() { 25 | .globals$extension_packages 26 | } 27 | 28 | 29 | #' Define a Spark dependency 30 | #' 31 | #' Define a Spark dependency consisting of a set of custom JARs and Spark packages. 32 | #' 33 | #' @param jars Character vector of full paths to JAR files 34 | #' @param packages Character vector of Spark packages names 35 | #' 36 | #' @return An object of type `spark_dependency` 37 | #' 38 | #' @export 39 | spark_dependency <- function(jars = NULL, packages = NULL) { 40 | structure(class = "spark_dependency", list( 41 | jars = jars, 42 | packages = packages 43 | )) 44 | } 45 | 46 | spark_dependencies_from_extensions <- function(spark_version, scala_version, extensions) { 47 | 48 | jars <- character() 49 | packages <- character() 50 | 51 | lapply(extensions, function(extension) { 52 | dependencies <- spark_dependencies_from_extension(spark_version, scala_version, extension) 53 | lapply(dependencies, function(dependency) { 54 | jars <<- c(jars, dependency$jars) 55 | packages <<- c(packages, dependency$packages) 56 | }) 57 | }) 58 | 59 | list( 60 | jars = jars, 61 | packages = packages 62 | ) 63 | } 64 | 65 | spark_dependencies_from_extension <- function(spark_version, scala_version, extension) { 66 | 67 | # attempt to find the function 68 | spark_dependencies <- tryCatch({ 69 | get("spark_dependencies", asNamespace(extension), inherits = FALSE) 70 | }, 71 | error = function(e) { 72 | stop("spark_dependencies function not found within ", 73 | "extension package ", extension, call. = FALSE) 74 | } 75 | ) 76 | 77 | # reduce the spark_version to just major and minor versions 78 | spark_version <- package_version(spark_version) 79 | spark_version <- paste(spark_version$major, spark_version$minor, sep = '.') 80 | spark_version <- numeric_version(spark_version) 81 | 82 | # call the function 83 | dependency <- spark_dependencies(spark_version = spark_version, 84 | scala_version = scala_version) 85 | 86 | # if it's just a single dependency then wrap it in a list 87 | if (inherits(dependency, "spark_dependency")) 88 | dependency <- list(dependency) 89 | 90 | # return it 91 | dependency 92 | } 93 | 94 | -------------------------------------------------------------------------------- /R/compile.R: -------------------------------------------------------------------------------- 1 | #' Compiles scala sources and packages into a jar file 2 | #' 3 | #' @export 4 | #' @param name The name of the target jar 5 | #' @param spark_home Spark version 6 | #' 7 | #' @import rprojroot 8 | #' @import digest 9 | #' 10 | #' @keywords internal 11 | spark_compile <- function(name, spark_home) { 12 | spark_version <- spark_version_from_home(spark_home) 13 | version_numeric <- gsub("[-_a-zA-Z]", "", spark_version) 14 | version_sufix <- gsub("\\.|[-_a-zA-Z]", "", spark_version) 15 | jar_name <- paste0(name, "-", version_numeric, ".jar") 16 | 17 | root <- rprojroot::find_package_root_file() 18 | 19 | jar_path <- file.path(root, "inst", "java", jar_name) 20 | scala_files <- lapply( 21 | Filter( 22 | function(e) { 23 | # if filename has version only include version being built 24 | if (grepl(".*_\\d+\\.scala", e)) { 25 | grepl(version_sufix, e) 26 | } 27 | else { 28 | grepl(".*\\.scala$", e) 29 | } 30 | }, 31 | list.files(file.path(root, "inst", "scala")) 32 | ), 33 | function(e) file.path(root, "inst", "scala", e) 34 | ) 35 | scala_files_digest <- file.path(root, paste0( 36 | "inst/scala/sparklyr-", version_numeric, ".md5" 37 | )) 38 | 39 | scala_files_contents <- paste(lapply(scala_files, function(e) readLines(e))) 40 | scala_files_contents_path <- tempfile() 41 | scala_files_contents_file <- file(scala_files_contents_path, "w") 42 | writeLines(scala_files_contents, scala_files_contents_file) 43 | close(scala_files_contents_file) 44 | 45 | # Bail if files havent changed 46 | md5 <- tools::md5sum(scala_files_contents_path) 47 | if (file.exists(scala_files_digest) && file.exists(jar_path)) { 48 | contents <- readChar(scala_files_digest, file.info(scala_files_digest)$size, TRUE) 49 | if (identical(contents, md5[[scala_files_contents_path]])) { 50 | return() 51 | } 52 | } 53 | 54 | message("** building '", jar_name, "' ...") 55 | 56 | cat(md5, file = scala_files_digest) 57 | 58 | execute <- function(...) { 59 | cmd <- paste(...) 60 | message("*** ", cmd) 61 | system(cmd) 62 | } 63 | 64 | if (!nzchar(Sys.which("scalac"))) 65 | stop("failed to discover 'scalac' on the PATH") 66 | 67 | if (!nzchar(Sys.which("jar"))) 68 | stop("failed to discover 'jar' on the PATH") 69 | 70 | # Work in temporary directory (as temporary class files 71 | # will be generated within there) 72 | dir <- file.path(tempdir(), paste0(name, "-", version_sufix, "-scala-compile")) 73 | if (!file.exists(dir)) 74 | if (!dir.create(dir)) 75 | stop("Failed to create '", dir, "'") 76 | owd <- setwd(dir) 77 | 78 | # list jars in the installation folder 79 | candidates <- c("jars", "lib") 80 | jars <- NULL 81 | for (candidate in candidates) { 82 | jars <- list.files( 83 | file.path(spark_home, candidate), 84 | full.names = TRUE, 85 | pattern = "jar$" 86 | ) 87 | 88 | if (length(jars)) 89 | break 90 | } 91 | 92 | if (!length(jars)) 93 | stop("failed to discover Spark jars") 94 | 95 | # construct classpath 96 | CLASSPATH <- paste(jars, collapse = .Platform$path.sep) 97 | 98 | # ensure 'inst/java' exists 99 | inst_java_path <- file.path(root, "inst/java") 100 | if (!file.exists(inst_java_path)) 101 | if (!dir.create(inst_java_path, recursive = TRUE)) 102 | stop("failed to create directory '", inst_java_path, "'") 103 | 104 | # call 'scalac' compiler 105 | classpath <- Sys.getenv("CLASSPATH") 106 | 107 | # set CLASSPATH environment variable rather than passing 108 | # in on command line (mostly aesthetic) 109 | Sys.setenv(CLASSPATH = CLASSPATH) 110 | execute("scalac", paste(shQuote(scala_files), collapse = " ")) 111 | Sys.setenv(CLASSPATH = classpath) 112 | 113 | # call 'jar' to create our jar 114 | class_files <- file.path(name, list.files(name, pattern = "class$")) 115 | execute("jar cf", jar_path, paste(shQuote(class_files), collapse = " ")) 116 | 117 | # double-check existence of jar 118 | if (file.exists(jar_path)) { 119 | message("*** ", basename(jar_path), " successfully created.") 120 | } else { 121 | stop("*** failed to create ", jar_name) 122 | } 123 | 124 | setwd(owd) 125 | } 126 | -------------------------------------------------------------------------------- /inst/scala/backend.scala: -------------------------------------------------------------------------------- 1 | package sparkapi 2 | 3 | import java.io.{DataOutputStream, File, FileOutputStream, IOException} 4 | import java.net.{InetAddress, InetSocketAddress, ServerSocket} 5 | import java.util.concurrent.TimeUnit 6 | 7 | import io.netty.bootstrap.ServerBootstrap 8 | import io.netty.channel.{ChannelFuture, ChannelInitializer, EventLoopGroup} 9 | import io.netty.channel.nio.NioEventLoopGroup 10 | import io.netty.channel.socket.SocketChannel 11 | import io.netty.channel.socket.nio.NioServerSocketChannel 12 | import io.netty.handler.codec.LengthFieldBasedFrameDecoder 13 | import io.netty.handler.codec.bytes.{ByteArrayDecoder, ByteArrayEncoder} 14 | 15 | import org.apache.spark.SparkConf 16 | 17 | import sparkapi.Logging._ 18 | 19 | class Backend { 20 | 21 | private[this] var channelFuture: ChannelFuture = null 22 | private[this] var bootstrap: ServerBootstrap = null 23 | private[this] var bossGroup: EventLoopGroup = null 24 | 25 | def init(): Int = { 26 | val conf = new SparkConf() 27 | bossGroup = new NioEventLoopGroup(conf.getInt("sparkapi.backend.threads", 2)) 28 | val workerGroup = bossGroup 29 | val handler = new Handler(this) 30 | 31 | bootstrap = new ServerBootstrap() 32 | .group(bossGroup, workerGroup) 33 | .channel(classOf[NioServerSocketChannel]) 34 | 35 | bootstrap.childHandler(new ChannelInitializer[SocketChannel]() { 36 | def initChannel(ch: SocketChannel): Unit = { 37 | ch.pipeline() 38 | .addLast("encoder", new ByteArrayEncoder()) 39 | .addLast("frameDecoder", 40 | new LengthFieldBasedFrameDecoder(Integer.MAX_VALUE, 0, 4, 0, 4)) 41 | .addLast("decoder", new ByteArrayDecoder()) 42 | .addLast("handler", handler) 43 | } 44 | }) 45 | 46 | channelFuture = bootstrap.bind(new InetSocketAddress("localhost", 0)) 47 | channelFuture.syncUninterruptibly() 48 | channelFuture.channel().localAddress().asInstanceOf[InetSocketAddress].getPort() 49 | } 50 | 51 | def run(): Unit = { 52 | channelFuture.channel.closeFuture().syncUninterruptibly() 53 | } 54 | 55 | def close(): Unit = { 56 | if (channelFuture != null) { 57 | // close is a local operation and should finish within milliseconds; timeout just to be safe 58 | channelFuture.channel().close().awaitUninterruptibly(10, TimeUnit.SECONDS) 59 | channelFuture = null 60 | } 61 | if (bootstrap != null && bootstrap.group() != null) { 62 | bootstrap.group().shutdownGracefully() 63 | } 64 | if (bootstrap != null && bootstrap.childGroup() != null) { 65 | bootstrap.childGroup().shutdownGracefully() 66 | } 67 | bootstrap = null 68 | } 69 | 70 | } 71 | 72 | object Backend { 73 | def main(args: Array[String]): Unit = { 74 | if (args.length < 1) { 75 | System.err.println("Usage: Backend ") 76 | System.exit(-1) 77 | } 78 | 79 | val backend = new Backend() 80 | try { 81 | // bind to random port 82 | val boundPort = backend.init() 83 | val serverSocket = new ServerSocket(0, 1, InetAddress.getByName("localhost")) 84 | val listenPort = serverSocket.getLocalPort() 85 | 86 | // tell the R process via temporary file 87 | val path = args(0) 88 | val f = new File(path + ".tmp") 89 | val dos = new DataOutputStream(new FileOutputStream(f)) 90 | dos.writeInt(boundPort) 91 | dos.writeInt(listenPort) 92 | Serializer.writeString(dos, Utils.rPackages.getOrElse("")) 93 | dos.close() 94 | f.renameTo(new File(path)) 95 | 96 | // wait for the end of stdin, then exit 97 | new Thread("wait for socket to close") { 98 | setDaemon(true) 99 | override def run(): Unit = { 100 | // any un-catched exception will also shutdown JVM 101 | val buf = new Array[Byte](1024) 102 | // shutdown JVM if R does not connect back in 10 seconds 103 | serverSocket.setSoTimeout(10000) 104 | try { 105 | val inSocket = serverSocket.accept() 106 | serverSocket.close() 107 | // wait for the end of socket, closed if R process die 108 | inSocket.getInputStream().read(buf) 109 | } finally { 110 | backend.close() 111 | System.exit(0) 112 | } 113 | } 114 | }.start() 115 | 116 | backend.run() 117 | } catch { 118 | case e: IOException => 119 | logError("Server shutting down: failed with exception ", e) 120 | backend.close() 121 | System.exit(1) 122 | } 123 | System.exit(0) 124 | } 125 | } 126 | -------------------------------------------------------------------------------- /R/jobj.R: -------------------------------------------------------------------------------- 1 | # Imported from: 2 | # https://raw.githubusercontent.com/apache/spark/branch-1.6/R/pkg/R/jobj.R 3 | # 4 | # Licensed to the Apache Software Foundation (ASF) under one or more 5 | # contributor license agreements. See the NOTICE file distributed with 6 | # this work for additional information regarding copyright ownership. 7 | # The ASF licenses this file to You under the Apache License, Version 2.0 8 | # (the "License"); you may not use this file except in compliance with 9 | # the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | # References to objects that exist on the JVM backend 21 | # are maintained using the jobj. 22 | 23 | 24 | #' Get the spark_jobj associated with an object 25 | #' 26 | #' S3 method to get the spark_jobj associated with objects of 27 | #' various types. 28 | #' 29 | #' @param x Object to extract jobj from 30 | #' @param ... Reserved for future use 31 | #' @return A \code{spark_jobj} object that can be passed to 32 | #' \code{\link{invoke}}. 33 | #' 34 | #' @seealso \code{\link{invoke}} 35 | #' 36 | #' @export 37 | spark_jobj <- function(x, ...) { 38 | UseMethod("spark_jobj") 39 | } 40 | 41 | 42 | #' @export 43 | spark_jobj.default <- function(x, ...) { 44 | stop("Unable to retreive a spark_jobj from object of class ", 45 | paste(class(x), collapse = " "), call. = FALSE) 46 | } 47 | 48 | #' @export 49 | spark_jobj.spark_jobj <- function(x, ...) { 50 | x 51 | } 52 | 53 | #' @export 54 | print.spark_jobj <- function(x, ...) { 55 | print_jobj(spark_connection(x), x, ...) 56 | } 57 | 58 | #' Generic method for print jobj for a connection type 59 | #' 60 | #' @param sc \code{spark_connection} (used for type dispatch) 61 | #' @param jobj Object to print 62 | #' 63 | #' @keywords internal 64 | #' 65 | #' @export 66 | print_jobj <- function(sc, jobj, ...) { 67 | UseMethod("print_jobj") 68 | } 69 | 70 | 71 | # Maintain a reference count of Java object references 72 | # This allows us to GC the java object when it is safe 73 | .validJobjs <- new.env(parent = emptyenv()) 74 | 75 | # List of object ids to be removed 76 | .toRemoveJobjs <- new.env(parent = emptyenv()) 77 | 78 | # Check if jobj was created with the current SparkContext 79 | isValidJobj <- function(jobj) { 80 | TRUE 81 | } 82 | 83 | getJobj <- function(objId) { 84 | newObj <- jobj_create(objId) 85 | if (exists(objId, .validJobjs)) { 86 | .validJobjs[[objId]] <- .validJobjs[[objId]] + 1 87 | } else { 88 | .validJobjs[[objId]] <- 1 89 | } 90 | newObj 91 | } 92 | 93 | # Handler for a java object that exists on the backend. 94 | jobj_create <- function(objId) { 95 | if (!is.character(objId)) { 96 | stop("object id must be a character") 97 | } 98 | # NOTE: We need a new env for a jobj as we can only register 99 | # finalizers for environments or external references pointers. 100 | obj <- structure(new.env(parent = emptyenv()), class = "spark_jobj") 101 | obj$id <- objId 102 | 103 | # Register a finalizer to remove the Java object when this reference 104 | # is garbage collected in R 105 | reg.finalizer(obj, cleanup.jobj) 106 | obj 107 | } 108 | 109 | jobj_info <- function(jobj) { 110 | if (!inherits(jobj, "spark_jobj")) 111 | stop("'jobj_info' called on non-jobj") 112 | 113 | class <- NULL 114 | repr <- NULL 115 | 116 | tryCatch({ 117 | class <- invoke(jobj, "getClass") 118 | if (inherits(class, "spark_jobj")) 119 | class <- invoke(class, "toString") 120 | }, error = function(e) { 121 | }) 122 | tryCatch({ 123 | repr <- invoke(jobj, "toString") 124 | }, error = function(e) { 125 | }) 126 | list( 127 | class = class, 128 | repr = repr 129 | ) 130 | } 131 | 132 | jobj_inspect <- function(jobj) { 133 | print(jobj) 134 | if (!connection_is_open(spark_connection(jobj))) 135 | return(jobj) 136 | 137 | class <- invoke(jobj, "getClass") 138 | 139 | cat("Fields:\n") 140 | fields <- invoke(class, "getDeclaredFields") 141 | lapply(fields, function(field) { print(field) }) 142 | 143 | cat("Methods:\n") 144 | methods <- invoke(class, "getDeclaredMethods") 145 | lapply(methods, function(method) { print(method) }) 146 | 147 | jobj 148 | } 149 | 150 | cleanup.jobj <- function(jobj) { 151 | if (isValidJobj(jobj)) { 152 | objId <- jobj$id 153 | # If we don't know anything about this jobj, ignore it 154 | if (exists(objId, envir = .validJobjs)) { 155 | .validJobjs[[objId]] <- .validJobjs[[objId]] - 1 156 | 157 | if (.validJobjs[[objId]] == 0) { 158 | rm(list = objId, envir = .validJobjs) 159 | # NOTE: We cannot call removeJObject here as the finalizer may be run 160 | # in the middle of another RPC. Thus we queue up this object Id to be removed 161 | # and then run all the removeJObject when the next RPC is called. 162 | .toRemoveJobjs[[objId]] <- 1 163 | } 164 | } 165 | } 166 | } 167 | 168 | clearJobjs <- function() { 169 | valid <- ls(.validJobjs) 170 | rm(list = valid, envir = .validJobjs) 171 | 172 | removeList <- ls(.toRemoveJobjs) 173 | rm(list = removeList, envir = .toRemoveJobjs) 174 | } 175 | 176 | -------------------------------------------------------------------------------- /R/connection.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | #' Get the SparkContext associated with a connection 5 | #' 6 | #' Get the SparkContext \code{spark_jobj} associated with a 7 | #' \code{spark_connection} 8 | #' 9 | #' @param sc Connection to get SparkContext from 10 | #' 11 | #' @return Reference to SparkContext 12 | #' @export 13 | spark_context <- function(sc) { 14 | sc$spark_context 15 | } 16 | 17 | #' Get the JavaSparkContext associated with a connection 18 | #' 19 | #' Get the JavaSparkContext \code{spark_jobj} associated with a 20 | #' \code{spark_connection} 21 | #' 22 | #' @param sc Connection to get SparkContext from 23 | #' 24 | #' @return Reference to SparkContext 25 | #' @export 26 | java_context <- function(sc) { 27 | sc$java_context 28 | } 29 | 30 | #' Get the HiveContext associated with a connection 31 | #' 32 | #' Get the HiveContext \code{spark_jobj} associated with a 33 | #' \code{spark_connection} 34 | #' 35 | #' @param sc Connection to get HiveContext from 36 | #' 37 | #' @return Reference to HiveContext 38 | #' @export 39 | hive_context <- function(sc) { 40 | sc$hive_context 41 | } 42 | 43 | 44 | #' Get the spark_connection associated with an object 45 | #' 46 | #' S3 method to get the spark_connection associated with objects of 47 | #' various types. 48 | #' 49 | #' @param x Object to extract connection from 50 | #' @param ... Reserved for future use 51 | #' @return A \code{spark_connection} object that can be passed to 52 | #' \code{\link{invoke_new}} and \code{\link{invoke_static}}. 53 | #' 54 | #' @export 55 | spark_connection <- function(x, ...) { 56 | UseMethod("spark_connection") 57 | } 58 | 59 | #' @export 60 | spark_connection.default <- function(x, ...) { 61 | stop("Unable to retreive a spark_connection from object of class ", 62 | paste(class(x), collapse = " "), call. = FALSE) 63 | } 64 | 65 | #' @export 66 | spark_connection.spark_connection <- function(x, ...) { 67 | x 68 | } 69 | 70 | #' @export 71 | spark_connection.spark_jobj <- function(x, ...) { 72 | x$connection 73 | } 74 | 75 | #' Check whether the connection is open 76 | #' 77 | #' @param sc \code{spark_connection} 78 | #' 79 | #' @keywords internal 80 | #' 81 | #' @export 82 | connection_is_open <- function(sc) { 83 | UseMethod("connection_is_open") 84 | } 85 | 86 | #' Read configuration values for a connection 87 | #' 88 | #' @param sc \code{spark_connection} 89 | #' @param prefix Prefix to read parameters for 90 | #' (e.g. \code{spark.context.}, \code{spark.sql.}, etc.) 91 | #' @param not_prefix Prefix to not include. 92 | #' 93 | #' @return Named list of config parameters (note that if a prefix was 94 | #' specified then the names will not include the prefix) 95 | #' 96 | #' @export 97 | connection_config <- function(sc, prefix, not_prefix = list()) { 98 | 99 | config <- sc$config 100 | master <- sc$master 101 | isLocal <- spark_master_is_local(master) 102 | 103 | configNames <- Filter(function(e) { 104 | found <- is.null(prefix) || 105 | (substring(e, 1, nchar(prefix)) == prefix) 106 | 107 | if (grepl("\\.local$", e) && !isLocal) 108 | found <- FALSE 109 | 110 | if (grepl("\\.remote$", e) && isLocal) 111 | found <- FALSE 112 | 113 | found 114 | }, names(config)) 115 | 116 | lapply(not_prefix, function(notPrefix) { 117 | configNames <<- Filter(function(e) { 118 | substring(e, 1, nchar(notPrefix)) != notPrefix 119 | }, configNames) 120 | }) 121 | 122 | paramsNames <- lapply(configNames, function(configName) { 123 | paramName <- substr(configName, nchar(prefix) + 1, nchar(configName)) 124 | paramName <- sub("(\\.local$)|(\\.remote$)", "", paramName, perl = TRUE) 125 | 126 | paramName 127 | }) 128 | 129 | params <- lapply(configNames, function(configName) { 130 | config[[configName]] 131 | }) 132 | 133 | names(params) <- paramsNames 134 | params 135 | } 136 | 137 | spark_master_is_local <- function(master) { 138 | grepl("^local(\\[[0-9\\*]*\\])?$", master, perl = TRUE) 139 | } 140 | 141 | 142 | #' Retrieves entries from the Spark log 143 | #' 144 | #' @param sc \code{spark_connection} 145 | #' @param n Max number of log entries to retrieve (pass NULL to retrieve 146 | #' all lines of the log) 147 | #' @param ... Unused (reserved for future use) 148 | #' 149 | #' @return Character vector with last \code{n} lines of the Spark log 150 | #' or for \code{spark_log_file} the full path to the log file. 151 | #' 152 | #' @export 153 | spark_log <- function(sc, n = 100, ...) { 154 | UseMethod("spark_log") 155 | } 156 | 157 | #' @export 158 | spark_log.default <- function(sc, n = 100, ...) { 159 | stop("Invalid class passed to spark_log") 160 | } 161 | 162 | #' @export 163 | print.spark_log <- function(x, ...) { 164 | cat(x, sep = "\n") 165 | cat("\n") 166 | } 167 | 168 | #' Open the Spark web interface 169 | #' 170 | #' @inheritParams spark_log 171 | #' 172 | #' @export 173 | spark_web <- function(sc, ...) { 174 | UseMethod("spark_web") 175 | } 176 | 177 | #' @export 178 | spark_web.default <- function(sc, ...) { 179 | stop("Invalid class passed to spark_web") 180 | } 181 | 182 | 183 | #' @export 184 | print.spark_web_url <- function(x, ...) { 185 | utils::browseURL(x) 186 | } 187 | 188 | initialize_connection <- function(sc) { 189 | 190 | # create the spark config 191 | conf <- invoke_new(sc, "org.apache.spark.SparkConf") 192 | conf <- invoke(conf, "setAppName", sc$app_name) 193 | conf <- invoke(conf, "setMaster", sc$master) 194 | conf <- invoke(conf, "setSparkHome", sc$spark_home) 195 | 196 | context_config <- connection_config(sc, "spark.", c("spark.sql.")) 197 | apply_config(context_config, conf, "set", "spark.") 198 | 199 | # create the spark context and assign the connection to it 200 | sc$spark_context <- invoke_new( 201 | sc, 202 | "org.apache.spark.SparkContext", 203 | conf 204 | ) 205 | sc$spark_context$connection <- sc 206 | 207 | # create the java spark context and assign the connection to it 208 | sc$java_context <- invoke_new( 209 | sc, 210 | "org.apache.spark.api.java.JavaSparkContext", 211 | sc$spark_context 212 | ) 213 | sc$java_context$connection <- sc 214 | 215 | # create the hive context and assign the connection to it 216 | sc$hive_context <- create_hive_context(sc) 217 | sc$hive_context$connection <- sc 218 | 219 | # return the modified connection 220 | sc 221 | } 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | -------------------------------------------------------------------------------- /R/deserialize.R: -------------------------------------------------------------------------------- 1 | # Imported from: 2 | # https://raw.githubusercontent.com/apache/spark/branch-1.6/R/pkg/R/deserialize.R 3 | # 4 | # Licensed to the Apache Software Foundation (ASF) under one or more 5 | # contributor license agreements. See the NOTICE file distributed with 6 | # this work for additional information regarding copyright ownership. 7 | # The ASF licenses this file to You under the Apache License, Version 2.0 8 | # (the "License"); you may not use this file except in compliance with 9 | # the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | # Utility functions to deserialize objects from Java. 21 | 22 | # nolint start 23 | # Type mapping from Java to R 24 | # 25 | # void -> NULL 26 | # Int -> integer 27 | # String -> character 28 | # Boolean -> logical 29 | # Float -> double 30 | # Double -> double 31 | # Long -> double 32 | # Array[Byte] -> raw 33 | # Date -> Date 34 | # Time -> POSIXct 35 | # 36 | # Array[T] -> list() 37 | # Object -> jobj 38 | # 39 | # nolint end 40 | 41 | readObject <- function(con) { 42 | # Read type first 43 | type <- readType(con) 44 | readTypedObject(con, type) 45 | } 46 | 47 | readTypedObject <- function(con, type) { 48 | switch (type, 49 | "i" = readInt(con), 50 | "c" = readString(con), 51 | "b" = readBoolean(con), 52 | "d" = readDouble(con), 53 | "r" = readRaw(con), 54 | "D" = readDate(con), 55 | "t" = readTime(con), 56 | "a" = readArray(con), 57 | "l" = readList(con), 58 | "e" = readEnv(con), 59 | "s" = readStruct(con), 60 | "n" = NULL, 61 | "j" = getJobj(readString(con)), 62 | stop(paste("Unsupported type for deserialization", type))) 63 | } 64 | 65 | readString <- function(con) { 66 | stringLen <- readInt(con) 67 | raw <- readBin(con, raw(), stringLen, endian = "big") 68 | string <- rawToChar(raw) 69 | Encoding(string) <- "UTF-8" 70 | string 71 | } 72 | 73 | readInt <- function(con, n = 1) { 74 | readBin(con, integer(), n = n, endian = "big") 75 | } 76 | 77 | readDouble <- function(con, n = 1) { 78 | readBin(con, double(), n = n, endian = "big") 79 | } 80 | 81 | readBoolean <- function(con, n = 1) { 82 | as.logical(readInt(con, n = n)) 83 | } 84 | 85 | readType <- function(con) { 86 | rawToChar(readBin(con, "raw", n = 1L)) 87 | } 88 | 89 | readDate <- function(con) { 90 | as.Date(readString(con)) 91 | } 92 | 93 | readTime <- function(con, n = 1) { 94 | t <- readDouble(con, n) 95 | as.POSIXct(t, origin = "1970-01-01") 96 | } 97 | 98 | readArray <- function(con) { 99 | type <- readType(con) 100 | len <- readInt(con) 101 | 102 | # short-circuit for reading arrays of double, int, logical 103 | if (type == "d") { 104 | return(readDouble(con, n = len)) 105 | } else if (type == "i") { 106 | return(readInt(con, n = len)) 107 | } else if (type == "b") { 108 | return(readBoolean(con, n = len)) 109 | } 110 | 111 | if (len > 0) { 112 | l <- vector("list", len) 113 | for (i in 1:len) { 114 | l[[i]] <- readTypedObject(con, type) 115 | } 116 | l 117 | } else { 118 | list() 119 | } 120 | } 121 | 122 | # Read a list. Types of each element may be different. 123 | # Null objects are read as NA. 124 | readList <- function(con) { 125 | len <- readInt(con) 126 | if (len > 0) { 127 | l <- vector("list", len) 128 | for (i in 1:len) { 129 | elem <- readObject(con) 130 | if (is.null(elem)) { 131 | elem <- NA 132 | } 133 | l[[i]] <- elem 134 | } 135 | l 136 | } else { 137 | list() 138 | } 139 | } 140 | 141 | readEnv <- function(con) { 142 | env <- new.env() 143 | len <- readInt(con) 144 | if (len > 0) { 145 | for (i in 1:len) { 146 | key <- readString(con) 147 | value <- readObject(con) 148 | env[[key]] <- value 149 | } 150 | } 151 | env 152 | } 153 | 154 | # Convert a named list to struct so that 155 | # SerDe won't confuse between a normal named list and struct 156 | listToStruct <- function(list) { 157 | stopifnot(class(list) == "list") 158 | stopifnot(!is.null(names(list))) 159 | class(list) <- "struct" 160 | list 161 | } 162 | 163 | # Read a field of StructType from DataFrame 164 | # into a named list in R whose class is "struct" 165 | readStruct <- function(con) { 166 | names <- readObject(con) 167 | fields <- readObject(con) 168 | names(fields) <- names 169 | listToStruct(fields) 170 | } 171 | 172 | readRaw <- function(con) { 173 | dataLen <- readInt(con) 174 | readBin(con, raw(), as.integer(dataLen), endian = "big") 175 | } 176 | 177 | readRawLen <- function(con, dataLen) { 178 | readBin(con, raw(), as.integer(dataLen), endian = "big") 179 | } 180 | 181 | readDeserialize <- function(con) { 182 | # We have two cases that are possible - In one, the entire partition is 183 | # encoded as a byte array, so we have only one value to read. If so just 184 | # return firstData 185 | dataLen <- readInt(con) 186 | firstData <- unserialize( 187 | readBin(con, raw(), as.integer(dataLen), endian = "big")) 188 | 189 | # Else, read things into a list 190 | dataLen <- readInt(con) 191 | if (length(dataLen) > 0 && dataLen > 0) { 192 | data <- list(firstData) 193 | while (length(dataLen) > 0 && dataLen > 0) { 194 | data[[length(data) + 1L]] <- unserialize( 195 | readBin(con, raw(), as.integer(dataLen), endian = "big")) 196 | dataLen <- readInt(con) 197 | } 198 | unlist(data, recursive = FALSE) 199 | } else { 200 | firstData 201 | } 202 | } 203 | 204 | readMultipleObjects <- function(inputCon) { 205 | # readMultipleObjects will read multiple continuous objects from 206 | # a DataOutputStream. There is no preceding field telling the count 207 | # of the objects, so the number of objects varies, we try to read 208 | # all objects in a loop until the end of the stream. 209 | data <- list() 210 | while (TRUE) { 211 | # If reaching the end of the stream, type returned should be "". 212 | type <- readType(inputCon) 213 | if (type == "") { 214 | break 215 | } 216 | data[[length(data) + 1L]] <- readTypedObject(inputCon, type) 217 | } 218 | data # this is a list of named lists now 219 | } 220 | 221 | readRowList <- function(obj) { 222 | # readRowList is meant for use inside an lapply. As a result, it is 223 | # necessary to open a standalone connection for the row and consume 224 | # the numCols bytes inside the read function in order to correctly 225 | # deserialize the row. 226 | rawObj <- rawConnection(obj, "r+") 227 | on.exit(close(rawObj)) 228 | readObject(rawObj) 229 | } 230 | -------------------------------------------------------------------------------- /R/serialize.R: -------------------------------------------------------------------------------- 1 | # Imported from: 2 | # https://raw.githubusercontent.com/apache/spark/branch-1.6/R/pkg/R/serialize.R 3 | # 4 | # Licensed to the Apache Software Foundation (ASF) under one or more 5 | # contributor license agreements. See the NOTICE file distributed with 6 | # this work for additional information regarding copyright ownership. 7 | # The ASF licenses this file to You under the Apache License, Version 2.0 8 | # (the "License"); you may not use this file except in compliance with 9 | # the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | # Utility functions to serialize R objects so they can be read in Java. 21 | 22 | # nolint start 23 | # Type mapping from R to Java 24 | # 25 | # NULL -> Void 26 | # integer -> Int 27 | # character -> String 28 | # logical -> Boolean 29 | # double, numeric -> Double 30 | # raw -> Array[Byte] 31 | # Date -> Date 32 | # POSIXct,POSIXlt -> Time 33 | # 34 | # list[T] -> Array[T], where T is one of above mentioned types 35 | # environment -> Map[String, T], where T is a native type 36 | # jobj -> Object, where jobj is an object created in the backend 37 | # nolint end 38 | 39 | getSerdeType <- function(object) { 40 | type <- class(object)[[1]] 41 | if (type != "list") { 42 | type 43 | } else { 44 | # Check if all elements are of same type 45 | elemType <- unique(sapply(object, function(elem) { getSerdeType(elem) })) 46 | if (length(elemType) <= 1) { 47 | "array" 48 | } else { 49 | "list" 50 | } 51 | } 52 | } 53 | 54 | writeObject <- function(con, object, writeType = TRUE) { 55 | # NOTE: In R vectors have same type as objects. So we don't support 56 | # passing in vectors as arrays and instead require arrays to be passed 57 | # as lists. 58 | type <- class(object)[[1]] # class of POSIXlt is c("POSIXlt", "POSIXt") 59 | # Checking types is needed here, since 'is.na' only handles atomic vectors, 60 | # lists and pairlists 61 | if (type %in% c("integer", "character", "logical", "double", "numeric")) { 62 | if (is.na(object)) { 63 | object <- NULL 64 | type <- "NULL" 65 | } 66 | } 67 | 68 | serdeType <- getSerdeType(object) 69 | if (writeType) { 70 | writeType(con, serdeType) 71 | } 72 | switch(serdeType, 73 | NULL = writeVoid(con), 74 | integer = writeInt(con, object), 75 | character = writeString(con, object), 76 | logical = writeBoolean(con, object), 77 | double = writeDouble(con, object), 78 | numeric = writeDouble(con, object), 79 | raw = writeRaw(con, object), 80 | array = writeArray(con, object), 81 | list = writeList(con, object), 82 | struct = writeList(con, object), 83 | spark_jobj = writeJobj(con, object), 84 | environment = writeEnv(con, object), 85 | Date = writeDate(con, object), 86 | POSIXlt = writeTime(con, object), 87 | POSIXct = writeTime(con, object), 88 | factor = writeFactor(con, object), 89 | stop(paste("Unsupported type for serialization", type))) 90 | } 91 | 92 | writeVoid <- function(con) { 93 | # no value for NULL 94 | } 95 | 96 | writeJobj <- function(con, value) { 97 | if (!isValidJobj(value)) { 98 | stop("invalid jobj ", value$id) 99 | } 100 | writeString(con, value$id) 101 | } 102 | 103 | writeString <- function(con, value) { 104 | utfVal <- enc2utf8(value) 105 | writeInt(con, as.integer(nchar(utfVal, type = "bytes") + 1)) 106 | writeBin(utfVal, con, endian = "big", useBytes = TRUE) 107 | } 108 | 109 | writeInt <- function(con, value) { 110 | writeBin(as.integer(value), con, endian = "big") 111 | } 112 | 113 | writeDouble <- function(con, value) { 114 | writeBin(value, con, endian = "big") 115 | } 116 | 117 | writeBoolean <- function(con, value) { 118 | # TRUE becomes 1, FALSE becomes 0 119 | writeInt(con, as.integer(value)) 120 | } 121 | 122 | writeRawSerialize <- function(outputCon, batch) { 123 | outputSer <- serialize(batch, ascii = FALSE, connection = NULL) 124 | writeRaw(outputCon, outputSer) 125 | } 126 | 127 | writeRowSerialize <- function(outputCon, rows) { 128 | invisible(lapply(rows, function(r) { 129 | bytes <- serializeRow(r) 130 | writeRaw(outputCon, bytes) 131 | })) 132 | } 133 | 134 | serializeRow <- function(row) { 135 | rawObj <- rawConnection(raw(0), "wb") 136 | on.exit(close(rawObj)) 137 | writeList(rawObj, row) 138 | rawConnectionValue(rawObj) 139 | } 140 | 141 | writeRaw <- function(con, batch) { 142 | writeInt(con, length(batch)) 143 | writeBin(batch, con, endian = "big") 144 | } 145 | 146 | writeType <- function(con, class) { 147 | type <- switch(class, 148 | NULL = "n", 149 | integer = "i", 150 | character = "c", 151 | logical = "b", 152 | double = "d", 153 | numeric = "d", 154 | raw = "r", 155 | array = "a", 156 | list = "l", 157 | struct = "s", 158 | spark_jobj = "j", 159 | environment = "e", 160 | Date = "D", 161 | POSIXlt = "t", 162 | POSIXct = "t", 163 | factor = "c", 164 | stop(paste("Unsupported type for serialization", class))) 165 | writeBin(charToRaw(type), con) 166 | } 167 | 168 | # Used to pass arrays where all the elements are of the same type 169 | writeArray <- function(con, arr) { 170 | # TODO: Empty lists are given type "character" right now. 171 | # This may not work if the Java side expects array of any other type. 172 | if (length(arr) == 0) { 173 | elemType <- class("somestring") 174 | } else { 175 | elemType <- getSerdeType(arr[[1]]) 176 | } 177 | 178 | writeType(con, elemType) 179 | writeInt(con, length(arr)) 180 | 181 | if (length(arr) > 0) { 182 | for (a in arr) { 183 | writeObject(con, a, FALSE) 184 | } 185 | } 186 | } 187 | 188 | # Used to pass arrays where the elements can be of different types 189 | writeList <- function(con, list) { 190 | writeInt(con, length(list)) 191 | for (elem in list) { 192 | writeObject(con, elem) 193 | } 194 | } 195 | 196 | # Used to pass in hash maps required on Java side. 197 | writeEnv <- function(con, env) { 198 | len <- length(env) 199 | 200 | writeInt(con, len) 201 | if (len > 0) { 202 | writeArray(con, as.list(ls(env))) 203 | vals <- lapply(ls(env), function(x) { env[[x]] }) 204 | writeList(con, as.list(vals)) 205 | } 206 | } 207 | 208 | writeDate <- function(con, date) { 209 | writeString(con, as.character(date)) 210 | } 211 | 212 | writeTime <- function(con, time) { 213 | writeDouble(con, as.double(time)) 214 | } 215 | 216 | writeFactor <- function(con, factor) { 217 | writeString(con, as.character(factor)) 218 | } 219 | 220 | # Used to serialize in a list of objects where each 221 | # object can be of a different type. Serialization format is 222 | # for each object 223 | writeArgs <- function(con, args) { 224 | if (length(args) > 0) { 225 | for (a in args) { 226 | writeObject(con, a) 227 | } 228 | } 229 | } 230 | -------------------------------------------------------------------------------- /inst/scala/handler.scala: -------------------------------------------------------------------------------- 1 | package sparkapi 2 | 3 | import java.io.{ByteArrayInputStream, ByteArrayOutputStream, DataInputStream, DataOutputStream} 4 | 5 | import scala.collection.mutable.HashMap 6 | import scala.language.existentials 7 | 8 | import io.netty.channel.{ChannelHandlerContext, SimpleChannelInboundHandler} 9 | import io.netty.channel.ChannelHandler.Sharable 10 | 11 | import sparkapi.Logging._ 12 | import sparkapi.Serializer._ 13 | 14 | @Sharable 15 | class Handler(server: Backend) 16 | extends SimpleChannelInboundHandler[Array[Byte]] { 17 | 18 | override def channelRead0(ctx: ChannelHandlerContext, msg: Array[Byte]): Unit = { 19 | val bis = new ByteArrayInputStream(msg) 20 | val dis = new DataInputStream(bis) 21 | 22 | val bos = new ByteArrayOutputStream() 23 | val dos = new DataOutputStream(bos) 24 | 25 | // First bit is isStatic 26 | val isStatic = readBoolean(dis) 27 | val objId = readString(dis) 28 | val methodName = readString(dis) 29 | val numArgs = readInt(dis) 30 | 31 | if (objId == "Handler") { 32 | methodName match { 33 | // This function is for test-purpose only 34 | case "echo" => 35 | val args = readArgs(numArgs, dis) 36 | assert(numArgs == 1) 37 | 38 | writeInt(dos, 0) 39 | writeObject(dos, args(0)) 40 | case "stopBackend" => 41 | writeInt(dos, 0) 42 | writeType(dos, "void") 43 | server.close() 44 | case "rm" => 45 | try { 46 | val t = readObjectType(dis) 47 | assert(t == 'c') 48 | val objToRemove = readString(dis) 49 | JVMObjectTracker.remove(objToRemove) 50 | writeInt(dos, 0) 51 | writeObject(dos, null) 52 | } catch { 53 | case e: Exception => 54 | logError(s"Removing $objId failed", e) 55 | writeInt(dos, -1) 56 | writeString(dos, s"Removing $objId failed: ${e.getMessage}") 57 | } 58 | case _ => 59 | dos.writeInt(-1) 60 | writeString(dos, s"Error: unknown method $methodName") 61 | } 62 | } else { 63 | handleMethodCall(isStatic, objId, methodName, numArgs, dis, dos) 64 | } 65 | 66 | val reply = bos.toByteArray 67 | ctx.write(reply) 68 | } 69 | 70 | override def channelReadComplete(ctx: ChannelHandlerContext): Unit = { 71 | ctx.flush() 72 | } 73 | 74 | override def exceptionCaught(ctx: ChannelHandlerContext, cause: Throwable): Unit = { 75 | // Close the connection when an exception is raised. 76 | cause.printStackTrace() 77 | ctx.close() 78 | } 79 | 80 | def handleMethodCall( 81 | isStatic: Boolean, 82 | objId: String, 83 | methodName: String, 84 | numArgs: Int, 85 | dis: DataInputStream, 86 | dos: DataOutputStream): Unit = { 87 | var obj: Object = null 88 | try { 89 | val cls = if (isStatic) { 90 | Class.forName(objId) 91 | } else { 92 | JVMObjectTracker.get(objId) match { 93 | case None => throw new IllegalArgumentException("Object not found " + objId) 94 | case Some(o) => 95 | obj = o 96 | o.getClass 97 | } 98 | } 99 | 100 | val args = readArgs(numArgs, dis) 101 | 102 | val methods = cls.getMethods 103 | val selectedMethods = methods.filter(m => m.getName == methodName) 104 | if (selectedMethods.length > 0) { 105 | val index = findMatchedSignature( 106 | selectedMethods.map(_.getParameterTypes), 107 | args) 108 | 109 | if (index.isEmpty) { 110 | logWarning(s"cannot find matching method ${cls}.$methodName. " 111 | + s"Candidates are:") 112 | selectedMethods.foreach { method => 113 | logWarning(s"$methodName(${method.getParameterTypes.mkString(",")})") 114 | } 115 | throw new Exception(s"No matched method found for $cls.$methodName") 116 | } 117 | 118 | val ret = selectedMethods(index.get).invoke(obj, args : _*) 119 | 120 | // Write status bit 121 | writeInt(dos, 0) 122 | writeObject(dos, ret.asInstanceOf[AnyRef]) 123 | } else if (methodName == "") { 124 | // methodName should be "" for constructor 125 | val ctors = cls.getConstructors 126 | val index = findMatchedSignature( 127 | ctors.map(_.getParameterTypes), 128 | args) 129 | 130 | if (index.isEmpty) { 131 | logWarning(s"cannot find matching constructor for ${cls}. " 132 | + s"Candidates are:") 133 | ctors.foreach { ctor => 134 | logWarning(s"$cls(${ctor.getParameterTypes.mkString(",")})") 135 | } 136 | throw new Exception(s"No matched constructor found for $cls") 137 | } 138 | 139 | val obj = ctors(index.get).newInstance(args : _*) 140 | 141 | writeInt(dos, 0) 142 | writeObject(dos, obj.asInstanceOf[AnyRef]) 143 | } else { 144 | throw new IllegalArgumentException("invalid method " + methodName + " for object " + objId) 145 | } 146 | } catch { 147 | case e: Exception => 148 | logError(s"$methodName on $objId failed") 149 | writeInt(dos, -1) 150 | // Writing the error message of the cause for the exception. This will be returned 151 | // to user in the R process. 152 | writeString(dos, Utils.exceptionString(e.getCause)) 153 | } 154 | } 155 | 156 | // Read a number of arguments from the data input stream 157 | def readArgs(numArgs: Int, dis: DataInputStream): Array[java.lang.Object] = { 158 | (0 until numArgs).map { _ => 159 | readObject(dis) 160 | }.toArray 161 | } 162 | 163 | // Find a matching method signature in an array of signatures of constructors 164 | // or methods of the same name according to the passed arguments. Arguments 165 | // may be converted in order to match a signature. 166 | // 167 | // Note that in Java reflection, constructors and normal methods are of different 168 | // classes, and share no parent class that provides methods for reflection uses. 169 | // There is no unified way to handle them in this function. So an array of signatures 170 | // is passed in instead of an array of candidate constructors or methods. 171 | // 172 | // Returns an Option[Int] which is the index of the matched signature in the array. 173 | def findMatchedSignature( 174 | parameterTypesOfMethods: Array[Array[Class[_]]], 175 | args: Array[Object]): Option[Int] = { 176 | val numArgs = args.length 177 | 178 | for (index <- 0 until parameterTypesOfMethods.length) { 179 | val parameterTypes = parameterTypesOfMethods(index) 180 | 181 | if (parameterTypes.length == numArgs) { 182 | var argMatched = true 183 | var i = 0 184 | while (i < numArgs && argMatched) { 185 | val parameterType = parameterTypes(i) 186 | 187 | if (parameterType == classOf[Seq[Any]] && args(i).getClass.isArray) { 188 | // The case that the parameter type is a Scala Seq and the argument 189 | // is a Java array is considered matching. The array will be converted 190 | // to a Seq later if this method is matched. 191 | } else { 192 | var parameterWrapperType = parameterType 193 | 194 | // Convert native parameters to Object types as args is Array[Object] here 195 | if (parameterType.isPrimitive) { 196 | parameterWrapperType = parameterType match { 197 | case java.lang.Integer.TYPE => classOf[java.lang.Integer] 198 | case java.lang.Long.TYPE => classOf[java.lang.Integer] 199 | case java.lang.Double.TYPE => classOf[java.lang.Double] 200 | case java.lang.Boolean.TYPE => classOf[java.lang.Boolean] 201 | case _ => parameterType 202 | } 203 | } 204 | if ((parameterType.isPrimitive || args(i) != null) && 205 | !parameterWrapperType.isInstance(args(i))) { 206 | argMatched = false 207 | } 208 | } 209 | 210 | i = i + 1 211 | } 212 | 213 | if (argMatched) { 214 | // Convert args if needed 215 | val parameterTypes = parameterTypesOfMethods(index) 216 | 217 | (0 until numArgs).map { i => 218 | if (parameterTypes(i) == classOf[Seq[Any]] && args(i).getClass.isArray) { 219 | // Convert a Java array to scala Seq 220 | args(i) = args(i).asInstanceOf[Array[_]].toSeq 221 | } 222 | } 223 | 224 | return Some(index) 225 | } 226 | } 227 | } 228 | None 229 | } 230 | } 231 | 232 | /** 233 | * Helper singleton that tracks JVM objects returned to R. 234 | * This is useful for referencing these objects in RPC calls. 235 | */ 236 | object JVMObjectTracker { 237 | 238 | private[this] val objMap = new HashMap[String, Object] 239 | 240 | private[this] var objCounter: Int = 0 241 | 242 | def getObject(id: String): Object = { 243 | objMap(id) 244 | } 245 | 246 | def get(id: String): Option[Object] = { 247 | objMap.get(id) 248 | } 249 | 250 | def put(obj: Object): String = { 251 | val objId = objCounter.toString 252 | objCounter = objCounter + 1 253 | objMap.put(objId, obj) 254 | objId 255 | } 256 | 257 | def remove(id: String): Option[Object] = { 258 | objMap.remove(id) 259 | } 260 | } 261 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /R/shell.R: -------------------------------------------------------------------------------- 1 | #' Start the Spark R Shell 2 | #' 3 | #' @param master Spark cluster url to connect to. Use \code{"local"} to connect to a local 4 | #' instance of Spark 5 | #' @param spark_home Spark home directory (defaults to SPARK_HOME environment variable) 6 | #' @param spark_version Spark version, if not specified, version taken from SPARK_HOME 7 | #' @param app_name Application name to be used while running in the Spark cluster 8 | #' @param config Named character vector of spark.* options 9 | #' @param jars Paths to Jar files to include 10 | #' @param packages Spark packages to include 11 | #' @param extensions Extension packages to include dependencies for 12 | #' (see \code{\link{spark_dependency}}). 13 | #' @param environment Environment variables to set 14 | #' @param shell_args Additional command line arguments for spark_shell 15 | #' @param sc \code{spark_connection} 16 | #' 17 | #' @return \code{spark_connection} object 18 | #' 19 | #' @export 20 | start_shell <- function(master, 21 | spark_home = Sys.getenv("SPARK_HOME"), 22 | spark_version = NULL, 23 | app_name = "sparkapi", 24 | config = list(), 25 | extensions = sparkapi::registered_extensions(), 26 | jars = NULL, 27 | packages = NULL, 28 | environment = NULL, 29 | shell_args = NULL) { 30 | # read app jar through config, this allows "sparkr-shell" to test sparkr backend 31 | app_jar <- spark_config_value(config, "sparkapi.app.jar", NULL) 32 | if (is.null(app_jar)) { 33 | app_jar <- shQuote(normalizePath(system.file(file.path("java", "sparkapi-1.6.1.jar"), package = "sparkapi"), 34 | mustWork = FALSE)) 35 | shell_args <- c(shell_args, "--class", "sparkapi.Backend") 36 | } 37 | 38 | # validate and normalize spark_home 39 | if (!nzchar(spark_home)) 40 | stop("No spark_home specified (defaults to SPARK_HOME environment varirable).") 41 | if (!dir.exists(spark_home)) 42 | stop("SPARK_HOME directory '", spark_home ,"' not found") 43 | spark_home <- normalizePath(spark_home) 44 | 45 | # set SPARK_HOME into child process environment 46 | if (is.null(environment())) 47 | environment <- list() 48 | environment$SPARK_HOME <- spark_home 49 | 50 | # provide empty config if necessary 51 | if (is.null(config)) 52 | config <- list() 53 | 54 | # determine path to spark_submit 55 | spark_submit <- switch(.Platform$OS.type, 56 | unix = "spark-submit", 57 | windows = "spark-submit.cmd" 58 | ) 59 | spark_submit_path <- normalizePath(file.path(spark_home, "bin", spark_submit)) 60 | 61 | # resolve extensions 62 | spark_version <- numeric_version( 63 | ifelse(is.null(spark_version), 64 | spark_version_from_home(spark_home), 65 | gsub("[-_a-zA-Z]", "", spark_version) 66 | ) 67 | ) 68 | scala_version <- numeric_version("2.10") 69 | extensions <- spark_dependencies_from_extensions(spark_version, scala_version, extensions) 70 | 71 | # combine passed jars and packages with extensions 72 | jars <- normalizePath(unique(c(jars, extensions$jars))) 73 | packages <- unique(c(packages, extensions$packages)) 74 | 75 | # add jars to arguments 76 | if (length(jars) > 0) { 77 | shell_args <- c(shell_args, "--jars", paste(shQuote(jars), collapse=",")) 78 | } 79 | 80 | # add packages to arguments 81 | if (length(packages) > 0) { 82 | shell_args <- c(shell_args, "--packages", paste(shQuote(packages), collapse=",")) 83 | } 84 | 85 | # add sparkr-shell to args 86 | shell_args <- c(shell_args, app_jar) 87 | 88 | # create temporary file for shell ports output and add it to the args 89 | shell_output_path <- spark_config_value(config, 90 | "sparkapi.ports.file", 91 | normalizePath(tempfile(fileext = ".out"), 92 | mustWork = FALSE)) 93 | 94 | on.exit(unlink(shell_output_path)) 95 | shell_args <- c(shell_args, shell_output_path) 96 | 97 | # create temp file for stdout and stderr 98 | output_file <- tempfile(fileext = "_spark.log") 99 | error_file <- tempfile(fileext = "_spark.err") 100 | 101 | # start the shell (w/ specified additional environment variables) 102 | env <- unlist(environment) 103 | withr::with_envvar(env, { 104 | if (.Platform$OS.type == "windows") { 105 | shell(paste( 106 | spark_submit_path, 107 | paste(shell_args, collapse = " "), 108 | ">", 109 | output_file, 110 | "2>", 111 | error_file 112 | ), 113 | wait = FALSE) 114 | } 115 | else { 116 | system2(spark_submit_path, 117 | args = shell_args, 118 | stdout = output_file, 119 | stderr = output_file, 120 | wait = FALSE) 121 | } 122 | }) 123 | 124 | # wait for the shell output file 125 | waitSeconds <- spark_config_value(config, "sparkapi.ports.wait.seconds", 100) 126 | if (!wait_file_exists(shell_output_path, waitSeconds)) { 127 | stop(paste( 128 | "Failed to launch Spark shell. Ports file does not exist.\n", 129 | " Path: ", spark_submit_path, "\n", 130 | " Parameters: ", paste(shell_args, collapse = ", "), "\n", 131 | " \n", 132 | paste(readLines(output_file), collapse = "\n"), 133 | if (file.exists(error_file)) paste(readLines(error_file), collapse = "\n") else "", 134 | sep = "")) 135 | } 136 | 137 | # read the shell output file 138 | shell_file <- read_shell_file(shell_output_path) 139 | 140 | # bind to the monitor and backend ports 141 | tryCatch({ 142 | monitor <- socketConnection(port = shell_file$monitorPort) 143 | }, error = function(err) { 144 | stop("Failed to open connection to monitor") 145 | }) 146 | 147 | tryCatch({ 148 | backend <- socketConnection(host = "localhost", 149 | port = shell_file$backendPort, 150 | server = FALSE, 151 | blocking = TRUE, 152 | open = "wb", 153 | timeout = 6000) 154 | }, error = function(err) { 155 | stop("Failed to open connection to backend") 156 | }) 157 | 158 | # create the shell connection 159 | sc <- structure(class = c("spark_connection", "spark_shell_connection"), list( 160 | # spark_connection 161 | master = master, 162 | spark_home = spark_home, 163 | app_name = app_name, 164 | config = config, 165 | # spark_shell_connection 166 | backend = backend, 167 | monitor = monitor, 168 | output_file = output_file 169 | )) 170 | 171 | # stop shell on R exit 172 | reg.finalizer(baseenv(), function(x) { 173 | if (connection_is_open(sc)) { 174 | stop_shell(sc) 175 | } 176 | }, onexit = TRUE) 177 | 178 | # initialize and return the connection 179 | initialize_connection(sc) 180 | } 181 | 182 | 183 | #' Stop the Spark R Shell 184 | #' 185 | #' @rdname start_shell 186 | #' 187 | #' @export 188 | stop_shell <- function(sc) { 189 | invoke_method(sc, 190 | FALSE, 191 | "Handler", 192 | "stopBackend") 193 | 194 | close(sc$backend) 195 | close(sc$monitor) 196 | } 197 | 198 | #' @export 199 | connection_is_open.spark_shell_connection <- function(sc) { 200 | bothOpen <- FALSE 201 | if (!identical(sc, NULL)) { 202 | tryCatch({ 203 | bothOpen <- isOpen(sc$backend) && isOpen(sc$monitor) 204 | }, error = function(e) { 205 | }) 206 | } 207 | bothOpen 208 | } 209 | 210 | #' @export 211 | spark_log.spark_shell_connection <- function(sc, n = 100, ...) { 212 | log <- file(sc$output_file) 213 | lines <- readLines(log) 214 | close(log) 215 | 216 | if (!is.null(n)) 217 | linesLog <- utils::tail(lines, n = n) 218 | else 219 | linesLog <- lines 220 | attr(linesLog, "class") <- "spark_log" 221 | 222 | linesLog 223 | } 224 | 225 | #' @export 226 | spark_web.spark_shell_connection <- function(sc, ...) { 227 | lines <- spark_log(sc, n = 200) 228 | 229 | uiLine <- grep("Started SparkUI at ", lines, perl=TRUE, value=TRUE) 230 | if (length(uiLine) > 0) { 231 | matches <- regexpr("http://.*", uiLine, perl=TRUE) 232 | match <-regmatches(uiLine, matches) 233 | if (length(match) > 0) { 234 | return(structure(match, class = "spark_web_url")) 235 | } 236 | } 237 | 238 | uiLine <- grep(".*Bound SparkUI to.*", lines, perl=TRUE, value=TRUE) 239 | if (length(uiLine) > 0) { 240 | matches <- regexec(".*Bound SparkUI to.*and started at (http.*)", uiLine, perl=TRUE) 241 | match <- regmatches(uiLine, matches) 242 | if (length(match) > 0 && length(match[[1]]) > 1) { 243 | return(structure(match[[1]][[2]], class = "spark_web_url")) 244 | } 245 | } 246 | 247 | warning("Spark UI URL not found in logs, attempting to guess.") 248 | structure("http://localhost:4040", class = "spark_web_url") 249 | } 250 | 251 | #' @export 252 | invoke_method.spark_shell_connection <- function(sc, static, object, method, ...) 253 | { 254 | if (is.null(sc)) { 255 | stop("The connection is no longer valid.") 256 | } 257 | 258 | # if the object is a jobj then get it's id 259 | if (inherits(object, "spark_jobj")) 260 | object <- object$id 261 | 262 | rc <- rawConnection(raw(), "r+") 263 | writeBoolean(rc, static) 264 | writeString(rc, object) 265 | writeString(rc, method) 266 | 267 | args <- list(...) 268 | writeInt(rc, length(args)) 269 | writeArgs(rc, args) 270 | bytes <- rawConnectionValue(rc) 271 | close(rc) 272 | 273 | rc <- rawConnection(raw(0), "r+") 274 | writeInt(rc, length(bytes)) 275 | writeBin(bytes, rc) 276 | con <- rawConnectionValue(rc) 277 | close(rc) 278 | 279 | backend <- sc$backend 280 | writeBin(con, backend) 281 | 282 | returnStatus <- readInt(backend) 283 | if (length(returnStatus) == 0) 284 | stop("No status is returned. Spark R backend might have failed.") 285 | if (returnStatus != 0) { 286 | # get error message from backend and report to R 287 | msg <- readString(backend) 288 | if (nzchar(msg)) 289 | stop(msg, call. = FALSE) 290 | else { 291 | # read the spark log 292 | msg <- read_spark_log_error(sc) 293 | stop(msg, call. = FALSE) 294 | } 295 | } 296 | 297 | object <- readObject(backend) 298 | attach_connection(object, sc) 299 | } 300 | 301 | #' @export 302 | print_jobj.spark_shell_connection <- function(sc, jobj, ...) { 303 | if (connection_is_open(sc)) { 304 | info <- jobj_info(jobj) 305 | fmt <- "\n %s\n %s\n" 306 | cat(sprintf(fmt, jobj$id, info$class, info$repr)) 307 | } else { 308 | fmt <- "\n " 309 | cat(sprintf(fmt, jobj$id)) 310 | } 311 | } 312 | 313 | 314 | attach_connection <- function(jobj, connection) { 315 | 316 | if (inherits(jobj, "spark_jobj")) { 317 | jobj$connection <- connection 318 | } 319 | else if (is.list(jobj) || inherits(jobj, "struct")) { 320 | jobj <- lapply(jobj, function(e) { 321 | attach_connection(e, connection) 322 | }) 323 | } 324 | else if (is.environment(jobj)) { 325 | jobj <- eapply(jobj, function(e) { 326 | attach_connection(e, connection) 327 | }) 328 | } 329 | 330 | jobj 331 | } 332 | 333 | 334 | read_shell_file <- function(shell_file) { 335 | 336 | shellOutputFile <- file(shell_file, open = "rb") 337 | backendPort <- readInt(shellOutputFile) 338 | monitorPort <- readInt(shellOutputFile) 339 | rLibraryPath <- readString(shellOutputFile) 340 | close(shellOutputFile) 341 | 342 | success <- length(backendPort) > 0 && backendPort > 0 && 343 | length(monitorPort) > 0 && monitorPort > 0 && 344 | length(rLibraryPath) == 1 345 | 346 | if (!success) 347 | stop("Invalid values found in shell output") 348 | 349 | list( 350 | backendPort = backendPort, 351 | monitorPort = monitorPort, 352 | rLibraryPath = rLibraryPath 353 | ) 354 | } 355 | 356 | 357 | wait_file_exists <- function(filename, seconds) { 358 | retries <- seconds * 10 359 | while(!file.exists(filename) && retries >= 0) { 360 | retries <- retries - 1; 361 | Sys.sleep(0.1) 362 | } 363 | 364 | file.exists(filename) 365 | } 366 | 367 | read_spark_log_error <- function(sc) { 368 | # if there was no error message reported, then 369 | # return information from the Spark logs. return 370 | # all those with most recent timestamp 371 | msg <- "failed to invoke spark command (unknown reason)" 372 | try(silent = TRUE, { 373 | log <- sc$output_file 374 | splat <- strsplit(log, "\\s+", perl = TRUE) 375 | n <- length(splat) 376 | timestamp <- splat[[n]][[2]] 377 | regex <- paste("\\b", timestamp, "\\b", sep = "") 378 | entries <- grep(regex, log, perl = TRUE, value = TRUE) 379 | pasted <- paste(entries, collapse = "\n") 380 | msg <- paste("failed to invoke spark command", pasted, sep = "\n") 381 | }) 382 | msg 383 | } 384 | 385 | spark_config_value <- function(config, name, default = NULL) { 386 | if(is.null(config[[name]])) default else config[[name]] 387 | } 388 | -------------------------------------------------------------------------------- /inst/scala/serializer.scala: -------------------------------------------------------------------------------- 1 | package sparkapi 2 | 3 | import java.io.{DataInputStream, DataOutputStream} 4 | import java.nio.charset.StandardCharsets 5 | import java.sql.{Date, Time, Timestamp} 6 | 7 | import scala.collection.JavaConverters._ 8 | import scala.collection.mutable.WrappedArray 9 | 10 | object Serializer { 11 | type ReadObject = (DataInputStream, Char) => Object 12 | type WriteObject = (DataOutputStream, Object) => Boolean 13 | 14 | var sqlSerDe: (ReadObject, WriteObject) = _ 15 | 16 | def registerSqlSerDe(sqlSerDe: (ReadObject, WriteObject)): Unit = { 17 | this.sqlSerDe = sqlSerDe 18 | } 19 | 20 | // Type mapping from R to Java 21 | // 22 | // NULL -> void 23 | // integer -> Int 24 | // character -> String 25 | // logical -> Boolean 26 | // double, numeric -> Double 27 | // raw -> Array[Byte] 28 | // Date -> Date 29 | // POSIXlt/POSIXct -> Time 30 | // 31 | // list[T] -> Array[T], where T is one of above mentioned types 32 | // environment -> Map[String, T], where T is a native type 33 | // jobj -> Object, where jobj is an object created in the backend 34 | 35 | def readObjectType(dis: DataInputStream): Char = { 36 | dis.readByte().toChar 37 | } 38 | 39 | def readObject(dis: DataInputStream): Object = { 40 | val dataType = readObjectType(dis) 41 | readTypedObject(dis, dataType) 42 | } 43 | 44 | def readTypedObject( 45 | dis: DataInputStream, 46 | dataType: Char): Object = { 47 | dataType match { 48 | case 'n' => null 49 | case 'i' => new java.lang.Integer(readInt(dis)) 50 | case 'd' => new java.lang.Double(readDouble(dis)) 51 | case 'b' => new java.lang.Boolean(readBoolean(dis)) 52 | case 'c' => readString(dis) 53 | case 'e' => readMap(dis) 54 | case 'r' => readBytes(dis) 55 | case 'a' => readArray(dis) 56 | case 'l' => readList(dis) 57 | case 'D' => readDate(dis) 58 | case 't' => readTime(dis) 59 | case 'j' => JVMObjectTracker.getObject(readString(dis)) 60 | case _ => 61 | if (sqlSerDe == null || sqlSerDe._1 == null) { 62 | throw new IllegalArgumentException (s"Invalid type $dataType") 63 | } else { 64 | val obj = (sqlSerDe._1)(dis, dataType) 65 | if (obj == null) { 66 | throw new IllegalArgumentException (s"Invalid type $dataType") 67 | } else { 68 | obj 69 | } 70 | } 71 | } 72 | } 73 | 74 | def readBytes(in: DataInputStream): Array[Byte] = { 75 | val len = readInt(in) 76 | val out = new Array[Byte](len) 77 | val bytesRead = in.readFully(out) 78 | out 79 | } 80 | 81 | def readInt(in: DataInputStream): Int = { 82 | in.readInt() 83 | } 84 | 85 | def readDouble(in: DataInputStream): Double = { 86 | in.readDouble() 87 | } 88 | 89 | def readStringBytes(in: DataInputStream, len: Int): String = { 90 | val bytes = new Array[Byte](len) 91 | in.readFully(bytes) 92 | assert(bytes(len - 1) == 0) 93 | val str = new String(bytes.dropRight(1), StandardCharsets.UTF_8) 94 | str 95 | } 96 | 97 | def readString(in: DataInputStream): String = { 98 | val len = in.readInt() 99 | readStringBytes(in, len) 100 | } 101 | 102 | def readBoolean(in: DataInputStream): Boolean = { 103 | val intVal = in.readInt() 104 | if (intVal == 0) false else true 105 | } 106 | 107 | def readDate(in: DataInputStream): Date = { 108 | Date.valueOf(readString(in)) 109 | } 110 | 111 | def readTime(in: DataInputStream): Timestamp = { 112 | val seconds = in.readDouble() 113 | val sec = Math.floor(seconds).toLong 114 | val t = new Timestamp(sec * 1000L) 115 | t.setNanos(((seconds - sec) * 1e9).toInt) 116 | t 117 | } 118 | 119 | def readBytesArr(in: DataInputStream): Array[Array[Byte]] = { 120 | val len = readInt(in) 121 | (0 until len).map(_ => readBytes(in)).toArray 122 | } 123 | 124 | def readIntArr(in: DataInputStream): Array[Int] = { 125 | val len = readInt(in) 126 | (0 until len).map(_ => readInt(in)).toArray 127 | } 128 | 129 | def readDoubleArr(in: DataInputStream): Array[Double] = { 130 | val len = readInt(in) 131 | (0 until len).map(_ => readDouble(in)).toArray 132 | } 133 | 134 | def readBooleanArr(in: DataInputStream): Array[Boolean] = { 135 | val len = readInt(in) 136 | (0 until len).map(_ => readBoolean(in)).toArray 137 | } 138 | 139 | def readStringArr(in: DataInputStream): Array[String] = { 140 | val len = readInt(in) 141 | (0 until len).map(_ => readString(in)).toArray 142 | } 143 | 144 | // All elements of an array must be of the same type 145 | def readArray(dis: DataInputStream): Array[_] = { 146 | val arrType = readObjectType(dis) 147 | arrType match { 148 | case 'i' => readIntArr(dis) 149 | case 'c' => readStringArr(dis) 150 | case 'd' => readDoubleArr(dis) 151 | case 'b' => readBooleanArr(dis) 152 | case 'j' => readStringArr(dis).map(x => JVMObjectTracker.getObject(x)) 153 | case 'r' => readBytesArr(dis) 154 | case 'a' => 155 | val len = readInt(dis) 156 | (0 until len).map(_ => readArray(dis)).toArray 157 | case 'l' => 158 | val len = readInt(dis) 159 | (0 until len).map(_ => readList(dis)).toArray 160 | case _ => 161 | if (sqlSerDe == null || sqlSerDe._1 == null) { 162 | throw new IllegalArgumentException (s"Invalid array type $arrType") 163 | } else { 164 | val len = readInt(dis) 165 | (0 until len).map { _ => 166 | val obj = (sqlSerDe._1)(dis, arrType) 167 | if (obj == null) { 168 | throw new IllegalArgumentException (s"Invalid array type $arrType") 169 | } else { 170 | obj 171 | } 172 | }.toArray 173 | } 174 | } 175 | } 176 | 177 | // Each element of a list can be of different type. They are all represented 178 | // as Object on JVM side 179 | def readList(dis: DataInputStream): Array[Object] = { 180 | val len = readInt(dis) 181 | (0 until len).map(_ => readObject(dis)).toArray 182 | } 183 | 184 | def readMap(in: DataInputStream): java.util.Map[Object, Object] = { 185 | val len = readInt(in) 186 | if (len > 0) { 187 | // Keys is an array of String 188 | val keys = readArray(in).asInstanceOf[Array[Object]] 189 | val values = readList(in) 190 | 191 | keys.zip(values).toMap.asJava 192 | } else { 193 | new java.util.HashMap[Object, Object]() 194 | } 195 | } 196 | 197 | // Methods to write out data from Java to R 198 | // 199 | // Type mapping from Java to R 200 | // 201 | // void -> NULL 202 | // Int -> integer 203 | // String -> character 204 | // Boolean -> logical 205 | // Float -> double 206 | // Double -> double 207 | // Decimal -> double 208 | // Long -> double 209 | // Array[Byte] -> raw 210 | // Date -> Date 211 | // Time -> POSIXct 212 | // 213 | // Array[T] -> list() 214 | // Object -> jobj 215 | 216 | def writeType(dos: DataOutputStream, typeStr: String): Unit = { 217 | typeStr match { 218 | case "void" => dos.writeByte('n') 219 | case "character" => dos.writeByte('c') 220 | case "double" => dos.writeByte('d') 221 | case "integer" => dos.writeByte('i') 222 | case "logical" => dos.writeByte('b') 223 | case "date" => dos.writeByte('D') 224 | case "time" => dos.writeByte('t') 225 | case "raw" => dos.writeByte('r') 226 | // Array of primitive types 227 | case "array" => dos.writeByte('a') 228 | // Array of objects 229 | case "list" => dos.writeByte('l') 230 | case "map" => dos.writeByte('e') 231 | case "jobj" => dos.writeByte('j') 232 | case _ => throw new IllegalArgumentException(s"Invalid type $typeStr") 233 | } 234 | } 235 | 236 | private def writeKeyValue(dos: DataOutputStream, key: Object, value: Object): Unit = { 237 | if (key == null) { 238 | throw new IllegalArgumentException("Key in map can't be null.") 239 | } else if (!key.isInstanceOf[String]) { 240 | throw new IllegalArgumentException(s"Invalid map key type: ${key.getClass.getName}") 241 | } 242 | 243 | writeString(dos, key.asInstanceOf[String]) 244 | writeObject(dos, value) 245 | } 246 | 247 | def writeObject(dos: DataOutputStream, obj: Object): Unit = { 248 | if (obj == null) { 249 | writeType(dos, "void") 250 | } else { 251 | // Convert ArrayType collected from DataFrame to Java array 252 | // Collected data of ArrayType from a DataFrame is observed to be of 253 | // type "scala.collection.mutable.WrappedArray" 254 | val value = 255 | if (obj.isInstanceOf[WrappedArray[_]]) { 256 | obj.asInstanceOf[WrappedArray[_]].toArray 257 | } else { 258 | obj 259 | } 260 | 261 | value match { 262 | case v: java.lang.Character => 263 | writeType(dos, "character") 264 | writeString(dos, v.toString) 265 | case v: java.lang.String => 266 | writeType(dos, "character") 267 | writeString(dos, v) 268 | case v: java.lang.Long => 269 | writeType(dos, "double") 270 | writeDouble(dos, v.toDouble) 271 | case v: java.lang.Float => 272 | writeType(dos, "double") 273 | writeDouble(dos, v.toDouble) 274 | case v: java.math.BigDecimal => 275 | writeType(dos, "double") 276 | writeDouble(dos, scala.math.BigDecimal(v).toDouble) 277 | case v: java.lang.Double => 278 | writeType(dos, "double") 279 | writeDouble(dos, v) 280 | case v: java.lang.Byte => 281 | writeType(dos, "integer") 282 | writeInt(dos, v.toInt) 283 | case v: java.lang.Short => 284 | writeType(dos, "integer") 285 | writeInt(dos, v.toInt) 286 | case v: java.lang.Integer => 287 | writeType(dos, "integer") 288 | writeInt(dos, v) 289 | case v: java.lang.Boolean => 290 | writeType(dos, "logical") 291 | writeBoolean(dos, v) 292 | case v: java.sql.Date => 293 | writeType(dos, "date") 294 | writeDate(dos, v) 295 | case v: java.sql.Time => 296 | writeType(dos, "time") 297 | writeTime(dos, v) 298 | case v: java.sql.Timestamp => 299 | writeType(dos, "time") 300 | writeTime(dos, v) 301 | 302 | // Handle arrays 303 | 304 | // Array of primitive types 305 | 306 | // Special handling for byte array 307 | case v: Array[Byte] => 308 | writeType(dos, "raw") 309 | writeBytes(dos, v) 310 | 311 | case v: Array[Char] => 312 | writeType(dos, "array") 313 | writeStringArr(dos, v.map(_.toString)) 314 | case v: Array[Short] => 315 | writeType(dos, "array") 316 | writeIntArr(dos, v.map(_.toInt)) 317 | case v: Array[Int] => 318 | writeType(dos, "array") 319 | writeIntArr(dos, v) 320 | case v: Array[Long] => 321 | writeType(dos, "array") 322 | writeDoubleArr(dos, v.map(_.toDouble)) 323 | case v: Array[Float] => 324 | writeType(dos, "array") 325 | writeDoubleArr(dos, v.map(_.toDouble)) 326 | case v: Array[Double] => 327 | writeType(dos, "array") 328 | writeDoubleArr(dos, v) 329 | case v: Array[Boolean] => 330 | writeType(dos, "array") 331 | writeBooleanArr(dos, v) 332 | 333 | // Array of objects, null objects use "void" type 334 | case v: Array[Object] => 335 | writeType(dos, "list") 336 | writeInt(dos, v.length) 337 | v.foreach(elem => writeObject(dos, elem)) 338 | 339 | // Handle Properties 340 | // This must be above the case java.util.Map below. 341 | // (Properties implements Map and will be serialized as map otherwise) 342 | case v: java.util.Properties => 343 | writeType(dos, "jobj") 344 | writeJObj(dos, value) 345 | 346 | // Handle map 347 | case v: java.util.Map[_, _] => 348 | writeType(dos, "map") 349 | writeInt(dos, v.size) 350 | val iter = v.entrySet.iterator 351 | while(iter.hasNext) { 352 | val entry = iter.next 353 | val key = entry.getKey 354 | val value = entry.getValue 355 | 356 | writeKeyValue(dos, key.asInstanceOf[Object], value.asInstanceOf[Object]) 357 | } 358 | case v: scala.collection.Map[_, _] => 359 | writeType(dos, "map") 360 | writeInt(dos, v.size) 361 | v.foreach { case (key, value) => 362 | writeKeyValue(dos, key.asInstanceOf[Object], value.asInstanceOf[Object]) 363 | } 364 | 365 | case _ => 366 | if (sqlSerDe == null || sqlSerDe._2 == null || !(sqlSerDe._2)(dos, value)) { 367 | writeType(dos, "jobj") 368 | writeJObj(dos, value) 369 | } 370 | } 371 | } 372 | } 373 | 374 | def writeInt(out: DataOutputStream, value: Int): Unit = { 375 | out.writeInt(value) 376 | } 377 | 378 | def writeDouble(out: DataOutputStream, value: Double): Unit = { 379 | out.writeDouble(value) 380 | } 381 | 382 | def writeBoolean(out: DataOutputStream, value: Boolean): Unit = { 383 | val intValue = if (value) 1 else 0 384 | out.writeInt(intValue) 385 | } 386 | 387 | def writeDate(out: DataOutputStream, value: Date): Unit = { 388 | writeString(out, value.toString) 389 | } 390 | 391 | def writeTime(out: DataOutputStream, value: Time): Unit = { 392 | out.writeDouble(value.getTime.toDouble / 1000.0) 393 | } 394 | 395 | def writeTime(out: DataOutputStream, value: Timestamp): Unit = { 396 | out.writeDouble((value.getTime / 1000).toDouble + value.getNanos.toDouble / 1e9) 397 | } 398 | 399 | def writeString(out: DataOutputStream, value: String): Unit = { 400 | val utf8 = value.getBytes(StandardCharsets.UTF_8) 401 | val len = utf8.length 402 | out.writeInt(len) 403 | out.write(utf8, 0, len) 404 | } 405 | 406 | def writeBytes(out: DataOutputStream, value: Array[Byte]): Unit = { 407 | out.writeInt(value.length) 408 | out.write(value) 409 | } 410 | 411 | def writeJObj(out: DataOutputStream, value: Object): Unit = { 412 | val objId = JVMObjectTracker.put(value) 413 | writeString(out, objId) 414 | } 415 | 416 | def writeIntArr(out: DataOutputStream, value: Array[Int]): Unit = { 417 | writeType(out, "integer") 418 | out.writeInt(value.length) 419 | value.foreach(v => out.writeInt(v)) 420 | } 421 | 422 | def writeDoubleArr(out: DataOutputStream, value: Array[Double]): Unit = { 423 | writeType(out, "double") 424 | out.writeInt(value.length) 425 | value.foreach(v => out.writeDouble(v)) 426 | } 427 | 428 | def writeBooleanArr(out: DataOutputStream, value: Array[Boolean]): Unit = { 429 | writeType(out, "logical") 430 | out.writeInt(value.length) 431 | value.foreach(v => writeBoolean(out, v)) 432 | } 433 | 434 | def writeStringArr(out: DataOutputStream, value: Array[String]): Unit = { 435 | writeType(out, "character") 436 | out.writeInt(value.length) 437 | value.foreach(v => writeString(out, v)) 438 | } 439 | 440 | } 441 | 442 | object SerializationFormats { 443 | val BYTE = "byte" 444 | val STRING = "string" 445 | val ROW = "row" 446 | } 447 | --------------------------------------------------------------------------------