├── inst
    ├── staticdocs
    │   └── index.r
    ├── scala
    │   ├── sparklyr-1.6.1.md5
    │   ├── logging.scala
    │   ├── utils.scala
    │   ├── backend.scala
    │   ├── handler.scala
    │   └── serializer.scala
    ├── java
    │   └── sparkapi-1.6.1.jar
    └── tools
    │   └── compile-scala.R
├── tests
    ├── testthat
    │   ├── test-invoke.R
    │   └── test-config.R
    └── testthat.R
├── .gitignore
├── R
    ├── globals.R
    ├── magrittr.R
    ├── dataframe.R
    ├── invoke.R
    ├── hive.R
    ├── version.R
    ├── extensions.R
    ├── compile.R
    ├── jobj.R
    ├── connection.R
    ├── deserialize.R
    ├── serialize.R
    └── shell.R
├── .Rbuildignore
├── README.md
├── man
    ├── pipe.Rd
    ├── spark_web.Rd
    ├── connection_is_open.Rd
    ├── print_jobj.Rd
    ├── hive_context.Rd
    ├── spark_compile.Rd
    ├── spark_context.Rd
    ├── java_context.Rd
    ├── spark_dataframe.Rd
    ├── spark_dependency.Rd
    ├── spark_jobj.Rd
    ├── spark_connection.Rd
    ├── spark_log.Rd
    ├── spark_version.Rd
    ├── invoke.Rd
    ├── connection_config.Rd
    ├── invoke_method.Rd
    ├── register_extension.Rd
    └── start_shell.Rd
├── sparkapi.Rproj
├── DESCRIPTION
├── configure.R
├── NAMESPACE
└── LICENSE


/inst/staticdocs/index.r:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/inst/scala/sparklyr-1.6.1.md5:
--------------------------------------------------------------------------------
1 | 54904ecd4f11cae8eaeb0fad3928d5ed


--------------------------------------------------------------------------------
/tests/testthat/test-invoke.R:
--------------------------------------------------------------------------------
1 | 
2 | library(testthat)
3 | 
4 | context("Invoke")
5 | 
6 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(sparkapi)
3 | 
4 | test_check("sparkapi")
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | .DS_Store
6 | configure
7 | 


--------------------------------------------------------------------------------
/inst/java/sparkapi-1.6.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkapi/HEAD/inst/java/sparkapi-1.6.1.jar


--------------------------------------------------------------------------------
/R/globals.R:
--------------------------------------------------------------------------------
1 | 
2 | .globals <- new.env(parent = emptyenv())
3 | .globals$extension_packages <- character()
4 | 
5 | 


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | README.Rmd
4 | ^configure$
5 | ^configure\.win$
6 | ^configure\.R$
7 | 
8 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # sparkapi
2 | 
3 | This project was merged back to [sparklyr](http://spark.rstudio.com). Please visit http://spark.rstudio.com/extensions.html for information.
4 | 


--------------------------------------------------------------------------------
/inst/tools/compile-scala.R:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env Rscript
2 | 
3 | spark_home <- file.path(rappdirs::app_dir("spark", "rstudio")$cache(), "spark-1.6.1-bin-hadoop2.6")
4 | spark_compile("sparkapi", spark_home = spark_home)
5 | 


--------------------------------------------------------------------------------
/R/magrittr.R:
--------------------------------------------------------------------------------
 1 | 
 2 | #' Pipe operator
 3 | #'
 4 | #' See \code{\link[magrittr]{\%>\%}} for more details.
 5 | #'
 6 | #' @name %>%
 7 | #' @rdname pipe
 8 | #' @keywords internal
 9 | #' @export
10 | #' @importFrom magrittr %>%
11 | #' @usage lhs \%>\% rhs
12 | NULL
13 | 
14 | 


--------------------------------------------------------------------------------
/man/pipe.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/magrittr.R
 3 | \name{\%>\%}
 4 | \alias{\%>\%}
 5 | \title{Pipe operator}
 6 | \usage{
 7 | lhs \%>\% rhs
 8 | }
 9 | \description{
10 | See \code{\link[magrittr]{\%>\%}} for more details.
11 | }
12 | \keyword{internal}
13 | 
14 | 


--------------------------------------------------------------------------------
/man/spark_web.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/connection.R
 3 | \name{spark_web}
 4 | \alias{spark_web}
 5 | \title{Open the Spark web interface}
 6 | \usage{
 7 | spark_web(sc, ...)
 8 | }
 9 | \arguments{
10 | \item{sc}{\code{spark_connection}}
11 | 
12 | \item{...}{Unused (reserved for future use)}
13 | }
14 | \description{
15 | Open the Spark web interface
16 | }
17 | 
18 | 


--------------------------------------------------------------------------------
/man/connection_is_open.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/connection.R
 3 | \name{connection_is_open}
 4 | \alias{connection_is_open}
 5 | \title{Check whether the connection is open}
 6 | \usage{
 7 | connection_is_open(sc)
 8 | }
 9 | \arguments{
10 | \item{sc}{\code{spark_connection}}
11 | }
12 | \description{
13 | Check whether the connection is open
14 | }
15 | \keyword{internal}
16 | 
17 | 


--------------------------------------------------------------------------------
/sparkapi.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: No
 4 | SaveWorkspace: No
 5 | AlwaysSaveHistory: No
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 
18 | BuildType: Package
19 | PackageUseDevtools: Yes
20 | PackageInstallArgs: --no-multiarch --with-keep.source
21 | PackageRoxygenize: rd,collate,namespace
22 | 


--------------------------------------------------------------------------------
/man/print_jobj.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/jobj.R
 3 | \name{print_jobj}
 4 | \alias{print_jobj}
 5 | \title{Generic method for print jobj for a connection type}
 6 | \usage{
 7 | print_jobj(sc, jobj, ...)
 8 | }
 9 | \arguments{
10 | \item{sc}{\code{spark_connection} (used for type dispatch)}
11 | 
12 | \item{jobj}{Object to print}
13 | }
14 | \description{
15 | Generic method for print jobj for a connection type
16 | }
17 | \keyword{internal}
18 | 
19 | 


--------------------------------------------------------------------------------
/man/hive_context.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/connection.R
 3 | \name{hive_context}
 4 | \alias{hive_context}
 5 | \title{Get the HiveContext associated with a connection}
 6 | \usage{
 7 | hive_context(sc)
 8 | }
 9 | \arguments{
10 | \item{sc}{Connection to get HiveContext from}
11 | }
12 | \value{
13 | Reference to HiveContext
14 | }
15 | \description{
16 | Get the HiveContext \code{spark_jobj} associated with a
17 | \code{spark_connection}
18 | }
19 | 
20 | 


--------------------------------------------------------------------------------
/man/spark_compile.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/compile.R
 3 | \name{spark_compile}
 4 | \alias{spark_compile}
 5 | \title{Compiles scala sources and packages into a jar file}
 6 | \usage{
 7 | spark_compile(name, spark_home)
 8 | }
 9 | \arguments{
10 | \item{name}{The name of the target jar}
11 | 
12 | \item{spark_home}{Spark version}
13 | }
14 | \description{
15 | Compiles scala sources and packages into a jar file
16 | }
17 | \keyword{internal}
18 | 
19 | 


--------------------------------------------------------------------------------
/man/spark_context.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/connection.R
 3 | \name{spark_context}
 4 | \alias{spark_context}
 5 | \title{Get the SparkContext associated with a connection}
 6 | \usage{
 7 | spark_context(sc)
 8 | }
 9 | \arguments{
10 | \item{sc}{Connection to get SparkContext from}
11 | }
12 | \value{
13 | Reference to SparkContext
14 | }
15 | \description{
16 | Get the SparkContext \code{spark_jobj} associated with a
17 | \code{spark_connection}
18 | }
19 | 
20 | 


--------------------------------------------------------------------------------
/man/java_context.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/connection.R
 3 | \name{java_context}
 4 | \alias{java_context}
 5 | \title{Get the JavaSparkContext associated with a connection}
 6 | \usage{
 7 | java_context(sc)
 8 | }
 9 | \arguments{
10 | \item{sc}{Connection to get SparkContext from}
11 | }
12 | \value{
13 | Reference to SparkContext
14 | }
15 | \description{
16 | Get the JavaSparkContext \code{spark_jobj} associated with a
17 | \code{spark_connection}
18 | }
19 | 
20 | 


--------------------------------------------------------------------------------
/man/spark_dataframe.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/dataframe.R
 3 | \name{spark_dataframe}
 4 | \alias{spark_dataframe}
 5 | \title{Get the Spark DataFrame associated with an object}
 6 | \usage{
 7 | spark_dataframe(x, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{Object to get DataFrame from}
11 | 
12 | \item{...}{Reserved for future use}
13 | }
14 | \value{
15 | Reference to DataFrame
16 | }
17 | \description{
18 | S3 method to get the Spark DataFrame associated with objects of
19 | various types.
20 | }
21 | 
22 | 


--------------------------------------------------------------------------------
/man/spark_dependency.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/extensions.R
 3 | \name{spark_dependency}
 4 | \alias{spark_dependency}
 5 | \title{Define a Spark dependency}
 6 | \usage{
 7 | spark_dependency(jars = NULL, packages = NULL)
 8 | }
 9 | \arguments{
10 | \item{jars}{Character vector of full paths to JAR files}
11 | 
12 | \item{packages}{Character vector of Spark packages names}
13 | }
14 | \value{
15 | An object of type `spark_dependency`
16 | }
17 | \description{
18 | Define a Spark dependency consisting of a set of custom JARs and Spark packages.
19 | }
20 | 
21 | 


--------------------------------------------------------------------------------
/man/spark_jobj.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/jobj.R
 3 | \name{spark_jobj}
 4 | \alias{spark_jobj}
 5 | \title{Get the spark_jobj associated with an object}
 6 | \usage{
 7 | spark_jobj(x, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{Object to extract jobj from}
11 | 
12 | \item{...}{Reserved for future use}
13 | }
14 | \value{
15 | A \code{spark_jobj} object that can be passed to
16 |   \code{\link{invoke}}.
17 | }
18 | \description{
19 | S3 method to get the spark_jobj associated with objects of
20 | various types.
21 | }
22 | \seealso{
23 | \code{\link{invoke}}
24 | }
25 | 
26 | 


--------------------------------------------------------------------------------
/man/spark_connection.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/connection.R
 3 | \name{spark_connection}
 4 | \alias{spark_connection}
 5 | \title{Get the spark_connection associated with an object}
 6 | \usage{
 7 | spark_connection(x, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{Object to extract connection from}
11 | 
12 | \item{...}{Reserved for future use}
13 | }
14 | \value{
15 | A \code{spark_connection} object that can be passed to
16 |   \code{\link{invoke_new}} and \code{\link{invoke_static}}.
17 | }
18 | \description{
19 | S3 method to get the spark_connection associated with objects of
20 | various types.
21 | }
22 | 
23 | 


--------------------------------------------------------------------------------
/man/spark_log.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/connection.R
 3 | \name{spark_log}
 4 | \alias{spark_log}
 5 | \title{Retrieves entries from the Spark log}
 6 | \usage{
 7 | spark_log(sc, n = 100, ...)
 8 | }
 9 | \arguments{
10 | \item{sc}{\code{spark_connection}}
11 | 
12 | \item{n}{Max number of log entries to retrieve (pass NULL to retrieve
13 | all lines of the log)}
14 | 
15 | \item{...}{Unused (reserved for future use)}
16 | }
17 | \value{
18 | Character vector with last \code{n} lines of the Spark log
19 |   or for \code{spark_log_file} the full path to the log file.
20 | }
21 | \description{
22 | Retrieves entries from the Spark log
23 | }
24 | 
25 | 


--------------------------------------------------------------------------------
/man/spark_version.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/version.R
 3 | \name{spark_version}
 4 | \alias{spark_version}
 5 | \alias{spark_version_from_home}
 6 | \title{Version of Spark for a connection}
 7 | \usage{
 8 | spark_version(sc)
 9 | 
10 | spark_version_from_home(spark_home, default = NULL)
11 | }
12 | \arguments{
13 | \item{sc}{\code{spark_connection}}
14 | 
15 | \item{spark_home}{Path to SPARK_HOME}
16 | 
17 | \item{default}{The version to use as default}
18 | }
19 | \value{
20 | A \code{\link{numeric_version}} object
21 | }
22 | \description{
23 | Version of Spark for a connection
24 | 
25 | Version of Spark for a SPARK_HOME directory
26 | }
27 | 
28 | 


--------------------------------------------------------------------------------
/R/dataframe.R:
--------------------------------------------------------------------------------
 1 | #' Get the Spark DataFrame associated with an object
 2 | #'
 3 | #' S3 method to get the Spark DataFrame associated with objects of
 4 | #' various types.
 5 | #'
 6 | #' @param x Object to get DataFrame from
 7 | #' @param ... Reserved for future use
 8 | #' @return Reference to DataFrame
 9 | #'
10 | #' @export
11 | spark_dataframe <- function(x, ...) {
12 |   UseMethod("spark_dataframe")
13 | }
14 | 
15 | #' @export
16 | spark_dataframe.default <- function(x, ...) {
17 |   stop("Unable to retreive a Spark DataFrame from object of class ",
18 |        paste(class(x), collapse = " "), call. = FALSE)
19 | }
20 | 
21 | #' @export
22 | spark_dataframe.spark_jobj <- function(x, ...) {
23 |   x
24 | }
25 | 
26 | 


--------------------------------------------------------------------------------
/inst/scala/logging.scala:
--------------------------------------------------------------------------------
 1 | package sparkapi
 2 | 
 3 | import java.text.SimpleDateFormat
 4 | import java.util.Calendar
 5 | import java.util.Date
 6 | 
 7 | object Logging {
 8 |   def getDate() : String = {
 9 |     val now = Calendar.getInstance().getTime()
10 |     val logFormat = new SimpleDateFormat("yy/MM/dd HH:mm:ss")
11 |     return logFormat.format(now)
12 |   }
13 | 
14 |   def logError(message: String) = {
15 |     System.err.println(getDate() + " ERROR " + message)
16 |   }
17 | 
18 |   def logError(message: String, e: Exception) = {
19 |     System.err.println(getDate() + " ERROR " + message, e.toString)
20 |   }
21 | 
22 |   def logWarning(message: String) = {
23 |     System.err.println(getDate() + " WARN " + message)
24 |   }
25 | }
26 | 


--------------------------------------------------------------------------------
/man/invoke.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/invoke.R
 3 | \name{invoke}
 4 | \alias{invoke}
 5 | \alias{invoke_new}
 6 | \alias{invoke_static}
 7 | \title{Execute a method on a remote Java object}
 8 | \usage{
 9 | invoke(jobj, method, ...)
10 | 
11 | invoke_static(sc, class, method, ...)
12 | 
13 | invoke_new(sc, class, ...)
14 | }
15 | \arguments{
16 | \item{jobj}{Java object to execute method on.}
17 | 
18 | \item{method}{Name of method to execute.}
19 | 
20 | \item{...}{Unused (future expansion)}
21 | 
22 | \item{sc}{\code{spark_connection} to execute on.}
23 | 
24 | \item{class}{Class to execute static method on.}
25 | }
26 | \description{
27 | Execute a method on a remote Java object
28 | }
29 | 
30 | 


--------------------------------------------------------------------------------
/man/connection_config.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/connection.R
 3 | \name{connection_config}
 4 | \alias{connection_config}
 5 | \title{Read configuration values for a connection}
 6 | \usage{
 7 | connection_config(sc, prefix, not_prefix = list())
 8 | }
 9 | \arguments{
10 | \item{sc}{\code{spark_connection}}
11 | 
12 | \item{prefix}{Prefix to read parameters for
13 | (e.g. \code{spark.context.}, \code{spark.sql.}, etc.)}
14 | 
15 | \item{not_prefix}{Prefix to not include.}
16 | }
17 | \value{
18 | Named list of config parameters (note that if a prefix was
19 |  specified then the names will not include the prefix)
20 | }
21 | \description{
22 | Read configuration values for a connection
23 | }
24 | 
25 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: sparkapi
 2 | Type: Package
 3 | Title: Spark API Interface
 4 | Version: 0.3.22
 5 | Authors@R: c(
 6 |   person("Javier", "Luraschi", email = "javier@rstudio.com", role = c("aut", "cre")),
 7 |   person(family = "The Apache Software Foundation", role = c("aut", "cph")),
 8 |   person("Kevin", "Ushey", role = "aut", email = "kevin@rstudio.com"),
 9 |   person("JJ", "Allaire", role = "aut", email = "jj@rstudio.com"),
10 |   person(family = "RStudio", role = c("cph")))
11 | Imports:
12 |   utils,
13 |   magrittr,
14 |   withr
15 | Suggests:
16 |   testthat
17 | Description: Low-level socket-based interface to calling the Spark API via the
18 |   RBackend server included in Spark.
19 | License: file LICENSE
20 | Encoding: UTF-8
21 | LazyData: true
22 | RoxygenNote: 5.0.1
23 | 


--------------------------------------------------------------------------------
/man/invoke_method.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/invoke.R
 3 | \name{invoke_method}
 4 | \alias{invoke_method}
 5 | \title{Generic call interface for spark shell}
 6 | \usage{
 7 | invoke_method(sc, static, object, method, ...)
 8 | }
 9 | \arguments{
10 | \item{sc}{\code{spark_connection}}
11 | 
12 | \item{static}{Is this a static method call (including a constructor). If so
13 | then the \code{object} parameter should be the name of a class (otherwise
14 | it should be a spark_jobj instance).}
15 | 
16 | \item{object}{Object instance or name of class (for \code{static})}
17 | 
18 | \item{method}{Name of method}
19 | 
20 | \item{...}{Call parameters}
21 | }
22 | \description{
23 | Generic call interface for spark shell
24 | }
25 | \keyword{internal}
26 | 
27 | 


--------------------------------------------------------------------------------
/inst/scala/utils.scala:
--------------------------------------------------------------------------------
 1 | package sparkapi
 2 | 
 3 | import java.io._
 4 | import java.io.File
 5 | import java.util.Arrays
 6 | 
 7 | import org.apache.spark.{SparkEnv, SparkException}
 8 | 
 9 | object Utils {
10 |   var rPackages: Option[String] = None
11 | 
12 |   /**
13 |    * Return a nice string representation of the exception. It will call "printStackTrace" to
14 |    * recursively generate the stack trace including the exception and its causes.
15 |    */
16 |   def exceptionString(e: Throwable): String = {
17 |     if (e == null) {
18 |       ""
19 |     } else {
20 |       // Use e.printStackTrace here because e.getStackTrace doesn't include the cause
21 |       val stringWriter = new StringWriter()
22 |       e.printStackTrace(new PrintWriter(stringWriter))
23 |       stringWriter.toString
24 |     }
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/man/register_extension.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/extensions.R
 3 | \name{register_extension}
 4 | \alias{register_extension}
 5 | \alias{registered_extensions}
 6 | \title{Register a package that implements a sparkapi extension}
 7 | \usage{
 8 | register_extension(package)
 9 | 
10 | registered_extensions()
11 | }
12 | \arguments{
13 | \item{package}{Name of package to register}
14 | }
15 | \description{
16 | Registering an extension package will result in the package being
17 | automatically scanned for spark dependencies when a connection
18 | to Spark is initiated (e.g. via \code{start_shell}).
19 | 
20 | Enumerate all registered extension packages
21 | }
22 | \note{
23 | Extensions are typically registered when packages are
24 |   loaded onto the search path (i.e. in the \code{.onLoad}
25 |   function).
26 | }
27 | 
28 | 


--------------------------------------------------------------------------------
/tests/testthat/test-config.R:
--------------------------------------------------------------------------------
 1 | library(testthat)
 2 | 
 3 | context("Config")
 4 | 
 5 | mock_spark_config <- function(master, config = list()) {
 6 |   list(
 7 |     master = master,
 8 |     config = config
 9 |   )
10 | }
11 | 
12 | test_that("connection_config can retrieve correct prefixes", {
13 |   sc <- mock_spark_config(master = "local", config = list(
14 |     "spark.session.value1" = "1",
15 |     "spark.session.value2" = "2"
16 |   ))
17 | 
18 |   params <- connection_config(sc, "spark.session.")
19 | 
20 |   expect_true(length(params) == 2)
21 | })
22 | 
23 | test_that("connection_config can filter out prefixes", {
24 |   sc <- mock_spark_config(master = "local", config = list(
25 |     "spark.sql.value" = "ok",
26 |     "spark.value" = "not ok"
27 |   ))
28 | 
29 |   params <- connection_config(sc, "spark.", c("spark.sql.",
30 |                                               "spark.session."))
31 | 
32 |   expect_true(length(params) == 1)
33 | })
34 | 


--------------------------------------------------------------------------------
/configure.R:
--------------------------------------------------------------------------------
 1 | compile_jars <- function() {
 2 |   verbose <- !is.na(Sys.getenv("NOT_CRAN", unset = NA))
 3 | 
 4 |   # skip on Travis
 5 |   if (!is.na(Sys.getenv("TRAVIS", unset = NA))) {
 6 |     if (verbose)
 7 |       message("** skipping Scala compilation on Travis")
 8 |     return(FALSE)
 9 |   }
10 | 
11 |   # skip if no 'scalac' available
12 |   if (!nzchar(Sys.which("scalac"))) {
13 |     if (verbose)
14 |       message("** skipping Scala compilation: 'scalac' not on PATH")
15 |     return(FALSE)
16 |   }
17 | 
18 |   # skip if no 'jar' available
19 |   if (!nzchar(Sys.which("jar"))) {
20 |     if (verbose)
21 |       message("** skipping Scala compilation: 'jar' not on PATH")
22 |     return(FALSE)
23 |   }
24 | 
25 |   source("R/version.R")
26 |   source("R/compile.R")
27 | 
28 |   tryCatch(
29 |     source("inst/tools/compile-scala.R"),
30 |     error = function(e) {
31 |       if (nzchar(e$message)) {
32 |         message(e$message)
33 |       }
34 |     }
35 |   )
36 | 
37 | }
38 | 
39 | invisible(compile_jars())
40 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | S3method(connection_is_open,spark_shell_connection)
 4 | S3method(invoke_method,spark_shell_connection)
 5 | S3method(print,spark_jobj)
 6 | S3method(print,spark_log)
 7 | S3method(print,spark_web_url)
 8 | S3method(print_jobj,spark_shell_connection)
 9 | S3method(spark_connection,default)
10 | S3method(spark_connection,spark_connection)
11 | S3method(spark_connection,spark_jobj)
12 | S3method(spark_dataframe,default)
13 | S3method(spark_dataframe,spark_jobj)
14 | S3method(spark_jobj,default)
15 | S3method(spark_jobj,spark_jobj)
16 | S3method(spark_log,default)
17 | S3method(spark_log,spark_shell_connection)
18 | S3method(spark_web,default)
19 | S3method(spark_web,spark_shell_connection)
20 | export("%>%")
21 | export(connection_config)
22 | export(connection_is_open)
23 | export(hive_context)
24 | export(invoke)
25 | export(invoke_method)
26 | export(invoke_new)
27 | export(invoke_static)
28 | export(java_context)
29 | export(print_jobj)
30 | export(register_extension)
31 | export(registered_extensions)
32 | export(spark_compile)
33 | export(spark_connection)
34 | export(spark_context)
35 | export(spark_dataframe)
36 | export(spark_dependency)
37 | export(spark_jobj)
38 | export(spark_log)
39 | export(spark_version)
40 | export(spark_version_from_home)
41 | export(spark_web)
42 | export(start_shell)
43 | export(stop_shell)
44 | import(digest)
45 | import(rprojroot)
46 | importFrom(magrittr,"%>%")
47 | 


--------------------------------------------------------------------------------
/man/start_shell.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/shell.R
 3 | \name{start_shell}
 4 | \alias{start_shell}
 5 | \alias{stop_shell}
 6 | \title{Start the Spark R Shell}
 7 | \usage{
 8 | start_shell(master, spark_home = Sys.getenv("SPARK_HOME"),
 9 |   spark_version = NULL, app_name = "sparkapi", config = list(),
10 |   extensions = sparkapi::registered_extensions(), jars = NULL,
11 |   packages = NULL, environment = NULL, shell_args = NULL)
12 | 
13 | stop_shell(sc)
14 | }
15 | \arguments{
16 | \item{master}{Spark cluster url to connect to. Use \code{"local"} to connect to a local
17 | instance of Spark}
18 | 
19 | \item{spark_home}{Spark home directory (defaults to SPARK_HOME environment variable)}
20 | 
21 | \item{spark_version}{Spark version, if not specified, version taken from SPARK_HOME}
22 | 
23 | \item{app_name}{Application name to be used while running in the Spark cluster}
24 | 
25 | \item{config}{Named character vector of spark.* options}
26 | 
27 | \item{extensions}{Extension packages to include dependencies for
28 | (see \code{\link{spark_dependency}}).}
29 | 
30 | \item{jars}{Paths to Jar files to include}
31 | 
32 | \item{packages}{Spark packages to include}
33 | 
34 | \item{environment}{Environment variables to set}
35 | 
36 | \item{shell_args}{Additional command line arguments for spark_shell}
37 | 
38 | \item{sc}{\code{spark_connection}}
39 | }
40 | \value{
41 | \code{spark_connection} object
42 | }
43 | \description{
44 | Start the Spark R Shell
45 | 
46 | Stop the Spark R Shell
47 | }
48 | 
49 | 


--------------------------------------------------------------------------------
/R/invoke.R:
--------------------------------------------------------------------------------
 1 | 
 2 | #' Execute a method on a remote Java object
 3 | #'
 4 | #' @param sc \code{spark_connection} to execute on.
 5 | #' @param jobj Java object to execute method on.
 6 | #' @param class Class to execute static method on.
 7 | #' @param method Name of method to execute.
 8 | #' @param ... Unused (future expansion)
 9 | #'
10 | #' @export
11 | invoke <- function (jobj, method, ...)
12 | {
13 |   invoke_method(spark_connection(jobj),
14 |                 FALSE,
15 |                 jobj,
16 |                 method,
17 |                 ...)
18 | }
19 | 
20 | 
21 | #' @name invoke
22 | #' @export
23 | invoke_static <- function (sc, class, method, ...)
24 | {
25 |   invoke_method(sc,
26 |                 TRUE,
27 |                 class,
28 |                 method,
29 |                 ...)
30 | }
31 | 
32 | 
33 | #' @name invoke
34 | #' @export
35 | invoke_new <- function(sc, class, ...)
36 | {
37 |   invoke_method(sc,
38 |                 TRUE,
39 |                 class,
40 |                 "<init>",
41 |                 ...)
42 | }
43 | 
44 | #' Generic call interface for spark shell
45 | #'
46 | #' @param sc \code{spark_connection}
47 | #' @param static Is this a static method call (including a constructor). If so
48 | #'   then the \code{object} parameter should be the name of a class (otherwise
49 | #'   it should be a spark_jobj instance).
50 | #' @param object Object instance or name of class (for \code{static})
51 | #' @param method Name of method
52 | #' @param ... Call parameters
53 | #'
54 | #' @keywords internal
55 | #'
56 | #' @export
57 | invoke_method <- function(sc, static, object, method, ...) {
58 |   UseMethod("invoke_method")
59 | }
60 | 
61 | 


--------------------------------------------------------------------------------
/R/hive.R:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | create_hive_context <- function(sc) {
  4 |   if (spark_version(sc) >= "2.0.0")
  5 |     create_hive_context_v2(sc)
  6 |   else
  7 |     create_hive_context_v1(sc)
  8 | }
  9 | 
 10 | create_hive_context_v2 <- function(sc) {
 11 | 
 12 |   # SparkSession.builder().enableHiveSupport()
 13 |   builder <- invoke_static(
 14 |     sc,
 15 |     "org.apache.spark.sql.SparkSession",
 16 |     "builder"
 17 |   )
 18 | 
 19 |   builder <- invoke(
 20 |     builder,
 21 |     "enableHiveSupport"
 22 |   )
 23 | 
 24 |   session <- invoke(
 25 |     builder,
 26 |     "getOrCreate"
 27 |   )
 28 | 
 29 |   # get config object
 30 |   conf <- invoke(session, "conf")
 31 | 
 32 |   # apply spark.sql. params
 33 |   params <- connection_config(sc, "spark.sql.")
 34 |   apply_config(params, conf, "set", "spark.sql.")
 35 | 
 36 |   # return session as hive context
 37 |   session
 38 | }
 39 | 
 40 | create_hive_context_v1 <- function(sc) {
 41 | 
 42 |   # get spark_context
 43 |   ctx <- spark_context(sc)
 44 | 
 45 |   # attempt to create hive_context
 46 |   hive_context <- tryCatch({
 47 |     invoke_new(
 48 |       sc,
 49 |       "org.apache.spark.sql.hive.HiveContext",
 50 |       ctx
 51 |     )},
 52 |     error = function(e) {
 53 |       NULL
 54 |     }
 55 |   )
 56 | 
 57 |   # if we failed then create a SqlContext instead
 58 |   if (is.null(hive_context)) {
 59 | 
 60 |     warning("Failed to create Hive context, falling back to SQL. Some operations, ",
 61 |             "like window-functions, will not work")
 62 | 
 63 |     jsc <- invoke_static(
 64 |       sc,
 65 |       "org.apache.spark.api.java.JavaSparkContext",
 66 |       "fromSparkContext",
 67 |       ctx
 68 |     )
 69 | 
 70 |     hive_context <- invoke_static(
 71 |       sc,
 72 |       "org.apache.spark.sql.api.r.SQLUtils",
 73 |       "createSQLContext",
 74 |       jsc
 75 |     )
 76 |   }
 77 | 
 78 |   # apply configuration
 79 |   params <- connection_config(sc, "spark.sql.")
 80 |   apply_config(params, hive_context, "setConf", "spark.sql.")
 81 | 
 82 |   # return hive_context
 83 |   hive_context
 84 | }
 85 | 
 86 | apply_config <- function(params, object, method, prefix) {
 87 |   lapply(names(params), function(paramName) {
 88 |     configValue <- params[[paramName]]
 89 |     if (is.logical(configValue)) {
 90 |       configValue <- if (configValue) "true" else "false"
 91 |     }
 92 |     else {
 93 |       configValue <- as.character(configValue)
 94 |     }
 95 | 
 96 |     invoke(
 97 |       object,
 98 |       method,
 99 |       paste0(prefix, paramName),
100 |       configValue
101 |     )
102 |   })
103 | }
104 | 


--------------------------------------------------------------------------------
/R/version.R:
--------------------------------------------------------------------------------
 1 | spark_version_clean <- function(version) {
 2 |   gsub("([0-9]+\\.?)[^0-9\\.](.*)","\\1", version)
 3 | }
 4 | 
 5 | #' Version of Spark for a connection
 6 | #'
 7 | #' @param sc \code{spark_connection}
 8 | #'
 9 | #' @return A \code{\link{numeric_version}} object
10 | #'
11 | #' @export
12 | spark_version <- function(sc) {
13 |   # get the version
14 |   version <- invoke(spark_context(sc), "version")
15 | 
16 |   # Get rid of -preview and other suffix variations
17 |   version <- spark_version_clean(version)
18 | 
19 |   # return numeric version
20 |   numeric_version(version)
21 | }
22 | 
23 | spark_version_from_home_version <- function() {
24 |   version <- Sys.getenv("SPARK_HOME_VERSION")
25 |   if (nchar(version) <= 0) NULL else version
26 | }
27 | 
28 | #' Version of Spark for a SPARK_HOME directory
29 | #'
30 | #' @param spark_home Path to SPARK_HOME
31 | #' @param default The version to use as default
32 | #'
33 | #' @rdname spark_version
34 | #'
35 | #' @export
36 | spark_version_from_home <- function(spark_home, default = NULL) {
37 |   versionAttempts <- list(
38 |     useReleaseFile = function() {
39 |       versionedFile <- file.path(spark_home, "RELEASE")
40 |       if (file.exists(versionedFile)) {
41 |         releaseContents <- readLines(versionedFile)
42 | 
43 |         if (!is.null(releaseContents) && length(releaseContents) > 0) {
44 |           gsub("Spark | built.*", "", releaseContents[[1]])
45 |         }
46 |       }
47 |     },
48 |     useAssemblies = function() {
49 |       candidateVersions <- list(
50 |         list(path = "lib", pattern = "spark-assembly-([0-9\\.]*)-hadoop.[0-9\\.]*\\.jar"),
51 |         list(path = "yarn", pattern = "spark-([0-9\\.]*)-preview-yarn-shuffle\\.jar")
52 |       )
53 | 
54 |       candidateFiles <- lapply(candidateVersions, function(e) {
55 |         c(e,
56 |           list(
57 |             files = list.files(
58 |               file.path(spark_home, e$path),
59 |               pattern = e$pattern
60 |             )
61 |           )
62 |         )
63 |       })
64 | 
65 |       filteredCandidates <- Filter(function(f) length(f$files) > 0, candidateFiles)
66 |       if (length(filteredCandidates) > 0) {
67 |         valid <- filteredCandidates[[1]]
68 |         e <- regexec(valid$pattern, valid$files[[1]])
69 |         match <- regmatches(valid$files[[1]], e)
70 |         if (length(match) > 0 && length(match[[1]]) > 1) {
71 |           return(match[[1]][[2]])
72 |         }
73 |       }
74 |     },
75 |     useEnvironmentVariable = function() {
76 |       spark_version_from_home_version()
77 |     },
78 |     useDefault = function() {
79 |       default
80 |     }
81 |   )
82 | 
83 |   for (versionAttempt in versionAttempts) {
84 |     result <- versionAttempt()
85 |     if (length(result) > 0)
86 |       return(spark_version_clean(result))
87 |   }
88 | 
89 |   stop(
90 |     "Failed to detect version from SPARK_HOME or SPARK_HOME_VERSION. ",
91 |     "Try passing the spark_version explicitly.")
92 | }
93 | 


--------------------------------------------------------------------------------
/R/extensions.R:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | #' Register a package that implements a sparkapi extension
 4 | #'
 5 | #' Registering an extension package will result in the package being
 6 | #' automatically scanned for spark dependencies when a connection
 7 | #' to Spark is initiated (e.g. via \code{start_shell}).
 8 | #'
 9 | #' @param package Name of package to register
10 | #'
11 | #' @note Extensions are typically registered when packages are
12 | #'   loaded onto the search path (i.e. in the \code{.onLoad}
13 | #'   function).
14 | #'
15 | #' @export
16 | register_extension <- function(package) {
17 |   .globals$extension_packages <- c(.globals$extension_packages, package)
18 | }
19 | 
20 | #' Enumerate all registered extension packages
21 | 
22 | #' @rdname register_extension
23 | #' @export
24 | registered_extensions <- function() {
25 |   .globals$extension_packages
26 | }
27 | 
28 | 
29 | #' Define a Spark dependency
30 | #'
31 | #' Define a Spark dependency consisting of a set of custom JARs and Spark packages.
32 | #'
33 | #' @param jars Character vector of full paths to JAR files
34 | #' @param packages Character vector of Spark packages names
35 | #'
36 | #' @return An object of type `spark_dependency`
37 | #'
38 | #' @export
39 | spark_dependency <- function(jars = NULL, packages = NULL) {
40 |   structure(class = "spark_dependency", list(
41 |     jars = jars,
42 |     packages = packages
43 |   ))
44 | }
45 | 
46 | spark_dependencies_from_extensions <- function(spark_version, scala_version, extensions) {
47 | 
48 |   jars <- character()
49 |   packages <- character()
50 | 
51 |   lapply(extensions, function(extension) {
52 |     dependencies <- spark_dependencies_from_extension(spark_version, scala_version, extension)
53 |     lapply(dependencies, function(dependency) {
54 |       jars <<- c(jars, dependency$jars)
55 |       packages <<- c(packages, dependency$packages)
56 |     })
57 |   })
58 | 
59 |   list(
60 |     jars = jars,
61 |     packages = packages
62 |   )
63 | }
64 | 
65 | spark_dependencies_from_extension <- function(spark_version, scala_version, extension) {
66 | 
67 |   # attempt to find the function
68 |   spark_dependencies <- tryCatch({
69 |     get("spark_dependencies", asNamespace(extension), inherits = FALSE)
70 |   },
71 |   error = function(e) {
72 |     stop("spark_dependencies function not found within ",
73 |          "extension package ", extension, call. = FALSE)
74 |   }
75 |   )
76 | 
77 |   # reduce the spark_version to just major and minor versions
78 |   spark_version <- package_version(spark_version)
79 |   spark_version <- paste(spark_version$major, spark_version$minor, sep = '.')
80 |   spark_version <- numeric_version(spark_version)
81 | 
82 |   # call the function
83 |   dependency <- spark_dependencies(spark_version = spark_version,
84 |                                    scala_version = scala_version)
85 | 
86 |   # if it's just a single dependency then wrap it in a list
87 |   if (inherits(dependency, "spark_dependency"))
88 |     dependency <- list(dependency)
89 | 
90 |   # return it
91 |   dependency
92 | }
93 | 
94 | 


--------------------------------------------------------------------------------
/R/compile.R:
--------------------------------------------------------------------------------
  1 | #' Compiles scala sources and packages into a jar file
  2 | #'
  3 | #' @export
  4 | #' @param name The name of the target jar
  5 | #' @param spark_home Spark version
  6 | #'
  7 | #' @import rprojroot
  8 | #' @import digest
  9 | #'
 10 | #' @keywords internal
 11 | spark_compile <- function(name, spark_home) {
 12 |   spark_version <- spark_version_from_home(spark_home)
 13 |   version_numeric <- gsub("[-_a-zA-Z]", "", spark_version)
 14 |   version_sufix <- gsub("\\.|[-_a-zA-Z]", "", spark_version)
 15 |   jar_name <- paste0(name, "-", version_numeric, ".jar")
 16 | 
 17 |   root <- rprojroot::find_package_root_file()
 18 | 
 19 |   jar_path <- file.path(root, "inst", "java", jar_name)
 20 |   scala_files <- lapply(
 21 |     Filter(
 22 |       function(e) {
 23 |         # if filename has version only include version being built
 24 |         if (grepl(".*_\\d+\\.scala", e)) {
 25 |           grepl(version_sufix, e)
 26 |         }
 27 |         else {
 28 |           grepl(".*\\.scala$", e)
 29 |         }
 30 |       },
 31 |       list.files(file.path(root, "inst", "scala"))
 32 |     ),
 33 |     function(e) file.path(root, "inst", "scala", e)
 34 |   )
 35 |   scala_files_digest <- file.path(root, paste0(
 36 |     "inst/scala/sparklyr-", version_numeric, ".md5"
 37 |   ))
 38 | 
 39 |   scala_files_contents <- paste(lapply(scala_files, function(e) readLines(e)))
 40 |   scala_files_contents_path <- tempfile()
 41 |   scala_files_contents_file <- file(scala_files_contents_path, "w")
 42 |   writeLines(scala_files_contents, scala_files_contents_file)
 43 |   close(scala_files_contents_file)
 44 | 
 45 |   # Bail if files havent changed
 46 |   md5 <- tools::md5sum(scala_files_contents_path)
 47 |   if (file.exists(scala_files_digest) && file.exists(jar_path)) {
 48 |     contents <- readChar(scala_files_digest, file.info(scala_files_digest)$size, TRUE)
 49 |     if (identical(contents, md5[[scala_files_contents_path]])) {
 50 |       return()
 51 |     }
 52 |   }
 53 | 
 54 |   message("** building '", jar_name, "' ...")
 55 | 
 56 |   cat(md5, file = scala_files_digest)
 57 | 
 58 |   execute <- function(...) {
 59 |     cmd <- paste(...)
 60 |     message("*** ", cmd)
 61 |     system(cmd)
 62 |   }
 63 | 
 64 |   if (!nzchar(Sys.which("scalac")))
 65 |     stop("failed to discover 'scalac' on the PATH")
 66 | 
 67 |   if (!nzchar(Sys.which("jar")))
 68 |     stop("failed to discover 'jar' on the PATH")
 69 | 
 70 |   # Work in temporary directory (as temporary class files
 71 |   # will be generated within there)
 72 |   dir <- file.path(tempdir(), paste0(name, "-", version_sufix, "-scala-compile"))
 73 |   if (!file.exists(dir))
 74 |     if (!dir.create(dir))
 75 |       stop("Failed to create '", dir, "'")
 76 |   owd <- setwd(dir)
 77 | 
 78 |   # list jars in the installation folder
 79 |   candidates <- c("jars", "lib")
 80 |   jars <- NULL
 81 |   for (candidate in candidates) {
 82 |     jars <- list.files(
 83 |       file.path(spark_home, candidate),
 84 |       full.names = TRUE,
 85 |       pattern = "jar$"
 86 |     )
 87 | 
 88 |     if (length(jars))
 89 |       break
 90 |   }
 91 | 
 92 |   if (!length(jars))
 93 |     stop("failed to discover Spark jars")
 94 | 
 95 |   # construct classpath
 96 |   CLASSPATH <- paste(jars, collapse = .Platform$path.sep)
 97 | 
 98 |   # ensure 'inst/java' exists
 99 |   inst_java_path <- file.path(root, "inst/java")
100 |   if (!file.exists(inst_java_path))
101 |     if (!dir.create(inst_java_path, recursive = TRUE))
102 |       stop("failed to create directory '", inst_java_path, "'")
103 | 
104 |   # call 'scalac' compiler
105 |   classpath <- Sys.getenv("CLASSPATH")
106 | 
107 |   # set CLASSPATH environment variable rather than passing
108 |   # in on command line (mostly aesthetic)
109 |   Sys.setenv(CLASSPATH = CLASSPATH)
110 |   execute("scalac", paste(shQuote(scala_files), collapse = " "))
111 |   Sys.setenv(CLASSPATH = classpath)
112 | 
113 |   # call 'jar' to create our jar
114 |   class_files <- file.path(name, list.files(name, pattern = "class$"))
115 |   execute("jar cf", jar_path, paste(shQuote(class_files), collapse = " "))
116 | 
117 |   # double-check existence of jar
118 |   if (file.exists(jar_path)) {
119 |     message("*** ", basename(jar_path), " successfully created.")
120 |   } else {
121 |     stop("*** failed to create ", jar_name)
122 |   }
123 | 
124 |   setwd(owd)
125 | }
126 | 


--------------------------------------------------------------------------------
/inst/scala/backend.scala:
--------------------------------------------------------------------------------
  1 | package sparkapi
  2 | 
  3 | import java.io.{DataOutputStream, File, FileOutputStream, IOException}
  4 | import java.net.{InetAddress, InetSocketAddress, ServerSocket}
  5 | import java.util.concurrent.TimeUnit
  6 | 
  7 | import io.netty.bootstrap.ServerBootstrap
  8 | import io.netty.channel.{ChannelFuture, ChannelInitializer, EventLoopGroup}
  9 | import io.netty.channel.nio.NioEventLoopGroup
 10 | import io.netty.channel.socket.SocketChannel
 11 | import io.netty.channel.socket.nio.NioServerSocketChannel
 12 | import io.netty.handler.codec.LengthFieldBasedFrameDecoder
 13 | import io.netty.handler.codec.bytes.{ByteArrayDecoder, ByteArrayEncoder}
 14 | 
 15 | import org.apache.spark.SparkConf
 16 | 
 17 | import sparkapi.Logging._
 18 | 
 19 | class Backend {
 20 | 
 21 |   private[this] var channelFuture: ChannelFuture = null
 22 |   private[this] var bootstrap: ServerBootstrap = null
 23 |   private[this] var bossGroup: EventLoopGroup = null
 24 | 
 25 |   def init(): Int = {
 26 |     val conf = new SparkConf()
 27 |     bossGroup = new NioEventLoopGroup(conf.getInt("sparkapi.backend.threads", 2))
 28 |     val workerGroup = bossGroup
 29 |     val handler = new Handler(this)
 30 | 
 31 |     bootstrap = new ServerBootstrap()
 32 |       .group(bossGroup, workerGroup)
 33 |       .channel(classOf[NioServerSocketChannel])
 34 | 
 35 |     bootstrap.childHandler(new ChannelInitializer[SocketChannel]() {
 36 |       def initChannel(ch: SocketChannel): Unit = {
 37 |         ch.pipeline()
 38 |           .addLast("encoder", new ByteArrayEncoder())
 39 |           .addLast("frameDecoder",
 40 |             new LengthFieldBasedFrameDecoder(Integer.MAX_VALUE, 0, 4, 0, 4))
 41 |           .addLast("decoder", new ByteArrayDecoder())
 42 |           .addLast("handler", handler)
 43 |       }
 44 |     })
 45 | 
 46 |     channelFuture = bootstrap.bind(new InetSocketAddress("localhost", 0))
 47 |     channelFuture.syncUninterruptibly()
 48 |     channelFuture.channel().localAddress().asInstanceOf[InetSocketAddress].getPort()
 49 |   }
 50 | 
 51 |   def run(): Unit = {
 52 |     channelFuture.channel.closeFuture().syncUninterruptibly()
 53 |   }
 54 | 
 55 |   def close(): Unit = {
 56 |     if (channelFuture != null) {
 57 |       // close is a local operation and should finish within milliseconds; timeout just to be safe
 58 |       channelFuture.channel().close().awaitUninterruptibly(10, TimeUnit.SECONDS)
 59 |       channelFuture = null
 60 |     }
 61 |     if (bootstrap != null && bootstrap.group() != null) {
 62 |       bootstrap.group().shutdownGracefully()
 63 |     }
 64 |     if (bootstrap != null && bootstrap.childGroup() != null) {
 65 |       bootstrap.childGroup().shutdownGracefully()
 66 |     }
 67 |     bootstrap = null
 68 |   }
 69 | 
 70 | }
 71 | 
 72 | object Backend {
 73 |   def main(args: Array[String]): Unit = {
 74 |     if (args.length < 1) {
 75 |       System.err.println("Usage: Backend <ports-file-output>")
 76 |       System.exit(-1)
 77 |     }
 78 | 
 79 |     val backend = new Backend()
 80 |     try {
 81 |       // bind to random port
 82 |       val boundPort = backend.init()
 83 |       val serverSocket = new ServerSocket(0, 1, InetAddress.getByName("localhost"))
 84 |       val listenPort = serverSocket.getLocalPort()
 85 | 
 86 |       // tell the R process via temporary file
 87 |       val path = args(0)
 88 |       val f = new File(path + ".tmp")
 89 |       val dos = new DataOutputStream(new FileOutputStream(f))
 90 |       dos.writeInt(boundPort)
 91 |       dos.writeInt(listenPort)
 92 |       Serializer.writeString(dos, Utils.rPackages.getOrElse(""))
 93 |       dos.close()
 94 |       f.renameTo(new File(path))
 95 | 
 96 |       // wait for the end of stdin, then exit
 97 |       new Thread("wait for socket to close") {
 98 |         setDaemon(true)
 99 |         override def run(): Unit = {
100 |           // any un-catched exception will also shutdown JVM
101 |           val buf = new Array[Byte](1024)
102 |           // shutdown JVM if R does not connect back in 10 seconds
103 |           serverSocket.setSoTimeout(10000)
104 |           try {
105 |             val inSocket = serverSocket.accept()
106 |             serverSocket.close()
107 |             // wait for the end of socket, closed if R process die
108 |             inSocket.getInputStream().read(buf)
109 |           } finally {
110 |             backend.close()
111 |             System.exit(0)
112 |           }
113 |         }
114 |       }.start()
115 | 
116 |       backend.run()
117 |     } catch {
118 |       case e: IOException =>
119 |         logError("Server shutting down: failed with exception ", e)
120 |         backend.close()
121 |         System.exit(1)
122 |     }
123 |     System.exit(0)
124 |   }
125 | }
126 | 


--------------------------------------------------------------------------------
/R/jobj.R:
--------------------------------------------------------------------------------
  1 | # Imported from:
  2 | #    https://raw.githubusercontent.com/apache/spark/branch-1.6/R/pkg/R/jobj.R
  3 | #
  4 | # Licensed to the Apache Software Foundation (ASF) under one or more
  5 | # contributor license agreements.  See the NOTICE file distributed with
  6 | # this work for additional information regarding copyright ownership.
  7 | # The ASF licenses this file to You under the Apache License, Version 2.0
  8 | # (the "License"); you may not use this file except in compliance with
  9 | # the License.  You may obtain a copy of the License at
 10 | #
 11 | #    http://www.apache.org/licenses/LICENSE-2.0
 12 | #
 13 | # Unless required by applicable law or agreed to in writing, software
 14 | # distributed under the License is distributed on an "AS IS" BASIS,
 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 | # See the License for the specific language governing permissions and
 17 | # limitations under the License.
 18 | #
 19 | 
 20 | # References to objects that exist on the JVM backend
 21 | # are maintained using the jobj.
 22 | 
 23 | 
 24 | #' Get the spark_jobj associated with an object
 25 | #'
 26 | #' S3 method to get the spark_jobj associated with objects of
 27 | #' various types.
 28 | #'
 29 | #' @param x Object to extract jobj from
 30 | #' @param ... Reserved for future use
 31 | #' @return A \code{spark_jobj} object that can be passed to
 32 | #'   \code{\link{invoke}}.
 33 | #'
 34 | #' @seealso \code{\link{invoke}}
 35 | #'
 36 | #' @export
 37 | spark_jobj <- function(x, ...) {
 38 |   UseMethod("spark_jobj")
 39 | }
 40 | 
 41 | 
 42 | #' @export
 43 | spark_jobj.default <- function(x, ...) {
 44 |   stop("Unable to retreive a spark_jobj from object of class ",
 45 |        paste(class(x), collapse = " "), call. = FALSE)
 46 | }
 47 | 
 48 | #' @export
 49 | spark_jobj.spark_jobj <- function(x, ...) {
 50 |   x
 51 | }
 52 | 
 53 | #' @export
 54 | print.spark_jobj <- function(x, ...) {
 55 |   print_jobj(spark_connection(x), x, ...)
 56 | }
 57 | 
 58 | #' Generic method for print jobj for a connection type
 59 | #'
 60 | #' @param sc \code{spark_connection} (used for type dispatch)
 61 | #' @param jobj Object to print
 62 | #'
 63 | #' @keywords internal
 64 | #'
 65 | #' @export
 66 | print_jobj <- function(sc, jobj, ...) {
 67 |   UseMethod("print_jobj")
 68 | }
 69 | 
 70 | 
 71 | # Maintain a reference count of Java object references
 72 | # This allows us to GC the java object when it is safe
 73 | .validJobjs <- new.env(parent = emptyenv())
 74 | 
 75 | # List of object ids to be removed
 76 | .toRemoveJobjs <- new.env(parent = emptyenv())
 77 | 
 78 | # Check if jobj was created with the current SparkContext
 79 | isValidJobj <- function(jobj) {
 80 |   TRUE
 81 | }
 82 | 
 83 | getJobj <- function(objId) {
 84 |   newObj <- jobj_create(objId)
 85 |   if (exists(objId, .validJobjs)) {
 86 |     .validJobjs[[objId]] <- .validJobjs[[objId]] + 1
 87 |   } else {
 88 |     .validJobjs[[objId]] <- 1
 89 |   }
 90 |   newObj
 91 | }
 92 | 
 93 | # Handler for a java object that exists on the backend.
 94 | jobj_create <- function(objId) {
 95 |   if (!is.character(objId)) {
 96 |     stop("object id must be a character")
 97 |   }
 98 |   # NOTE: We need a new env for a jobj as we can only register
 99 |   # finalizers for environments or external references pointers.
100 |   obj <- structure(new.env(parent = emptyenv()), class = "spark_jobj")
101 |   obj$id <- objId
102 | 
103 |   # Register a finalizer to remove the Java object when this reference
104 |   # is garbage collected in R
105 |   reg.finalizer(obj, cleanup.jobj)
106 |   obj
107 | }
108 | 
109 | jobj_info <- function(jobj) {
110 |   if (!inherits(jobj, "spark_jobj"))
111 |     stop("'jobj_info' called on non-jobj")
112 | 
113 |   class <- NULL
114 |   repr <- NULL
115 | 
116 |   tryCatch({
117 |     class <- invoke(jobj, "getClass")
118 |     if (inherits(class, "spark_jobj"))
119 |       class <- invoke(class, "toString")
120 |   }, error = function(e) {
121 |   })
122 |   tryCatch({
123 |     repr <- invoke(jobj, "toString")
124 |   }, error = function(e) {
125 |   })
126 |   list(
127 |     class = class,
128 |     repr  = repr
129 |   )
130 | }
131 | 
132 | jobj_inspect <- function(jobj) {
133 |   print(jobj)
134 |   if (!connection_is_open(spark_connection(jobj)))
135 |     return(jobj)
136 | 
137 |   class <- invoke(jobj, "getClass")
138 | 
139 |   cat("Fields:\n")
140 |   fields <- invoke(class, "getDeclaredFields")
141 |   lapply(fields, function(field) { print(field) })
142 | 
143 |   cat("Methods:\n")
144 |   methods <- invoke(class, "getDeclaredMethods")
145 |   lapply(methods, function(method) { print(method) })
146 | 
147 |   jobj
148 | }
149 | 
150 | cleanup.jobj <- function(jobj) {
151 |   if (isValidJobj(jobj)) {
152 |     objId <- jobj$id
153 |     # If we don't know anything about this jobj, ignore it
154 |     if (exists(objId, envir = .validJobjs)) {
155 |       .validJobjs[[objId]] <- .validJobjs[[objId]] - 1
156 | 
157 |       if (.validJobjs[[objId]] == 0) {
158 |         rm(list = objId, envir = .validJobjs)
159 |         # NOTE: We cannot call removeJObject here as the finalizer may be run
160 |         # in the middle of another RPC. Thus we queue up this object Id to be removed
161 |         # and then run all the removeJObject when the next RPC is called.
162 |         .toRemoveJobjs[[objId]] <- 1
163 |       }
164 |     }
165 |   }
166 | }
167 | 
168 | clearJobjs <- function() {
169 |   valid <- ls(.validJobjs)
170 |   rm(list = valid, envir = .validJobjs)
171 | 
172 |   removeList <- ls(.toRemoveJobjs)
173 |   rm(list = removeList, envir = .toRemoveJobjs)
174 | }
175 | 
176 | 


--------------------------------------------------------------------------------
/R/connection.R:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | 
  4 | #' Get the SparkContext associated with a connection
  5 | #'
  6 | #' Get the SparkContext \code{spark_jobj} associated with a
  7 | #' \code{spark_connection}
  8 | #'
  9 | #' @param sc Connection to get SparkContext from
 10 | #'
 11 | #' @return Reference to SparkContext
 12 | #' @export
 13 | spark_context <- function(sc) {
 14 |   sc$spark_context
 15 | }
 16 | 
 17 | #' Get the JavaSparkContext associated with a connection
 18 | #'
 19 | #' Get the JavaSparkContext \code{spark_jobj} associated with a
 20 | #' \code{spark_connection}
 21 | #'
 22 | #' @param sc Connection to get SparkContext from
 23 | #'
 24 | #' @return Reference to SparkContext
 25 | #' @export
 26 | java_context <- function(sc) {
 27 |   sc$java_context
 28 | }
 29 | 
 30 | #' Get the HiveContext associated with a connection
 31 | #'
 32 | #' Get the HiveContext \code{spark_jobj} associated with a
 33 | #' \code{spark_connection}
 34 | #'
 35 | #' @param sc Connection to get HiveContext from
 36 | #'
 37 | #' @return Reference to HiveContext
 38 | #' @export
 39 | hive_context <- function(sc) {
 40 |   sc$hive_context
 41 | }
 42 | 
 43 | 
 44 | #' Get the spark_connection associated with an object
 45 | #'
 46 | #' S3 method to get the spark_connection associated with objects of
 47 | #' various types.
 48 | #'
 49 | #' @param x Object to extract connection from
 50 | #' @param ... Reserved for future use
 51 | #' @return A \code{spark_connection} object that can be passed to
 52 | #'   \code{\link{invoke_new}} and \code{\link{invoke_static}}.
 53 | #'
 54 | #' @export
 55 | spark_connection <- function(x, ...) {
 56 |   UseMethod("spark_connection")
 57 | }
 58 | 
 59 | #' @export
 60 | spark_connection.default <- function(x, ...) {
 61 |   stop("Unable to retreive a spark_connection from object of class ",
 62 |        paste(class(x), collapse = " "), call. = FALSE)
 63 | }
 64 | 
 65 | #' @export
 66 | spark_connection.spark_connection <- function(x, ...) {
 67 |   x
 68 | }
 69 | 
 70 | #' @export
 71 | spark_connection.spark_jobj <- function(x, ...) {
 72 |   x$connection
 73 | }
 74 | 
 75 | #' Check whether the connection is open
 76 | #'
 77 | #' @param sc \code{spark_connection}
 78 | #'
 79 | #' @keywords internal
 80 | #'
 81 | #' @export
 82 | connection_is_open <- function(sc) {
 83 |   UseMethod("connection_is_open")
 84 | }
 85 | 
 86 | #' Read configuration values for a connection
 87 | #'
 88 | #' @param sc \code{spark_connection}
 89 | #' @param prefix Prefix to read parameters for
 90 | #'   (e.g. \code{spark.context.}, \code{spark.sql.}, etc.)
 91 | #' @param not_prefix Prefix to not include.
 92 | #'
 93 | #' @return Named list of config parameters (note that if a prefix was
 94 | #'  specified then the names will not include the prefix)
 95 | #'
 96 | #' @export
 97 | connection_config <- function(sc, prefix, not_prefix = list()) {
 98 | 
 99 |   config <- sc$config
100 |   master <- sc$master
101 |   isLocal <- spark_master_is_local(master)
102 | 
103 |   configNames <- Filter(function(e) {
104 |     found <- is.null(prefix) ||
105 |       (substring(e, 1, nchar(prefix)) == prefix)
106 | 
107 |     if (grepl("\\.local$", e) && !isLocal)
108 |       found <- FALSE
109 | 
110 |     if (grepl("\\.remote$", e) && isLocal)
111 |       found <- FALSE
112 | 
113 |     found
114 |   }, names(config))
115 | 
116 |   lapply(not_prefix, function(notPrefix) {
117 |     configNames <<- Filter(function(e) {
118 |       substring(e, 1, nchar(notPrefix)) != notPrefix
119 |     }, configNames)
120 |   })
121 | 
122 |   paramsNames <- lapply(configNames, function(configName) {
123 |     paramName <- substr(configName, nchar(prefix) + 1, nchar(configName))
124 |     paramName <- sub("(\\.local$)|(\\.remote$)", "", paramName, perl = TRUE)
125 | 
126 |     paramName
127 |   })
128 | 
129 |   params <- lapply(configNames, function(configName) {
130 |     config[[configName]]
131 |   })
132 | 
133 |   names(params) <- paramsNames
134 |   params
135 | }
136 | 
137 | spark_master_is_local <- function(master) {
138 |   grepl("^local(\\[[0-9\\*]*\\])?$", master, perl = TRUE)
139 | }
140 | 
141 | 
142 | #' Retrieves entries from the Spark log
143 | #'
144 | #' @param sc \code{spark_connection}
145 | #' @param n Max number of log entries to retrieve (pass NULL to retrieve
146 | #'   all lines of the log)
147 | #' @param ... Unused (reserved for future use)
148 | #'
149 | #' @return Character vector with last \code{n} lines of the Spark log
150 | #'   or for \code{spark_log_file} the full path to the log file.
151 | #'
152 | #' @export
153 | spark_log <- function(sc, n = 100, ...) {
154 |   UseMethod("spark_log")
155 | }
156 | 
157 | #' @export
158 | spark_log.default <- function(sc, n = 100, ...) {
159 |   stop("Invalid class passed to spark_log")
160 | }
161 | 
162 | #' @export
163 | print.spark_log <- function(x, ...) {
164 |   cat(x, sep = "\n")
165 |   cat("\n")
166 | }
167 | 
168 | #' Open the Spark web interface
169 | #'
170 | #' @inheritParams spark_log
171 | #'
172 | #' @export
173 | spark_web <- function(sc, ...) {
174 |   UseMethod("spark_web")
175 | }
176 | 
177 | #' @export
178 | spark_web.default <- function(sc, ...) {
179 |   stop("Invalid class passed to spark_web")
180 | }
181 | 
182 | 
183 | #' @export
184 | print.spark_web_url <- function(x, ...) {
185 |   utils::browseURL(x)
186 | }
187 | 
188 | initialize_connection <- function(sc) {
189 | 
190 |   # create the spark config
191 |   conf <- invoke_new(sc, "org.apache.spark.SparkConf")
192 |   conf <- invoke(conf, "setAppName", sc$app_name)
193 |   conf <- invoke(conf, "setMaster", sc$master)
194 |   conf <- invoke(conf, "setSparkHome", sc$spark_home)
195 | 
196 |   context_config <- connection_config(sc, "spark.", c("spark.sql."))
197 |   apply_config(context_config, conf, "set", "spark.")
198 | 
199 |   # create the spark context and assign the connection to it
200 |   sc$spark_context <- invoke_new(
201 |     sc,
202 |     "org.apache.spark.SparkContext",
203 |     conf
204 |   )
205 |   sc$spark_context$connection <- sc
206 | 
207 |   # create the java spark context and assign the connection to it
208 |   sc$java_context <- invoke_new(
209 |     sc,
210 |     "org.apache.spark.api.java.JavaSparkContext",
211 |     sc$spark_context
212 |   )
213 |   sc$java_context$connection <- sc
214 | 
215 |   # create the hive context and assign the connection to it
216 |   sc$hive_context <- create_hive_context(sc)
217 |   sc$hive_context$connection <- sc
218 | 
219 |   # return the modified connection
220 |   sc
221 | }
222 | 
223 | 
224 | 
225 | 
226 | 
227 | 
228 | 
229 | 


--------------------------------------------------------------------------------
/R/deserialize.R:
--------------------------------------------------------------------------------
  1 | # Imported from:
  2 | #    https://raw.githubusercontent.com/apache/spark/branch-1.6/R/pkg/R/deserialize.R
  3 | #
  4 | # Licensed to the Apache Software Foundation (ASF) under one or more
  5 | # contributor license agreements.  See the NOTICE file distributed with
  6 | # this work for additional information regarding copyright ownership.
  7 | # The ASF licenses this file to You under the Apache License, Version 2.0
  8 | # (the "License"); you may not use this file except in compliance with
  9 | # the License.  You may obtain a copy of the License at
 10 | #
 11 | #    http://www.apache.org/licenses/LICENSE-2.0
 12 | #
 13 | # Unless required by applicable law or agreed to in writing, software
 14 | # distributed under the License is distributed on an "AS IS" BASIS,
 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 | # See the License for the specific language governing permissions and
 17 | # limitations under the License.
 18 | #
 19 | 
 20 | # Utility functions to deserialize objects from Java.
 21 | 
 22 | # nolint start
 23 | # Type mapping from Java to R
 24 | #
 25 | # void -> NULL
 26 | # Int -> integer
 27 | # String -> character
 28 | # Boolean -> logical
 29 | # Float -> double
 30 | # Double -> double
 31 | # Long -> double
 32 | # Array[Byte] -> raw
 33 | # Date -> Date
 34 | # Time -> POSIXct
 35 | #
 36 | # Array[T] -> list()
 37 | # Object -> jobj
 38 | #
 39 | # nolint end
 40 | 
 41 | readObject <- function(con) {
 42 |   # Read type first
 43 |   type <- readType(con)
 44 |   readTypedObject(con, type)
 45 | }
 46 | 
 47 | readTypedObject <- function(con, type) {
 48 |   switch (type,
 49 |           "i" = readInt(con),
 50 |           "c" = readString(con),
 51 |           "b" = readBoolean(con),
 52 |           "d" = readDouble(con),
 53 |           "r" = readRaw(con),
 54 |           "D" = readDate(con),
 55 |           "t" = readTime(con),
 56 |           "a" = readArray(con),
 57 |           "l" = readList(con),
 58 |           "e" = readEnv(con),
 59 |           "s" = readStruct(con),
 60 |           "n" = NULL,
 61 |           "j" = getJobj(readString(con)),
 62 |           stop(paste("Unsupported type for deserialization", type)))
 63 | }
 64 | 
 65 | readString <- function(con) {
 66 |   stringLen <- readInt(con)
 67 |   raw <- readBin(con, raw(), stringLen, endian = "big")
 68 |   string <- rawToChar(raw)
 69 |   Encoding(string) <- "UTF-8"
 70 |   string
 71 | }
 72 | 
 73 | readInt <- function(con, n = 1) {
 74 |   readBin(con, integer(), n = n, endian = "big")
 75 | }
 76 | 
 77 | readDouble <- function(con, n = 1) {
 78 |   readBin(con, double(), n = n, endian = "big")
 79 | }
 80 | 
 81 | readBoolean <- function(con, n = 1) {
 82 |   as.logical(readInt(con, n = n))
 83 | }
 84 | 
 85 | readType <- function(con) {
 86 |   rawToChar(readBin(con, "raw", n = 1L))
 87 | }
 88 | 
 89 | readDate <- function(con) {
 90 |   as.Date(readString(con))
 91 | }
 92 | 
 93 | readTime <- function(con, n = 1) {
 94 |   t <- readDouble(con, n)
 95 |   as.POSIXct(t, origin = "1970-01-01")
 96 | }
 97 | 
 98 | readArray <- function(con) {
 99 |   type <- readType(con)
100 |   len <- readInt(con)
101 | 
102 |   # short-circuit for reading arrays of double, int, logical
103 |   if (type == "d") {
104 |     return(readDouble(con, n = len))
105 |   } else if (type == "i") {
106 |     return(readInt(con, n = len))
107 |   } else if (type == "b") {
108 |     return(readBoolean(con, n = len))
109 |   }
110 | 
111 |   if (len > 0) {
112 |     l <- vector("list", len)
113 |     for (i in 1:len) {
114 |       l[[i]] <- readTypedObject(con, type)
115 |     }
116 |     l
117 |   } else {
118 |     list()
119 |   }
120 | }
121 | 
122 | # Read a list. Types of each element may be different.
123 | # Null objects are read as NA.
124 | readList <- function(con) {
125 |   len <- readInt(con)
126 |   if (len > 0) {
127 |     l <- vector("list", len)
128 |     for (i in 1:len) {
129 |       elem <- readObject(con)
130 |       if (is.null(elem)) {
131 |         elem <- NA
132 |       }
133 |       l[[i]] <- elem
134 |     }
135 |     l
136 |   } else {
137 |     list()
138 |   }
139 | }
140 | 
141 | readEnv <- function(con) {
142 |   env <- new.env()
143 |   len <- readInt(con)
144 |   if (len > 0) {
145 |     for (i in 1:len) {
146 |       key <- readString(con)
147 |       value <- readObject(con)
148 |       env[[key]] <- value
149 |     }
150 |   }
151 |   env
152 | }
153 | 
154 | # Convert a named list to struct so that
155 | # SerDe won't confuse between a normal named list and struct
156 | listToStruct <- function(list) {
157 |   stopifnot(class(list) == "list")
158 |   stopifnot(!is.null(names(list)))
159 |   class(list) <- "struct"
160 |   list
161 | }
162 | 
163 | # Read a field of StructType from DataFrame
164 | # into a named list in R whose class is "struct"
165 | readStruct <- function(con) {
166 |   names <- readObject(con)
167 |   fields <- readObject(con)
168 |   names(fields) <- names
169 |   listToStruct(fields)
170 | }
171 | 
172 | readRaw <- function(con) {
173 |   dataLen <- readInt(con)
174 |   readBin(con, raw(), as.integer(dataLen), endian = "big")
175 | }
176 | 
177 | readRawLen <- function(con, dataLen) {
178 |   readBin(con, raw(), as.integer(dataLen), endian = "big")
179 | }
180 | 
181 | readDeserialize <- function(con) {
182 |   # We have two cases that are possible - In one, the entire partition is
183 |   # encoded as a byte array, so we have only one value to read. If so just
184 |   # return firstData
185 |   dataLen <- readInt(con)
186 |   firstData <- unserialize(
187 |     readBin(con, raw(), as.integer(dataLen), endian = "big"))
188 | 
189 |   # Else, read things into a list
190 |   dataLen <- readInt(con)
191 |   if (length(dataLen) > 0 && dataLen > 0) {
192 |     data <- list(firstData)
193 |     while (length(dataLen) > 0 && dataLen > 0) {
194 |       data[[length(data) + 1L]] <- unserialize(
195 |         readBin(con, raw(), as.integer(dataLen), endian = "big"))
196 |       dataLen <- readInt(con)
197 |     }
198 |     unlist(data, recursive = FALSE)
199 |   } else {
200 |     firstData
201 |   }
202 | }
203 | 
204 | readMultipleObjects <- function(inputCon) {
205 |   # readMultipleObjects will read multiple continuous objects from
206 |   # a DataOutputStream. There is no preceding field telling the count
207 |   # of the objects, so the number of objects varies, we try to read
208 |   # all objects in a loop until the end of the stream.
209 |   data <- list()
210 |   while (TRUE) {
211 |     # If reaching the end of the stream, type returned should be "".
212 |     type <- readType(inputCon)
213 |     if (type == "") {
214 |       break
215 |     }
216 |     data[[length(data) + 1L]] <- readTypedObject(inputCon, type)
217 |   }
218 |   data # this is a list of named lists now
219 | }
220 | 
221 | readRowList <- function(obj) {
222 |   # readRowList is meant for use inside an lapply. As a result, it is
223 |   # necessary to open a standalone connection for the row and consume
224 |   # the numCols bytes inside the read function in order to correctly
225 |   # deserialize the row.
226 |   rawObj <- rawConnection(obj, "r+")
227 |   on.exit(close(rawObj))
228 |   readObject(rawObj)
229 | }
230 | 


--------------------------------------------------------------------------------
/R/serialize.R:
--------------------------------------------------------------------------------
  1 | # Imported from:
  2 | #    https://raw.githubusercontent.com/apache/spark/branch-1.6/R/pkg/R/serialize.R
  3 | #
  4 | # Licensed to the Apache Software Foundation (ASF) under one or more
  5 | # contributor license agreements.  See the NOTICE file distributed with
  6 | # this work for additional information regarding copyright ownership.
  7 | # The ASF licenses this file to You under the Apache License, Version 2.0
  8 | # (the "License"); you may not use this file except in compliance with
  9 | # the License.  You may obtain a copy of the License at
 10 | #
 11 | #    http://www.apache.org/licenses/LICENSE-2.0
 12 | #
 13 | # Unless required by applicable law or agreed to in writing, software
 14 | # distributed under the License is distributed on an "AS IS" BASIS,
 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 | # See the License for the specific language governing permissions and
 17 | # limitations under the License.
 18 | #
 19 | 
 20 | # Utility functions to serialize R objects so they can be read in Java.
 21 | 
 22 | # nolint start
 23 | # Type mapping from R to Java
 24 | #
 25 | # NULL -> Void
 26 | # integer -> Int
 27 | # character -> String
 28 | # logical -> Boolean
 29 | # double, numeric -> Double
 30 | # raw -> Array[Byte]
 31 | # Date -> Date
 32 | # POSIXct,POSIXlt -> Time
 33 | #
 34 | # list[T] -> Array[T], where T is one of above mentioned types
 35 | # environment -> Map[String, T], where T is a native type
 36 | # jobj -> Object, where jobj is an object created in the backend
 37 | # nolint end
 38 | 
 39 | getSerdeType <- function(object) {
 40 |   type <- class(object)[[1]]
 41 |   if (type != "list") {
 42 |     type
 43 |   } else {
 44 |     # Check if all elements are of same type
 45 |     elemType <- unique(sapply(object, function(elem) { getSerdeType(elem) }))
 46 |     if (length(elemType) <= 1) {
 47 |       "array"
 48 |     } else {
 49 |       "list"
 50 |     }
 51 |   }
 52 | }
 53 | 
 54 | writeObject <- function(con, object, writeType = TRUE) {
 55 |   # NOTE: In R vectors have same type as objects. So we don't support
 56 |   # passing in vectors as arrays and instead require arrays to be passed
 57 |   # as lists.
 58 |   type <- class(object)[[1]]  # class of POSIXlt is c("POSIXlt", "POSIXt")
 59 |   # Checking types is needed here, since 'is.na' only handles atomic vectors,
 60 |   # lists and pairlists
 61 |   if (type %in% c("integer", "character", "logical", "double", "numeric")) {
 62 |     if (is.na(object)) {
 63 |       object <- NULL
 64 |       type <- "NULL"
 65 |     }
 66 |   }
 67 | 
 68 |   serdeType <- getSerdeType(object)
 69 |   if (writeType) {
 70 |     writeType(con, serdeType)
 71 |   }
 72 |   switch(serdeType,
 73 |          NULL = writeVoid(con),
 74 |          integer = writeInt(con, object),
 75 |          character = writeString(con, object),
 76 |          logical = writeBoolean(con, object),
 77 |          double = writeDouble(con, object),
 78 |          numeric = writeDouble(con, object),
 79 |          raw = writeRaw(con, object),
 80 |          array = writeArray(con, object),
 81 |          list = writeList(con, object),
 82 |          struct = writeList(con, object),
 83 |          spark_jobj = writeJobj(con, object),
 84 |          environment = writeEnv(con, object),
 85 |          Date = writeDate(con, object),
 86 |          POSIXlt = writeTime(con, object),
 87 |          POSIXct = writeTime(con, object),
 88 |          factor = writeFactor(con, object),
 89 |          stop(paste("Unsupported type for serialization", type)))
 90 | }
 91 | 
 92 | writeVoid <- function(con) {
 93 |   # no value for NULL
 94 | }
 95 | 
 96 | writeJobj <- function(con, value) {
 97 |   if (!isValidJobj(value)) {
 98 |     stop("invalid jobj ", value$id)
 99 |   }
100 |   writeString(con, value$id)
101 | }
102 | 
103 | writeString <- function(con, value) {
104 |   utfVal <- enc2utf8(value)
105 |   writeInt(con, as.integer(nchar(utfVal, type = "bytes") + 1))
106 |   writeBin(utfVal, con, endian = "big", useBytes = TRUE)
107 | }
108 | 
109 | writeInt <- function(con, value) {
110 |   writeBin(as.integer(value), con, endian = "big")
111 | }
112 | 
113 | writeDouble <- function(con, value) {
114 |   writeBin(value, con, endian = "big")
115 | }
116 | 
117 | writeBoolean <- function(con, value) {
118 |   # TRUE becomes 1, FALSE becomes 0
119 |   writeInt(con, as.integer(value))
120 | }
121 | 
122 | writeRawSerialize <- function(outputCon, batch) {
123 |   outputSer <- serialize(batch, ascii = FALSE, connection = NULL)
124 |   writeRaw(outputCon, outputSer)
125 | }
126 | 
127 | writeRowSerialize <- function(outputCon, rows) {
128 |   invisible(lapply(rows, function(r) {
129 |     bytes <- serializeRow(r)
130 |     writeRaw(outputCon, bytes)
131 |   }))
132 | }
133 | 
134 | serializeRow <- function(row) {
135 |   rawObj <- rawConnection(raw(0), "wb")
136 |   on.exit(close(rawObj))
137 |   writeList(rawObj, row)
138 |   rawConnectionValue(rawObj)
139 | }
140 | 
141 | writeRaw <- function(con, batch) {
142 |   writeInt(con, length(batch))
143 |   writeBin(batch, con, endian = "big")
144 | }
145 | 
146 | writeType <- function(con, class) {
147 |   type <- switch(class,
148 |                  NULL = "n",
149 |                  integer = "i",
150 |                  character = "c",
151 |                  logical = "b",
152 |                  double = "d",
153 |                  numeric = "d",
154 |                  raw = "r",
155 |                  array = "a",
156 |                  list = "l",
157 |                  struct = "s",
158 |                  spark_jobj = "j",
159 |                  environment = "e",
160 |                  Date = "D",
161 |                  POSIXlt = "t",
162 |                  POSIXct = "t",
163 |                  factor = "c",
164 |                  stop(paste("Unsupported type for serialization", class)))
165 |   writeBin(charToRaw(type), con)
166 | }
167 | 
168 | # Used to pass arrays where all the elements are of the same type
169 | writeArray <- function(con, arr) {
170 |   # TODO: Empty lists are given type "character" right now.
171 |   # This may not work if the Java side expects array of any other type.
172 |   if (length(arr) == 0) {
173 |     elemType <- class("somestring")
174 |   } else {
175 |     elemType <- getSerdeType(arr[[1]])
176 |   }
177 | 
178 |   writeType(con, elemType)
179 |   writeInt(con, length(arr))
180 | 
181 |   if (length(arr) > 0) {
182 |     for (a in arr) {
183 |       writeObject(con, a, FALSE)
184 |     }
185 |   }
186 | }
187 | 
188 | # Used to pass arrays where the elements can be of different types
189 | writeList <- function(con, list) {
190 |   writeInt(con, length(list))
191 |   for (elem in list) {
192 |     writeObject(con, elem)
193 |   }
194 | }
195 | 
196 | # Used to pass in hash maps required on Java side.
197 | writeEnv <- function(con, env) {
198 |   len <- length(env)
199 | 
200 |   writeInt(con, len)
201 |   if (len > 0) {
202 |     writeArray(con, as.list(ls(env)))
203 |     vals <- lapply(ls(env), function(x) { env[[x]] })
204 |     writeList(con, as.list(vals))
205 |   }
206 | }
207 | 
208 | writeDate <- function(con, date) {
209 |   writeString(con, as.character(date))
210 | }
211 | 
212 | writeTime <- function(con, time) {
213 |   writeDouble(con, as.double(time))
214 | }
215 | 
216 | writeFactor <- function(con, factor) {
217 |   writeString(con, as.character(factor))
218 | }
219 | 
220 | # Used to serialize in a list of objects where each
221 | # object can be of a different type. Serialization format is
222 | # <object type> <object> for each object
223 | writeArgs <- function(con, args) {
224 |   if (length(args) > 0) {
225 |     for (a in args) {
226 |       writeObject(con, a)
227 |     }
228 |   }
229 | }
230 | 


--------------------------------------------------------------------------------
/inst/scala/handler.scala:
--------------------------------------------------------------------------------
  1 | package sparkapi
  2 | 
  3 | import java.io.{ByteArrayInputStream, ByteArrayOutputStream, DataInputStream, DataOutputStream}
  4 | 
  5 | import scala.collection.mutable.HashMap
  6 | import scala.language.existentials
  7 | 
  8 | import io.netty.channel.{ChannelHandlerContext, SimpleChannelInboundHandler}
  9 | import io.netty.channel.ChannelHandler.Sharable
 10 | 
 11 | import sparkapi.Logging._
 12 | import sparkapi.Serializer._
 13 | 
 14 | @Sharable
 15 | class Handler(server: Backend)
 16 | extends SimpleChannelInboundHandler[Array[Byte]] {
 17 | 
 18 |   override def channelRead0(ctx: ChannelHandlerContext, msg: Array[Byte]): Unit = {
 19 |     val bis = new ByteArrayInputStream(msg)
 20 |     val dis = new DataInputStream(bis)
 21 | 
 22 |     val bos = new ByteArrayOutputStream()
 23 |     val dos = new DataOutputStream(bos)
 24 | 
 25 |     // First bit is isStatic
 26 |     val isStatic = readBoolean(dis)
 27 |     val objId = readString(dis)
 28 |     val methodName = readString(dis)
 29 |     val numArgs = readInt(dis)
 30 | 
 31 |     if (objId == "Handler") {
 32 |       methodName match {
 33 |         // This function is for test-purpose only
 34 |         case "echo" =>
 35 |           val args = readArgs(numArgs, dis)
 36 |         assert(numArgs == 1)
 37 | 
 38 |         writeInt(dos, 0)
 39 |         writeObject(dos, args(0))
 40 |         case "stopBackend" =>
 41 |           writeInt(dos, 0)
 42 |         writeType(dos, "void")
 43 |         server.close()
 44 |         case "rm" =>
 45 |           try {
 46 |             val t = readObjectType(dis)
 47 |             assert(t == 'c')
 48 |             val objToRemove = readString(dis)
 49 |             JVMObjectTracker.remove(objToRemove)
 50 |             writeInt(dos, 0)
 51 |             writeObject(dos, null)
 52 |           } catch {
 53 |             case e: Exception =>
 54 |               logError(s"Removing $objId failed", e)
 55 |             writeInt(dos, -1)
 56 |             writeString(dos, s"Removing $objId failed: ${e.getMessage}")
 57 |           }
 58 |         case _ =>
 59 |           dos.writeInt(-1)
 60 |         writeString(dos, s"Error: unknown method $methodName")
 61 |       }
 62 |     } else {
 63 |       handleMethodCall(isStatic, objId, methodName, numArgs, dis, dos)
 64 |     }
 65 | 
 66 |     val reply = bos.toByteArray
 67 |     ctx.write(reply)
 68 |   }
 69 | 
 70 |   override def channelReadComplete(ctx: ChannelHandlerContext): Unit = {
 71 |     ctx.flush()
 72 |   }
 73 | 
 74 |   override def exceptionCaught(ctx: ChannelHandlerContext, cause: Throwable): Unit = {
 75 |     // Close the connection when an exception is raised.
 76 |     cause.printStackTrace()
 77 |     ctx.close()
 78 |   }
 79 | 
 80 |   def handleMethodCall(
 81 |     isStatic: Boolean,
 82 |     objId: String,
 83 |     methodName: String,
 84 |     numArgs: Int,
 85 |     dis: DataInputStream,
 86 |     dos: DataOutputStream): Unit = {
 87 |       var obj: Object = null
 88 |       try {
 89 |         val cls = if (isStatic) {
 90 |           Class.forName(objId)
 91 |         } else {
 92 |           JVMObjectTracker.get(objId) match {
 93 |             case None => throw new IllegalArgumentException("Object not found " + objId)
 94 |             case Some(o) =>
 95 |               obj = o
 96 |             o.getClass
 97 |           }
 98 |         }
 99 | 
100 |         val args = readArgs(numArgs, dis)
101 | 
102 |         val methods = cls.getMethods
103 |         val selectedMethods = methods.filter(m => m.getName == methodName)
104 |         if (selectedMethods.length > 0) {
105 |           val index = findMatchedSignature(
106 |             selectedMethods.map(_.getParameterTypes),
107 |             args)
108 | 
109 |           if (index.isEmpty) {
110 |             logWarning(s"cannot find matching method ${cls}.$methodName. "
111 |                        + s"Candidates are:")
112 |             selectedMethods.foreach { method =>
113 |               logWarning(s"$methodName(${method.getParameterTypes.mkString(",")})")
114 |             }
115 |             throw new Exception(s"No matched method found for $cls.$methodName")
116 |           }
117 | 
118 |           val ret = selectedMethods(index.get).invoke(obj, args : _*)
119 | 
120 |           // Write status bit
121 |           writeInt(dos, 0)
122 |           writeObject(dos, ret.asInstanceOf[AnyRef])
123 |         } else if (methodName == "<init>") {
124 |           // methodName should be "<init>" for constructor
125 |           val ctors = cls.getConstructors
126 |           val index = findMatchedSignature(
127 |             ctors.map(_.getParameterTypes),
128 |             args)
129 | 
130 |           if (index.isEmpty) {
131 |             logWarning(s"cannot find matching constructor for ${cls}. "
132 |                        + s"Candidates are:")
133 |             ctors.foreach { ctor =>
134 |               logWarning(s"$cls(${ctor.getParameterTypes.mkString(",")})")
135 |             }
136 |             throw new Exception(s"No matched constructor found for $cls")
137 |           }
138 | 
139 |           val obj = ctors(index.get).newInstance(args : _*)
140 | 
141 |           writeInt(dos, 0)
142 |           writeObject(dos, obj.asInstanceOf[AnyRef])
143 |         } else {
144 |           throw new IllegalArgumentException("invalid method " + methodName + " for object " + objId)
145 |         }
146 |       } catch {
147 |         case e: Exception =>
148 |           logError(s"$methodName on $objId failed")
149 |         writeInt(dos, -1)
150 |         // Writing the error message of the cause for the exception. This will be returned
151 |         // to user in the R process.
152 |         writeString(dos, Utils.exceptionString(e.getCause))
153 |       }
154 |     }
155 | 
156 |   // Read a number of arguments from the data input stream
157 |   def readArgs(numArgs: Int, dis: DataInputStream): Array[java.lang.Object] = {
158 |     (0 until numArgs).map { _ =>
159 |       readObject(dis)
160 |     }.toArray
161 |   }
162 | 
163 |   // Find a matching method signature in an array of signatures of constructors
164 |   // or methods of the same name according to the passed arguments. Arguments
165 |   // may be converted in order to match a signature.
166 |   //
167 |   // Note that in Java reflection, constructors and normal methods are of different
168 |   // classes, and share no parent class that provides methods for reflection uses.
169 |   // There is no unified way to handle them in this function. So an array of signatures
170 |   // is passed in instead of an array of candidate constructors or methods.
171 |   //
172 |   // Returns an Option[Int] which is the index of the matched signature in the array.
173 |   def findMatchedSignature(
174 |     parameterTypesOfMethods: Array[Array[Class[_]]],
175 |     args: Array[Object]): Option[Int] = {
176 |       val numArgs = args.length
177 | 
178 |       for (index <- 0 until parameterTypesOfMethods.length) {
179 |         val parameterTypes = parameterTypesOfMethods(index)
180 | 
181 |         if (parameterTypes.length == numArgs) {
182 |           var argMatched = true
183 |           var i = 0
184 |           while (i < numArgs && argMatched) {
185 |             val parameterType = parameterTypes(i)
186 | 
187 |             if (parameterType == classOf[Seq[Any]] && args(i).getClass.isArray) {
188 |               // The case that the parameter type is a Scala Seq and the argument
189 |               // is a Java array is considered matching. The array will be converted
190 |               // to a Seq later if this method is matched.
191 |             } else {
192 |               var parameterWrapperType = parameterType
193 | 
194 |               // Convert native parameters to Object types as args is Array[Object] here
195 |               if (parameterType.isPrimitive) {
196 |                 parameterWrapperType = parameterType match {
197 |                   case java.lang.Integer.TYPE => classOf[java.lang.Integer]
198 |                   case java.lang.Long.TYPE => classOf[java.lang.Integer]
199 |                   case java.lang.Double.TYPE => classOf[java.lang.Double]
200 |                   case java.lang.Boolean.TYPE => classOf[java.lang.Boolean]
201 |                   case _ => parameterType
202 |                 }
203 |               }
204 |               if ((parameterType.isPrimitive || args(i) != null) &&
205 |                   !parameterWrapperType.isInstance(args(i))) {
206 |                 argMatched = false
207 |               }
208 |             }
209 | 
210 |             i = i + 1
211 |           }
212 | 
213 |           if (argMatched) {
214 |             // Convert args if needed
215 |             val parameterTypes = parameterTypesOfMethods(index)
216 | 
217 |             (0 until numArgs).map { i =>
218 |               if (parameterTypes(i) == classOf[Seq[Any]] && args(i).getClass.isArray) {
219 |                 // Convert a Java array to scala Seq
220 |                 args(i) = args(i).asInstanceOf[Array[_]].toSeq
221 |               }
222 |             }
223 | 
224 |             return Some(index)
225 |           }
226 |         }
227 |       }
228 |       None
229 |     }
230 | }
231 | 
232 | /**
233 |  * Helper singleton that tracks JVM objects returned to R.
234 |  * This is useful for referencing these objects in RPC calls.
235 | */
236 | object JVMObjectTracker {
237 | 
238 |   private[this] val objMap = new HashMap[String, Object]
239 | 
240 |   private[this] var objCounter: Int = 0
241 | 
242 |   def getObject(id: String): Object = {
243 |     objMap(id)
244 |   }
245 | 
246 |   def get(id: String): Option[Object] = {
247 |     objMap.get(id)
248 |   }
249 | 
250 |   def put(obj: Object): String = {
251 |     val objId = objCounter.toString
252 |     objCounter = objCounter + 1
253 |     objMap.put(objId, obj)
254 |     objId
255 |   }
256 | 
257 |   def remove(id: String): Option[Object] = {
258 |     objMap.remove(id)
259 |   }
260 | }
261 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/R/shell.R:
--------------------------------------------------------------------------------
  1 | #' Start the Spark R Shell
  2 | #'
  3 | #' @param master Spark cluster url to connect to. Use \code{"local"} to connect to a local
  4 | #'   instance of Spark
  5 | #' @param spark_home Spark home directory (defaults to SPARK_HOME environment variable)
  6 | #' @param spark_version Spark version, if not specified, version taken from SPARK_HOME
  7 | #' @param app_name Application name to be used while running in the Spark cluster
  8 | #' @param config Named character vector of spark.* options
  9 | #' @param jars Paths to Jar files to include
 10 | #' @param packages Spark packages to include
 11 | #' @param extensions Extension packages to include dependencies for
 12 | #'   (see \code{\link{spark_dependency}}).
 13 | #' @param environment Environment variables to set
 14 | #' @param shell_args Additional command line arguments for spark_shell
 15 | #' @param sc \code{spark_connection}
 16 | #'
 17 | #' @return \code{spark_connection} object
 18 | #'
 19 | #' @export
 20 | start_shell <- function(master,
 21 |                         spark_home = Sys.getenv("SPARK_HOME"),
 22 |                         spark_version = NULL,
 23 |                         app_name = "sparkapi",
 24 |                         config = list(),
 25 |                         extensions = sparkapi::registered_extensions(),
 26 |                         jars = NULL,
 27 |                         packages = NULL,
 28 |                         environment = NULL,
 29 |                         shell_args = NULL) {
 30 |   # read app jar through config, this allows "sparkr-shell" to test sparkr backend
 31 |   app_jar <- spark_config_value(config, "sparkapi.app.jar", NULL)
 32 |   if (is.null(app_jar)) {
 33 |     app_jar <- shQuote(normalizePath(system.file(file.path("java", "sparkapi-1.6.1.jar"), package = "sparkapi"),
 34 |                              mustWork = FALSE))
 35 |     shell_args <- c(shell_args, "--class", "sparkapi.Backend")
 36 |   }
 37 | 
 38 |   # validate and normalize spark_home
 39 |   if (!nzchar(spark_home))
 40 |     stop("No spark_home specified (defaults to SPARK_HOME environment varirable).")
 41 |   if (!dir.exists(spark_home))
 42 |     stop("SPARK_HOME directory '", spark_home ,"' not found")
 43 |   spark_home <- normalizePath(spark_home)
 44 | 
 45 |   # set SPARK_HOME into child process environment
 46 |   if (is.null(environment()))
 47 |     environment <- list()
 48 |   environment$SPARK_HOME <- spark_home
 49 | 
 50 |   # provide empty config if necessary
 51 |   if (is.null(config))
 52 |     config <- list()
 53 | 
 54 |   # determine path to spark_submit
 55 |   spark_submit <- switch(.Platform$OS.type,
 56 |     unix = "spark-submit",
 57 |     windows = "spark-submit.cmd"
 58 |   )
 59 |   spark_submit_path <- normalizePath(file.path(spark_home, "bin", spark_submit))
 60 | 
 61 |   # resolve extensions
 62 |   spark_version <- numeric_version(
 63 |     ifelse(is.null(spark_version),
 64 |       spark_version_from_home(spark_home),
 65 |       gsub("[-_a-zA-Z]", "", spark_version)
 66 |     )
 67 |   )
 68 |   scala_version <- numeric_version("2.10")
 69 |   extensions <- spark_dependencies_from_extensions(spark_version, scala_version, extensions)
 70 | 
 71 |   # combine passed jars and packages with extensions
 72 |   jars <- normalizePath(unique(c(jars, extensions$jars)))
 73 |   packages <- unique(c(packages, extensions$packages))
 74 | 
 75 |   # add jars to arguments
 76 |   if (length(jars) > 0) {
 77 |     shell_args <- c(shell_args, "--jars", paste(shQuote(jars), collapse=","))
 78 |   }
 79 | 
 80 |   # add packages to arguments
 81 |   if (length(packages) > 0) {
 82 |     shell_args <- c(shell_args, "--packages", paste(shQuote(packages), collapse=","))
 83 |   }
 84 | 
 85 |   # add sparkr-shell to args
 86 |   shell_args <- c(shell_args, app_jar)
 87 | 
 88 |   # create temporary file for shell ports output and add it to the args
 89 |   shell_output_path <- spark_config_value(config,
 90 |                                           "sparkapi.ports.file",
 91 |                                           normalizePath(tempfile(fileext = ".out"),
 92 |                                                         mustWork = FALSE))
 93 | 
 94 |   on.exit(unlink(shell_output_path))
 95 |   shell_args <- c(shell_args, shell_output_path)
 96 | 
 97 |   # create temp file for stdout and stderr
 98 |   output_file <- tempfile(fileext = "_spark.log")
 99 |   error_file <- tempfile(fileext = "_spark.err")
100 | 
101 |   # start the shell (w/ specified additional environment variables)
102 |   env <- unlist(environment)
103 |   withr::with_envvar(env, {
104 |     if (.Platform$OS.type == "windows") {
105 |       shell(paste(
106 |         spark_submit_path,
107 |         paste(shell_args, collapse = " "),
108 |         ">",
109 |         output_file,
110 |         "2>",
111 |         error_file
112 |       ),
113 |       wait = FALSE)
114 |     }
115 |     else {
116 |       system2(spark_submit_path,
117 |               args = shell_args,
118 |               stdout = output_file,
119 |               stderr = output_file,
120 |               wait = FALSE)
121 |     }
122 |   })
123 | 
124 |   # wait for the shell output file
125 |   waitSeconds <- spark_config_value(config, "sparkapi.ports.wait.seconds", 100)
126 |   if (!wait_file_exists(shell_output_path, waitSeconds)) {
127 |     stop(paste(
128 |       "Failed to launch Spark shell. Ports file does not exist.\n",
129 |       "    Path: ", spark_submit_path, "\n",
130 |       "    Parameters: ", paste(shell_args, collapse = ", "), "\n",
131 |       "    \n",
132 |       paste(readLines(output_file), collapse = "\n"),
133 |       if (file.exists(error_file)) paste(readLines(error_file), collapse = "\n") else "",
134 |       sep = ""))
135 |   }
136 | 
137 |   # read the shell output file
138 |   shell_file <- read_shell_file(shell_output_path)
139 | 
140 |   # bind to the monitor and backend ports
141 |   tryCatch({
142 |     monitor <- socketConnection(port = shell_file$monitorPort)
143 |   }, error = function(err) {
144 |     stop("Failed to open connection to monitor")
145 |   })
146 | 
147 |   tryCatch({
148 |     backend <- socketConnection(host = "localhost",
149 |                                 port = shell_file$backendPort,
150 |                                 server = FALSE,
151 |                                 blocking = TRUE,
152 |                                 open = "wb",
153 |                                 timeout = 6000)
154 |   }, error = function(err) {
155 |     stop("Failed to open connection to backend")
156 |   })
157 | 
158 |   # create the shell connection
159 |   sc <- structure(class = c("spark_connection", "spark_shell_connection"), list(
160 |     # spark_connection
161 |     master = master,
162 |     spark_home = spark_home,
163 |     app_name = app_name,
164 |     config = config,
165 |     # spark_shell_connection
166 |     backend = backend,
167 |     monitor = monitor,
168 |     output_file = output_file
169 |   ))
170 | 
171 |   # stop shell on R exit
172 |   reg.finalizer(baseenv(), function(x) {
173 |     if (connection_is_open(sc)) {
174 |       stop_shell(sc)
175 |     }
176 |   }, onexit = TRUE)
177 | 
178 |   # initialize and return the connection
179 |   initialize_connection(sc)
180 | }
181 | 
182 | 
183 | #' Stop the Spark R Shell
184 | #'
185 | #' @rdname start_shell
186 | #'
187 | #' @export
188 | stop_shell <- function(sc) {
189 |   invoke_method(sc,
190 |                 FALSE,
191 |                 "Handler",
192 |                 "stopBackend")
193 | 
194 |   close(sc$backend)
195 |   close(sc$monitor)
196 | }
197 | 
198 | #' @export
199 | connection_is_open.spark_shell_connection <- function(sc) {
200 |   bothOpen <- FALSE
201 |   if (!identical(sc, NULL)) {
202 |     tryCatch({
203 |       bothOpen <- isOpen(sc$backend) && isOpen(sc$monitor)
204 |     }, error = function(e) {
205 |     })
206 |   }
207 |   bothOpen
208 | }
209 | 
210 | #' @export
211 | spark_log.spark_shell_connection <- function(sc, n = 100, ...) {
212 |   log <- file(sc$output_file)
213 |   lines <- readLines(log)
214 |   close(log)
215 | 
216 |   if (!is.null(n))
217 |     linesLog <- utils::tail(lines, n = n)
218 |   else
219 |     linesLog <- lines
220 |   attr(linesLog, "class") <- "spark_log"
221 | 
222 |   linesLog
223 | }
224 | 
225 | #' @export
226 | spark_web.spark_shell_connection <- function(sc, ...) {
227 |   lines <- spark_log(sc, n = 200)
228 | 
229 |   uiLine <- grep("Started SparkUI at ", lines, perl=TRUE, value=TRUE)
230 |   if (length(uiLine) > 0) {
231 |     matches <- regexpr("http://.*", uiLine, perl=TRUE)
232 |     match <-regmatches(uiLine, matches)
233 |     if (length(match) > 0) {
234 |       return(structure(match, class = "spark_web_url"))
235 |     }
236 |   }
237 | 
238 |   uiLine <- grep(".*Bound SparkUI to.*", lines, perl=TRUE, value=TRUE)
239 |   if (length(uiLine) > 0) {
240 |     matches <- regexec(".*Bound SparkUI to.*and started at (http.*)", uiLine, perl=TRUE)
241 |     match <- regmatches(uiLine, matches)
242 |     if (length(match) > 0 && length(match[[1]]) > 1) {
243 |       return(structure(match[[1]][[2]], class = "spark_web_url"))
244 |     }
245 |   }
246 | 
247 |   warning("Spark UI URL not found in logs, attempting to guess.")
248 |   structure("http://localhost:4040", class = "spark_web_url")
249 | }
250 | 
251 | #' @export
252 | invoke_method.spark_shell_connection <- function(sc, static, object, method, ...)
253 | {
254 |   if (is.null(sc)) {
255 |     stop("The connection is no longer valid.")
256 |   }
257 | 
258 |   # if the object is a jobj then get it's id
259 |   if (inherits(object, "spark_jobj"))
260 |     object <- object$id
261 | 
262 |   rc <- rawConnection(raw(), "r+")
263 |   writeBoolean(rc, static)
264 |   writeString(rc, object)
265 |   writeString(rc, method)
266 | 
267 |   args <- list(...)
268 |   writeInt(rc, length(args))
269 |   writeArgs(rc, args)
270 |   bytes <- rawConnectionValue(rc)
271 |   close(rc)
272 | 
273 |   rc <- rawConnection(raw(0), "r+")
274 |   writeInt(rc, length(bytes))
275 |   writeBin(bytes, rc)
276 |   con <- rawConnectionValue(rc)
277 |   close(rc)
278 | 
279 |   backend <- sc$backend
280 |   writeBin(con, backend)
281 | 
282 |   returnStatus <- readInt(backend)
283 |   if (length(returnStatus) == 0)
284 |     stop("No status is returned. Spark R backend might have failed.")
285 |   if (returnStatus != 0) {
286 |     # get error message from backend and report to R
287 |     msg <- readString(backend)
288 |     if (nzchar(msg))
289 |       stop(msg, call. = FALSE)
290 |     else {
291 |       # read the spark log
292 |       msg <- read_spark_log_error(sc)
293 |       stop(msg, call. = FALSE)
294 |     }
295 |   }
296 | 
297 |   object <- readObject(backend)
298 |   attach_connection(object, sc)
299 | }
300 | 
301 | #' @export
302 | print_jobj.spark_shell_connection <- function(sc, jobj, ...) {
303 |   if (connection_is_open(sc)) {
304 |     info <- jobj_info(jobj)
305 |     fmt <- "<jobj[%s]>\n  %s\n  %s\n"
306 |     cat(sprintf(fmt, jobj$id, info$class, info$repr))
307 |   } else {
308 |     fmt <- "<jobj[%s]>\n  <detached>"
309 |     cat(sprintf(fmt, jobj$id))
310 |   }
311 | }
312 | 
313 | 
314 | attach_connection <- function(jobj, connection) {
315 | 
316 |   if (inherits(jobj, "spark_jobj")) {
317 |     jobj$connection <- connection
318 |   }
319 |   else if (is.list(jobj) || inherits(jobj, "struct")) {
320 |     jobj <- lapply(jobj, function(e) {
321 |       attach_connection(e, connection)
322 |     })
323 |   }
324 |   else if (is.environment(jobj)) {
325 |     jobj <- eapply(jobj, function(e) {
326 |       attach_connection(e, connection)
327 |     })
328 |   }
329 | 
330 |   jobj
331 | }
332 | 
333 | 
334 | read_shell_file <- function(shell_file) {
335 | 
336 |   shellOutputFile <- file(shell_file, open = "rb")
337 |   backendPort <- readInt(shellOutputFile)
338 |   monitorPort <- readInt(shellOutputFile)
339 |   rLibraryPath <- readString(shellOutputFile)
340 |   close(shellOutputFile)
341 | 
342 |   success <- length(backendPort) > 0 && backendPort > 0 &&
343 |     length(monitorPort) > 0 && monitorPort > 0 &&
344 |     length(rLibraryPath) == 1
345 | 
346 |   if (!success)
347 |     stop("Invalid values found in shell output")
348 | 
349 |   list(
350 |     backendPort = backendPort,
351 |     monitorPort = monitorPort,
352 |     rLibraryPath = rLibraryPath
353 |   )
354 | }
355 | 
356 | 
357 | wait_file_exists <- function(filename, seconds) {
358 |   retries <- seconds * 10
359 |   while(!file.exists(filename) && retries >= 0) {
360 |     retries <- retries  - 1;
361 |     Sys.sleep(0.1)
362 |   }
363 | 
364 |   file.exists(filename)
365 | }
366 | 
367 | read_spark_log_error <- function(sc) {
368 |   # if there was no error message reported, then
369 |   # return information from the Spark logs. return
370 |   # all those with most recent timestamp
371 |   msg <- "failed to invoke spark command (unknown reason)"
372 |   try(silent = TRUE, {
373 |     log <- sc$output_file
374 |     splat <- strsplit(log, "\\s+", perl = TRUE)
375 |     n <- length(splat)
376 |     timestamp <- splat[[n]][[2]]
377 |     regex <- paste("\\b", timestamp, "\\b", sep = "")
378 |     entries <- grep(regex, log, perl = TRUE, value = TRUE)
379 |     pasted <- paste(entries, collapse = "\n")
380 |     msg <- paste("failed to invoke spark command", pasted, sep = "\n")
381 |   })
382 |   msg
383 | }
384 | 
385 | spark_config_value <- function(config, name, default = NULL) {
386 |   if(is.null(config[[name]])) default else config[[name]]
387 | }
388 | 


--------------------------------------------------------------------------------
/inst/scala/serializer.scala:
--------------------------------------------------------------------------------
  1 | package sparkapi
  2 | 
  3 | import java.io.{DataInputStream, DataOutputStream}
  4 | import java.nio.charset.StandardCharsets
  5 | import java.sql.{Date, Time, Timestamp}
  6 | 
  7 | import scala.collection.JavaConverters._
  8 | import scala.collection.mutable.WrappedArray
  9 | 
 10 | object Serializer {
 11 |   type ReadObject = (DataInputStream, Char) => Object
 12 |   type WriteObject = (DataOutputStream, Object) => Boolean
 13 | 
 14 |   var sqlSerDe: (ReadObject, WriteObject) = _
 15 | 
 16 |   def registerSqlSerDe(sqlSerDe: (ReadObject, WriteObject)): Unit = {
 17 |     this.sqlSerDe = sqlSerDe
 18 |   }
 19 | 
 20 |   // Type mapping from R to Java
 21 |   //
 22 |   // NULL -> void
 23 |   // integer -> Int
 24 |   // character -> String
 25 |   // logical -> Boolean
 26 |   // double, numeric -> Double
 27 |   // raw -> Array[Byte]
 28 |   // Date -> Date
 29 |   // POSIXlt/POSIXct -> Time
 30 |   //
 31 |   // list[T] -> Array[T], where T is one of above mentioned types
 32 |   // environment -> Map[String, T], where T is a native type
 33 |   // jobj -> Object, where jobj is an object created in the backend
 34 | 
 35 |   def readObjectType(dis: DataInputStream): Char = {
 36 |     dis.readByte().toChar
 37 |   }
 38 | 
 39 |   def readObject(dis: DataInputStream): Object = {
 40 |     val dataType = readObjectType(dis)
 41 |     readTypedObject(dis, dataType)
 42 |   }
 43 | 
 44 |   def readTypedObject(
 45 |     dis: DataInputStream,
 46 |     dataType: Char): Object = {
 47 |       dataType match {
 48 |         case 'n' => null
 49 |         case 'i' => new java.lang.Integer(readInt(dis))
 50 |         case 'd' => new java.lang.Double(readDouble(dis))
 51 |         case 'b' => new java.lang.Boolean(readBoolean(dis))
 52 |         case 'c' => readString(dis)
 53 |         case 'e' => readMap(dis)
 54 |         case 'r' => readBytes(dis)
 55 |         case 'a' => readArray(dis)
 56 |         case 'l' => readList(dis)
 57 |         case 'D' => readDate(dis)
 58 |         case 't' => readTime(dis)
 59 |         case 'j' => JVMObjectTracker.getObject(readString(dis))
 60 |         case _ =>
 61 |           if (sqlSerDe == null || sqlSerDe._1 == null) {
 62 |             throw new IllegalArgumentException (s"Invalid type $dataType")
 63 |           } else {
 64 |             val obj = (sqlSerDe._1)(dis, dataType)
 65 |             if (obj == null) {
 66 |               throw new IllegalArgumentException (s"Invalid type $dataType")
 67 |             } else {
 68 |               obj
 69 |             }
 70 |           }
 71 |       }
 72 |     }
 73 | 
 74 |   def readBytes(in: DataInputStream): Array[Byte] = {
 75 |     val len = readInt(in)
 76 |     val out = new Array[Byte](len)
 77 |     val bytesRead = in.readFully(out)
 78 |     out
 79 |   }
 80 | 
 81 |   def readInt(in: DataInputStream): Int = {
 82 |     in.readInt()
 83 |   }
 84 | 
 85 |   def readDouble(in: DataInputStream): Double = {
 86 |     in.readDouble()
 87 |   }
 88 | 
 89 |   def readStringBytes(in: DataInputStream, len: Int): String = {
 90 |     val bytes = new Array[Byte](len)
 91 |     in.readFully(bytes)
 92 |     assert(bytes(len - 1) == 0)
 93 |     val str = new String(bytes.dropRight(1), StandardCharsets.UTF_8)
 94 |     str
 95 |   }
 96 | 
 97 |   def readString(in: DataInputStream): String = {
 98 |     val len = in.readInt()
 99 |     readStringBytes(in, len)
100 |   }
101 | 
102 |   def readBoolean(in: DataInputStream): Boolean = {
103 |     val intVal = in.readInt()
104 |     if (intVal == 0) false else true
105 |   }
106 | 
107 |   def readDate(in: DataInputStream): Date = {
108 |     Date.valueOf(readString(in))
109 |   }
110 | 
111 |   def readTime(in: DataInputStream): Timestamp = {
112 |     val seconds = in.readDouble()
113 |     val sec = Math.floor(seconds).toLong
114 |     val t = new Timestamp(sec * 1000L)
115 |     t.setNanos(((seconds - sec) * 1e9).toInt)
116 |     t
117 |   }
118 | 
119 |   def readBytesArr(in: DataInputStream): Array[Array[Byte]] = {
120 |     val len = readInt(in)
121 |     (0 until len).map(_ => readBytes(in)).toArray
122 |   }
123 | 
124 |   def readIntArr(in: DataInputStream): Array[Int] = {
125 |     val len = readInt(in)
126 |     (0 until len).map(_ => readInt(in)).toArray
127 |   }
128 | 
129 |   def readDoubleArr(in: DataInputStream): Array[Double] = {
130 |     val len = readInt(in)
131 |     (0 until len).map(_ => readDouble(in)).toArray
132 |   }
133 | 
134 |   def readBooleanArr(in: DataInputStream): Array[Boolean] = {
135 |     val len = readInt(in)
136 |     (0 until len).map(_ => readBoolean(in)).toArray
137 |   }
138 | 
139 |   def readStringArr(in: DataInputStream): Array[String] = {
140 |     val len = readInt(in)
141 |     (0 until len).map(_ => readString(in)).toArray
142 |   }
143 | 
144 |   // All elements of an array must be of the same type
145 |   def readArray(dis: DataInputStream): Array[_] = {
146 |     val arrType = readObjectType(dis)
147 |     arrType match {
148 |       case 'i' => readIntArr(dis)
149 |       case 'c' => readStringArr(dis)
150 |       case 'd' => readDoubleArr(dis)
151 |       case 'b' => readBooleanArr(dis)
152 |       case 'j' => readStringArr(dis).map(x => JVMObjectTracker.getObject(x))
153 |       case 'r' => readBytesArr(dis)
154 |       case 'a' =>
155 |         val len = readInt(dis)
156 |       (0 until len).map(_ => readArray(dis)).toArray
157 |       case 'l' =>
158 |         val len = readInt(dis)
159 |       (0 until len).map(_ => readList(dis)).toArray
160 |       case _ =>
161 |         if (sqlSerDe == null || sqlSerDe._1 == null) {
162 |           throw new IllegalArgumentException (s"Invalid array type $arrType")
163 |         } else {
164 |           val len = readInt(dis)
165 |           (0 until len).map { _ =>
166 |             val obj = (sqlSerDe._1)(dis, arrType)
167 |             if (obj == null) {
168 |               throw new IllegalArgumentException (s"Invalid array type $arrType")
169 |             } else {
170 |               obj
171 |             }
172 |           }.toArray
173 |         }
174 |     }
175 |   }
176 | 
177 |   // Each element of a list can be of different type. They are all represented
178 |   // as Object on JVM side
179 |   def readList(dis: DataInputStream): Array[Object] = {
180 |     val len = readInt(dis)
181 |     (0 until len).map(_ => readObject(dis)).toArray
182 |   }
183 | 
184 |   def readMap(in: DataInputStream): java.util.Map[Object, Object] = {
185 |     val len = readInt(in)
186 |     if (len > 0) {
187 |       // Keys is an array of String
188 |       val keys = readArray(in).asInstanceOf[Array[Object]]
189 |       val values = readList(in)
190 | 
191 |       keys.zip(values).toMap.asJava
192 |     } else {
193 |       new java.util.HashMap[Object, Object]()
194 |     }
195 |   }
196 | 
197 |   // Methods to write out data from Java to R
198 |   //
199 |   // Type mapping from Java to R
200 |   //
201 |   // void -> NULL
202 |   // Int -> integer
203 |   // String -> character
204 |   // Boolean -> logical
205 |   // Float -> double
206 |   // Double -> double
207 |   // Decimal -> double
208 |   // Long -> double
209 |   // Array[Byte] -> raw
210 |   // Date -> Date
211 |   // Time -> POSIXct
212 |   //
213 |     // Array[T] -> list()
214 |   // Object -> jobj
215 | 
216 |   def writeType(dos: DataOutputStream, typeStr: String): Unit = {
217 |     typeStr match {
218 |       case "void" => dos.writeByte('n')
219 |       case "character" => dos.writeByte('c')
220 |       case "double" => dos.writeByte('d')
221 |       case "integer" => dos.writeByte('i')
222 |       case "logical" => dos.writeByte('b')
223 |       case "date" => dos.writeByte('D')
224 |       case "time" => dos.writeByte('t')
225 |       case "raw" => dos.writeByte('r')
226 |       // Array of primitive types
227 |       case "array" => dos.writeByte('a')
228 |       // Array of objects
229 |       case "list" => dos.writeByte('l')
230 |       case "map" => dos.writeByte('e')
231 |       case "jobj" => dos.writeByte('j')
232 |       case _ => throw new IllegalArgumentException(s"Invalid type $typeStr")
233 |     }
234 |   }
235 | 
236 |   private def writeKeyValue(dos: DataOutputStream, key: Object, value: Object): Unit = {
237 |     if (key == null) {
238 |       throw new IllegalArgumentException("Key in map can't be null.")
239 |     } else if (!key.isInstanceOf[String]) {
240 |       throw new IllegalArgumentException(s"Invalid map key type: ${key.getClass.getName}")
241 |     }
242 | 
243 |     writeString(dos, key.asInstanceOf[String])
244 |     writeObject(dos, value)
245 |   }
246 | 
247 |   def writeObject(dos: DataOutputStream, obj: Object): Unit = {
248 |     if (obj == null) {
249 |       writeType(dos, "void")
250 |     } else {
251 |       // Convert ArrayType collected from DataFrame to Java array
252 |       // Collected data of ArrayType from a DataFrame is observed to be of
253 |       // type "scala.collection.mutable.WrappedArray"
254 |       val value =
255 |         if (obj.isInstanceOf[WrappedArray[_]]) {
256 |           obj.asInstanceOf[WrappedArray[_]].toArray
257 |         } else {
258 |           obj
259 |         }
260 | 
261 |       value match {
262 |         case v: java.lang.Character =>
263 |           writeType(dos, "character")
264 |         writeString(dos, v.toString)
265 |         case v: java.lang.String =>
266 |           writeType(dos, "character")
267 |         writeString(dos, v)
268 |         case v: java.lang.Long =>
269 |           writeType(dos, "double")
270 |         writeDouble(dos, v.toDouble)
271 |         case v: java.lang.Float =>
272 |           writeType(dos, "double")
273 |         writeDouble(dos, v.toDouble)
274 |         case v: java.math.BigDecimal =>
275 |           writeType(dos, "double")
276 |         writeDouble(dos, scala.math.BigDecimal(v).toDouble)
277 |         case v: java.lang.Double =>
278 |           writeType(dos, "double")
279 |         writeDouble(dos, v)
280 |         case v: java.lang.Byte =>
281 |           writeType(dos, "integer")
282 |         writeInt(dos, v.toInt)
283 |         case v: java.lang.Short =>
284 |           writeType(dos, "integer")
285 |         writeInt(dos, v.toInt)
286 |         case v: java.lang.Integer =>
287 |           writeType(dos, "integer")
288 |         writeInt(dos, v)
289 |         case v: java.lang.Boolean =>
290 |           writeType(dos, "logical")
291 |         writeBoolean(dos, v)
292 |         case v: java.sql.Date =>
293 |           writeType(dos, "date")
294 |         writeDate(dos, v)
295 |         case v: java.sql.Time =>
296 |           writeType(dos, "time")
297 |         writeTime(dos, v)
298 |         case v: java.sql.Timestamp =>
299 |           writeType(dos, "time")
300 |         writeTime(dos, v)
301 | 
302 |         // Handle arrays
303 | 
304 |         // Array of primitive types
305 | 
306 |         // Special handling for byte array
307 |         case v: Array[Byte] =>
308 |           writeType(dos, "raw")
309 |         writeBytes(dos, v)
310 | 
311 |         case v: Array[Char] =>
312 |           writeType(dos, "array")
313 |         writeStringArr(dos, v.map(_.toString))
314 |         case v: Array[Short] =>
315 |           writeType(dos, "array")
316 |         writeIntArr(dos, v.map(_.toInt))
317 |         case v: Array[Int] =>
318 |           writeType(dos, "array")
319 |         writeIntArr(dos, v)
320 |         case v: Array[Long] =>
321 |           writeType(dos, "array")
322 |         writeDoubleArr(dos, v.map(_.toDouble))
323 |         case v: Array[Float] =>
324 |           writeType(dos, "array")
325 |         writeDoubleArr(dos, v.map(_.toDouble))
326 |         case v: Array[Double] =>
327 |           writeType(dos, "array")
328 |         writeDoubleArr(dos, v)
329 |         case v: Array[Boolean] =>
330 |           writeType(dos, "array")
331 |         writeBooleanArr(dos, v)
332 | 
333 |         // Array of objects, null objects use "void" type
334 |         case v: Array[Object] =>
335 |           writeType(dos, "list")
336 |         writeInt(dos, v.length)
337 |         v.foreach(elem => writeObject(dos, elem))
338 | 
339 |         // Handle Properties
340 |         // This must be above the case java.util.Map below.
341 |         // (Properties implements Map<Object,Object> and will be serialized as map otherwise)
342 |         case v: java.util.Properties =>
343 |           writeType(dos, "jobj")
344 |         writeJObj(dos, value)
345 | 
346 |         // Handle map
347 |         case v: java.util.Map[_, _] =>
348 |           writeType(dos, "map")
349 |         writeInt(dos, v.size)
350 |         val iter = v.entrySet.iterator
351 |         while(iter.hasNext) {
352 |           val entry = iter.next
353 |           val key = entry.getKey
354 |           val value = entry.getValue
355 | 
356 |           writeKeyValue(dos, key.asInstanceOf[Object], value.asInstanceOf[Object])
357 |         }
358 |         case v: scala.collection.Map[_, _] =>
359 |           writeType(dos, "map")
360 |         writeInt(dos, v.size)
361 |         v.foreach { case (key, value) =>
362 |           writeKeyValue(dos, key.asInstanceOf[Object], value.asInstanceOf[Object])
363 |         }
364 | 
365 |         case _ =>
366 |           if (sqlSerDe == null || sqlSerDe._2 == null || !(sqlSerDe._2)(dos, value)) {
367 |             writeType(dos, "jobj")
368 |             writeJObj(dos, value)
369 |           }
370 |       }
371 |     }
372 |   }
373 | 
374 |   def writeInt(out: DataOutputStream, value: Int): Unit = {
375 |     out.writeInt(value)
376 |   }
377 | 
378 |   def writeDouble(out: DataOutputStream, value: Double): Unit = {
379 |     out.writeDouble(value)
380 |   }
381 | 
382 |   def writeBoolean(out: DataOutputStream, value: Boolean): Unit = {
383 |     val intValue = if (value) 1 else 0
384 |     out.writeInt(intValue)
385 |   }
386 | 
387 |   def writeDate(out: DataOutputStream, value: Date): Unit = {
388 |     writeString(out, value.toString)
389 |   }
390 | 
391 |   def writeTime(out: DataOutputStream, value: Time): Unit = {
392 |     out.writeDouble(value.getTime.toDouble / 1000.0)
393 |   }
394 | 
395 |   def writeTime(out: DataOutputStream, value: Timestamp): Unit = {
396 |     out.writeDouble((value.getTime / 1000).toDouble + value.getNanos.toDouble / 1e9)
397 |   }
398 | 
399 |   def writeString(out: DataOutputStream, value: String): Unit = {
400 |     val utf8 = value.getBytes(StandardCharsets.UTF_8)
401 |     val len = utf8.length
402 |     out.writeInt(len)
403 |     out.write(utf8, 0, len)
404 |   }
405 | 
406 |   def writeBytes(out: DataOutputStream, value: Array[Byte]): Unit = {
407 |     out.writeInt(value.length)
408 |     out.write(value)
409 |   }
410 | 
411 |   def writeJObj(out: DataOutputStream, value: Object): Unit = {
412 |     val objId = JVMObjectTracker.put(value)
413 |     writeString(out, objId)
414 |   }
415 | 
416 |   def writeIntArr(out: DataOutputStream, value: Array[Int]): Unit = {
417 |     writeType(out, "integer")
418 |     out.writeInt(value.length)
419 |     value.foreach(v => out.writeInt(v))
420 |   }
421 | 
422 |   def writeDoubleArr(out: DataOutputStream, value: Array[Double]): Unit = {
423 |     writeType(out, "double")
424 |     out.writeInt(value.length)
425 |     value.foreach(v => out.writeDouble(v))
426 |   }
427 | 
428 |   def writeBooleanArr(out: DataOutputStream, value: Array[Boolean]): Unit = {
429 |     writeType(out, "logical")
430 |     out.writeInt(value.length)
431 |     value.foreach(v => writeBoolean(out, v))
432 |   }
433 | 
434 |   def writeStringArr(out: DataOutputStream, value: Array[String]): Unit = {
435 |     writeType(out, "character")
436 |     out.writeInt(value.length)
437 |     value.foreach(v => writeString(out, v))
438 |   }
439 | 
440 | }
441 | 
442 | object SerializationFormats {
443 |   val BYTE = "byte"
444 |   val STRING = "string"
445 |   val ROW = "row"
446 | }
447 | 


--------------------------------------------------------------------------------