├── cluster ├── code │ └── .keep ├── .gitignore ├── .dockerignore ├── spark-env.sh ├── submit.sh ├── Dockerfile ├── README.md └── docker-compose.yml ├── .gitignore ├── sparkplug-core ├── .gitignore ├── test │ └── sparkplug │ │ ├── function │ │ └── test_fns.clj │ │ ├── kryo_test.clj │ │ ├── core_test.clj │ │ └── function_test.clj ├── resources │ └── sparkplug │ │ └── kryo │ │ └── registry │ │ ├── sparkplug.conf │ │ └── clojure.conf ├── dev-resources │ └── log4j.properties ├── README.md ├── src │ ├── java │ │ └── sparkplug │ │ │ ├── function │ │ │ ├── Fn1.java │ │ │ ├── ComparatorFn.java │ │ │ ├── VoidFn.java │ │ │ ├── Fn2.java │ │ │ ├── Fn3.java │ │ │ ├── FlatMapFn1.java │ │ │ ├── FlatMapFn2.java │ │ │ ├── PairFlatMapFn.java │ │ │ ├── PairFn.java │ │ │ └── SerializableFn.java │ │ │ ├── kryo │ │ │ └── ClassPathRegistrator.java │ │ │ ├── core │ │ │ └── UnionHelper.java │ │ │ ├── partition │ │ │ └── FnHashPartitioner.java │ │ │ └── broadcast │ │ │ └── DerefBroadcast.java │ └── clojure │ │ └── sparkplug │ │ ├── accumulator.clj │ │ ├── config.clj │ │ ├── scala.clj │ │ ├── function.clj │ │ ├── context.clj │ │ ├── rdd.clj │ │ ├── kryo.clj │ │ └── core.clj ├── project.clj └── dev │ └── user.clj ├── sparkplug-repl ├── .gitignore ├── src │ └── sparkplug │ │ └── repl │ │ ├── work.clj │ │ └── main.clj ├── project.clj └── README.md ├── .cljstyle ├── .clj-kondo ├── hooks │ └── sparkplug │ │ └── function.clj └── config.edn ├── LICENSE ├── project.clj ├── README.md ├── CHANGELOG.md ├── doc ├── serialization.md └── sparkling.md └── .circleci └── config.yml /cluster/code/.keep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /cluster/.gitignore: -------------------------------------------------------------------------------- 1 | /code 2 | /data 3 | -------------------------------------------------------------------------------- /cluster/.dockerignore: -------------------------------------------------------------------------------- 1 | /data 2 | /jars 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | classes 3 | checkouts 4 | .clj-kondo/.cache 5 | .lein-* 6 | .nrepl-port 7 | pom.xml 8 | pom.xml.asc 9 | *.jar 10 | *.class 11 | -------------------------------------------------------------------------------- /sparkplug-core/.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /classes 3 | /checkouts 4 | /.lein-* 5 | /.nrepl-port 6 | pom.xml 7 | pom.xml.asc 8 | *.jar 9 | *.class 10 | -------------------------------------------------------------------------------- /sparkplug-repl/.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /classes 3 | /checkouts 4 | /.lein-* 5 | /.nrepl-port 6 | pom.xml 7 | pom.xml.asc 8 | *.jar 9 | *.class 10 | -------------------------------------------------------------------------------- /cluster/spark-env.sh: -------------------------------------------------------------------------------- 1 | # Spark environment customizations 2 | 3 | export SPARK_DIST_CLASSPATH=$(/opt/hadoop/bin/hadoop classpath) 4 | export SPARK_NO_DAEMONIZE=1 5 | -------------------------------------------------------------------------------- /sparkplug-core/test/sparkplug/function/test_fns.clj: -------------------------------------------------------------------------------- 1 | (ns sparkplug.function.test-fns 2 | "AOT-compiled test functions.") 3 | 4 | 5 | (defn bool-closure 6 | [b] 7 | (fn inner 8 | [x] 9 | (when b 10 | x))) 11 | -------------------------------------------------------------------------------- /.cljstyle: -------------------------------------------------------------------------------- 1 | ;; vim: ft=clojure 2 | {:files 3 | {:ignore #{"checkouts" "target"}} 4 | 5 | :rules 6 | {:namespaces 7 | {:import-break-width 80} 8 | 9 | :indentation 10 | {:indents {for-all [[:block 1]] 11 | with-context [[:block 1]]}}}} 12 | -------------------------------------------------------------------------------- /cluster/submit.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | APP_DRIVER="$1" 4 | 5 | if [[ -z $APP_DRIVER ]]; then 6 | echo "No application driver code provided!" >&2 7 | exit 1 8 | fi 9 | 10 | if [[ ! -f code/$APP_DRIVER ]]; then 11 | echo "Couldn't find code/$APP_DRIVER - did you copy it in place?" >&2 12 | exit 2 13 | fi 14 | 15 | docker compose exec master \ 16 | /opt/spark/bin/spark-submit \ 17 | --master spark://master:7077 \ 18 | /mnt/code/$APP_DRIVER 19 | -------------------------------------------------------------------------------- /sparkplug-core/resources/sparkplug/kryo/registry/sparkplug.conf: -------------------------------------------------------------------------------- 1 | # SparkPlug types 2 | 3 | # Functions 4 | register sparkplug.function.Fn1 5 | register sparkplug.function.Fn2 6 | register sparkplug.function.Fn3 7 | register sparkplug.function.FlatMapFn1 8 | register sparkplug.function.FlatMapFn2 9 | register sparkplug.function.PairFn 10 | register sparkplug.function.PairFlatMapFn 11 | register sparkplug.function.ComparatorFn 12 | register sparkplug.function.VoidFn 13 | 14 | # Misc 15 | register sparkplug.broadcast.DerefBroadcast 16 | -------------------------------------------------------------------------------- /.clj-kondo/hooks/sparkplug/function.clj: -------------------------------------------------------------------------------- 1 | (ns hooks.sparkplug.function 2 | (:require 3 | [clj-kondo.hooks-api :as api])) 4 | 5 | 6 | (defn gen-function 7 | "Macro analysis for `sparkplug.function/gen-function`." 8 | [form] 9 | (let [name-sym (-> form :node :children (nth 2)) 10 | constructor (api/list-node 11 | [(api/token-node 'defn) 12 | name-sym 13 | (api/vector-node 14 | [(api/token-node '_f)])])] 15 | {:node constructor})) 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2019 Amperity, Inc. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /sparkplug-core/dev-resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # log4j config for clojure development 2 | log4j.rootLogger=WARN, stdout 3 | 4 | # Console appender 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 6 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 7 | log4j.appender.stdout.layout.ConversionPattern=%d{ISO8601} %5p %c{2}:%L - %m%n 8 | 9 | log4j.logger.sparkplug=DEBUG 10 | log4j.logger.sparkplug.kryo=INFO 11 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO 12 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO 13 | log4j.logger.org.eclipse.jetty=WARN 14 | -------------------------------------------------------------------------------- /sparkplug-core/README.md: -------------------------------------------------------------------------------- 1 | Spark Core API 2 | ============== 3 | 4 | [![cljdoc](https://cljdoc.org/badge/amperity/sparkplug-core)](https://cljdoc.org/d/amperity/sparkplug-core/CURRENT) 5 | 6 | This library contains the core API for working with Spark. If you want to get 7 | the basic building blocks of a Spark application, you can use this directly. 8 | 9 | 10 | ## Installation 11 | 12 | Library releases are published on Clojars. To use the latest version with 13 | Leiningen, add the following dependency to your project: 14 | 15 | [![Clojars Project](https://clojars.org/amperity/sparkplug-core/latest-version.svg)](https://clojars.org/amperity/sparkplug-core) 16 | -------------------------------------------------------------------------------- /.clj-kondo/config.edn: -------------------------------------------------------------------------------- 1 | {:linters 2 | {:consistent-alias 3 | {:level :warning 4 | :aliases {clojure.java.io io 5 | clojure.set set 6 | clojure.string str 7 | clojure.tools.logging log 8 | sparkplug.function f 9 | sparkplug.kryo kryo 10 | sparkplug.rdd rdd}}} 11 | 12 | :lint-as 13 | {clojure.test.check.clojure-test/defspec clj-kondo.lint-as/def-catch-all 14 | sparkplug.context/with-context clojure.core/let 15 | sparkplug.kryo/defserializer clj-kondo.lint-as/def-catch-all} 16 | 17 | :hooks 18 | {:analyze-call {sparkplug.function/gen-function hooks.sparkplug.function/gen-function}}} 19 | -------------------------------------------------------------------------------- /sparkplug-core/src/java/sparkplug/function/Fn1.java: -------------------------------------------------------------------------------- 1 | package sparkplug.function; 2 | 3 | 4 | import clojure.lang.IFn; 5 | 6 | import java.util.Collection; 7 | 8 | import org.apache.spark.api.java.function.Function; 9 | 10 | 11 | /** 12 | * Compatibility wrapper for a Spark `Function` of one argument. 13 | */ 14 | public class Fn1 extends SerializableFn implements Function { 15 | 16 | public Fn1(IFn f, Collection namespaces) { 17 | super(f, namespaces); 18 | } 19 | 20 | 21 | @Override 22 | @SuppressWarnings("unchecked") 23 | public Object call(Object v1) throws Exception { 24 | return f.invoke(v1); 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /sparkplug-core/src/java/sparkplug/function/ComparatorFn.java: -------------------------------------------------------------------------------- 1 | package sparkplug.function; 2 | 3 | 4 | import clojure.lang.IFn; 5 | 6 | import java.util.Collection; 7 | import java.util.Comparator; 8 | 9 | 10 | /** 11 | * Compatibility wrapper for a `Comparator` of two arguments. 12 | */ 13 | public class ComparatorFn extends SerializableFn implements Comparator { 14 | 15 | public ComparatorFn(IFn f, Collection namespaces) { 16 | super(f, namespaces); 17 | } 18 | 19 | 20 | @Override 21 | @SuppressWarnings("unchecked") 22 | public int compare(Object v1, Object v2) { 23 | return (int)f.invoke(v1, v2); 24 | } 25 | 26 | } 27 | -------------------------------------------------------------------------------- /sparkplug-core/src/java/sparkplug/function/VoidFn.java: -------------------------------------------------------------------------------- 1 | package sparkplug.function; 2 | 3 | 4 | import clojure.lang.IFn; 5 | 6 | import java.util.Collection; 7 | 8 | import org.apache.spark.api.java.function.VoidFunction; 9 | 10 | 11 | /** 12 | * Compatibility wrapper for a Spark `VoidFunction` of one argument. 13 | */ 14 | public class VoidFn extends SerializableFn implements VoidFunction { 15 | 16 | public VoidFn(IFn f, Collection namespaces) { 17 | super(f, namespaces); 18 | } 19 | 20 | 21 | @Override 22 | @SuppressWarnings("unchecked") 23 | public void call(Object v1) throws Exception { 24 | f.invoke(v1); 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /sparkplug-core/src/java/sparkplug/function/Fn2.java: -------------------------------------------------------------------------------- 1 | package sparkplug.function; 2 | 3 | 4 | import clojure.lang.IFn; 5 | 6 | import java.util.Collection; 7 | 8 | import org.apache.spark.api.java.function.Function2; 9 | 10 | 11 | /** 12 | * Compatibility wrapper for a Spark `Function2` of two arguments. 13 | */ 14 | public class Fn2 extends SerializableFn implements Function2 { 15 | 16 | public Fn2(IFn f, Collection namespaces) { 17 | super(f, namespaces); 18 | } 19 | 20 | 21 | @Override 22 | @SuppressWarnings("unchecked") 23 | public Object call(Object v1, Object v2) throws Exception { 24 | return f.invoke(v1, v2); 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /sparkplug-core/src/java/sparkplug/function/Fn3.java: -------------------------------------------------------------------------------- 1 | package sparkplug.function; 2 | 3 | 4 | import clojure.lang.IFn; 5 | 6 | import java.util.Collection; 7 | 8 | import org.apache.spark.api.java.function.Function3; 9 | 10 | 11 | /** 12 | * Compatibility wrapper for a Spark `Function3` of three arguments. 13 | */ 14 | public class Fn3 extends SerializableFn implements Function3 { 15 | 16 | public Fn3(IFn f, Collection namespaces) { 17 | super(f, namespaces); 18 | } 19 | 20 | 21 | @Override 22 | @SuppressWarnings("unchecked") 23 | public Object call(Object v1, Object v2, Object v3) throws Exception { 24 | return f.invoke(v1, v2, v3); 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /sparkplug-core/test/sparkplug/kryo_test.clj: -------------------------------------------------------------------------------- 1 | (ns sparkplug.kryo-test 2 | (:require 3 | [clojure.test :refer [deftest is]] 4 | [clojure.test.check.clojure-test :refer [defspec]] 5 | [clojure.test.check.generators :as gen] 6 | [clojure.test.check.properties :as prop] 7 | [sparkplug.kryo :as kryo])) 8 | 9 | 10 | (deftest classpath-search 11 | (let [registries (kryo/classpath-registries)] 12 | (is (sequential? registries)) 13 | (is (<= 2 (count registries))))) 14 | 15 | 16 | (def kryo (kryo/initialize)) 17 | 18 | 19 | (defspec clojure-data-roundtrip 20 | {:num-tests 1000 21 | :max-size 20} 22 | (prop/for-all [x gen/any-equatable] 23 | (is (= x (->> x (kryo/encode kryo) (kryo/decode kryo)))))) 24 | -------------------------------------------------------------------------------- /sparkplug-repl/src/sparkplug/repl/work.clj: -------------------------------------------------------------------------------- 1 | (ns sparkplug.repl.work 2 | (:require 3 | [clojure.java.io :as io] 4 | [clojure.repl :refer :all] 5 | [clojure.set :as set] 6 | [clojure.stacktrace :refer [print-cause-trace]] 7 | [clojure.string :as str] 8 | [sparkplug.config :as conf] 9 | [sparkplug.context :as ctx] 10 | [sparkplug.core :as spark] 11 | [sparkplug.function :as f] 12 | [sparkplug.kryo :as kryo] 13 | [sparkplug.rdd :as rdd] 14 | [sparkplug.scala :as scala])) 15 | 16 | 17 | (def spark-context 18 | "The currently active Spark context." 19 | nil) 20 | 21 | 22 | (defn exit! 23 | "Exit the running REPL gracefully." 24 | [] 25 | (deliver @(resolve 'sparkplug.repl.main/exit-promise) :exit)) 26 | -------------------------------------------------------------------------------- /sparkplug-core/src/java/sparkplug/function/FlatMapFn1.java: -------------------------------------------------------------------------------- 1 | package sparkplug.function; 2 | 3 | 4 | import clojure.lang.IFn; 5 | 6 | import java.util.Iterator; 7 | import java.util.Collection; 8 | 9 | import org.apache.spark.api.java.function.FlatMapFunction; 10 | 11 | 12 | /** 13 | * Compatibility wrapper for a Spark `FlatMapFunction` of one argument. 14 | */ 15 | public class FlatMapFn1 extends SerializableFn implements FlatMapFunction { 16 | 17 | public FlatMapFn1(IFn f, Collection namespaces) { 18 | super(f, namespaces); 19 | } 20 | 21 | 22 | @Override 23 | @SuppressWarnings("unchecked") 24 | public Iterator call(Object v1) throws Exception { 25 | Collection results = (Collection)f.invoke(v1); 26 | return results.iterator(); 27 | } 28 | 29 | } 30 | -------------------------------------------------------------------------------- /project.clj: -------------------------------------------------------------------------------- 1 | (defproject amperity/sparkplug "1.1.0" 2 | :description "Clojure API for Apache Spark" 3 | :url "https://github.com/amperity/sparkplug" 4 | :license {:name "Apache License 2.0" 5 | :url "http://www.apache.org/licenses/LICENSE-2.0"} 6 | 7 | :deploy-repositories {"releases" {:url "https://repo.clojars.org"}} 8 | :deploy-branches ["main"] 9 | :pedantic? :warn 10 | 11 | :plugins 12 | [[lein-cloverage "1.2.2"] 13 | [lein-monolith "1.7.0"]] 14 | 15 | :dependencies 16 | [[org.clojure/clojure "1.12.0"] 17 | [amperity/sparkplug-core "1.1.0"]] 18 | 19 | :profiles 20 | {:dev 21 | {:dependencies 22 | [[org.clojure/test.check "1.1.1"]]}} 23 | 24 | :monolith 25 | {:project-dirs ["sparkplug-core" 26 | "sparkplug-repl"] 27 | :inherit [:deploy-branches 28 | :pedantic?]}) 29 | -------------------------------------------------------------------------------- /sparkplug-core/src/java/sparkplug/function/FlatMapFn2.java: -------------------------------------------------------------------------------- 1 | package sparkplug.function; 2 | 3 | 4 | import clojure.lang.IFn; 5 | 6 | import java.util.Iterator; 7 | import java.util.Collection; 8 | 9 | import org.apache.spark.api.java.function.FlatMapFunction2; 10 | 11 | 12 | /** 13 | * Compatibility wrapper for a Spark `FlatMapFunction2` of two arguments. 14 | */ 15 | public class FlatMapFn2 extends SerializableFn implements FlatMapFunction2 { 16 | 17 | public FlatMapFn2(IFn f, Collection namespaces) { 18 | super(f, namespaces); 19 | } 20 | 21 | 22 | @Override 23 | @SuppressWarnings("unchecked") 24 | public Iterator call(Object v1, Object v2) throws Exception { 25 | Collection results = (Collection)f.invoke(v1, v2); 26 | return results.iterator(); 27 | } 28 | 29 | } 30 | -------------------------------------------------------------------------------- /cluster/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM eclipse-temurin:11-jdk 2 | 3 | RUN apt update 4 | RUN apt install -yy ca-certificates wget bash procps coreutils python3 5 | RUN update-ca-certificates 6 | 7 | RUN mkdir -p /opt 8 | WORKDIR /opt 9 | 10 | ARG HADOOP_VERSION 11 | RUN wget https://dlcdn.apache.org/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz && \ 12 | tar -xzvf hadoop-${HADOOP_VERSION}.tar.gz && \ 13 | rm hadoop-${HADOOP_VERSION}.tar.gz && \ 14 | mv hadoop-${HADOOP_VERSION} hadoop 15 | 16 | ARG SPARK_VERSION 17 | ARG SPARK_VARIANT=without-hadoop 18 | RUN wget https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-${SPARK_VARIANT}.tgz && \ 19 | tar -xzvf spark-${SPARK_VERSION}-bin-${SPARK_VARIANT}.tgz && \ 20 | rm spark-${SPARK_VERSION}-bin-${SPARK_VARIANT}.tgz && \ 21 | mv spark-${SPARK_VERSION}-bin-${SPARK_VARIANT} spark 22 | 23 | ENV HADOOP_HOME=/opt/hadoop 24 | ENV SPARK_HOME=/opt/spark 25 | ADD spark-env.sh /opt/spark/conf/spark-env.sh 26 | 27 | RUN mkdir -p /tmp/spark-events 28 | -------------------------------------------------------------------------------- /sparkplug-repl/project.clj: -------------------------------------------------------------------------------- 1 | (defproject amperity/sparkplug-repl "1.1.0" 2 | :description "Clojure REPL for Spark exploration" 3 | :url "https://github.com/amperity/sparkplug" 4 | :scm {:dir ".."} 5 | :license {:name "Apache License 2.0" 6 | :url "http://www.apache.org/licenses/LICENSE-2.0"} 7 | 8 | :monolith/inherit true 9 | 10 | :dependencies 11 | [[org.clojure/clojure "1.12.0"] 12 | [amperity/sparkplug-core "1.1.0"] 13 | [mvxcvi/whidbey "2.2.1"] 14 | [nrepl "1.3.0"]] 15 | 16 | :main sparkplug.repl.main 17 | 18 | :profiles 19 | {:default 20 | [:base :system :user :provided :spark-3.5 :dev] 21 | 22 | :repl 23 | {:repl-options 24 | {:custom-init (whidbey.repl/update-print-fn!) 25 | :init-ns sparkplug.repl.work}} 26 | 27 | :spark-3.1 28 | ^{:pom-scope :provided} 29 | {:dependencies 30 | [[org.apache.spark/spark-core_2.12 "3.1.3"] 31 | [org.apache.spark/spark-sql_2.12 "3.1.3"]]} 32 | 33 | :spark-3.5 34 | ^{:pom-scope :provided} 35 | {:dependencies 36 | [[org.apache.spark/spark-core_2.12 "3.5.1"] 37 | [org.apache.spark/spark-sql_2.12 "3.5.1"]]} 38 | 39 | :uberjar 40 | {:target-path "target/uberjar" 41 | :uberjar-name "sparkplug-repl.jar" 42 | :aot :all}}) 43 | -------------------------------------------------------------------------------- /sparkplug-core/src/java/sparkplug/function/PairFlatMapFn.java: -------------------------------------------------------------------------------- 1 | package sparkplug.function; 2 | 3 | 4 | import clojure.lang.IFn; 5 | 6 | import java.util.Collection; 7 | import java.util.Iterator; 8 | 9 | import org.apache.spark.api.java.function.PairFlatMapFunction; 10 | 11 | import scala.Tuple2; 12 | 13 | 14 | /** 15 | * Compatibility wrapper for a Spark `PairFlatMapFunction` of one argument 16 | * which returns a sequence of pairs. 17 | */ 18 | public class PairFlatMapFn extends SerializableFn implements PairFlatMapFunction { 19 | 20 | public PairFlatMapFn(IFn f, Collection namespaces) { 21 | super(f, namespaces); 22 | } 23 | 24 | 25 | @Override 26 | @SuppressWarnings("unchecked") 27 | public Iterator> call(Object v1) throws Exception { 28 | Collection result = (Collection)f.invoke(v1); 29 | Iterator results = result.iterator(); 30 | return new Iterator>() { 31 | public boolean hasNext() { 32 | return results.hasNext(); 33 | } 34 | 35 | public Tuple2 next() { 36 | return PairFn.coercePair(f, results.next()); 37 | } 38 | }; 39 | } 40 | 41 | } 42 | -------------------------------------------------------------------------------- /sparkplug-core/src/java/sparkplug/kryo/ClassPathRegistrator.java: -------------------------------------------------------------------------------- 1 | package sparkplug.kryo; 2 | 3 | 4 | import clojure.lang.IFn; 5 | import clojure.lang.RT; 6 | import clojure.lang.Symbol; 7 | 8 | import com.esotericsoftware.kryo.Kryo; 9 | 10 | import org.apache.spark.serializer.KryoRegistrator; 11 | 12 | 13 | /** 14 | * Spark interop class to register types for serialization with Kryo. 15 | */ 16 | public class ClassPathRegistrator implements KryoRegistrator { 17 | 18 | /** 19 | * Wrapper class to efficiently ensure the configuration function is only 20 | * loaded once. 21 | */ 22 | private static class Singleton { 23 | 24 | private static final IFn configure; 25 | 26 | static { 27 | IFn resolve = RT.var("clojure.core", "requiring-resolve"); 28 | Symbol name = Symbol.intern("sparkplug.kryo", "load-configuration"); 29 | IFn loader = (IFn)resolve.invoke(name); 30 | configure = (IFn)loader.invoke(); 31 | } 32 | 33 | } 34 | 35 | 36 | @Override 37 | public void registerClasses(Kryo kryo) { 38 | 39 | IFn configure = Singleton.configure; 40 | 41 | if (configure == null) { 42 | throw new RuntimeException("Could not construct kryo configuration function!"); 43 | } 44 | 45 | configure.invoke(kryo); 46 | 47 | } 48 | 49 | } 50 | -------------------------------------------------------------------------------- /sparkplug-core/src/java/sparkplug/core/UnionHelper.java: -------------------------------------------------------------------------------- 1 | package sparkplug.core; 2 | 3 | import org.apache.spark.api.java.JavaPairRDD; 4 | import org.apache.spark.api.java.JavaRDD; 5 | import org.apache.spark.api.java.JavaSparkContext; 6 | 7 | /** 8 | * This is a simple wrapper to call the `union` method on `JavaSparkContext`. 9 | * 10 | * It is written in Java because: 11 | * 12 | * The non-varargs version of `union` was removed in Spark 3, leaving the varargs version as the 13 | * only one that is compatible with both Spark 2 and Spark 3. See: 14 | * 15 | * 16 | * Unfortunately, Clojure is unable to call the varargs version, due to a compiler bug. Doing so 17 | * will fail with errors such as: 18 | * 19 | * IllegalArgumentException: Can't call public method of non-public class: public final 20 | * org.apache.spark.api.java.JavaPairRDD 21 | * org.apache.spark.api.java.JavaSparkContextVarargsWorkaround.union(org.apache.spark.api.java.JavaPairRDD[]) 22 | * 23 | * See: 24 | */ 25 | public class UnionHelper { 26 | public static JavaRDD unionJavaRDDs(JavaSparkContext jsc, JavaRDD[] rdds) { 27 | return jsc.union(rdds); 28 | } 29 | 30 | public static JavaPairRDD unionJavaPairRDDs(JavaSparkContext jsc, JavaPairRDD[] rdds) { 31 | return jsc.union(rdds); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Clojure Spark API 2 | ================= 3 | 4 | [![CircleCI](https://dl.circleci.com/status-badge/img/gh/amperity/sparkplug/tree/main.svg?style=shield)](https://dl.circleci.com/status-badge/redirect/gh/amperity/sparkplug/tree/main) 5 | [![codecov](https://codecov.io/gh/amperity/sparkplug/branch/master/graph/badge.svg)](https://codecov.io/gh/amperity/sparkplug) 6 | [![cljdoc](https://cljdoc.org/badge/amperity/sparkplug)](https://cljdoc.org/d/amperity/sparkplug/CURRENT) 7 | 8 | SparkPlug is a Clojure API for [Apache Spark](http://spark.apache.org/). 9 | 10 | 11 | ## Installation 12 | 13 | Library releases are published on Clojars. To use the latest version with 14 | Leiningen, add the following dependency to your project: 15 | 16 | [![Clojars Project](https://clojars.org/amperity/sparkplug/latest-version.svg)](https://clojars.org/amperity/sparkplug) 17 | 18 | This will pull in the omnibus package, which in turn depends on each subproject 19 | of the same version. You may instead depend on the subprojects directly if you 20 | wish to omit some functionality, such as Spark SQL or Machine Learning 21 | dependencies. 22 | 23 | 24 | ## Usage 25 | 26 | The sparkplug-core package provides functions for working with RDDs, broadcasts, 27 | and accumulators with the classic Spark context API. 28 | See the [cljdoc](https://cljdoc.org/d/amperity/sparkplug-core/CURRENT) for API docs. 29 | 30 | 31 | ## License 32 | 33 | Licensed under the Apache License, Version 2.0. See the [LICENSE](LICENSE) file 34 | for rights and restrictions. 35 | -------------------------------------------------------------------------------- /sparkplug-repl/README.md: -------------------------------------------------------------------------------- 1 | Spark REPL 2 | ========== 3 | 4 | This project provides a server providing an interactive REPL experience while 5 | connected to a Spark cluster. 6 | 7 | 8 | ## Usage 9 | 10 | First, build the REPL uberjar and copy it into the Docker cluster: 11 | 12 | ```shell 13 | lein uberjar 14 | cp target/uberjar/sparkplug-repl.jar ../cluster/code 15 | ``` 16 | 17 | Next, start up the REPL container in another terminal: 18 | 19 | ``` 20 | $ docker-compose up repl 21 | ``` 22 | 23 | Finally, connect to the REPL running in the container: 24 | 25 | ``` 26 | $ lein repl :connect 8765 27 | ``` 28 | 29 | If all goes well, you should see the prompt: 30 | 31 | ``` 32 | sparkplug.repl.work=> 33 | ``` 34 | 35 | The currently running Spark application context is available via the 36 | `spark-context` var, and the WebUI runs on http://localhost:4050/. When you're 37 | done with the REPL you can hit `^D` (Control + D) to hang up and leave the 38 | container running, or call `(exit!)` to shut it down cleanly and stop the Spark 39 | application. 40 | 41 | 42 | ## Limitations 43 | 44 | Currently, you cannot use any dynamically-defined functions. Because these 45 | classes are defined locally, Spark won't be able to deserialize the instances on 46 | the executors. 47 | 48 | If you can express your logic in terms of existing higher-order functions, this 49 | will still work: 50 | 51 | ```clojure 52 | ;; won't work 53 | (spark/map->pairs #(vector % 1)) 54 | 55 | ;; will work! 56 | (spark/map->pairs (juxt identity (constantly 1))) 57 | ``` 58 | -------------------------------------------------------------------------------- /sparkplug-core/project.clj: -------------------------------------------------------------------------------- 1 | (defproject amperity/sparkplug-core "1.1.0" 2 | :description "Clojure API for Apache Spark" 3 | :url "https://github.com/amperity/sparkplug" 4 | :scm {:dir ".."} 5 | :license {:name "Apache License 2.0" 6 | :url "http://www.apache.org/licenses/LICENSE-2.0"} 7 | 8 | :monolith/inherit true 9 | 10 | :dependencies 11 | [[org.clojure/clojure "1.12.0"] 12 | [org.clojure/java.classpath "1.1.0"] 13 | [org.clojure/tools.logging "1.3.0"]] 14 | 15 | :source-paths ["src/clojure"] 16 | :java-source-paths ["src/java"] 17 | 18 | :profiles 19 | {:default 20 | [:base :system :user :provided :spark-3.5 :dev] 21 | 22 | :dev 23 | {:dependencies 24 | [[org.clojure/test.check "1.1.1"] 25 | [org.slf4j/slf4j-api "2.0.16"] 26 | [org.slf4j/slf4j-simple "2.0.16"]] 27 | :jvm-opts ["-Xmx2g" 28 | "-XX:-OmitStackTraceInFastThrow" 29 | "-Dorg.slf4j.simpleLogger.defaultLogLevel=warn" 30 | "-Dorg.slf4j.simpleLogger.log.org.apache=warn"]} 31 | 32 | :repl 33 | {:source-paths ["dev"] 34 | :aot [sparkplug.function.test-fns] 35 | :dependencies 36 | [[org.clojure/tools.namespace "1.5.0"]]} 37 | 38 | :test 39 | {:aot [sparkplug.function.test-fns]} 40 | 41 | :spark-3.1 42 | ^{:pom-scope :provided} 43 | {:dependencies 44 | [[org.apache.spark/spark-core_2.12 "3.1.3"]]} 45 | 46 | :spark-3.5 47 | ^{:pom-scope :provided} 48 | {:dependencies 49 | [[org.apache.spark/spark-core_2.12 "3.5.1" 50 | :exclusions [org.apache.logging.log4j/log4j-slf4j2-impl]] 51 | 52 | ;; Conflict resolution 53 | [com.fasterxml.jackson.core/jackson-core "2.15.2"] 54 | [com.google.code.findbugs/jsr305 "3.0.2"]]}}) 55 | -------------------------------------------------------------------------------- /sparkplug-core/src/java/sparkplug/partition/FnHashPartitioner.java: -------------------------------------------------------------------------------- 1 | package sparkplug.partition; 2 | 3 | import static clojure.lang.Util.hasheq; 4 | import org.apache.spark.Partitioner; 5 | import org.slf4j.Logger; 6 | import org.slf4j.LoggerFactory; 7 | 8 | import sparkplug.function.Fn1; 9 | 10 | 11 | /** 12 | * A Partitioner Similar to Spark's HashPartitioner, which also accepts a key 13 | * function to translate an Object into a hashable key, and uses Clojure's 14 | * hash function instead of Object.hashCode(). 15 | */ 16 | public class FnHashPartitioner extends Partitioner { 17 | 18 | private static final Logger logger = LoggerFactory.getLogger(FnHashPartitioner.class); 19 | 20 | private final int numPartitions; 21 | private final Fn1 keyFn; 22 | 23 | public FnHashPartitioner(int numPartitions, Fn1 keyFn) { 24 | if (numPartitions <= 0) { 25 | throw new IllegalArgumentException("Number of partitions must be positive, got " + numPartitions); 26 | } 27 | if (keyFn == null) { 28 | throw new IllegalArgumentException("Key function must not be null"); 29 | } 30 | this.numPartitions = numPartitions; 31 | this.keyFn = keyFn; 32 | } 33 | 34 | @Override 35 | public int numPartitions() { 36 | return this.numPartitions; 37 | } 38 | 39 | @Override 40 | public int getPartition(Object key) { 41 | Object transformedKey = null; 42 | try { 43 | transformedKey = this.keyFn.call(key); 44 | } catch (Exception e) { 45 | logger.error("Key function threw an exception, so this key will be hashed as if it were null." 46 | + " This is likely to cause skewed partitioning.", e); 47 | } 48 | 49 | return Math.floorMod(hasheq(transformedKey), this.numPartitions); 50 | } 51 | 52 | } 53 | -------------------------------------------------------------------------------- /sparkplug-core/resources/sparkplug/kryo/registry/clojure.conf: -------------------------------------------------------------------------------- 1 | # Clojure language types 2 | 3 | # Value types 4 | register clojure.lang.BigInt sparkplug.kryo/bigint-serializer 5 | register clojure.lang.Keyword sparkplug.kryo/ident-serializer 6 | register clojure.lang.Symbol sparkplug.kryo/ident-serializer 7 | register clojure.lang.Ratio sparkplug.kryo/ratio-serializer 8 | register clojure.lang.Var sparkplug.kryo/var-serializer 9 | 10 | # Sequences 11 | register clojure.lang.Cons sparkplug.kryo/sequence-serializer 12 | register clojure.lang.PersistentList$EmptyList sparkplug.kryo/sequence-serializer 13 | register clojure.lang.PersistentList sparkplug.kryo/sequence-serializer 14 | register clojure.lang.LazySeq sparkplug.kryo/sequence-serializer 15 | register clojure.lang.IteratorSeq sparkplug.kryo/sequence-serializer 16 | register clojure.lang.ArraySeq sparkplug.kryo/sequence-serializer 17 | register clojure.lang.PersistentVector$ChunkedSeq sparkplug.kryo/sequence-serializer 18 | register clojure.lang.StringSeq sparkplug.kryo/string-seq-serializer 19 | 20 | # Vectors 21 | register clojure.lang.MapEntry sparkplug.kryo/vector-serializer 22 | register clojure.lang.PersistentVector sparkplug.kryo/vector-serializer 23 | register clojure.lang.APersistentVector$SubVector sparkplug.kryo/vector-serializer 24 | 25 | # Maps 26 | register clojure.lang.PersistentArrayMap sparkplug.kryo/map-serializer 27 | register clojure.lang.PersistentHashMap sparkplug.kryo/map-serializer 28 | register clojure.lang.PersistentStructMap sparkplug.kryo/map-serializer 29 | register clojure.lang.PersistentTreeMap sparkplug.kryo/ordered-map-serializer 30 | 31 | # Sets 32 | register clojure.lang.PersistentHashSet sparkplug.kryo/set-serializer 33 | register clojure.lang.PersistentTreeSet sparkplug.kryo/ordered-map-serializer 34 | 35 | # Others 36 | register clojure.lang.MethodImplCache 37 | register clojure.lang.RT$DefaultComparator 38 | -------------------------------------------------------------------------------- /sparkplug-core/src/java/sparkplug/broadcast/DerefBroadcast.java: -------------------------------------------------------------------------------- 1 | package sparkplug.broadcast; 2 | 3 | 4 | import clojure.lang.IDeref; 5 | 6 | import org.apache.spark.broadcast.Broadcast; 7 | 8 | import scala.reflect.ClassTag; 9 | 10 | 11 | /** 12 | * This class extends Spark's broadcast type so that it can be used with the 13 | * Clojure
deref
 function and reader macro.
14 |  */
15 | public class DerefBroadcast extends Broadcast implements IDeref {
16 | 
17 |     public final Broadcast wrapped;
18 | 
19 | 
20 |     /**
21 |      * Construct a new DerefBroadcast wrapping the given broadcast value.
22 |      */
23 |     public DerefBroadcast(Broadcast wrapped, Class cls) {
24 |         super(wrapped.id(), ClassTag.apply(cls));
25 |         this.wrapped = wrapped;
26 |     }
27 | 
28 | 
29 |     @Override
30 |     public boolean equals(Object other) {
31 |         if (this == other) {
32 |             return true;
33 |         } else if (other instanceof DerefBroadcast) {
34 |             DerefBroadcast db = (DerefBroadcast)other;
35 |             return wrapped.equals(db.wrapped);
36 |         } else {
37 |             return false;
38 |         }
39 |     }
40 | 
41 | 
42 |     @Override
43 |     public int hashCode() {
44 |         return wrapped.hashCode();
45 |     }
46 | 
47 | 
48 |     @Override
49 |     public String toString() {
50 |         return wrapped.toString();
51 |     }
52 | 
53 | 
54 |     @Override
55 |     public Object deref() {
56 |         return wrapped.value();
57 |     }
58 | 
59 | 
60 |     @Override
61 |     public T getValue() {
62 |         return wrapped.value();
63 |     }
64 | 
65 | 
66 |     @Override
67 |     public void doUnpersist(boolean blocking) {
68 |         wrapped.doUnpersist(blocking);
69 |     }
70 | 
71 | 
72 |     @Override
73 |     public void doDestroy(boolean blocking) {
74 |         wrapped.doDestroy(blocking);
75 |     }
76 | 
77 | }
78 | 


--------------------------------------------------------------------------------
/cluster/README.md:
--------------------------------------------------------------------------------
 1 | Docker Spark Cluster
 2 | ====================
 3 | 
 4 | A simple spark-master and two-worker cluster for use in testing and debugging
 5 | deployed Spark applications. This setup surfaces serialization and classpath
 6 | issues that do not occur in local development contexts.
 7 | 
 8 | 
 9 | ## Usage
10 | 
11 | Initialize the cluster, containing a master and one worker:
12 | 
13 | ```shell
14 | docker compose up -d
15 | ```
16 | 
17 | You can submit an application with the submit script:
18 | 
19 | ```shell
20 | cp $PROJECT/target/uberjar/my-app.jar cluster/code/
21 | ./submit.sh my-app.jar
22 | ```
23 | 
24 | You can also submit an application using the Spark master's REST API. First,
25 | create a JSON file with the request body:
26 | 
27 | ```json
28 | {
29 |     "action": "CreateSubmissionRequest",
30 |     "appArgs": ["file:///data/hamlet.txt"],
31 |     "appResource": "file:///mnt/code/my-app.jar",
32 |     "clientSparkVersion": "3.5.1",
33 |     "environmentVariables": {"SPARK_ENV_LOADED": "1"},
34 |     "mainClass": "my_app.main",
35 |     "sparkProperties":
36 |     {
37 |         "spark.app.name": "my-app",
38 |         "spark.submit.deployMode": "cluster",
39 |         "spark.jars": "file:///mnt/code/my-app.jar",
40 |         "spark.driver.cores": 1,
41 |         "spark.driver.memory": "1G",
42 |         "spark.driver.supervise": "false",
43 |         "spark.executor.cores": 1,
44 |         "spark.executor.count": 1,
45 |         "spark.executor.memory": "1G",
46 |         "spark.logConf": "true"
47 |     }
48 | }
49 | ```
50 | 
51 | Then submit it to the scheduling HTTP endpoint:
52 | 
53 | ```shell
54 | curl http://localhost:6066/v1/submissions/create --data @request.json
55 | ```
56 | 
57 | ## Endpoints
58 | 
59 | All of these are from docker host:
60 | 
61 | * spark-master [http:8080](http://localhost:8080)
62 | * spark-driver [http:4040](http://localhost:4040) (when an application is running)
63 | * legacy submission [spark:7077](spark://localhost:7077)
64 | * REST API submission [spark:6066](spark://localhost:6066)
65 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | Change Log
 2 | ==========
 3 | 
 4 | All notable changes to this project will be documented in this file, which
 5 | follows the conventions of [keepachangelog.com](http://keepachangelog.com/).
 6 | This project adheres to [Semantic Versioning](http://semver.org/).
 7 | 
 8 | ## [Unreleased]
 9 | 
10 | ...
11 | 
12 | 
13 | ## [1.1.0] - 2024-10-10
14 | 
15 | ### Changed
16 | - Sparkplug is now tested with Java 11 + Spark 3.1.3, and Java 11 + Spark 3.5.1.
17 |   Java 8 test coverage was dropped.
18 | - Bump Clojure to 1.12.0.
19 | - Update various dependency versions.
20 | - Add clj-kondo linting to test suite.
21 | - Fix bug when serializing functions which close over a boolean value.
22 |   [#27](https://github.com/amperity/sparkplug/issues/27)
23 |   [#28](https://github.com/amperity/sparkplug/pull/28)
24 | 
25 | 
26 | ## [1.0.0] - 2022-05-31
27 | 
28 | ### Changed
29 | - Update some project dependencies to the latest versions.
30 | - The `sparkplug-sql` sub-project, which has been empty since its creation
31 |   over two years ago, has been removed for now.
32 | 
33 | 
34 | ## [0.1.9] - 2022-04-25
35 | 
36 | ### Changed
37 | - Sparkplug is now tested with Spark 3.1.3 and Spark 3.2.1.
38 |   Spark 2.4.x and related dependencies were dropped.
39 | - The `sparkplug-ml` sub-project, which has been empty since its creation
40 |   over two years ago, has been removed for now.
41 | 
42 | ### Fixed
43 | - Correctly detect namespace to require when serializing a closure defined
44 |   inside a record type.
45 |   [#23](https://github.com/amperity/sparkplug/pull/23)
46 | 
47 | 
48 | ## [0.1.8] - 2021-08-06
49 | 
50 | ### Fixed
51 | - `sparkplug.core/union` now works with Spark 3.
52 |   [#21](https://github.com/amperity/sparkplug/pull/21)
53 | 
54 | 
55 | [Unreleased]: https://github.com/amperity/sparkplug/compare/1.1.0...HEAD
56 | [1.1.0]: https://github.com/amperity/sparkplug/compare/1.0.0...1.1.0
57 | [1.0.0]: https://github.com/amperity/sparkplug/compare/0.1.9...1.0.0
58 | [0.1.9]: https://github.com/amperity/sparkplug/compare/0.1.8...0.1.9
59 | [0.1.8]: https://github.com/amperity/sparkplug/compare/0.1.7...0.1.8
60 | 


--------------------------------------------------------------------------------
/cluster/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   master:
 3 |     build:
 4 |       context: .
 5 |       dockerfile: Dockerfile
 6 |       args:
 7 |         HADOOP_VERSION: 3.3.5
 8 |         SPARK_VERSION: 3.5.1
 9 |     command: /opt/spark/sbin/start-master.sh
10 |     restart: on-failure
11 |     hostname: master
12 |     environment:
13 |       SPARK_PUBLIC_DNS: localhost
14 |       SPARK_MASTER_PORT: 7077
15 |       SPARK_MASTER_WEBUI_PORT: 8080
16 |       SPARK_MASTER_OPTS: "-Dspark.master.rest.enabled=true"
17 |     expose:
18 |       - 6066
19 |       - 7001
20 |       - 7002
21 |       - 7003
22 |       - 7004
23 |       - 7005
24 |       - 7006
25 |       - 7077
26 |     ports:
27 |       - 6066:6066
28 |       - 7077:7077
29 |       - 8080:8080
30 |     volumes:
31 |       - ./code:/mnt/code
32 | 
33 |   worker-1:
34 |     build:
35 |       context: .
36 |       dockerfile: Dockerfile
37 |       args:
38 |         HADOOP_VERSION: 3.3.5
39 |         SPARK_VERSION: 3.5.1
40 |     command: /opt/spark/sbin/start-worker.sh spark://master:7077
41 |     restart: on-failure
42 |     hostname: worker-1
43 |     environment:
44 |       SPARK_PUBLIC_DNS: localhost
45 |       SPARK_WORKER_PORT: 8881
46 |       SPARK_WORKER_WEBUI_PORT: 8081
47 |       SPARK_WORKER_CORES: 2
48 |       SPARK_WORKER_MEMORY: 2g
49 |     links:
50 |       - master
51 |     depends_on:
52 |       - master
53 |     expose:
54 |       - 4040
55 |       - 7012
56 |       - 7013
57 |       - 7014
58 |       - 7015
59 |       - 7016
60 |       - 8881
61 |     ports:
62 |       - 4040:4040
63 |       - 8081:8081
64 |       - 8881:8881
65 |     volumes:
66 |       - ./code:/mnt/code
67 |       - ./data:/data
68 | 
69 |   repl:
70 |     image: eclipse-temurin:11-jdk
71 |     command: java -jar /sparkplug-repl.jar
72 |     restart: on-failure
73 |     hostname: repl
74 |     environment:
75 |       SPARKPLUG_REPL_MASTER: spark://master:7077
76 |       SPARKPLUG_REPL_PORT: 8765
77 |     ports:
78 |       - 4050:4040
79 |       - 8765:8765
80 |     volumes:
81 |       - ./code/sparkplug-repl.jar:/sparkplug-repl.jar
82 |       - ./data:/data
83 | 
84 | networks:
85 |   default:
86 |     ipam:
87 |       config:
88 |         - subnet: "10.128.99.0/24"
89 | 


--------------------------------------------------------------------------------
/sparkplug-core/src/java/sparkplug/function/PairFn.java:
--------------------------------------------------------------------------------
 1 | package sparkplug.function;
 2 | 
 3 | 
 4 | import clojure.lang.IFn;
 5 | import clojure.lang.IMapEntry;
 6 | import clojure.lang.IPersistentVector;
 7 | 
 8 | import java.util.Collection;
 9 | 
10 | import org.apache.spark.api.java.function.PairFunction;
11 | 
12 | import scala.Tuple2;
13 | 
14 | 
15 | /**
16 |  * Compatibility wrapper for a Spark `PairFunction` of one argument which
17 |  * returns a pair.
18 |  */
19 | public class PairFn extends SerializableFn implements PairFunction {
20 | 
21 |     public PairFn(IFn f, Collection namespaces) {
22 |         super(f, namespaces);
23 |     }
24 | 
25 | 
26 |     @Override
27 |     @SuppressWarnings("unchecked")
28 |     public Tuple2 call(Object v1) throws Exception {
29 |         return coercePair(f, f.invoke(v1));
30 |     }
31 | 
32 | 
33 |     /**
34 |      * Coerce a result value into a Scala `Tuple2` as the result of a function.
35 |      *
36 |      * @param f the function which produced the result, to report in error messages
37 |      * @param result object to try to coerce
38 |      * @return a Scala tuple with two values
39 |      */
40 |     public static Tuple2 coercePair(IFn f, Object result) {
41 |         // Null can't be coerced.
42 |         if (result == null) {
43 |             throw new RuntimeException("Wrapped pair function " + f + " returned a null");
44 |         // Scala tuples can be returned directly.
45 |         } else if (result instanceof Tuple2) {
46 |             return (Tuple2)result;
47 |         // Use key/value from Clojure map entries to construct a tuple.
48 |         } else if (result instanceof IMapEntry) {
49 |             IMapEntry entry = (IMapEntry)result;
50 |             return new Tuple2(entry.key(), entry.val());
51 |         // Try to generically coerce a sequential result into a tuple.
52 |         } else if (result instanceof IPersistentVector) {
53 |             IPersistentVector vector = (IPersistentVector)result;
54 |             if (vector.count() != 2) {
55 |                 throw new RuntimeException("Wrapped pair function " + f + " returned a vector without exactly two values: " + vector.count());
56 |             }
57 |             return new Tuple2(vector.nth(0), vector.nth(1));
58 |         // Unknown type, can't coerce.
59 |         } else {
60 |             throw new RuntimeException("Wrapped pair function " + f + " returned an invalid pair type: " + result.getClass().getName());
61 |         }
62 |     }
63 | 
64 | }
65 | 


--------------------------------------------------------------------------------
/sparkplug-core/dev/user.clj:
--------------------------------------------------------------------------------
 1 | (ns user
 2 |   (:require
 3 |     [clojure.java.io :as io]
 4 |     [clojure.repl :refer :all]
 5 |     [clojure.stacktrace :refer [print-cause-trace]]
 6 |     [clojure.string :as str]
 7 |     [clojure.tools.namespace.repl :refer [refresh]]
 8 |     [sparkplug.config :as conf]
 9 |     [sparkplug.context :as ctx]
10 |     [sparkplug.core :as spark]
11 |     [sparkplug.function :as f]
12 |     [sparkplug.function.test-fns :as test-fns]
13 |     [sparkplug.kryo :as kryo]
14 |     [sparkplug.rdd :as rdd]
15 |     [sparkplug.scala :as scala])
16 |   (:import
17 |     com.esotericsoftware.kryo.Kryo
18 |     (java.io
19 |       ByteArrayInputStream
20 |       ByteArrayOutputStream
21 |       ObjectInputStream
22 |       ObjectOutputStream)))
23 | 
24 | 
25 | (def local-conf
26 |   (-> (conf/spark-conf)
27 |       (conf/master "local[*]")
28 |       (conf/app-name "user")))
29 | 
30 | 
31 | (def spark-context nil)
32 | 
33 | 
34 | (defn letter-frequencies
35 |   "Calculate the number of times each letter appears in the given text."
36 |   ([]
37 |    (letter-frequencies "/usr/share/dict/words"))
38 |   ([path]
39 |    (ctx/with-context [ctx (-> (conf/spark-conf)
40 |                               (conf/master "local[2]")
41 |                               (conf/app-name "letter-frequencies"))]
42 |      (alter-var-root #'spark-context (constantly ctx))
43 |      (->
44 |        (->>
45 |          (rdd/text-file ctx (str "file://" path))
46 |          (spark/map str/lower-case)
47 |          (spark/mapcat seq)
48 |          (spark/map->pairs #(vector % 1))
49 |          (spark/reduce-by-key +)
50 |          (spark/into {}))
51 |        (as-> result
52 |          (do (println "Done, press enter to continue...")
53 |              (read-line)
54 |              result))))))
55 | 
56 | 
57 | (def kryo
58 |   (delay (kryo/initialize)))
59 | 
60 | 
61 | (defn inspect-bytes
62 |   [data]
63 |   (->>
64 |     (seq data)
65 |     (map #(let [c (char (if (neg? %)
66 |                           (+ % 256)
67 |                           %))]
68 |             (if (<= 32 (int c))
69 |               c
70 |               \.)))
71 |     (partition-all 32)
72 |     (map str/join)
73 |     (str/join "\n")
74 |     (println)))
75 | 
76 | 
77 | (defn serialize
78 |   [f]
79 |   (let [baos (ByteArrayOutputStream.)]
80 |     (with-open [out (ObjectOutputStream. baos)]
81 |       (.writeObject out f))
82 |     (.toByteArray baos)))
83 | 
84 | 
85 | (defn deserialize
86 |   [bs]
87 |   (with-open [in (ObjectInputStream. (ByteArrayInputStream. bs))]
88 |     (.readObject in)))
89 | 


--------------------------------------------------------------------------------
/sparkplug-core/test/sparkplug/core_test.clj:
--------------------------------------------------------------------------------
 1 | (ns sparkplug.core-test
 2 |   (:require
 3 |     [clojure.test :refer [deftest is testing use-fixtures]]
 4 |     [sparkplug.config :as conf]
 5 |     [sparkplug.context :as context]
 6 |     [sparkplug.core :as spark]
 7 |     [sparkplug.rdd :as rdd]))
 8 | 
 9 | 
10 | (def ^:dynamic *sc*
11 |   nil)
12 | 
13 | 
14 | (def local-conf
15 |   (-> (conf/spark-conf)
16 |       (conf/master "local[*]")
17 |       (conf/app-name "user")
18 |       (conf/set-param "spark.ui.enabled" "false")))
19 | 
20 | 
21 | (defn spark-context-fixture
22 |   [f]
23 |   (context/with-context [sc local-conf]
24 |     (binding [*sc* sc]
25 |       (f))))
26 | 
27 | 
28 | (use-fixtures :once spark-context-fixture)
29 | 
30 | 
31 | (deftest core-transforms
32 |   (testing "aggregate-by-key"
33 |     (is (= [[1 (reduce + (range 10))]]
34 |            (->> (rdd/parallelize-pairs *sc* (map vector (repeat 10 1) (range 10)))
35 |                 (spark/aggregate-by-key + + 0)
36 |                 (spark/into []))))
37 |     (is (= [[1 (reduce + (range 10))]]
38 |            (->> (rdd/parallelize-pairs *sc* (map vector (repeat 10 1) (range 10)))
39 |                 (spark/aggregate-by-key + + 0 2)
40 |                 (spark/into []))))
41 |     (is (= [[1 (reduce + (range 10))]]
42 |            (->> (rdd/parallelize-pairs *sc* (map vector (repeat 10 1) (range 10)))
43 |                 (spark/aggregate-by-key + + 0 (rdd/hash-partitioner 2))
44 |                 (spark/into [])))))
45 | 
46 |   (testing "sort-by"
47 |     (is (= (vec (reverse (range 10)))
48 |            (->> (rdd/parallelize *sc* (shuffle (range 10)))
49 |                 (spark/sort-by -)
50 |                 (spark/into []))
51 |            (->> (rdd/parallelize *sc* (shuffle (range 10)))
52 |                 (spark/sort-by identity false)
53 |                 (spark/into [])))))
54 | 
55 |   (testing "union"
56 |     (is (= #{:a :b}
57 |            (spark/into #{} (spark/union (rdd/parallelize *sc* [:a :b])))))
58 |     (is (= #{:a :b :c :d}
59 |            (spark/into
60 |              #{}
61 |              (spark/union
62 |                (rdd/parallelize *sc* [:a :b])
63 |                (rdd/parallelize *sc* [:c :d])))))
64 |     (is (= #{:a :b :c :d :e :f}
65 |            (spark/into
66 |              #{}
67 |              (spark/union
68 |                (rdd/parallelize *sc* [:a :b])
69 |                (rdd/parallelize *sc* [:c :d])
70 |                (rdd/parallelize *sc* [:e :f])))))
71 |     (is (= #{[:a :b]}
72 |            (spark/into #{} (spark/union (rdd/parallelize-pairs *sc* [[:a :b]])))))
73 |     (is (= #{[:a :b] [:c :d]}
74 |            (spark/into
75 |              #{}
76 |              (spark/union
77 |                (rdd/parallelize-pairs *sc* [[:a :b]])
78 |                (rdd/parallelize-pairs *sc* [[:c :d]])))))
79 |     (is (= #{[:a :b] [:c :d] [:e :f]}
80 |            (spark/into
81 |              #{}
82 |              (spark/union
83 |                (rdd/parallelize-pairs *sc* [[:a :b]])
84 |                (rdd/parallelize-pairs *sc* [[:c :d]])
85 |                (rdd/parallelize-pairs *sc* [[:e :f]])))))))
86 | 


--------------------------------------------------------------------------------
/sparkplug-repl/src/sparkplug/repl/main.clj:
--------------------------------------------------------------------------------
  1 | (ns sparkplug.repl.main
  2 |   (:gen-class)
  3 |   (:require
  4 |     [clojure.java.io :as io]
  5 |     [clojure.tools.logging :as log]
  6 |     [nrepl.middleware :as middleware]
  7 |     [nrepl.middleware.session :as mw-session]
  8 |     [nrepl.server :as server]
  9 |     [sparkplug.config :as conf]
 10 |     [sparkplug.context :as ctx]
 11 |     [sparkplug.core :as spark]
 12 |     [whidbey.repl :as whidbey]))
 13 | 
 14 | 
 15 | (def whidbey-opts
 16 |   {:width 200
 17 |    :namespace-maps true
 18 |    :color-scheme {:nil [:blue]}
 19 |    :tag-types {java.lang.Class {'java/class #(symbol (.getName ^Class %))}
 20 |                java.time.Instant {'inst str}}})
 21 | 
 22 | 
 23 | ;; ## REPL Middleware
 24 | 
 25 | (def repl-ns 'sparkplug.repl.work)
 26 | 
 27 | 
 28 | (defn wrap-repl-init
 29 |   "Middleware constructor which ensures the admin-repl system namespace is
 30 |   loaded and available before configuring the new session to use it."
 31 |   [handler]
 32 |   (with-local-vars [sentinel nil]
 33 |     (fn [{:keys [session] :as msg}]
 34 |       (when-not (@session sentinel)
 35 |         (swap! session assoc
 36 |                #'*ns*
 37 |                (try
 38 |                  (require repl-ns)
 39 |                  (create-ns repl-ns)
 40 |                  (catch Throwable t
 41 |                    (log/error t "Failed to switch to repl-ns" repl-ns)
 42 |                    (create-ns 'user)))
 43 |                sentinel true))
 44 |       (handler msg))))
 45 | 
 46 | 
 47 | (middleware/set-descriptor!
 48 |   #'wrap-repl-init
 49 |   {:requires #{#'mw-session/session}
 50 |    :expects #{"eval"}})
 51 | 
 52 | 
 53 | ;; ## Spark Lifecycle
 54 | 
 55 | (defn- initialize-context!
 56 |   "Construct a new Spark context and intern it in the repl namespace."
 57 |   [master]
 58 |   (require repl-ns)
 59 |   (let [ctx (-> (conf/spark-conf)
 60 |                 (conf/master master)
 61 |                 (conf/app-name "sparkplug-repl")
 62 |                 (conf/jars ["sparkplug-repl.jar"])
 63 |                 (ctx/spark-context))]
 64 |     (intern repl-ns 'spark-context ctx)))
 65 | 
 66 | 
 67 | (defn- stop-context!
 68 |   "Stop the running Spark context, if any."
 69 |   []
 70 |   (let [ctx-var (ns-resolve repl-ns 'spark-context)]
 71 |     (when-let [ctx (and ctx-var @ctx-var)]
 72 |       (ctx/stop! ctx))))
 73 | 
 74 | 
 75 | ;; ## Main Entry
 76 | 
 77 | (def nrepl-server nil)
 78 | (def exit-promise (promise))
 79 | 
 80 | 
 81 | (defn -main
 82 |   "Main entry point for launching the nREPL server."
 83 |   [& args]
 84 |   (let [master (or (System/getenv "SPARKPLUG_REPL_MASTER")
 85 |                    "local[*]")
 86 |         port (-> (System/getenv "SPARKPLUG_REPL_PORT")
 87 |                  (or "8765")
 88 |                  (Integer/parseInt))]
 89 |     (try
 90 |       (whidbey/init! whidbey-opts)
 91 |       (catch Exception ex
 92 |         (log/warn ex "Failed to initialize whidbey middleware!")))
 93 |     (try
 94 |       (log/info "Initializing Spark context...")
 95 |       (require repl-ns)
 96 |       (initialize-context! master)
 97 |       (catch Exception ex
 98 |         (log/error ex "Failed to initialize Spark context!")
 99 |         (System/exit 10)))
100 |     (log/info "Starting nrepl server on port:" port)
101 |     (let [server (server/start-server
102 |                    :bind "0.0.0.0"
103 |                    :port port
104 |                    :handler (server/default-handler #'wrap-repl-init))]
105 |       (alter-var-root #'nrepl-server (constantly server)))
106 |     @exit-promise
107 |     (log/info "Stopping Spark context...")
108 |     (stop-context!)
109 |     (log/info "Stopping nrepl server...")
110 |     (server/stop-server nrepl-server)
111 |     (System/exit 0)))
112 | 


--------------------------------------------------------------------------------
/doc/serialization.md:
--------------------------------------------------------------------------------
 1 | ## Serialization
 2 | 
 3 | A major concern of SparkPlug is reliable and efficient serialization for Spark
 4 | programs written in Clojure.
 5 | 
 6 | Under the umbrella of serialization, there are two separate problems: task functions,
 7 | and task results.
 8 | 
 9 | 
10 | ### Task functions
11 | 
12 | These are the functions you pass to RDD transformations, like map and filter.
13 | When you invoke an action on the resulting RDD, the driver will serialize these
14 | functions and broadcast them to executors. Executors must be able to
15 | deserialize the functions and run them across multiple threads.
16 | 
17 | Due to challenges of serializing functions, Spark uses built-in Java serialization
18 | for task functions. The main difficulty with Clojure functions is that they have
19 | implicit dependencies on namespaces and Vars being available at runtime. If Clojure
20 | functions are not serialized correctly, your application is bound to crash with
21 | confusing errors like "attempting to call unbound fn". To address this,
22 | SparkPlug takes this approach:
23 | * On the driver side: Any function passed to an RDD transformation (map,
24 |   filter, etc.) is serialized along with a list of the namespaces that it
25 |   implicitly depends on. This list is built by reflecting on the _function
26 |   object_ itself, instead of analyzing code.
27 | * On the executor side: When the function is deserialized, first require each
28 |   of those namespaces to ensure they are available before calling the function.
29 |   It's important to synchronize these requires, because `clojure.core/require`
30 |   is not thread-safe! Without synchronization, it's likely to result in
31 |   non-deterministic "unbound fn" and "unbound Var" errors.
32 | 
33 | 
34 | ### Task results
35 | 
36 | This refers to the data produced by executing tasks. Executors will either send
37 | results back to the driver (as in a "collect" action), or pass them on to the
38 | next stage for executors to read again.
39 | 
40 | For task results of Clojure data, such as keywords, maps, and vectors,
41 | Java serialization with `java.io.Serializable` is very suboptimal.
42 | For example, the keyword `:a` gets encoded to a whopping 218 bytes, and
43 | the empty vector `[]` becomes 405 bytes!
44 | 
45 | SparkPlug solves this using Spark's support for [Kryo serialization](https://github.com/EsotericSoftware/kryo),
46 | by defining custom serializers and a registrator to handle common Clojure data types.
47 | To use SparkPlug's Kryo serialization, set these Spark properties:
48 | 
49 | | Property                 | Value                                        |
50 | | ------------------------ | -------------------------------------------- |
51 | | `spark.serializer`       | `org.apache.spark.serializer.KryoSerializer` |
52 | | `spark.kryo.registrator` | `sparkplug.kryo.ClassPathRegistrator`        |
53 | 
54 | For convencience, SparkPlug's configuration builder functions include these
55 | properties by default.
56 | 
57 | The registrator is also extensible, so that applications can easily add more
58 | serializers and have them included in the registry. See the
59 | [sparkplug.kryo](https://cljdoc.org/d/amperity/sparkplug-core/CURRENT/api/sparkplug.kryo)
60 | namespace for details.
61 | 
62 | 
63 | ## Tips
64 | 
65 | Since task functions are serialized with `java.io.Serializable`, any Clojure
66 | data _closed over by_ a task function is also serialized this way. If you need
67 | to close over a relatively large piece of Clojure data in a task function, such
68 | as a static lookup table, using a broadcast variable will provide much better
69 | performance because it will use the same serialization path as task results.
70 | 
71 | If you are caching RDDs of Clojure data, consider using a serialized storage
72 | level. This will use Kryo serialization, and will save a lot of memory on executors.
73 | The tradeoff is that this increases CPU time to access the data.
74 | 


--------------------------------------------------------------------------------
/sparkplug-core/test/sparkplug/function_test.clj:
--------------------------------------------------------------------------------
  1 | (ns sparkplug.function-test
  2 |   (:require
  3 |     [clojure.test :refer [are deftest is testing]]
  4 |     [sparkplug.function :as f]
  5 |     [sparkplug.function.test-fns :as test-fns])
  6 |   (:import
  7 |     (java.io
  8 |       ByteArrayInputStream
  9 |       ByteArrayOutputStream
 10 |       ObjectInputStream
 11 |       ObjectOutputStream)))
 12 | 
 13 | 
 14 | (def this-ns
 15 |   (ns-name *ns*))
 16 | 
 17 | 
 18 | (defprotocol TestProto
 19 | 
 20 |   (proto-method [this])
 21 | 
 22 |   (get-closure [this]))
 23 | 
 24 | 
 25 | (defrecord TestRecord
 26 |   [example-fn]
 27 | 
 28 |   TestProto
 29 | 
 30 |   (proto-method
 31 |     [_]
 32 |     (example-fn))
 33 | 
 34 | 
 35 |   (get-closure
 36 |     [_]
 37 |     (fn inside-fn
 38 |       []
 39 |       nil)))
 40 | 
 41 | 
 42 | (deftest resolve-namespace-references
 43 |   (are [expected-references obj] (= expected-references (f/namespace-references obj))
 44 | 
 45 |     ;; Simple data
 46 |     #{} nil
 47 |     #{} :keyword
 48 |     #{} 5
 49 |     #{} true
 50 |     #{} "str"
 51 |     #{} 'sym
 52 | 
 53 |     ;; Functions
 54 |     #{this-ns}
 55 |     (fn [])
 56 | 
 57 |     #{this-ns 'sparkplug.function}
 58 |     (fn []
 59 |       (f/namespace-references (fn [])))
 60 | 
 61 |     #{this-ns 'sparkplug.function}
 62 |     (fn []
 63 |       (let [x (f/namespace-references (fn []))]
 64 |         (x)))
 65 | 
 66 |     #{this-ns}
 67 |     [(fn [])]
 68 | 
 69 |     #{this-ns}
 70 |     (list (fn []))
 71 | 
 72 |     #{this-ns}
 73 |     (doto (java.util.ArrayList.)
 74 |       (.add (fn [])))
 75 | 
 76 |     #{this-ns}
 77 |     (doto (java.util.HashMap.)
 78 |       (.put "key" (fn [])))
 79 | 
 80 |     #{this-ns}
 81 |     {:key (fn [])}
 82 | 
 83 |     #{this-ns}
 84 |     {:key {:nested (fn [])}}
 85 | 
 86 |     #{this-ns}
 87 |     {:key {:nested [(fn [])]}}
 88 | 
 89 |     ;; Record fields.
 90 |     #{this-ns 'sparkplug.function}
 91 |     (->TestRecord
 92 |       (fn []
 93 |         (f/namespace-references nil)))
 94 | 
 95 |     ;; Function that closes over an object invoking a protocol method.
 96 |     #{this-ns 'sparkplug.function}
 97 |     (let [inst (->TestRecord
 98 |                  (fn []
 99 |                    (f/namespace-references nil)))]
100 |       (fn [] (proto-method inst)))
101 | 
102 |     ;; Function closure defined inside a record class.
103 |     #{this-ns}
104 |     (let [x (->TestRecord nil)]
105 |       (get-closure x))))
106 | 
107 | 
108 | ;; This is a regression test which ensures that decoded functions which close
109 | ;; over a boolean value are updated to use the canonical `Boolean` static
110 | ;; instances. Otherwise, users see bugs where a false value evaluates as truthy.
111 | (deftest canonical-booleans
112 |   (letfn [(serialize
113 |             [f]
114 |             (let [baos (ByteArrayOutputStream.)]
115 |               (with-open [out (ObjectOutputStream. baos)]
116 |                 (.writeObject out f))
117 |               (.toByteArray baos)))
118 | 
119 |           (deserialize
120 |             [bs]
121 |             (with-open [in (ObjectInputStream. (ByteArrayInputStream. bs))]
122 |               (.readObject in)))]
123 |     (testing "closure over true value"
124 |       (let [original-fn (f/fn1 (test-fns/bool-closure true))
125 |             decoded-fn (-> original-fn serialize deserialize)]
126 |         (testing "original behavior"
127 |           (is (= :x (.call original-fn :x))
128 |               "should return value"))
129 |         (testing "decoded behavior"
130 |           (is (= :x (.call decoded-fn :x))
131 |               "should return value"))))
132 |     (testing "closure over false value"
133 |       (let [original-fn (f/fn1 (test-fns/bool-closure false))
134 |             decoded-fn (-> original-fn serialize deserialize)]
135 |         (testing "original behavior"
136 |           (is (nil? (.call original-fn :x))
137 |               "should not return value"))
138 |         (testing "decoded behavior"
139 |           (is (nil? (.call decoded-fn :x))
140 |               "should not return value"))))))
141 | 


--------------------------------------------------------------------------------
/sparkplug-core/src/clojure/sparkplug/accumulator.clj:
--------------------------------------------------------------------------------
  1 | (ns sparkplug.accumulator
  2 |   "Functions for working with Accumulator objects which can aggregate values
  3 |   across executors."
  4 |   (:refer-clojure :exclude [count empty? name reset!])
  5 |   (:require
  6 |     [sparkplug.scala :as scala])
  7 |   (:import
  8 |     org.apache.spark.api.java.JavaSparkContext
  9 |     (org.apache.spark.util
 10 |       AccumulatorV2
 11 |       DoubleAccumulator
 12 |       LongAccumulator)))
 13 | 
 14 | 
 15 | ;; ## Constructors
 16 | 
 17 | (defn long-accumulator
 18 |   "Create and register a long accumulator, which starts with 0 and accumulates
 19 |   inputs by summing them."
 20 |   ([^JavaSparkContext spark-context]
 21 |    (.longAccumulator (.sc spark-context)))
 22 |   ([^JavaSparkContext spark-context acc-name]
 23 |    (.longAccumulator (.sc spark-context) acc-name)))
 24 | 
 25 | 
 26 | (defn double-accumulator
 27 |   "Create and register a double accumulator, which starts with 0.0 and
 28 |   accumulates inputs by summing them."
 29 |   ([^JavaSparkContext spark-context]
 30 |    (.doubleAccumulator (.sc spark-context)))
 31 |   ([^JavaSparkContext spark-context acc-name]
 32 |    (.doubleAccumulator (.sc spark-context) acc-name)))
 33 | 
 34 | 
 35 | (defn collection-accumulator
 36 |   "Create and register a collection accumulator, which starts with empty list
 37 |   and accumulates inputs by adding them into the list."
 38 |   ([^JavaSparkContext spark-context]
 39 |    (.collectionAccumulator (.sc spark-context)))
 40 |   ([^JavaSparkContext spark-context acc-name]
 41 |    (.collectionAccumulator (.sc spark-context) acc-name)))
 42 | 
 43 | 
 44 | ;; ## Accumulator Methods
 45 | 
 46 | (defn name
 47 |   "Return the name of the accumulator, if any."
 48 |   [^AccumulatorV2 acc]
 49 |   (scala/resolve-option (.name acc)))
 50 | 
 51 | 
 52 | (defn value
 53 |   "Return the current value of the accumulator. This can only be called by the
 54 |   driver."
 55 |   [^AccumulatorV2 acc]
 56 |   (.value acc))
 57 | 
 58 | 
 59 | (defn empty?
 60 |   "True if the accumulator has not had any values added to it."
 61 |   [^AccumulatorV2 acc]
 62 |   (.isZero acc))
 63 | 
 64 | 
 65 | (defn add!
 66 |   "Add an element to the accumulated value."
 67 |   [^AccumulatorV2 acc v]
 68 |   (.add acc v))
 69 | 
 70 | 
 71 | (defn merge!
 72 |   "Merge an accumulator `b` into `a`. Both accumulators must have the same
 73 |   type."
 74 |   [^AccumulatorV2 a ^AccumulatorV2 b]
 75 |   (.merge a b))
 76 | 
 77 | 
 78 | (defn reset!
 79 |   "Reset the accumulator to its empty or zero value."
 80 |   [^AccumulatorV2 acc]
 81 |   (.reset acc))
 82 | 
 83 | 
 84 | ;; ## Numeric Accumulators
 85 | 
 86 | (defn count
 87 |   "Return the number of values added to the accumulator. The accumulator must
 88 |   hold either long or double values."
 89 |   [acc]
 90 |   (condp instance? acc
 91 |     LongAccumulator
 92 |     (.count ^LongAccumulator acc)
 93 | 
 94 |     DoubleAccumulator
 95 |     (.count ^DoubleAccumulator acc)
 96 | 
 97 |     (throw (IllegalArgumentException.
 98 |              (str "Cannot call count on accumulator type "
 99 |                   (class acc))))))
100 | 
101 | 
102 | (defn sum
103 |   "Return the sum of all the values added to the accumulator. The accumulator
104 |   must hold either long or double values."
105 |   [acc]
106 |   (condp instance? acc
107 |     LongAccumulator
108 |     (.sum ^LongAccumulator acc)
109 | 
110 |     DoubleAccumulator
111 |     (.sum ^DoubleAccumulator acc)
112 | 
113 |     (throw (IllegalArgumentException.
114 |              (str "Cannot call sum on accumulator type "
115 |                   (class acc))))))
116 | 
117 | 
118 | (defn avg
119 |   "Return the average of all the values added to the accumulator. The
120 |   accumulator must hold either long or double values."
121 |   [acc]
122 |   (condp instance? acc
123 |     LongAccumulator
124 |     (.avg ^LongAccumulator acc)
125 | 
126 |     DoubleAccumulator
127 |     (.avg ^DoubleAccumulator acc)
128 | 
129 |     (throw (IllegalArgumentException.
130 |              (str "Cannot call avg on accumulator type "
131 |                   (class acc))))))
132 | 


--------------------------------------------------------------------------------
/sparkplug-core/src/clojure/sparkplug/config.clj:
--------------------------------------------------------------------------------
  1 | (ns sparkplug.config
  2 |   "Functions for working with Spark configuration."
  3 |   (:import
  4 |     org.apache.spark.SparkConf))
  5 | 
  6 | 
  7 | ;; ## Constructor
  8 | 
  9 | (defn spark-conf
 10 |   "Construct a new Spark configuration. Optionally accepts a boolean to control
 11 |   whether default configuration is loaded from the system properties."
 12 |   ^SparkConf
 13 |   ([]
 14 |    (spark-conf true))
 15 |   ^SparkConf
 16 |   ([defaults?]
 17 |    (-> (SparkConf. (boolean defaults?))
 18 |        (.set "spark.serializer" "org.apache.spark.serializer.KryoSerializer")
 19 |        (.set "spark.kryo.registrator" "sparkplug.kryo.ClassPathRegistrator"))))
 20 | 
 21 | 
 22 | ;; ## Property Accessors
 23 | 
 24 | (defn contains-key?
 25 |   "True if the given spark configuration contains the named parameter."
 26 |   [^SparkConf conf ^String k]
 27 |   (.contains conf k))
 28 | 
 29 | 
 30 | (defn get-all
 31 |   "Get all configuration parameters as a map."
 32 |   [^SparkConf conf]
 33 |   (into {}
 34 |         (map (fn tuple->entry
 35 |                [^scala.Tuple2 entry]
 36 |                [(._1 entry) (._2 entry)]))
 37 |         (.getAll conf)))
 38 | 
 39 | 
 40 | (defn get-param
 41 |   "Get a configuration parameter `k` in `conf`. If not set, this throws a
 42 |   `NoSuchElementException` or returns `not-found` if provided."
 43 |   ([^SparkConf conf ^String k]
 44 |    (.get conf k))
 45 |   ([^SparkConf conf ^String k ^String not-found]
 46 |    (.get conf k not-found)))
 47 | 
 48 | 
 49 | (defn merge-params
 50 |   "Merge the provided parameters into the Spark configuration. Returns updated
 51 |   configuration."
 52 |   ^SparkConf
 53 |   [^SparkConf conf params]
 54 |   (reduce-kv
 55 |     (fn set-entry
 56 |       [^SparkConf c ^String k ^String v]
 57 |       (.set c k v))
 58 |     conf
 59 |     params))
 60 | 
 61 | 
 62 | (defn set-param
 63 |   "Set a parameter to a new value in the given Spark configuration. Returns
 64 |   updated configuration."
 65 |   ^SparkConf
 66 |   ([^SparkConf conf ^String k ^String v]
 67 |    (.set conf k v))
 68 |   ^SparkConf
 69 |   ([^SparkConf conf k v & kvs]
 70 |    {:pre [(even? (count kvs))]}
 71 |    (merge-params conf (apply array-map k v kvs))))
 72 | 
 73 | 
 74 | (defn set-param-default
 75 |   "Set a parameter to a new value if it is not already set in the config.
 76 |   Returns an updated configuration."
 77 |   ^SparkConf
 78 |   [^SparkConf conf ^String k ^String v]
 79 |   (.setIfMissing conf k v))
 80 | 
 81 | 
 82 | (defn unset-param
 83 |   "Unset the given parameters on the config. Returns an updated config."
 84 |   ^SparkConf
 85 |   ([^SparkConf conf ^String k]
 86 |    (.remove conf k))
 87 |   ^SparkConf
 88 |   ([^SparkConf conf k & ks]
 89 |    (reduce
 90 |      (fn unset-key
 91 |        [^SparkConf c ^String k]
 92 |        (.remove c k))
 93 |      conf
 94 |      (cons k ks))))
 95 | 
 96 | 
 97 | (defn set-executor-env
 98 |   "Set environment variables to be used when launching executors for this
 99 |   application. Accepts a parameter key and value or a map of parameters.
100 |   Returns an updated configuration."
101 |   ^SparkConf
102 |   ([^SparkConf conf k v]
103 |    (.setExecutorEnv conf k v))
104 |   ^SparkConf
105 |   ([^SparkConf conf env]
106 |    (reduce-kv
107 |      (fn set-entry
108 |        [^SparkConf c k v]
109 |        (.setExecutorEnv c k v))
110 |      conf
111 |      env)))
112 | 
113 | 
114 | (defn master
115 |   "Set the Spark master property. Returns updated configuration."
116 |   ^SparkConf
117 |   [^SparkConf conf ^String master]
118 |   (.setMaster conf master))
119 | 
120 | 
121 | (defn spark-home
122 |   "Set the Spark home path property. Returns updated configuration."
123 |   ^SparkConf
124 |   [^SparkConf conf home]
125 |   (.setSparkHome conf home))
126 | 
127 | 
128 | (defn app-name
129 |   "Set the Spark application name. Returns updated configuration."
130 |   ^SparkConf
131 |   [^SparkConf conf name-str]
132 |   (.setAppName conf name-str))
133 | 
134 | 
135 | (defn jars
136 |   "Set JAR files to distribute to the cluster. Returns updated configuration."
137 |   ^SparkConf
138 |   [^SparkConf conf jars]
139 |   (.setJars conf ^"[Ljava.lang.String;" (into-array String jars)))
140 | 
141 | 
142 | (defn debug-str
143 |   "Return a string containing a representation of the configuration useful for
144 |   debugging."
145 |   [^SparkConf conf]
146 |   (.toDebugString conf))
147 | 


--------------------------------------------------------------------------------
/doc/sparkling.md:
--------------------------------------------------------------------------------
  1 | Migrating from Sparkling
  2 | ========================
  3 | 
  4 | Migrating from Sparkling should require very little work - a few functions have
  5 | changed names, but the API is extremely similar by design. The major change is
  6 | obviously to update the namespaces you're requiring; for example, instead of
  7 | requiring `[sparkling.core :as spark]`, require `[sparkplug.core :as spark]`.
  8 | 
  9 | Specific changes to be aware of are documented by namespace below.
 10 | 
 11 | 
 12 | ## `sparkling.conf`
 13 | 
 14 | - `get` renamed `get-param`
 15 | - `set` renamed `set-param`
 16 | - `set-if-missing` renamed `set-param-default`
 17 | - `remove` renamed `unset-param`
 18 | - `master` no longer sets `"local[*]"` if provided no arguments
 19 | - `to-string` renamed `debug-str`
 20 | 
 21 | 
 22 | ## `sparkling.function`
 23 | 
 24 | The names of all of the function interop classes changed and their serialization
 25 | is slightly more efficient. Otherwise consumers shouldn't need to change much
 26 | here.
 27 | 
 28 | 
 29 | ## `sparkling.core`
 30 | 
 31 | ### Spark Contexts
 32 | - `spark-context` moved to `sparkplug.context/spark-context`
 33 | - `local-spark-context` not implemented
 34 | - `default-min-partitions` replaced by `sparkplug.context/info`
 35 | - `default-parallelism` replaced by `sparkplug.context/info`
 36 | - `stop` moved to `sparkplug.context/stop!`
 37 | - `with-context` moved to `sparkplug.context/with-context` and now expects a
 38 |   two-element binding vector instead of separate symbol/config args.
 39 | 
 40 | ### RDD Transformations
 41 | - `map-to-pair` renamed `map->pairs`
 42 | - `map-values` renamed `map-vals`
 43 | - `values` renamed `vals`
 44 | - `flat-map` renamed `mapcat`
 45 | - `flat-map-to-pair` renamed `mapcat->pairs`
 46 | - `flat-map-values` renamed `mapcat-vals`
 47 | - `map-partition` renamed `map-partitions`
 48 | - `map-partitions-to-pair` renamed `map-partitions->pairs`
 49 | - `map-partition-with-index` renamed `map-partitions-indexed`
 50 | - `sort-by-key` no longer auto-detects whether the first argument is a
 51 |   comparator - explicitly pass the `ascending?` argument to provide a custom
 52 |   comparison function
 53 | - `sample` has more arities and a different argument signature
 54 | - `zip-with-index` renamed `zip-indexed`
 55 | - `zip-with-unique-id` renamed `zip-unique-ids`
 56 | - `partitionwise-sampled-rdd` not implemented
 57 | - `partitioner-aware-union` not implemented
 58 | - `intersect-by-key` not implemented
 59 | 
 60 | ### RDD Actions
 61 | - `glom` not implemented
 62 | - `collect` returns a vector instead of a mutable Java list
 63 | - `collect-map` not implemented, use `(spark/into {} rdd)` instead
 64 | - `save-as-text-file` moved to `sparkplug.rdd` namespace
 65 | - `histogram` not implemented
 66 | 
 67 | ### RDD Construction
 68 | - `parallelize`/`into-rdd` moved to `sparkplug.rdd/parallelize`
 69 | - `parallelize-pairs`/`into-pair-rdd` moved to `sparkplug.rdd/parallelize-pairs`
 70 | - `text-file` moved to `sparkplug.rdd/text-file`
 71 | - `whole-text-files` moved to `sparkplug.rdd` namespace
 72 | 
 73 | ### RDD Partitioning
 74 | - `hash-partitioner` moved to `sparkplug.rdd` namespace
 75 | - `partitions` moved to `sparkplug.rdd` namespace
 76 | - `partitioner` moved to `sparkplug.rdd` namespace
 77 | - `partition-by` moved to `sparkplug.rdd` namespace
 78 | - `repartition` moved to `sparkplug.rdd` namespace
 79 | - `repartition` moved to `sparkplug.rdd` namespace
 80 | - `coalesce` moved to `sparkplug.rdd` namespace
 81 | - `coalesce-max` not implemented
 82 | - `rekey` not implemented
 83 | 
 84 | ### RDD Persistence
 85 | - `STORAGE-LEVELS` moved to `sparkplug.rdd/storage-levels`
 86 | - `cache`/`storage-level!` replaced by `sparkplug.rdd/cache!`
 87 | - `uncache` moved to `sparkplug.rdd/uncache!`
 88 | - `checkpoint` moved to `sparkplug.rdd/checkpoint!`
 89 | 
 90 | ### Misc
 91 | - `tuple` moved to `sparkplug.scala` namespace
 92 | - `count-partitions` not implemented
 93 | - `tuple-by` not implemented
 94 | - `key-by-fn` not implemented
 95 | - `rdd-name` replaced by `sparkplug.rdd/name` and `sparkplug.rdd/set-name` for
 96 |   the read and write operations, respectively
 97 | 
 98 | 
 99 | ## `sparkling.broadcast`
100 | 
101 | - `broadcast` moved to `sparkplug.core/broadcast`
102 | - `value` not implemented, deref the broadcast values instead
103 | 
104 | 
105 | ## `sparkling.accumulator`
106 | 
107 | - `accumulator` replaced with type-specific v2 constructors:
108 |     - `long-accumulator`
109 |     - `double-accumulator`
110 |     - `collection-accumulator`
111 | 


--------------------------------------------------------------------------------
/.circleci/config.yml:
--------------------------------------------------------------------------------
  1 | version: 2.1
  2 | 
  3 | # Common executor configuration
  4 | executors:
  5 |   clojure-java-11:
  6 |     docker:
  7 |       - image: cimg/clojure:1.11.1-openjdk-11.0
  8 |     working_directory: ~/repo
  9 | 
 10 | 
 11 | # Job definitions
 12 | jobs:
 13 |   style:
 14 |     executor: clojure-java-11
 15 |     steps:
 16 |       - checkout
 17 |       - run:
 18 |           name: Install cljstyle
 19 |           environment:
 20 |             CLJSTYLE_VERSION: 0.16.626
 21 |           command: |
 22 |             wget https://github.com/greglook/cljstyle/releases/download/${CLJSTYLE_VERSION}/cljstyle_${CLJSTYLE_VERSION}_linux_amd64.zip
 23 |             unzip cljstyle_${CLJSTYLE_VERSION}_linux_amd64.zip
 24 |       - run:
 25 |           name: Check source formatting
 26 |           command: "./cljstyle check --report"
 27 | 
 28 |   lint:
 29 |     executor: clojure-java-11
 30 |     steps:
 31 |       - checkout
 32 |       - run:
 33 |           name: Install clj-kondo
 34 |           environment:
 35 |             CLJ_KONDO_VERSION: 2024.09.27
 36 |           command: |
 37 |             wget https://github.com/borkdude/clj-kondo/releases/download/v${CLJ_KONDO_VERSION}/clj-kondo-${CLJ_KONDO_VERSION}-linux-amd64.zip
 38 |             unzip clj-kondo-${CLJ_KONDO_VERSION}-linux-amd64.zip
 39 |       - run:
 40 |           name: Lint source code
 41 |           command: "./clj-kondo --lint sparkplug-core/src:sparkplug-core/test"
 42 | 
 43 |   test-spark-3-1-java-11:
 44 |     executor: clojure-java-11
 45 |     steps:
 46 |       - checkout
 47 |       - restore_cache:
 48 |           keys:
 49 |             - v1-test-spark-3.1-java-11-{{ checksum "project.clj" }}
 50 |             - v1-test-spark-3.1-java-11-
 51 |       - run:
 52 |           name: Test projects
 53 |           command: |
 54 |             lein -version
 55 |             lein monolith each do clean, check, install, test
 56 |       - save_cache:
 57 |           key: v1-test-{{ checksum "project.clj" }}
 58 |           paths:
 59 |             - ~/.m2
 60 | 
 61 |   test-spark-3-5-java-11:
 62 |     executor: clojure-java-11
 63 |     steps:
 64 |       - checkout
 65 |       - restore_cache:
 66 |           keys:
 67 |             - v1-test-spark-3.5-java-11-{{ checksum "project.clj" }}
 68 |             - v1-test-spark-3.5-java-11-
 69 |       - run:
 70 |           name: Test projects
 71 |           command: |
 72 |             lein -version
 73 |             lein monolith each with-profile -spark-3.1,+spark-3.5 do clean, check, install, test
 74 |       - save_cache:
 75 |           key: v1-test-spark-3.5-java-11-{{ checksum "project.clj" }}
 76 |           paths:
 77 |             - ~/.m2
 78 | 
 79 |   coverage:
 80 |     executor: clojure-java-11
 81 |     steps:
 82 |       - checkout
 83 |       - restore_cache:
 84 |           keys:
 85 |             - v1-coverage-{{ checksum "project.clj" }}
 86 |             - v1-coverage-
 87 |             - v1-test-
 88 |       - run:
 89 |           name: Install projects
 90 |           command: lein monolith each install
 91 |       - run:
 92 |           name: Generate coverage
 93 |           command: lein monolith each :in sparkplug-core with-profile +spark-3.5 cloverage --codecov
 94 |       - save_cache:
 95 |           key: v1-coverage-{{ checksum "project.clj" }}
 96 |           paths:
 97 |             - ~/.m2
 98 |       - store_artifacts:
 99 |           path: sparkplug-core/target/coverage
100 |           destination: coverage
101 |       - run:
102 |           name: Install codecov
103 |           command: |
104 |             sudo apt-get update && sudo apt-get install gpg
105 |             curl https://keybase.io/codecovsecurity/pgp_keys.asc | gpg --no-default-keyring --keyring trustedkeys.gpg --import
106 |             curl -Os https://uploader.codecov.io/latest/linux/codecov
107 |             curl -Os https://uploader.codecov.io/latest/linux/codecov.SHA256SUM
108 |             curl -Os https://uploader.codecov.io/latest/linux/codecov.SHA256SUM.sig
109 |             gpgv codecov.SHA256SUM.sig codecov.SHA256SUM
110 |             shasum -a 256 -c codecov.SHA256SUM
111 |             chmod +x codecov
112 |       - run:
113 |           name: Publish coverage report
114 |           command: './codecov -f sparkplug-core/target/coverage/codecov.json'
115 | 
116 | 
117 | # Workflow definitions
118 | workflows:
119 |   version: 2
120 |   test:
121 |     jobs:
122 |       - style
123 |       - lint
124 |       - test-spark-3-1-java-11
125 |       - test-spark-3-5-java-11
126 |       - coverage:
127 |           requires:
128 |             - test-spark-3-5-java-11
129 | 


--------------------------------------------------------------------------------
/sparkplug-core/src/clojure/sparkplug/scala.clj:
--------------------------------------------------------------------------------
  1 | (ns sparkplug.scala
  2 |   "Commonly used utilities for interop with Scala objects."
  3 |   (:refer-clojure :exclude [first second])
  4 |   (:require
  5 |     [clojure.walk :as walk])
  6 |   (:import
  7 |     clojure.lang.MapEntry
  8 |     (scala
  9 |       Option
 10 |       Product
 11 |       Some
 12 |       Tuple1
 13 |       Tuple2
 14 |       Tuple3
 15 |       Tuple4
 16 |       Tuple5
 17 |       Tuple6
 18 |       Tuple7
 19 |       Tuple8
 20 |       Tuple9)))
 21 | 
 22 | 
 23 | (defn resolve-option
 24 |   "Resolve an optional type to some value or nil."
 25 |   [^Option o]
 26 |   (when (instance? Some o)
 27 |     (.get ^Some o)))
 28 | 
 29 | 
 30 | ;; ## Tuples
 31 | 
 32 | (defn tuple
 33 |   "Construct a Scala tuple. Supports tuples up to size 9."
 34 |   ([a]
 35 |    (Tuple1. a))
 36 |   ([a b]
 37 |    (Tuple2. a b))
 38 |   ([a b c]
 39 |    (Tuple3. a b c))
 40 |   ([a b c d]
 41 |    (Tuple4. a b c d))
 42 |   ([a b c d e]
 43 |    (Tuple5. a b c d e))
 44 |   ([a b c d e f]
 45 |    (Tuple6. a b c d e f))
 46 |   ([a b c d e f g]
 47 |    (Tuple7. a b c d e f g))
 48 |   ([a b c d e f g h]
 49 |    (Tuple8. a b c d e f g h))
 50 |   ([a b c d e f g h i]
 51 |    (Tuple9. a b c d e f g h i)))
 52 | 
 53 | 
 54 | (defn vec->tuple
 55 |   "Coerce a Clojure vector to a Scala tuple. Supports tuples up to size 9."
 56 |   [v]
 57 |   (cond
 58 |     (instance? MapEntry v)
 59 |     (Tuple2. (key v) (val v))
 60 | 
 61 |     (< (count v) 10)
 62 |     (apply tuple v)
 63 | 
 64 |     :else
 65 |     (throw (IllegalArgumentException.
 66 |              (str "Cannot coerce value to a tuple: " (pr-str v))))))
 67 | 
 68 | 
 69 | (defn tuple->vec
 70 |   "Coerce a Scala tuple to a Clojure vector. Supports tuples up to size 9."
 71 |   [v]
 72 |   (condp instance? v
 73 |     Tuple1
 74 |     (let [t ^Tuple1 v]
 75 |       (vector (._1 t)))
 76 | 
 77 |     Tuple2
 78 |     (let [t ^Tuple2 v]
 79 |       (vector (._1 t) (._2 t)))
 80 | 
 81 |     Tuple3
 82 |     (let [t ^Tuple3 v]
 83 |       (vector (._1 t) (._2 t) (._3 t)))
 84 | 
 85 |     Tuple4
 86 |     (let [t ^Tuple4 v]
 87 |       (vector (._1 t) (._2 t) (._3 t) (._4 t)))
 88 | 
 89 |     Tuple5
 90 |     (let [t ^Tuple5 v]
 91 |       (vector (._1 t) (._2 t) (._3 t) (._4 t) (._5 t)))
 92 | 
 93 |     Tuple6
 94 |     (let [t ^Tuple6 v]
 95 |       (vector (._1 t) (._2 t) (._3 t) (._4 t) (._5 t) (._6 t)))
 96 | 
 97 |     Tuple7
 98 |     (let [t ^Tuple7 v]
 99 |       (vector (._1 t) (._2 t) (._3 t) (._4 t) (._5 t) (._6 t) (._7 t)))
100 | 
101 |     Tuple8
102 |     (let [t ^Tuple8 v]
103 |       (vector (._1 t) (._2 t) (._3 t) (._4 t) (._5 t) (._6 t) (._7 t) (._8 t)))
104 | 
105 |     Tuple9
106 |     (let [t ^Tuple9 v]
107 |       (vector (._1 t) (._2 t) (._3 t) (._4 t) (._5 t) (._6 t) (._7 t) (._8 t) (._9 t)))
108 | 
109 |     (throw (IllegalArgumentException.
110 |              (str "Cannot coerce " (class v) " value to a vector")))))
111 | 
112 | 
113 | (defn from-tuple
114 |   "Coerce a Scala tuple value to a Clojure vector. Recursively walks the
115 |   structure to ensure all nested tuples are converted."
116 |   [t]
117 |   (letfn [(coerce-product
118 |             [x]
119 |             (if (instance? Product x)
120 |               (tuple->vec x)
121 |               x))]
122 |     (walk/prewalk coerce-product t)))
123 | 
124 | 
125 | (defn from-pair
126 |   "Coerce a Scala pair (`Tuple2`) value to a Clojure value. Returns map entry
127 |   values for efficiency. Recursively walks the structure to ensure all nested
128 |   values are Clojure-compatible."
129 |   [^Tuple2 pair]
130 |   (MapEntry. (from-tuple (._1 pair)) (from-tuple (._2 pair))))
131 | 
132 | 
133 | (defn to-pair
134 |   "Coerce a Clojure value to a Scala pair (`Tuple2`)."
135 |   ^Tuple2
136 |   [entry]
137 |   (cond
138 |     ;; Null values can't be coerced.
139 |     (nil? entry)
140 |     (throw (IllegalArgumentException.
141 |              "Cannot coerce nil to a pair value"))
142 | 
143 |     ;; Scala tuples can be returned directly.
144 |     (instance? Tuple2 entry)
145 |     entry
146 | 
147 |     ;; Use key/value from map entries to construct the pair.
148 |     (instance? MapEntry entry)
149 |     (Tuple2. (key entry) (val entry))
150 | 
151 |     ;; Try to generically coerce a vector result.
152 |     (vector? entry)
153 |     (if (= 2 (count entry))
154 |       (Tuple2. (clojure.core/first entry) (clojure.core/second entry))
155 |       (throw (IllegalArgumentException.
156 |                (str "Cannot coerce a vector with " (count entry)
157 |                     " elements to a pair value"))))
158 | 
159 |     ;; Unknown type, can't coerce.
160 |     :else
161 |     (throw (IllegalArgumentException.
162 |              (str "Cannot coerce unknown type " (.getName (class entry))
163 |                   " to a pair value")))))
164 | 
165 | 
166 | (defn first
167 |   "Get the first element of a Scala pair."
168 |   [^Tuple2 t]
169 |   (._1 t))
170 | 
171 | 
172 | (defn second
173 |   "Get the second element of a Scala pair."
174 |   [^Tuple2 t]
175 |   (._2 t))
176 | 


--------------------------------------------------------------------------------
/sparkplug-core/src/clojure/sparkplug/function.clj:
--------------------------------------------------------------------------------
  1 | (ns sparkplug.function
  2 |   "This namespace generates function classes for various kinds of interop with
  3 |   Spark and Scala."
  4 |   (:require
  5 |     [clojure.string :as str])
  6 |   (:import
  7 |     (java.lang.reflect
  8 |       Field
  9 |       Modifier)
 10 |     java.util.HashSet
 11 |     sparkplug.function.SerializableFn))
 12 | 
 13 | 
 14 | ;; ## Namespace Discovery
 15 | 
 16 | (defn- fn-enclosing-class
 17 |   "Given a function object, determine the name of the class which the function
 18 |   is a child of. Usually this is the class representing the namespace where the
 19 |   function is defined."
 20 |   [f]
 21 |   (-> (.getName (class f))
 22 |       (Compiler/demunge)
 23 |       (str/split #"/")
 24 |       (first)
 25 |       (symbol)))
 26 | 
 27 | 
 28 | (defn- class-name?
 29 |   "True if the provided symbol names a class, rather than a namespace."
 30 |   [name-sym]
 31 |   (let [class-name (-> (str name-sym)
 32 |                        (str/replace "-" "_")
 33 |                        (symbol))]
 34 |     (class? (resolve class-name))))
 35 | 
 36 | 
 37 | (defn- type-namespace
 38 |   "Given a symbol naming a record, return a symbol naming its defining
 39 |   namespace if it exists."
 40 |   [name-sym]
 41 |   (let [ns-sym (-> (str name-sym)
 42 |                    (str/replace #"\.[^.]+$" "")
 43 |                    (symbol))]
 44 |     (when (find-ns ns-sym)
 45 |       ns-sym)))
 46 | 
 47 | 
 48 | (defn- fn-namespace
 49 |   "Given a function object, derive the name of the namespace where it was
 50 |   defined."
 51 |   [f]
 52 |   ;; The logic here is to avoid marking class names as namespaces to be
 53 |   ;; required. When using a piece of data as a function, such as a keyword or
 54 |   ;; set, this will actually be a class name like `clojure.lang.Keyword`. This
 55 |   ;; also happens when referencing a function closure defined inside of a
 56 |   ;; record implementation, since the function becomes an inner class; in that
 57 |   ;; case, we _do_ want to mark the record's defining namespace.
 58 |   (let [enclosing (fn-enclosing-class f)]
 59 |     (if (class-name? enclosing)
 60 |       (type-namespace enclosing)
 61 |       enclosing)))
 62 | 
 63 | 
 64 | (defn- walk-object-refs
 65 |   "Walk the given object to find namespaces referenced by vars. Adds discovered
 66 |   reference symbols to `references` and tracks values in `visited`."
 67 |   [^HashSet references ^HashSet visited obj]
 68 |   (when-not (or (nil? obj)
 69 |                 ;; Simple types that can't have namespace references.
 70 |                 (boolean? obj)
 71 |                 (string? obj)
 72 |                 (number? obj)
 73 |                 (keyword? obj)
 74 |                 (symbol? obj)
 75 |                 (instance? clojure.lang.Ref obj)
 76 |                 ;; Nothing to do if we've already visited this object.
 77 |                 (.contains visited obj))
 78 |     (.add visited obj)
 79 |     (cond
 80 |       ;; Vars directly represent a namespace dependency.
 81 |       (var? obj)
 82 |       (let [ns-sym (ns-name (:ns (meta obj)))]
 83 |         (.add references ns-sym))
 84 | 
 85 |       ;; Clojure functions:
 86 |       ;; Try to derive the namespace that defined the function.
 87 |       ;; Functions also have Var references as static fields,
 88 |       ;; and have closed-over objects as non-static fields.
 89 |       (fn? obj)
 90 |       (when-let [ns-sym (fn-namespace obj)]
 91 |         (.add references ns-sym)
 92 |         (doseq [^Field field (.getDeclaredFields (class obj))]
 93 |           (let [value (SerializableFn/accessField obj field)]
 94 |             (walk-object-refs references visited value))))
 95 | 
 96 |       ;; For collection-like objects, (e.g. vectors, maps, records, Java collections),
 97 |       ;; just traverse the objects they contain.
 98 |       (seqable? obj)
 99 |       (doseq [entry obj]
100 |         (walk-object-refs references visited entry))
101 | 
102 |       ;; Otherwise, reflectively traverse the fields of the object for more references.
103 |       :else
104 |       (doseq [^Field field (.getDeclaredFields (class obj))]
105 |         (when-not (Modifier/isStatic (.getModifiers field))
106 |           (let [value (SerializableFn/accessField obj field)]
107 |             (walk-object-refs references visited value)))))))
108 | 
109 | 
110 | (defn namespace-references
111 |   "Walk the given function-like object to find all namespaces referenced by
112 |   closed-over vars. Returns a set of referenced namespace symbols."
113 |   [^Object obj]
114 |   (let [references (HashSet.)
115 |         visited (HashSet.)]
116 |     (walk-object-refs references visited obj)
117 |     (disj (set references) 'clojure.core)))
118 | 
119 | 
120 | ;; ## Function Wrappers
121 | 
122 | (defmacro ^:private gen-function
123 |   "Generate a new constructor for functions of the `fn-name` class that extends
124 |   `SerializableFn` and implements interfaces for compatibility with Spark."
125 |   [fn-name constructor]
126 |   (let [class-sym (symbol (str "sparkplug.function." fn-name))]
127 |     `(defn ~(vary-meta constructor assoc :tag class-sym)
128 |        ~(str "Construct a new serializable " fn-name " function wrapping `f`.")
129 |        [~'f]
130 |        (let [references# (namespace-references ~'f)]
131 |          (new ~class-sym ~'f (mapv str references#))))))
132 | 
133 | 
134 | (gen-function Fn1 fn1)
135 | (gen-function Fn2 fn2)
136 | (gen-function Fn3 fn3)
137 | (gen-function ComparatorFn comparator-fn)
138 | (gen-function FlatMapFn1 flat-map-fn)
139 | (gen-function FlatMapFn2 flat-map-fn2)
140 | (gen-function PairFlatMapFn pair-flat-map-fn)
141 | (gen-function PairFn pair-fn)
142 | (gen-function VoidFn void-fn)
143 | 


--------------------------------------------------------------------------------
/sparkplug-core/src/clojure/sparkplug/context.clj:
--------------------------------------------------------------------------------
  1 | (ns sparkplug.context
  2 |   "Functions for working with and creating Spark contexts."
  3 |   (:require
  4 |     [sparkplug.scala :as scala])
  5 |   (:import
  6 |     org.apache.spark.SparkConf
  7 |     org.apache.spark.api.java.JavaSparkContext))
  8 | 
  9 | 
 10 | ;; ## Application Lifecycle
 11 | 
 12 | (defn spark-context
 13 |   "Create a new spark context which takes its settings from the given
 14 |   configuration object."
 15 |   ^JavaSparkContext
 16 |   ([^SparkConf conf]
 17 |    (JavaSparkContext. conf))
 18 |   ^JavaSparkContext
 19 |   ([master app-name]
 20 |    (JavaSparkContext. (str master) (str app-name))))
 21 | 
 22 | 
 23 | (defn set-job-description!
 24 |   "Set a human readable description of the current job."
 25 |   [^JavaSparkContext spark-context description]
 26 |   (.setJobDescription spark-context description))
 27 | 
 28 | 
 29 | (defn set-job-group!
 30 |   "Assign a group ID to all the jobs started by this thread until the group ID
 31 |   is set to a different value or cleared.
 32 | 
 33 |   Often, a unit of execution in an application consists of multiple Spark
 34 |   actions or jobs. Application programmers can use this method to group all
 35 |   those jobs together and give a group description. Once set, the Spark web UI
 36 |   will associate such jobs with this group.
 37 | 
 38 |   The application can later use `cancel-job-group!` to cancel all running jobs
 39 |   in this group. If `interrupt?` is set to true for the job group, then job
 40 |   cancellation will result in the job's executor threads being interrupted."
 41 |   ([^JavaSparkContext spark-context group-id description]
 42 |    (.setJobGroup spark-context group-id description))
 43 |   ([^JavaSparkContext spark-context group-id description interrupt?]
 44 |    (.setJobGroup spark-context group-id description (boolean interrupt?))))
 45 | 
 46 | 
 47 | (defn clear-job-group!
 48 |   "Clear the current thread's job group ID and its description."
 49 |   [^JavaSparkContext spark-context]
 50 |   (.clearJobGroup spark-context))
 51 | 
 52 | 
 53 | (defn cancel-job-group!
 54 |   "Cancel active jobs for the specified group.
 55 | 
 56 |   See `set-job-group!` for more information."
 57 |   [^JavaSparkContext spark-context group-id]
 58 |   (.cancelJobGroup spark-context group-id))
 59 | 
 60 | 
 61 | (defn cancel-all-jobs!
 62 |   "Cancel all jobs that have been scheduled or are running."
 63 |   [^JavaSparkContext spark-context]
 64 |   (.cancelAllJobs spark-context))
 65 | 
 66 | 
 67 | (defn stop!
 68 |   "Shut down the Spark context."
 69 |   [^JavaSparkContext spark-context]
 70 |   (.stop spark-context))
 71 | 
 72 | 
 73 | (defmacro with-context
 74 |   "Evaluate `body` within a new Spark context by constructing one from the
 75 |   given expression. The context is stopped after evaluation is complete."
 76 |   [binding-vec & body]
 77 |   {:pre [(vector? binding-vec) (= 2 (count binding-vec))]}
 78 |   (let [[ctx-sym expr] binding-vec
 79 |         ctx-sym (vary-meta ctx-sym assoc :tag 'org.apache.spark.api.java.JavaSparkContext)]
 80 |     `(let [~ctx-sym (spark-context ~expr)]
 81 |        (try
 82 |          ~@body
 83 |          (finally
 84 |            (stop! ~ctx-sym))))))
 85 | 
 86 | 
 87 | ;; ## Context Introspection
 88 | 
 89 | (defn config
 90 |   "Return the Spark configuration used for the given context."
 91 |   ^SparkConf
 92 |   [^JavaSparkContext spark-context]
 93 |   (.getConf spark-context))
 94 | 
 95 | 
 96 | (defn info
 97 |   "Build a map of information about the Spark context."
 98 |   [^JavaSparkContext spark-context]
 99 |   {:master (.master spark-context)
100 |    :app-name (.appName spark-context)
101 |    :local? (.isLocal spark-context)
102 |    :user (.sparkUser spark-context)
103 |    :start-time (.startTime spark-context)
104 |    :version (.version spark-context)
105 |    :jars (.jars spark-context)
106 |    :default-min-partitions (.defaultMinPartitions spark-context)
107 |    :default-parallelism (.defaultParallelism spark-context)
108 |    :checkpoint-dir (scala/resolve-option (.getCheckpointDir spark-context))})
109 | 
110 | 
111 | (defn get-local-property
112 |   "Get a local property set for this thread, or null if not set."
113 |   [^JavaSparkContext spark-context k]
114 |   (.getLocalProperty spark-context k))
115 | 
116 | 
117 | (defn persistent-rdds
118 |   "Return a Java map of JavaRDDs that have marked themselves as persistent via
119 |   a `cache!` call."
120 |   [^JavaSparkContext spark-context]
121 |   (into {} (.getPersistentRDDs spark-context)))
122 | 
123 | 
124 | ;; ## Context Modifiers
125 | 
126 | (defn add-file!
127 |   "Add a file to be downloaded with this Spark job on every node."
128 |   ([^JavaSparkContext spark-context path]
129 |    (.addFile spark-context path))
130 |   ([^JavaSparkContext spark-context path recursive?]
131 |    (.addFile spark-context path (boolean recursive?))))
132 | 
133 | 
134 | (defn add-jar!
135 |   "Adds a JAR dependency for all tasks to be executed on this SparkContext in
136 |   the future."
137 |   [^JavaSparkContext spark-context path]
138 |   (.addJar spark-context path))
139 | 
140 | 
141 | (defn set-local-property!
142 |   "Set a local property that affects jobs submitted from this thread, and all
143 |   child threads, such as the Spark fair scheduler pool."
144 |   [^JavaSparkContext spark-context k v]
145 |   (.setLocalProperty spark-context k v))
146 | 
147 | 
148 | (defn set-checkpoint-dir!
149 |   "Set the directory under which RDDs are going to be checkpointed."
150 |   [^JavaSparkContext spark-context path]
151 |   (.setCheckpointDir spark-context path))
152 | 
153 | 
154 | (defn set-log-level!
155 |   "Control the Spark application's logging level."
156 |   [^JavaSparkContext spark-context level]
157 |   (.setLogLevel spark-context level))
158 | 


--------------------------------------------------------------------------------
/sparkplug-core/src/java/sparkplug/function/SerializableFn.java:
--------------------------------------------------------------------------------
  1 | package sparkplug.function;
  2 | 
  3 | 
  4 | import clojure.lang.Compiler;
  5 | import clojure.lang.IFn;
  6 | import clojure.lang.Keyword;
  7 | import clojure.lang.RT;
  8 | import clojure.lang.Symbol;
  9 | import clojure.lang.Var;
 10 | 
 11 | import java.lang.reflect.Field;
 12 | import java.lang.reflect.Modifier;
 13 | 
 14 | import java.io.IOException;
 15 | import java.io.InvalidObjectException;
 16 | import java.io.ObjectInputStream;
 17 | import java.io.ObjectOutputStream;
 18 | import java.io.Serializable;
 19 | 
 20 | import java.util.ArrayList;
 21 | import java.util.Collection;
 22 | import java.util.Collections;
 23 | import java.util.HashSet;
 24 | import java.util.List;
 25 | 
 26 | import org.slf4j.Logger;
 27 | import org.slf4j.LoggerFactory;
 28 | 
 29 | 
 30 | /**
 31 |  * Base class for function classes built for interop with Spark and Scala.
 32 |  *
 33 |  * This class is designed to be serialized across computation boundaries in a
 34 |  * manner compatible with Spark and Kryo, while ensuring that required code is
 35 |  * loaded upon deserialization.
 36 |  */
 37 | public abstract class SerializableFn implements Serializable {
 38 | 
 39 |     private static final Logger logger = LoggerFactory.getLogger(SerializableFn.class);
 40 |     private static final Var require = RT.var("clojure.core", "require");
 41 | 
 42 |     protected IFn f;
 43 |     protected List namespaces;
 44 | 
 45 | 
 46 |     /**
 47 |      * Default empty constructor.
 48 |      */
 49 |     private SerializableFn() {
 50 |     }
 51 | 
 52 | 
 53 |     /**
 54 |      * Construct a new serializable wrapper for the function with an explicit
 55 |      * set of required namespaces.
 56 |      *
 57 |      * @param fn Clojure function to wrap
 58 |      * @param namespaces collection of namespaces required
 59 |      */
 60 |     protected SerializableFn(IFn fn, Collection namespaces) {
 61 |         this.f = fn;
 62 |         List namespaceColl = new ArrayList(namespaces);
 63 |         Collections.sort(namespaceColl);
 64 |         this.namespaces = Collections.unmodifiableList(namespaceColl);
 65 |     }
 66 | 
 67 | 
 68 |     /**
 69 |      * Safely access the value of a field on the given object.
 70 |      *
 71 |      * @param obj Instance to access a field on
 72 |      * @param field Reflective field to access
 73 |      * @return the value of the field, or nil on failure
 74 |      */
 75 |     public static Object accessField(Object obj, Field field) {
 76 |         try {
 77 |             if (!field.isAccessible()) {
 78 |                 field.setAccessible(true);
 79 |             }
 80 |             return field.get(obj);
 81 |         } catch (Exception ex) {
 82 |             logger.trace("Failed to access field " + field.toString() + ": " + ex.getClass().getName());
 83 |             return null;
 84 |         }
 85 |     }
 86 | 
 87 | 
 88 |     /**
 89 |      * Walk a value to convert any deserialized booleans back into the
 90 |      * canonical java.lang.Boolean values.
 91 |      *
 92 |      * @param visited Set of objects already visited by the walk
 93 |      * @param obj Object to walk references of
 94 |      */
 95 |     private void fixBooleans(HashSet visited, Object obj) {
 96 |         // Short-circuit objects which can't have nested values to fix.
 97 |         if ((obj == null)
 98 |                 || (obj instanceof Boolean)
 99 |                 || (obj instanceof String)
100 |                 || (obj instanceof Number)
101 |                 || (obj instanceof Keyword)
102 |                 || (obj instanceof Symbol)
103 |                 || (obj instanceof Var)) {
104 |             return;
105 |         }
106 | 
107 |         // Short-circuit if we've already visited this object.
108 |         if (visited.contains(obj)) {
109 |             return;
110 |         }
111 | 
112 |         visited.add(obj);
113 | 
114 |         // For collection-like objects, just traverse their elements.
115 |         if (obj instanceof Iterable) {
116 |             for (Object el : (Iterable)obj) {
117 |                 fixBooleans(visited, el);
118 |             }
119 |             return;
120 |         }
121 | 
122 |         // Otherwise, look at the object's fields and try to fix any booleans
123 |         // we find and traverse further.
124 |         for (Field field : obj.getClass().getDeclaredFields()) {
125 |             if (!Modifier.isStatic(field.getModifiers())) {
126 |                 Object value = accessField(obj, field);
127 |                 if (value instanceof Boolean) {
128 |                     Boolean canonical = ((Boolean)value).booleanValue() ? Boolean.TRUE : Boolean.FALSE;
129 |                     try {
130 |                         field.set(obj, canonical);
131 |                     } catch (IllegalAccessException ex) {
132 |                         logger.warn("Failed to set boolean field " + field.toString());
133 |                     }
134 |                 } else {
135 |                     fixBooleans(visited, value);
136 |                 }
137 |             }
138 |         }
139 |     }
140 | 
141 | 
142 |     /**
143 |      * Serialize the function to the provided output stream.
144 |      * An unspoken part of the `Serializable` interface.
145 |      *
146 |      * @param out stream to write the function to
147 |      */
148 |     private void writeObject(ObjectOutputStream out) throws IOException {
149 |         try {
150 |             logger.trace("Serializing " + f);
151 |             // Write the function class name
152 |             // This is only used for debugging
153 |             out.writeObject(f.getClass().getName());
154 |             // Write out the referenced namespaces.
155 |             out.writeInt(namespaces.size());
156 |             for (String ns : namespaces) {
157 |                 out.writeObject(ns);
158 |             }
159 |             // Write out the function itself.
160 |             out.writeObject(f);
161 |         } catch (IOException ex) {
162 |             logger.error("Error serializing function " + f, ex);
163 |             throw ex;
164 |         } catch (RuntimeException ex){
165 |             logger.error("Error serializing function " + f, ex);
166 |             throw ex;
167 |         }
168 |     }
169 | 
170 | 
171 |     /**
172 |      * Deserialize a function from the provided input stream.
173 |      * An unspoken part of the `Serializable` interface.
174 |      *
175 |      * @param in stream to read the function from
176 |      */
177 |     private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException {
178 |         String className = "";
179 |         try {
180 |             // Read the function class name.
181 |             className = (String)in.readObject();
182 |             logger.trace("Deserializing " + className);
183 |             // Read the referenced namespaces and load them.
184 |             int nsCount = in.readInt();
185 |             this.namespaces = new ArrayList(nsCount);
186 |             for (int i = 0; i < nsCount; i++) {
187 |                 String ns = (String)in.readObject();
188 |                 namespaces.add(ns);
189 |                 requireNamespace(ns);
190 |             }
191 |             // Read the function itself.
192 |             this.f = (IFn)in.readObject();
193 |             // Walk the data structure to coerce canonical booleans.
194 |             fixBooleans(new HashSet(), this.f);
195 |         } catch (IOException ex) {
196 |             logger.error("IO error deserializing function " + className, ex);
197 |             throw ex;
198 |         } catch (ClassNotFoundException ex) {
199 |             logger.error("Class error deserializing function " + className, ex);
200 |             throw ex;
201 |         } catch (RuntimeException ex) {
202 |             logger.error("Error deserializing function " + className, ex);
203 |             throw ex;
204 |         }
205 |     }
206 | 
207 | 
208 |     /**
209 |      * Load the namespace specified by the given symbol.
210 |      *
211 |      * @param namespace string designating the namespace to load
212 |      */
213 |     private static void requireNamespace(String namespace) {
214 |         try {
215 |             logger.trace("(require " + namespace + ")");
216 |             synchronized (RT.REQUIRE_LOCK) {
217 |                 Symbol sym = Symbol.intern(namespace);
218 |                 require.invoke(sym);
219 |             }
220 |         } catch (Exception ex) {
221 |             logger.warn("Error loading namespace " + namespace, ex);
222 |         }
223 |     }
224 | 
225 | }
226 | 


--------------------------------------------------------------------------------
/sparkplug-core/src/clojure/sparkplug/rdd.clj:
--------------------------------------------------------------------------------
  1 | (ns sparkplug.rdd
  2 |   "This namespace provides the main API for writing Spark tasks.
  3 | 
  4 |   Most operations in this namespace place the RDD last in the argument list,
  5 |   just like Clojure collection functions. This lets you compose them using the
  6 |   thread-last macro (`->>`), making it simple to migrate existing Clojure
  7 |   code."
  8 |   (:refer-clojure :exclude [empty name partition-by])
  9 |   (:require
 10 |     [clojure.string :as str]
 11 |     [sparkplug.function :as f]
 12 |     [sparkplug.scala :as scala])
 13 |   (:import
 14 |     clojure.lang.Compiler
 15 |     (org.apache.spark
 16 |       HashPartitioner
 17 |       Partitioner)
 18 |     (org.apache.spark.api.java
 19 |       JavaPairRDD
 20 |       JavaRDD
 21 |       JavaRDDLike
 22 |       JavaSparkContext
 23 |       StorageLevels)
 24 |     sparkplug.partition.FnHashPartitioner))
 25 | 
 26 | 
 27 | ;; ## Naming Functions
 28 | 
 29 | ;; Type hints are omitted because `name` is not included in JavaRDDLike.
 30 | (defn name
 31 |   "Return the current name for `rdd`."
 32 |   [rdd]
 33 |   (.name rdd))
 34 | 
 35 | 
 36 | ;; Type hints are omitted because `setName` is not included in JavaRDDLike.
 37 | (defn set-name
 38 |   "Set the name of `rdd` to `name-str`."
 39 |   ^JavaRDDLike
 40 |   [name-str rdd]
 41 |   (.setName rdd name-str))
 42 | 
 43 | 
 44 | (defn- internal-call?
 45 |   "True if a stack-trace element should be ignored because it represents an internal
 46 |   function call that should not be considered a callsite."
 47 |   [^StackTraceElement element]
 48 |   (let [class-name (.getClassName element)]
 49 |     (or (str/starts-with? class-name "sparkplug.")
 50 |         (str/starts-with? class-name "clojure.lang."))))
 51 | 
 52 | 
 53 | (defn- stack-callsite
 54 |   "Find the top element in the current stack trace that is not an internal
 55 |   function call."
 56 |   ^StackTraceElement
 57 |   []
 58 |   (first (remove internal-call? (.getStackTrace (Exception.)))))
 59 | 
 60 | 
 61 | (defn ^:no-doc fn-name
 62 |   "Return the (unmangled) name of the given Clojure function."
 63 |   [f]
 64 |   (Compiler/demunge (.getName (class f))))
 65 | 
 66 | 
 67 | (defn- callsite-name
 68 |   "Generate a name for the callsite of this function by looking at the current
 69 |   stack. Ignores core Clojure and internal function frames."
 70 |   []
 71 |   (let [callsite (stack-callsite)
 72 |         filename (.getFileName callsite)
 73 |         classname (.getClassName callsite)
 74 |         line-number (.getLineNumber callsite)]
 75 |     (format "%s %s:%d" (Compiler/demunge classname) filename line-number)))
 76 | 
 77 | 
 78 | (defn ^:no-doc set-callsite-name
 79 |   "Provide a name for the given RDD by looking at the current stack. Returns
 80 |   the updated RDD if the name could be determined."
 81 |   ^JavaRDD
 82 |   [^JavaRDD rdd & args]
 83 |   (try
 84 |     (let [rdd-name (format "#<%s: %s %s>"
 85 |                            (.getSimpleName (class rdd))
 86 |                            (callsite-name)
 87 |                            (if (seq args)
 88 |                              (str " [" (str/join ", " args) "]")
 89 |                              ""))]
 90 |       (.setName rdd rdd-name))
 91 |     (catch Exception _
 92 |       ;; Ignore errors and return an unnamed RDD.
 93 |       rdd)))
 94 | 
 95 | 
 96 | ;; ## Dataset Construction
 97 | 
 98 | (defn empty
 99 |   "Construct a new empty RDD."
100 |   ^JavaRDD
101 |   [^JavaSparkContext spark-context]
102 |   (.emptyRDD spark-context))
103 | 
104 | 
105 | (defn parallelize
106 |   "Distribute a local collection to form an RDD. Optionally accepts a number
107 |   of partitions to slice the collection into."
108 |   (^JavaRDD
109 |    [^JavaSparkContext spark-context coll]
110 |    (set-callsite-name
111 |      (.parallelize spark-context coll)))
112 |   (^JavaRDD
113 |    [^JavaSparkContext spark-context min-partitions coll]
114 |    (set-callsite-name
115 |      (.parallelize spark-context coll min-partitions)
116 |      min-partitions)))
117 | 
118 | 
119 | (defn parallelize-pairs
120 |   "Distributes a local collection to form a pair RDD. Optionally accepts a
121 |   number of partitions to slice the collection into."
122 |   ^JavaPairRDD
123 |   ([^JavaSparkContext spark-context coll]
124 |    (set-callsite-name
125 |      (.parallelizePairs
126 |        spark-context
127 |        (map scala/to-pair coll))))
128 |   ^JavaPairRDD
129 |   ([^JavaSparkContext spark-context min-partitions coll]
130 |    (set-callsite-name
131 |      (.parallelizePairs
132 |        spark-context
133 |        (map scala/to-pair coll)
134 |        min-partitions)
135 |      min-partitions)))
136 | 
137 | 
138 | (defn binary-files
139 |   "Read a directory of binary files from the given URL as a pair RDD of paths
140 |   to byte streams."
141 |   ^JavaPairRDD
142 |   ([^JavaSparkContext spark-context path]
143 |    (.binaryFiles spark-context path))
144 |   ^JavaPairRDD
145 |   ([^JavaSparkContext spark-context path num-partitions]
146 |    (.binaryFiles spark-context path (int num-partitions))))
147 | 
148 | 
149 | (defn text-file
150 |   "Read a text file from a URL into an RDD of the lines in the file. Optionally
151 |   accepts a number of partitions to slice the file into."
152 |   ^JavaRDD
153 |   ([^JavaSparkContext spark-context filename]
154 |    (.textFile spark-context filename))
155 |   ^JavaRDD
156 |   ([^JavaSparkContext spark-context min-partitions filename]
157 |    (.textFile spark-context filename min-partitions)))
158 | 
159 | 
160 | (defn whole-text-files
161 |   "Read a directory of text files from a URL into an RDD. Each element of the
162 |   RDD is a pair of the file path and the full contents of the file."
163 |   (^JavaPairRDD
164 |    [^JavaSparkContext spark-context filename]
165 |    (.wholeTextFiles spark-context filename))
166 |   (^JavaPairRDD
167 |    [^JavaSparkContext spark-context min-partitions filename]
168 |    (.wholeTextFiles spark-context filename min-partitions)))
169 | 
170 | 
171 | (defn save-as-text-file
172 |   "Write the elements of `rdd` as a text file (or set of text files) in a given
173 |   directory `path` in the local filesystem, HDFS or any other Hadoop-supported
174 |   file system. Spark will call toString on each element to convert it to a line
175 |   of text in the file."
176 |   [path ^JavaRDDLike rdd]
177 |   (.saveAsTextFile rdd (str path)))
178 | 
179 | 
180 | ;; ## Partitioning Logic
181 | 
182 | (defn hash-partitioner
183 |   "Construct a partitioner which will hash keys to distribute them uniformly
184 |   over `n` buckets. Optionally accepts a `key-fn` which will be called on each
185 |   key before hashing it."
186 |   (^Partitioner
187 |    [n]
188 |    (HashPartitioner. (int n)))
189 |   (^Partitioner
190 |    [key-fn n]
191 |    (FnHashPartitioner. (int n) (f/fn1 key-fn))))
192 | 
193 | 
194 | (defn partitions
195 |   "Return a vector of the partitions in `rdd`."
196 |   [^JavaRDDLike rdd]
197 |   (into [] (.partitions (.rdd rdd))))
198 | 
199 | 
200 | (defn num-partitions
201 |   "Returns the number of partitions in `rdd`."
202 |   [^JavaRDDLike rdd]
203 |   (.getNumPartitions rdd))
204 | 
205 | 
206 | (defn partitioner
207 |   "Return the partitioner associated with `rdd`, or nil if there is no custom
208 |   partitioner."
209 |   [^JavaRDDLike rdd]
210 |   (scala/resolve-option
211 |     (.partitioner (.rdd rdd))))
212 | 
213 | 
214 | (defn partition-by
215 |   "Return a copy of `rdd` partitioned by the given `partitioner`."
216 |   [^Partitioner partitioner ^JavaPairRDD rdd]
217 |   (set-callsite-name
218 |     (.partitionBy rdd partitioner)
219 |     (.getName (class partitioner))))
220 | 
221 | 
222 | ;; Type hints are omitted because `repartition` is not included in JavaRDDLike.
223 | (defn repartition
224 |   "Returns a new `rdd` with exactly `n` partitions.
225 | 
226 |   This method can increase or decrease the level of parallelism in this RDD.
227 |   Internally, this uses a shuffle to redistribute data.
228 | 
229 |   If you are decreasing the number of partitions in this RDD, consider using
230 |   `coalesce`, which can avoid performing a shuffle."
231 |   ^JavaRDDLike
232 |   [n rdd]
233 |   (set-callsite-name
234 |     (.repartition rdd (int n))
235 |     (int n)))
236 | 
237 | 
238 | (defn repartition-and-sort-within-partitions
239 |   "Repartition the RDD according to the given partitioner and, within each
240 |   resulting partition, sort records by their keys. This is more efficient than
241 |   calling repartition and then sorting within each partition because it can
242 |   push the sorting down into the shuffle machinery."
243 |   (^JavaPairRDD
244 |    [^Partitioner partitioner ^JavaPairRDD pair-rdd]
245 |    (.repartitionAndSortWithinPartitions pair-rdd partitioner))
246 |   (^JavaPairRDD
247 |    [^Partitioner partitioner ^java.util.Comparator comparator ^JavaPairRDD pair-rdd]
248 |    (.repartitionAndSortWithinPartitions pair-rdd partitioner comparator)))
249 | 
250 | 
251 | ;; Type hints are omitted because `coalesce` is not included in JavaRDDLike.
252 | (defn coalesce
253 |   "Decrease the number of partitions in `rdd` to `n`. Useful for running
254 |   operations more efficiently after filtering down a large dataset."
255 |   ([num-partitions rdd]
256 |    (coalesce num-partitions false rdd))
257 |   ([num-partitions shuffle? rdd]
258 |    (set-callsite-name
259 |      (.coalesce rdd (int num-partitions) (boolean shuffle?))
260 |      (int num-partitions)
261 |      (boolean shuffle?))))
262 | 
263 | 
264 | ;; ## Storage Management
265 | 
266 | (def storage-levels
267 |   "Keyword mappings for available RDD storage levels."
268 |   {:memory-only           StorageLevels/MEMORY_ONLY
269 |    :memory-only-ser       StorageLevels/MEMORY_ONLY_SER
270 |    :memory-and-disk       StorageLevels/MEMORY_AND_DISK
271 |    :memory-and-disk-ser   StorageLevels/MEMORY_AND_DISK_SER
272 |    :disk-only             StorageLevels/DISK_ONLY
273 |    :memory-only-2         StorageLevels/MEMORY_ONLY_2
274 |    :memory-only-ser-2     StorageLevels/MEMORY_ONLY_SER_2
275 |    :memory-and-disk-2     StorageLevels/MEMORY_AND_DISK_2
276 |    :memory-and-disk-ser-2 StorageLevels/MEMORY_AND_DISK_SER_2
277 |    :disk-only-2           StorageLevels/DISK_ONLY_2
278 |    :none                  StorageLevels/NONE})
279 | 
280 | 
281 | (defn storage-level
282 |   "Return the keyword representing the storage level in the `storage-levels`
283 |   map, or the raw value if not found."
284 |   [^JavaRDD rdd]
285 |   (let [level (.getStorageLevel rdd)]
286 |     (or (->> storage-levels
287 |              (filter #(= level (val %)))
288 |              (map key)
289 |              (first))
290 |         level)))
291 | 
292 | 
293 | ;; Type hints are omitted because `cache` and `persist` are not included in
294 | ;; JavaRDDLike.
295 | (defn cache!
296 |   "Sets the storage level of `rdd` to persist its values across operations
297 |   after the first time it is computed. By default, this uses the `:memory-only`
298 |   level, but an alternate may be specified by `level`.
299 | 
300 |   This can only be used to assign a new storage level if the RDD does not have
301 |   a storage level set already."
302 |   ([rdd]
303 |    (.cache rdd))
304 |   ([level rdd]
305 |    {:pre [(contains? storage-levels level)]}
306 |    (.persist rdd (get storage-levels level))))
307 | 
308 | 
309 | ;; Type hints are omitted because `unpersist` is not included in JavaRDDLike.
310 | (defn uncache!
311 |   "Mark `rdd` as non-persistent, and remove all blocks for it from memory and
312 |   disk. Blocks until all data has been removed unless `blocking?` is provided
313 |   and false."
314 |   ([rdd]
315 |    (.unpersist rdd))
316 |   ([blocking? rdd]
317 |    (.unpersist rdd (boolean blocking?))))
318 | 
319 | 
320 | (defn checkpointed?
321 |   "True if `rdd` has been marked for checkpointing."
322 |   [^JavaRDDLike rdd]
323 |   (.isCheckpointed rdd))
324 | 
325 | 
326 | (defn checkpoint!
327 |   "Mark `rdd` for checkpointing. It will be saved to a file inside the
328 |   checkpoint directory set on the Spark context and all references to its
329 |   parent RDDs will be removed.
330 | 
331 |   This function must be called before any job has been executed on this RDD. It
332 |   is strongly recommended that this RDD is persisted in memory, otherwise
333 |   saving it to a file will require recomputation."
334 |   [^JavaRDDLike rdd]
335 |   (.checkpoint rdd))
336 | 


--------------------------------------------------------------------------------
/sparkplug-core/src/clojure/sparkplug/kryo.clj:
--------------------------------------------------------------------------------
  1 | (ns sparkplug.kryo
  2 |   "Functions for managing object serialization with Kryo.
  3 | 
  4 |   To configure a new Kryo instance, this class looks up all resources in
  5 |   directories named `sparkplug/kryo/registry/` on the classpath. The files are
  6 |   read in sorted order, one line at a time. Each line should be tab-separated
  7 |   and begin with the desired action:
  8 | 
  9 |   - `require        {{namespace}}`
 10 |     Require a namespace to load code or for other side effects.
 11 |   - `register       {{class}}`
 12 |     Register the named class with default serialization. The class name may be
 13 |     suffixed with `[]` pairs to indicate array class types.
 14 |   - `register       {{class}}     {{serializer}}`
 15 |     Register the named class with the given serializer. The serializer may
 16 |     either be the name of a class to instantiate with the default constructor,
 17 |     or a qualified function var be resolved and called with no arguments to
 18 |     return a `Serializer` instance.
 19 |   - `configure      {{config-fn}}`
 20 |     Resolve the named function and call it on the Kryo instance to directly
 21 |     configure it.
 22 | 
 23 |   Blank lines or lines beginning with a hash (#) are ignored."
 24 |   (:require
 25 |     [clojure.java.classpath :as classpath]
 26 |     [clojure.java.io :as io]
 27 |     [clojure.string :as str]
 28 |     [clojure.tools.logging :as log])
 29 |   (:import
 30 |     (clojure.lang
 31 |       BigInt
 32 |       IPersistentMap
 33 |       IPersistentSet
 34 |       IPersistentVector
 35 |       ISeq
 36 |       Keyword
 37 |       Named
 38 |       PersistentTreeMap
 39 |       PersistentTreeSet
 40 |       Ratio
 41 |       StringSeq
 42 |       Var)
 43 |     (com.esotericsoftware.kryo
 44 |       Kryo
 45 |       Serializer)
 46 |     (com.esotericsoftware.kryo.io
 47 |       Input
 48 |       Output)
 49 |     java.io.File
 50 |     java.math.BigInteger
 51 |     (java.util.jar
 52 |       JarFile)
 53 |     org.objenesis.strategy.StdInstantiatorStrategy))
 54 | 
 55 | 
 56 | ;; ## Registry Files
 57 | 
 58 | (def ^:const registry-prefix
 59 |   "SparkPlug registry files must be available under this directory path."
 60 |   "sparkplug/kryo/registry")
 61 | 
 62 | 
 63 | (def ^:const registry-extension
 64 |   "SparkPlug registry file extension."
 65 |   ".conf")
 66 | 
 67 | 
 68 | (defn- registry-path?
 69 |   "True if the given path is a valid registry file name."
 70 |   [path]
 71 |   (and (str/starts-with? path registry-prefix)
 72 |        (str/ends-with? path registry-extension)))
 73 | 
 74 | 
 75 | (defn- relative-suffix
 76 |   "Return the suffix in `b` if it is prefixed by `a`."
 77 |   [a b]
 78 |   (let [a (str a "/")
 79 |         b (str b)]
 80 |     (when (str/starts-with? b a)
 81 |       (subs b (count a)))))
 82 | 
 83 | 
 84 | (defn- read-dir-file
 85 |   "Read a file from the given directory. Returns a map of registry data."
 86 |   [^File dir path]
 87 |   (let [file (io/file dir path)]
 88 |     (log/debug "Reading registry configuration from file" (str file))
 89 |     {:path (str dir)
 90 |      :name path
 91 |      :text (slurp file)}))
 92 | 
 93 | 
 94 | (defn- find-dir-files
 95 |   "Find all files in the given directory matching the registry prefix."
 96 |   [^File dir]
 97 |   (->> (file-seq dir)
 98 |        (keep (partial relative-suffix dir))
 99 |        (filter registry-path?)
100 |        (sort)
101 |        (map (partial read-dir-file dir))))
102 | 
103 | 
104 | (defn- read-jar-entry
105 |   "Read an entry in the given jar. Returns a map of registry data if the entry
106 |   is in the jar."
107 |   [^JarFile jar entry-name]
108 |   (when-let [entry (.getEntry jar entry-name)]
109 |     (log/debugf "Reading registry configuration from JAR entry %s!%s"
110 |                 (.getName jar) entry-name)
111 |     {:path (.getName jar)
112 |      :name entry-name
113 |      :text (slurp (.getInputStream jar entry))}))
114 | 
115 | 
116 | (defn- find-jar-entries
117 |   "Find all entries in the given JAR file matching the registry prefix."
118 |   [^JarFile jar]
119 |   (->> (classpath/filenames-in-jar jar)
120 |        (filter registry-path?)
121 |        (sort)
122 |        (keep (partial read-jar-entry jar))))
123 | 
124 | 
125 | (defn- find-classpath-files
126 |   "Find all config files on the classpath within the registry prefix."
127 |   []
128 |   (concat (mapcat find-dir-files (classpath/classpath-directories))
129 |           (mapcat find-jar-entries (classpath/classpath-jarfiles))))
130 | 
131 | 
132 | (defn- parse-registry-line
133 |   "Parse a line from a registry file. Returns a map of information with the
134 |   given line number as `:line`, an action `:type` keyword, and any remaining
135 |   `:args` as a sequence of strings. Returns nil if the line is blank or a
136 |   comment."
137 |   [line-number line]
138 |   (when-not (or (str/blank? line)
139 |                 (str/starts-with? line "#"))
140 |     (let [[action-type & args] (str/split line #"\t")]
141 |       {:line line-number
142 |        :type (keyword action-type)
143 |        :args (vec args)})))
144 | 
145 | 
146 | (defn- parse-registry-actions
147 |   "Parse the text content of the given registry data map. Returns an updated map
148 |   with `:text` removed and `:actions` set to the parsed lines."
149 |   [registry]
150 |   (let [actions (->>
151 |                   (:text registry)
152 |                   (str/split-lines)
153 |                   (map-indexed parse-registry-line)
154 |                   (remove nil?)
155 |                   (vec))]
156 |     (-> registry
157 |         (assoc :actions actions)
158 |         (dissoc :text))))
159 | 
160 | 
161 | (defn classpath-registries
162 |   "Return a sequence of registry file maps from the classpath. Returns a sorted
163 |   sequence with a single entry per distinct config name. Files earlier on the
164 |   classpath will take precedence."
165 |   []
166 |   (->>
167 |     (find-classpath-files)
168 |     (map (juxt :name parse-registry-actions))
169 |     (reverse)
170 |     (into (sorted-map))
171 |     (vals)))
172 | 
173 | 
174 | ;; ## Registry Actions
175 | 
176 | (defn- load-require-action
177 |   "Prepare a `require` action from a registry. Requires the namespace and
178 |   returns nil."
179 |   [args]
180 |   ;; Check arguments.
181 |   (when-not (= 1 (count args))
182 |     (throw (ex-info (str "require action takes exactly one argument, not "
183 |                          (count args))
184 |                     {:type ::bad-action})))
185 |   (when (str/includes? (first args) "/")
186 |     (throw (ex-info "require action argument should not be namespaced"
187 |                     {:type ::bad-action})))
188 |   ;; Require the namespace code.
189 |   (let [ns-sym (symbol (first args))]
190 |     (log/debug "Requiring namespace" ns-sym)
191 |     (require ns-sym))
192 |   ;; Nothing to do per-kryo instance afterwards.
193 |   nil)
194 | 
195 | 
196 | (defn- convert-array-class
197 |   "Determine the base class and number of nested arrays for a class name like
198 |   `String[][]`. Returns a rewritten string in a form that the classloader will
199 |   understand like `[[LString;`."
200 |   [class-name]
201 |   (loop [class-name class-name
202 |          arrays 0]
203 |     (if (str/ends-with? class-name "[]")
204 |       (recur (subs class-name 0 (- (count class-name) 2))
205 |              (inc arrays))
206 |       (if (zero? arrays)
207 |         class-name
208 |         (str (str/join (repeat arrays \[))
209 |              "L" class-name ";")))))
210 | 
211 | 
212 | (defn- load-register-action
213 |   "Prepare a `register` action from a registry at load-time. Loads the class to
214 |   register and any serialzer and returns a function which will register the
215 |   class with a Kryo instance."
216 |   [args]
217 |   (when-not (<= 1 (count args) 2)
218 |     (throw (ex-info (str "register action takes one or two arguments, not "
219 |                          (count args))
220 |                     {:type ::bad-action})))
221 |   (when (and (second args) (not (str/includes? (second args) "/")))
222 |     (throw (ex-info "register action serializer should be a namespaced symbol"
223 |                     {:type ::bad-action})))
224 |   (let [[class-name serializer-name] args]
225 |     (log/debugf "Registering class %s with %s serializer"
226 |                 class-name
227 |                 (or serializer-name "default"))
228 |     ;; Load the class to register.
229 |     (let [target-class (Class/forName (convert-array-class class-name))]
230 |       (if serializer-name
231 |         (if (str/includes? serializer-name "/")
232 |           ;; Resolve the named function to construct a new serializer instance.
233 |           (if-let [constructor (requiring-resolve (symbol serializer-name))]
234 |             (fn register
235 |               [^Kryo kryo]
236 |               (let [serializer ^Serializer (constructor)]
237 |                 (.register kryo target-class serializer)))
238 |             (throw (ex-info (str "Could not resolve serializer constructor function "
239 |                                  serializer-name)
240 |                             {:type ::bad-action})))
241 |           ;; Assume the serializer is a class name and construct an instance.
242 |           (let [serializer-class (Class/forName serializer-name)]
243 |             (fn register
244 |               [^Kryo kryo]
245 |               (let [serializer ^Serializer (.newInstance serializer-class)]
246 |                 (.register kryo target-class serializer)))))
247 |         ;; No serializer, register with defaults.
248 |         (fn register
249 |           [^Kryo kryo]
250 |           (.register kryo target-class))))))
251 | 
252 | 
253 | (defn- load-configure-action
254 |   "Prepare a `configure` action from a registry at load-time. Resolves the
255 |   configuration function and returns it."
256 |   [args]
257 |   (when-not (= 1 (count args))
258 |     (throw (ex-info (str "configure action takes exactly one argument, not "
259 |                          (count args))
260 |                     {:type ::bad-action})))
261 |   (when-not (str/includes? (first args) "/")
262 |     (throw (ex-info "configure action function should be a namespaced symbol"
263 |                     {:type ::bad-action})))
264 |   (let [var-name (symbol (first args))]
265 |     (log/debug "Configuring Kryo with function" var-name)
266 |     (or (requiring-resolve var-name)
267 |         (throw (ex-info (str "Could not resolve configuration function "
268 |                              var-name)
269 |                         {:type ::bad-action})))))
270 | 
271 | 
272 | (defn- load-action
273 |   "Load the configuration `action` as read from the given `registry`.
274 |   Dispatches on action type."
275 |   [registry action]
276 |   (let [{:keys [path name]} registry
277 |         {:keys [line type args]} action]
278 |     (try
279 |       (case type
280 |         :require
281 |         (load-require-action args)
282 | 
283 |         :register
284 |         (load-register-action args)
285 | 
286 |         :configure
287 |         (load-configure-action args)
288 | 
289 |         (throw (ex-info (str "Unsupported registry action " (pr-str type))
290 |                         {:type ::bad-action})))
291 |       (catch Exception ex
292 |         (let [message (format "Failed to load %s action on line %s of %s in %s"
293 |                               (clojure.core/name type) line name path)
294 |               cause (when (not= ::bad-action (:type (ex-data ex)))
295 |                       ex)]
296 |           (log/error message (ex-message ex))
297 |           (throw (ex-info (str message ": " (ex-message ex))
298 |                           {:path path
299 |                            :name name
300 |                            :line line
301 |                            :type type
302 |                            :args args}
303 |                           cause)))))))
304 | 
305 | 
306 | (defn- load-registry
307 |   "Process the given registry file map and returns a sequence of all
308 |   loaded configuration functions."
309 |   [registry]
310 |   (log/debugf "Loading registry %s in %s" (:name registry) (:path registry))
311 |   (into []
312 |         (keep (partial load-action registry))
313 |         (:actions registry)))
314 | 
315 | 
316 | (defn load-configuration
317 |   "Walk the classpath and load configuration actions from all discovered
318 |   registries. Returns a function which can be called on a Kryo serializer to
319 |   configure it."
320 |   []
321 |   (let [actions (into [] (mapcat load-registry) (classpath-registries))]
322 |     (fn configure!
323 |       [^Kryo kryo]
324 |       (.setInstantiatorStrategy kryo (StdInstantiatorStrategy.))
325 |       (doseq [f actions]
326 |         (f kryo)))))
327 | 
328 | 
329 | (defn initialize
330 |   "Creates a new Kryo instance and configures it with classpath registry
331 |   actions."
332 |   ^Kryo
333 |   []
334 |   (let [configure! (load-configuration)]
335 |     (doto (Kryo.)
336 |       (configure!))))
337 | 
338 | 
339 | ;; ## Serialization Logic
340 | 
341 | ;; For types that are already registered with efficient serializers, see:
342 | ;; https://github.com/EsotericSoftware/kryo/blob/master/src/com/esotericsoftware/kryo/Kryo.java
343 | ;; https://github.com/twitter/chill/blob/v0.9.3/chill-java/src/main/java/com/twitter/chill/java/PackageRegistrar.java
344 | ;; https://github.com/twitter/chill/blob/v0.9.3/chill-scala/src/main/scala/com/twitter/chill/ScalaKryoInstantiator.scala
345 | 
346 | (defmacro defserializer
347 |   "Define a new constructor for a Kryo Serializer with the given `write` and
348 |   `read` method implementations."
349 |   [name-sym class-sym immutable? & body]
350 |   ;; TODO: a spec for this macro would be better than these assertions
351 |   {:pre [(symbol? name-sym)
352 |          (symbol? class-sym)
353 |          (boolean? immutable?)
354 |          (= 2 (count body))
355 |          (every? list? body)
356 |          (= #{'read 'write} (set (map first body)))]}
357 |   (let [tagged #(vary-meta %1 assoc :tag (if (instance? Class %2)
358 |                                            (.getName ^Class %2)
359 |                                            (str %2)))
360 |         name-sym (tagged name-sym Serializer)
361 |         body-methods (into {} (map (juxt first identity)) body)
362 |         write-form (get body-methods 'write)
363 |         read-form (get body-methods 'read)]
364 |     `(defn ~name-sym
365 |        ~(str "Construct a new Kryo serializer for " class-sym " values.")
366 |        []
367 |        (proxy [Serializer] [false ~immutable?]
368 | 
369 |          (write
370 |            ~(let [[kryo-sym output-sym value-sym] (second write-form)]
371 |               [(tagged kryo-sym Kryo)
372 |                (tagged output-sym Output)
373 |                (tagged value-sym class-sym)])
374 |            ~@(nnext write-form))
375 | 
376 |          (read
377 |            ~(let [[kryo-sym input-sym target-sym] (second read-form)]
378 |               [(tagged kryo-sym Kryo)
379 |                (tagged input-sym Input)
380 |                (tagged target-sym Class)])
381 |            ~@(nnext read-form))))))
382 | 
383 | 
384 | ;; ### Core Serializers
385 | 
386 | (defserializer ident-serializer
387 |   Named true
388 | 
389 |   (write
390 |     [kryo output value]
391 |     (let [named-str (if (keyword? value)
392 |                       (subs (str value) 1)
393 |                       (str value))]
394 |       (.writeString output named-str)))
395 | 
396 |   (read
397 |     [kryo input target-class]
398 |     (let [named-str (.readString input)]
399 |       (if (identical? Keyword target-class)
400 |         (keyword named-str)
401 |         (symbol named-str)))))
402 | 
403 | 
404 | (defn- write-biginteger
405 |   "Write a BigInteger to the Kryo output."
406 |   [^Output output ^BigInteger value]
407 |   (let [int-bytes (.toByteArray value)]
408 |     (.writeVarInt output (alength int-bytes) true)
409 |     (.writeBytes output int-bytes)))
410 | 
411 | 
412 | (defn- read-biginteger
413 |   "Read a BigInteger value from the Kryo input."
414 |   [^Input input]
415 |   (let [length (.readVarInt input true)
416 |         int-bytes (.readBytes input length)]
417 |     (BigInteger. int-bytes)))
418 | 
419 | 
420 | (defserializer bigint-serializer
421 |   BigInt true
422 | 
423 |   (write
424 |     [kryo output value]
425 |     (write-biginteger output (biginteger value)))
426 | 
427 |   (read
428 |     [kryo input _]
429 |     (bigint (read-biginteger input))))
430 | 
431 | 
432 | (defserializer ratio-serializer
433 |   Ratio true
434 | 
435 |   (write
436 |     [kryo output value]
437 |     (write-biginteger output (numerator value))
438 |     (write-biginteger output (denominator value)))
439 | 
440 |   (read
441 |     [kryo input _]
442 |     (/ (read-biginteger input)
443 |        (read-biginteger input))))
444 | 
445 | 
446 | (defserializer var-serializer
447 |   Var false
448 | 
449 |   (write
450 |     [kryo output value]
451 |     (.writeString output (str (symbol value))))
452 | 
453 |   (read
454 |     [kryo input _]
455 |     (let [var-sym (symbol (.readString input))]
456 |       (requiring-resolve var-sym))))
457 | 
458 | 
459 | ;; ### Sequence Serializers
460 | 
461 | (defn- write-sequence
462 |   "Write a sequence of values to the Kryo output."
463 |   [^Kryo kryo ^Output output coll]
464 |   (.writeVarInt output (count coll) true)
465 |   (doseq [x coll]
466 |     (.writeClassAndObject kryo output x)))
467 | 
468 | 
469 | (defn- read-sequence
470 |   "Read a lazy sequence of values from the Kryo output."
471 |   [^Kryo kryo ^Input input]
472 |   (let [length (.readVarInt input true)]
473 |     (repeatedly length #(.readClassAndObject kryo input))))
474 | 
475 | 
476 | (defserializer sequence-serializer
477 |   ISeq true
478 | 
479 |   (write
480 |     [kryo output coll]
481 |     (write-sequence kryo output coll))
482 | 
483 |   (read
484 |     [kryo input _]
485 |     (apply list (read-sequence kryo input))))
486 | 
487 | 
488 | (defserializer vector-serializer
489 |   IPersistentVector true
490 | 
491 |   (write
492 |     [kryo output coll]
493 |     (write-sequence kryo output coll))
494 | 
495 |   (read
496 |     [kryo input _]
497 |     (into [] (read-sequence kryo input))))
498 | 
499 | 
500 | (defserializer string-seq-serializer
501 |   StringSeq true
502 | 
503 |   (write
504 |     [kryo output coll]
505 |     (.writeString output (str/join coll)))
506 | 
507 |   (read
508 |     [kryo input _]
509 |     (seq (.readString input))))
510 | 
511 | 
512 | ;; ### Set Serializers
513 | 
514 | (defserializer set-serializer
515 |   IPersistentSet true
516 | 
517 |   (write
518 |     [kryo output coll]
519 |     (write-sequence kryo output coll))
520 | 
521 |   (read
522 |     [kryo input _]
523 |     (into #{} (read-sequence kryo input))))
524 | 
525 | 
526 | (defserializer ordered-set-serializer
527 |   PersistentTreeSet true
528 | 
529 |   (write
530 |     [kryo output coll]
531 |     (.writeClassAndObject kryo output (.comparator coll))
532 |     (write-sequence kryo output coll))
533 | 
534 |   (read
535 |     [kryo input _]
536 |     (let [cmp (.readClassAndObject kryo input)]
537 |       (into (sorted-set-by cmp) (read-sequence kryo input)))))
538 | 
539 | 
540 | ;; ### Map Serializers
541 | 
542 | (defn- write-kvs
543 |   "Write a sequence of key/value pairs to the Kryo output."
544 |   [^Kryo kryo ^Output output coll]
545 |   (.writeVarInt output (count coll) true)
546 |   (doseq [[k v] coll]
547 |     (.writeClassAndObject kryo output k)
548 |     (.writeClassAndObject kryo output v)))
549 | 
550 | 
551 | (defn- read-kvs
552 |   "Read a lazy sequence of key/value pairs from the Kryo output."
553 |   [^Kryo kryo ^Input input]
554 |   (let [length (.readVarInt input true)]
555 |     (repeatedly length #(clojure.lang.MapEntry.
556 |                           (.readClassAndObject kryo input)
557 |                           (.readClassAndObject kryo input)))))
558 | 
559 | 
560 | (defserializer map-serializer
561 |   IPersistentMap true
562 | 
563 |   (write
564 |     [kryo output coll]
565 |     (write-kvs kryo output coll))
566 | 
567 |   (read
568 |     [kryo input _]
569 |     (into {} (read-kvs kryo input))))
570 | 
571 | 
572 | (defserializer ordered-map-serializer
573 |   PersistentTreeMap true
574 | 
575 |   (write
576 |     [kryo output coll]
577 |     (.writeClassAndObject kryo output (.comparator coll))
578 |     (write-kvs kryo output coll))
579 | 
580 |   (read
581 |     [kryo input _]
582 |     (let [cmp (.readClassAndObject kryo input)]
583 |       (into (sorted-set-by cmp) (read-kvs kryo input)))))
584 | 
585 | 
586 | ;; ## Serialization Utilities
587 | 
588 | ;; These are handy for tests and repl usage, but aren't actually used directly
589 | ;; by the library.
590 | 
591 | (defn encode
592 |   "Serialize the given object into a byte arary using the Kryo codec."
593 |   ^bytes
594 |   [^Kryo kryo obj]
595 |   (let [output (Output. 512 8192)]
596 |     (.writeClassAndObject kryo output obj)
597 |     (.toBytes output)))
598 | 
599 | 
600 | (defn decode
601 |   "Deserialize the given byte array using the Kryo codec."
602 |   [^Kryo kryo ^bytes data]
603 |   (let [input (Input. data)]
604 |     (.readClassAndObject kryo input)))
605 | 


--------------------------------------------------------------------------------
/sparkplug-core/src/clojure/sparkplug/core.clj:
--------------------------------------------------------------------------------
  1 | (ns sparkplug.core
  2 |   "This namespace provides the main API for writing Spark tasks.
  3 | 
  4 |   Most operations in this namespace place the RDD last in the argument list,
  5 |   just like Clojure collection functions. This lets you compose them using the
  6 |   thread-last macro (`->>`), making it simple to migrate existing Clojure
  7 |   code."
  8 |   (:refer-clojure :exclude [count distinct filter first group-by into keys map
  9 |                             mapcat max min reduce sort-by take vals])
 10 |   (:require
 11 |     [clojure.core :as c]
 12 |     [sparkplug.function :as f]
 13 |     [sparkplug.rdd :as rdd]
 14 |     [sparkplug.scala :as scala])
 15 |   (:import
 16 |     org.apache.spark.Partitioner
 17 |     (org.apache.spark.api.java
 18 |       JavaPairRDD
 19 |       JavaRDD
 20 |       JavaRDDLike
 21 |       JavaSparkContext)
 22 |     org.apache.spark.broadcast.Broadcast
 23 |     sparkplug.broadcast.DerefBroadcast
 24 |     sparkplug.core.UnionHelper))
 25 | 
 26 | 
 27 | ;; ## Broadcast Variables
 28 | 
 29 | (defn broadcast
 30 |   "Broadcast a read-only variable to the cluster, returning a reference for
 31 |   reading it in distributed functions. The variable data will be sent to each
 32 |   cluster only once.
 33 | 
 34 |   The returned broadcast value can be resolved with `deref` or the `@` reader
 35 |   macro."
 36 |   ^Broadcast
 37 |   [^JavaSparkContext spark-context value]
 38 |   (let [broadcast (.broadcast spark-context value)]
 39 |     (DerefBroadcast. broadcast (class value))))
 40 | 
 41 | 
 42 | ;; ## RDD Transformations
 43 | 
 44 | ;; Type hints are omitted because `filter` is not included in JavaRDDLike.
 45 | (defn filter
 46 |   "Filter the elements of `rdd` to the ones which satisfy the predicate `f`."
 47 |   ^JavaRDDLike
 48 |   [f rdd]
 49 |   (rdd/set-callsite-name
 50 |     (.filter rdd (f/fn1 (comp boolean f)))
 51 |     (rdd/fn-name f)))
 52 | 
 53 | 
 54 | (defn map
 55 |   "Map the function `f` over each element of `rdd`. Returns a new RDD
 56 |   representing the transformed elements."
 57 |   ^JavaRDDLike
 58 |   [f ^JavaRDDLike rdd]
 59 |   (rdd/set-callsite-name
 60 |     (.map rdd (f/fn1 f))
 61 |     (rdd/fn-name f)))
 62 | 
 63 | 
 64 | (defn mapcat
 65 |   "Map the function `f` over each element in `rdd` to produce a sequence of
 66 |   results. Returns an RDD representing the concatenation of all element
 67 |   results."
 68 |   ^JavaRDD
 69 |   [f ^JavaRDDLike rdd]
 70 |   (rdd/set-callsite-name
 71 |     (.flatMap rdd (f/flat-map-fn f))
 72 |     (rdd/fn-name f)))
 73 | 
 74 | 
 75 | (defn map-partitions
 76 |   "Map the function `f` over each partition in `rdd`, producing a sequence of
 77 |   results. Returns an RDD representing the concatenation of all the partition
 78 |   results. The function will be called with an iterator of the elements of each
 79 |   partition."
 80 |   (^JavaRDDLike
 81 |    [f ^JavaRDDLike rdd]
 82 |    (map-partitions f false rdd))
 83 |   (^JavaRDDLike
 84 |    [f preserve-partitioning? ^JavaRDDLike rdd]
 85 |    (rdd/set-callsite-name
 86 |      (.mapPartitions
 87 |        rdd
 88 |        (f/flat-map-fn f)
 89 |        (boolean preserve-partitioning?))
 90 |      (rdd/fn-name f))))
 91 | 
 92 | 
 93 | (defn map-partitions-indexed
 94 |   "Map the function `f` over each partition in `rdd`, producing a sequence of
 95 |   results. Returns an RDD representing the concatenation of all the partition
 96 |   results. The function will be called with the partition index and an iterator
 97 |   of the elements of each partition."
 98 |   ^JavaRDD
 99 |   [f ^JavaRDDLike rdd]
100 |   (rdd/set-callsite-name
101 |     (.mapPartitionsWithIndex rdd (f/fn2 f) true)
102 |     (rdd/fn-name f)))
103 | 
104 | 
105 | ;; Type hints are omitted because `distinct` is not included in JavaRDDLike.
106 | (defn distinct
107 |   "Construct an RDD containing only a single copy of each distinct element in
108 |   `rdd`. Optionally accepts a number of partitions to size the resulting RDD
109 |   with."
110 |   (^JavaRDDLike
111 |    [rdd]
112 |    (rdd/set-callsite-name
113 |      (.distinct rdd)))
114 |   (^JavaRDDLike
115 |    [num-partitions rdd]
116 |    (rdd/set-callsite-name
117 |      (.distinct rdd (int num-partitions))
118 |      (int num-partitions))))
119 | 
120 | 
121 | ;; Type hints are omitted because `sample` is not included in JavaRDDLike.
122 | (defn sample
123 |   "Generate a randomly sampled subset of `rdd` with roughly `fraction` of the
124 |   original elements. Callers can optionally select whether the sample happens
125 |   with replacement, and a random seed to control the sample."
126 |   (^JavaRDDLike
127 |    [fraction rdd]
128 |    (rdd/set-callsite-name
129 |      (.sample rdd true (double fraction))
130 |      (double fraction)))
131 |   (^JavaRDDLike
132 |    [fraction replacement? rdd]
133 |    (rdd/set-callsite-name
134 |      (.sample rdd (boolean replacement?) (double fraction))
135 |      (double fraction)
136 |      (boolean replacement?)))
137 |   (^JavaRDDLike
138 |    [fraction replacement? seed rdd]
139 |    (rdd/set-callsite-name
140 |      (.sample rdd (boolean replacement?) (double fraction) (long seed))
141 |      (double fraction)
142 |      (boolean replacement?)
143 |      (long seed))))
144 | 
145 | 
146 | (defn sort-by
147 |   "Reorder the elements of `rdd` so that they are sorted according to the given
148 |   key function. The result may be ordered ascending or descending, depending on
149 |   `ascending?`."
150 |   (^JavaRDD
151 |    [f ^JavaRDD rdd]
152 |    (sort-by f true rdd))
153 |   (^JavaRDD
154 |    [f ascending? ^JavaRDD rdd]
155 |    (sort-by f ascending? (.getNumPartitions rdd) rdd))
156 |   (^JavaRDD
157 |    [f ascending? num-partitions ^JavaRDD rdd]
158 |    (rdd/set-callsite-name
159 |      (.sortBy rdd
160 |               (f/fn1 f)
161 |               (boolean ascending?)
162 |               num-partitions)
163 |      (rdd/fn-name f)
164 |      (boolean ascending?)
165 |      (int num-partitions))))
166 | 
167 | 
168 | ;; ## Pair RDD Transformations
169 | 
170 | (defn keys
171 |   "Transform `rdd` by replacing each pair with its key. Returns a new RDD
172 |   representing the keys."
173 |   ^JavaRDD
174 |   [^JavaPairRDD rdd]
175 |   (rdd/set-callsite-name (.keys rdd)))
176 | 
177 | 
178 | (defn vals
179 |   "Transform `rdd` by replacing each pair with its value. Returns a new RDD
180 |   representing the values."
181 |   ^JavaRDD
182 |   [^JavaPairRDD rdd]
183 |   (rdd/set-callsite-name (.values rdd)))
184 | 
185 | 
186 | (defn key-by
187 |   "Creates pairs from the elements in `rdd` by using `f` to compute a key for
188 |   each value."
189 |   ^JavaPairRDD
190 |   [f ^JavaRDDLike rdd]
191 |   (rdd/set-callsite-name
192 |     (.mapToPair rdd (f/pair-fn (juxt f identity)))
193 |     (rdd/fn-name f)))
194 | 
195 | 
196 | (defn map->pairs
197 |   "Map the function `f` over each element of `rdd`. Returns a new pair RDD
198 |   representing the transformed elements."
199 |   ^JavaPairRDD
200 |   [f ^JavaRDDLike rdd]
201 |   (rdd/set-callsite-name
202 |     (.mapToPair rdd (f/pair-fn f))
203 |     (rdd/fn-name f)))
204 | 
205 | 
206 | (defn mapcat->pairs
207 |   "Map the function `f` over each element in `rdd` to produce a sequence of
208 |   key-value pairs. Returns a new pair RDD representing the concatenation of all
209 |   result pairs."
210 |   ^JavaPairRDD
211 |   [f ^JavaRDDLike rdd]
212 |   (rdd/set-callsite-name
213 |     (.flatMapToPair rdd (f/pair-flat-map-fn f))
214 |     (rdd/fn-name f)))
215 | 
216 | 
217 | (defn map-partitions->pairs
218 |   "Map the function `f` over each partition in `rdd`, producing a sequence of
219 |   key-value pairs. The function will be called with an iterator of the elements
220 |   of the partition."
221 |   (^JavaPairRDD
222 |    [f ^JavaRDDLike rdd]
223 |    (map-partitions->pairs f false rdd))
224 |   (^JavaPairRDD
225 |    [f preserve-partitioning? ^JavaRDDLike rdd]
226 |    (rdd/set-callsite-name
227 |      (.mapPartitionsToPair
228 |        rdd
229 |        (f/pair-flat-map-fn f)
230 |        (boolean preserve-partitioning?))
231 |      (rdd/fn-name f)
232 |      (boolean preserve-partitioning?))))
233 | 
234 | 
235 | (defn map-vals
236 |   "Map the function `f` over each value of the pairs in `rdd`. Returns a new
237 |   pair RDD representing the transformed pairs."
238 |   ^JavaPairRDD
239 |   [f ^JavaPairRDD rdd]
240 |   (rdd/set-callsite-name
241 |     (.mapValues rdd (f/fn1 f))
242 |     (rdd/fn-name f)))
243 | 
244 | 
245 | (defn mapcat-vals
246 |   "Map the function `f` over each value of the pairs in `rdd` to produce a
247 |   collection of values. Returns a new pair RDD representing the concatenated
248 |   keys and values."
249 |   ^JavaPairRDD
250 |   [f ^JavaPairRDD rdd]
251 |   (rdd/set-callsite-name
252 |     (.flatMapValues rdd (f/fn1 f))
253 |     (rdd/fn-name f)))
254 | 
255 | 
256 | (defn zip-indexed
257 |   "Zip the elements in `rdd` with their indices. Returns a new pair RDD with
258 |   the element/index tuples.
259 | 
260 |   The ordering is first based on the partition index and then the ordering of
261 |   items within each partition. So the first item in the first partition gets
262 |   index 0, and the last item in the last partition receives the largest index.
263 | 
264 |   This method needs to trigger a spark job when `rdd` contains more than one
265 |   partition."
266 |   ^JavaPairRDD
267 |   [^JavaRDDLike rdd]
268 |   (rdd/set-callsite-name
269 |     (.zipWithIndex rdd)))
270 | 
271 | 
272 | (defn zip-unique-ids
273 |   "Zip the elements in `rdd` with unique long identifiers. Returns a new pair
274 |   RDD with the element/id tuples.
275 | 
276 |   Items in the kth partition will get ids `k`, `n+k`, `2*n+k`, ..., where `n`
277 |   is the number of partitions. So the ids won't be sequential and there may be
278 |   gaps, but this method _won't_ trigger a spark job, unlike `zip-indexed`."
279 |   ^JavaPairRDD
280 |   [^JavaRDDLike rdd]
281 |   (rdd/set-callsite-name
282 |     (.zipWithUniqueId rdd)))
283 | 
284 | 
285 | ;; ## Multi-RDD Functions
286 | 
287 | (defn cartesian
288 |   "Construct an RDD representing the cartesian product of two RDDs. Returns a
289 |   new pair RDD containing all combinations of elements between the datasets."
290 |   ^JavaPairRDD
291 |   [^JavaRDDLike rdd1 ^JavaRDDLike rdd2]
292 |   (rdd/set-callsite-name
293 |     (.cartesian rdd1 rdd2)))
294 | 
295 | 
296 | ;; Type hints are omitted because `union` is not included in JavaRDDLike.
297 | (defn union
298 |   "Construct a union of the elements in the provided RDDs. Any identical
299 |   elements will appear multiple times."
300 |   [rdd1 & rdds]
301 |   (let [ctx (JavaSparkContext/fromSparkContext (.context ^JavaRDDLike rdd1))]
302 |     (rdd/set-callsite-name
303 |       (condp instance? rdd1
304 |         JavaRDD
305 |         (UnionHelper/unionJavaRDDs ctx (into-array JavaRDD (list* rdd1 rdds)))
306 | 
307 |         JavaPairRDD
308 |         (UnionHelper/unionJavaPairRDDs ctx (into-array JavaPairRDD (list* rdd1 rdds)))
309 | 
310 |         (throw
311 |           (IllegalArgumentException.
312 |             (str "Unsupported type for RDD union: " (.getName (class rdd1)))))))))
313 | 
314 | 
315 | ;; Type hints are omitted because `intersecton` is not included in JavaRDDLike.
316 | (defn intersection
317 |   "Construct an RDD representing the intersection of elements which are in both
318 |   RDDs."
319 |   [rdd1 rdd2]
320 |   (rdd/set-callsite-name
321 |     (.intersection rdd1 rdd2)))
322 | 
323 | 
324 | ;; Type hints are omitted because `subtract` is not included in JavaRDDLike.
325 | (defn subtract
326 |   "Remove all elements from `rdd1` that are present in `rdd2`."
327 |   ^JavaRDDLike
328 |   [rdd1 rdd2]
329 |   (rdd/set-callsite-name
330 |     (.subtract rdd1 rdd2)))
331 | 
332 | 
333 | (defn subtract-by-key
334 |   "Construct an RDD representing all pairs in `rdd1` for which there is no pair
335 |   with a matching key in `rdd2`."
336 |   ^JavaPairRDD
337 |   [^JavaPairRDD rdd1 ^JavaPairRDD rdd2]
338 |   (rdd/set-callsite-name
339 |     (.subtractByKey rdd1 rdd2)))
340 | 
341 | 
342 | (defn cogroup
343 |   "Produe a new RDD containing an element for each key `k` in the given pair
344 |   RDDs mapped to a tuple of the values from all RDDs as lists.
345 | 
346 |   If the input RDDs have types `(K, A)`, `(K, B)`, and `(K, C)`, the grouped
347 |   RDD will have type `(K, (list(A), list(B), list(C)))`."
348 |   (^JavaPairRDD
349 |    [^JavaPairRDD rdd1 ^JavaPairRDD rdd2]
350 |    (rdd/set-callsite-name
351 |      (.cogroup rdd1 rdd2)))
352 |   (^JavaPairRDD
353 |    [^JavaPairRDD rdd1 ^JavaPairRDD rdd2 ^JavaPairRDD rdd3]
354 |    (rdd/set-callsite-name
355 |      (.cogroup rdd1 rdd2 rdd3)))
356 |   (^JavaPairRDD
357 |    [^JavaPairRDD rdd1 ^JavaPairRDD rdd2 ^JavaPairRDD rdd3 ^JavaPairRDD rdd4]
358 |    (rdd/set-callsite-name
359 |      (.cogroup rdd1 rdd2 rdd3 rdd4))))
360 | 
361 | 
362 | (defn cogroup-partitioned
363 |   "Produe a new RDD containing an element for each key `k` in the given pair
364 |   RDDs mapped to a tuple of the values from all RDDs as lists. The resulting
365 |   RDD partitions may be controlled by setting `partitions` to an integer number
366 |   or a `Partitioner` instance.
367 | 
368 |   If the input RDDs have types `(K, A)`, `(K, B)`, and `(K, C)`, the grouped
369 |   RDD will have type `(K, (List(A), List(B), List(C)))`."
370 |   (^JavaPairRDD
371 |    [^JavaPairRDD rdd1 ^JavaPairRDD rdd2 partitions]
372 |    (if (instance? Partitioner partitions)
373 |      (rdd/set-callsite-name
374 |        (.cogroup rdd1 rdd2 ^Partitioner partitions)
375 |        (class partitions))
376 |      (rdd/set-callsite-name
377 |        (.cogroup rdd1 rdd2 (int partitions))
378 |        (int partitions))))
379 |   (^JavaPairRDD
380 |    [^JavaPairRDD rdd1 ^JavaPairRDD rdd2 ^JavaPairRDD rdd3 partitions]
381 |    (if (instance? Partitioner partitions)
382 |      (rdd/set-callsite-name
383 |        (.cogroup rdd1 rdd2 rdd3 ^Partitioner partitions)
384 |        (class partitions))
385 |      (rdd/set-callsite-name
386 |        (.cogroup rdd1 rdd2 rdd3 (int partitions))
387 |        (int partitions))))
388 |   (^JavaPairRDD
389 |    [^JavaPairRDD rdd1
390 |     ^JavaPairRDD rdd2
391 |     ^JavaPairRDD rdd3
392 |     ^JavaPairRDD rdd4
393 |     partitions]
394 |    (if (instance? Partitioner partitions)
395 |      (rdd/set-callsite-name
396 |        (.cogroup rdd1 rdd2 rdd3 rdd4 ^Partitioner partitions)
397 |        (class partitions))
398 |      (rdd/set-callsite-name
399 |        (.cogroup rdd1 rdd2 rdd3 rdd4 (int partitions))
400 |        (int partitions)))))
401 | 
402 | 
403 | (defn join
404 |   "Construct an RDD containing all pairs of elements with matching keys in
405 |   `rdd1` and `rdd2`. Each pair of elements will be returned as a tuple of
406 |   `(k, (v, w))`, where `(k, v)` is in `rdd1` and `(k, w)` is in `rdd2`.
407 | 
408 |   Performs a hash join across the cluster. Optionally, `partitions` may be
409 |   provided as an integer number or a partitioner instance to control the
410 |   partitioning of the resulting RDD."
411 |   (^JavaPairRDD
412 |    [^JavaPairRDD rdd1 ^JavaPairRDD rdd2]
413 |    (rdd/set-callsite-name
414 |      (.join rdd1 rdd2)))
415 |   (^JavaPairRDD
416 |    [^JavaPairRDD rdd1 ^JavaPairRDD rdd2 partitions]
417 |    (if (instance? Partitioner partitions)
418 |      (rdd/set-callsite-name
419 |        (.join rdd1 rdd2 ^Partitioner partitions)
420 |        (class partitions))
421 |      (rdd/set-callsite-name
422 |        (.join rdd1 rdd2 (int partitions))
423 |        (int partitions)))))
424 | 
425 | 
426 | (defn left-outer-join
427 |   "Perform a left outer join of `rdd1` and `rdd2`.
428 | 
429 |   For each element `(k, v)` in `rdd1`, the resulting RDD will either contain
430 |   all pairs `(k, (v, Some(w)))` for `(k, w)` in `rdd2`, or the pair
431 |   `(k, (v, None))` if no elements in `rdd2` have key `k`.
432 | 
433 |   Hash-partitions the resulting RDD using the existing partitioner/parallelism
434 |   level unless `partitions` is be provided as an integer number or a
435 |   partitioner instance."
436 |   (^JavaPairRDD
437 |    [^JavaPairRDD rdd1 ^JavaPairRDD rdd2]
438 |    (rdd/set-callsite-name
439 |      (.leftOuterJoin rdd1 rdd2)))
440 |   (^JavaPairRDD
441 |    [^JavaPairRDD rdd1 ^JavaPairRDD rdd2 partitions]
442 |    (if (instance? Partitioner partitions)
443 |      (rdd/set-callsite-name
444 |        (.leftOuterJoin rdd1 rdd2 ^Partitioner partitions)
445 |        (class partitions))
446 |      (rdd/set-callsite-name
447 |        (.leftOuterJoin rdd1 rdd2 (int partitions))
448 |        (int partitions)))))
449 | 
450 | 
451 | (defn right-outer-join
452 |   "Perform a right outer join of `rdd1` and `rdd2`.
453 | 
454 |   For each element `(k, w)` in `rdd2`, the resulting RDD will either contain
455 |   all pairs `(k, (Some(v), w))` for `(k, v)` in `rdd1`, or the pair
456 |   `(k, (None, w))` if no elements in `rdd1` have key `k`.
457 | 
458 |   Hash-partitions the resulting RDD using the existing partitioner/parallelism
459 |   level unless `partitions` is be provided as an integer number or a
460 |   partitioner instance."
461 |   (^JavaPairRDD
462 |    [^JavaPairRDD rdd1 ^JavaPairRDD rdd2]
463 |    (rdd/set-callsite-name
464 |      (.rightOuterJoin rdd1 rdd2)))
465 |   (^JavaPairRDD
466 |    [^JavaPairRDD rdd1 ^JavaPairRDD rdd2 partitions]
467 |    (if (instance? Partitioner partitions)
468 |      (rdd/set-callsite-name
469 |        (.rightOuterJoin rdd1 rdd2 ^Partitioner partitions)
470 |        (class partitions))
471 |      (rdd/set-callsite-name
472 |        (.rightOuterJoin rdd1 rdd2 (int partitions))
473 |        (int partitions)))))
474 | 
475 | 
476 | (defn full-outer-join
477 |   "Perform a full outer join of `rdd1` and `rdd2`.
478 | 
479 |   For each element `(k, v)` in `rdd1`, the resulting RDD will either contain all
480 |   pairs `(k, (Some(v), Some(w)))` for `(k, w)` in `rdd2`, or the pair
481 |   `(k, (Some(v), None))` if no elements in other have key `k`. Similarly, for
482 |   each element `(k, w)` in `rdd2`, the resulting RDD will either contain all
483 |   pairs `(k, (Some(v), Some(w)))` for `v` in `rdd1`, or the pair
484 |   `(k, (None, Some(w)))` if no elements in `rdd1` have key `k`.
485 | 
486 |   Hash-partitions the resulting RDD using the existing partitioner/parallelism
487 |   level unless `partitions` is be provided as an integer number or a
488 |   partitioner instance."
489 |   (^JavaPairRDD
490 |    [^JavaPairRDD rdd1 ^JavaPairRDD rdd2]
491 |    (rdd/set-callsite-name
492 |      (.fullOuterJoin rdd1 rdd2)))
493 |   (^JavaPairRDD
494 |    [^JavaPairRDD rdd1 ^JavaPairRDD rdd2 partitions]
495 |    (if (instance? Partitioner partitions)
496 |      (rdd/set-callsite-name
497 |        (.fullOuterJoin rdd1 rdd2 ^Partitioner partitions)
498 |        (class partitions))
499 |      (rdd/set-callsite-name
500 |        (.fullOuterJoin rdd1 rdd2 (int partitions))
501 |        (int partitions)))))
502 | 
503 | 
504 | ;; ## Pair RDD Aggregation
505 | 
506 | (defn aggregate-by-key
507 |   "When called on an RDD of (K, V) pairs, returns an RDD of (K, U) pairs where
508 |   the values for each key are aggregated using the given 2-arg aggregator
509 |   function, 2-arg combiner function, and a neutral zero value. Allows an
510 |   aggregated value type that is different than the input value type, while
511 |   avoiding unnecessary allocations. The number of reduce tasks is configurable
512 |   by optionally passing a number of partitions or a partitioner."
513 |   (^JavaPairRDD
514 |    [aggregator combiner zero ^JavaPairRDD rdd]
515 |    (.aggregateByKey rdd zero (f/fn2 aggregator) (f/fn2 combiner)))
516 |   (^JavaPairRDD
517 |    [aggregator combiner zero partitioner-or-num-partitions ^JavaPairRDD rdd]
518 |    (if (instance? Partitioner partitioner-or-num-partitions)
519 |      (.aggregateByKey
520 |        rdd
521 |        zero
522 |        ^Partitioner partitioner-or-num-partitions
523 |        (f/fn2 aggregator)
524 |        (f/fn2 combiner))
525 |      (.aggregateByKey
526 |        rdd
527 |        zero
528 |        (int partitioner-or-num-partitions)
529 |        (f/fn2 aggregator)
530 |        (f/fn2 combiner)))))
531 | 
532 | 
533 | (defn group-by
534 |   "Group the elements of `rdd` using a key function `f`. Returns a pair RDD
535 |   with each generated key and all matching elements as a value sequence."
536 |   (^JavaPairRDD
537 |    [f ^JavaRDDLike rdd]
538 |    (rdd/set-callsite-name
539 |      (.groupBy rdd (f/fn1 f))
540 |      (rdd/fn-name f)))
541 |   (^JavaPairRDD
542 |    [f num-partitions ^JavaRDDLike rdd]
543 |    (rdd/set-callsite-name
544 |      (.groupBy rdd (f/fn1 f) (int num-partitions))
545 |      (rdd/fn-name f)
546 |      num-partitions)))
547 | 
548 | 
549 | (defn group-by-key
550 |   "Group the entries in the pair `rdd` by key. Returns a new pair RDD with one
551 |   entry per key, containing all of the matching values as a sequence."
552 |   (^JavaPairRDD
553 |    [^JavaPairRDD rdd]
554 |    (rdd/set-callsite-name
555 |      (.groupByKey rdd)))
556 |   (^JavaPairRDD
557 |    [num-partitions ^JavaPairRDD rdd]
558 |    (rdd/set-callsite-name
559 |      (.groupByKey rdd (int num-partitions))
560 |      num-partitions)))
561 | 
562 | 
563 | (defn reduce-by-key
564 |   "Aggregate the pairs of `rdd` which share a key by combining all of the
565 |   values with the reducing function `f`. Returns a new pair RDD with one entry
566 |   per unique key, holding the aggregated values."
567 |   ^JavaPairRDD
568 |   [f ^JavaPairRDD rdd]
569 |   (rdd/set-callsite-name
570 |     (.reduceByKey rdd (f/fn2 f))
571 |     (rdd/fn-name f)))
572 | 
573 | 
574 | (defn combine-by-key
575 |   "Combine the elements for each key using a set of aggregation functions.
576 | 
577 |   If `rdd` contains pairs of `(K, V)`, the resulting RDD will contain pairs of
578 |   type `(K, C)`. Callers must provide three functions:
579 |   - `seq-fn` which turns a V into a C (for example, `vector`)
580 |   - `conj-fn` to add a V to a C (for example, `conj`)
581 |   - `merge-fn` to combine two C's into a single result"
582 |   (^JavaPairRDD
583 |    [seq-fn conj-fn merge-fn ^JavaPairRDD rdd]
584 |    (rdd/set-callsite-name
585 |      (.combineByKey rdd
586 |                     (f/fn1 seq-fn)
587 |                     (f/fn2 conj-fn)
588 |                     (f/fn2 merge-fn))
589 |      (rdd/fn-name seq-fn)
590 |      (rdd/fn-name conj-fn)
591 |      (rdd/fn-name merge-fn)))
592 |   (^JavaPairRDD
593 |    [seq-fn conj-fn merge-fn num-partitions ^JavaPairRDD rdd]
594 |    (rdd/set-callsite-name
595 |      (.combineByKey rdd
596 |                     (f/fn1 seq-fn)
597 |                     (f/fn2 conj-fn)
598 |                     (f/fn2 merge-fn)
599 |                     (int num-partitions))
600 |      (rdd/fn-name seq-fn)
601 |      (rdd/fn-name conj-fn)
602 |      (rdd/fn-name merge-fn)
603 |      num-partitions)))
604 | 
605 | 
606 | (defn sort-by-key
607 |   "Reorder the elements of `rdd` so that they are sorted according to their
608 |   natural order or the given comparator `f` if provided. The result may be
609 |   ordered ascending or descending, depending on `ascending?`."
610 |   (^JavaPairRDD
611 |    [^JavaPairRDD rdd]
612 |    (rdd/set-callsite-name
613 |      (.sortByKey rdd true)))
614 |   (^JavaPairRDD
615 |    [ascending? ^JavaPairRDD rdd]
616 |    (rdd/set-callsite-name
617 |      (.sortByKey rdd (boolean ascending?))
618 |      (boolean ascending?)))
619 |   (^JavaPairRDD
620 |    [compare-fn ascending? ^JavaPairRDD rdd]
621 |    (rdd/set-callsite-name
622 |      (.sortByKey rdd
623 |                  (f/comparator-fn compare-fn)
624 |                  (boolean ascending?))
625 |      (rdd/fn-name compare-fn)
626 |      (boolean ascending?)))
627 |   (^JavaPairRDD
628 |    [compare-fn ascending? num-partitions ^JavaPairRDD rdd]
629 |    (rdd/set-callsite-name
630 |      (.sortByKey rdd
631 |                  (f/comparator-fn compare-fn)
632 |                  (boolean ascending?)
633 |                  (int num-partitions))
634 |      (rdd/fn-name compare-fn)
635 |      (boolean ascending?)
636 |      (int num-partitions))))
637 | 
638 | 
639 | ;; ## RDD Actions
640 | 
641 | (defn collect
642 |   "Collect the elements of `rdd` into a vector on the driver. Be careful not to
643 |   realize large datasets with this, as the driver will likely run out of
644 |   memory.
645 | 
646 |   This is an action that causes computation."
647 |   [^JavaRDDLike rdd]
648 |   (vec (.collect rdd)))
649 | 
650 | 
651 | (defn into
652 |   "Collect the elements of `rdd` into a collection on the driver. Behaves like
653 |   `clojure.core/into`, including accepting an optional transducer.
654 |   Automatically coerces Scala tuples into Clojure vectors.
655 | 
656 |   Be careful not to realize large datasets with this, as the driver will likely
657 |   run out of memory.
658 | 
659 |   This is an action that causes computation."
660 |   ([coll ^JavaRDDLike rdd]
661 |    (into coll identity rdd))
662 |   ([coll xf ^JavaRDDLike rdd]
663 |    (c/into coll
664 |            (comp (c/map scala/from-tuple) xf)
665 |            (.collect rdd))))
666 | 
667 | 
668 | (defn foreach
669 |   "Apply the function `f` to all elements of `rdd`. The function will run on
670 |   the executors where the data resides.
671 | 
672 |   Consider `foreach-partition` for efficiency if handling an element requires
673 |   costly resource acquisition such as a database connection.
674 | 
675 |   This is an action that causes computation."
676 |   [f ^JavaRDDLike rdd]
677 |   (.foreach rdd (f/void-fn f)))
678 | 
679 | 
680 | (defn foreach-partition
681 |   "Apply the function `f` to all elements of `rdd` by calling it with a
682 |   sequence of each partition's elements. The function will run on the executors
683 |   where the data resides.
684 | 
685 |   This is an action that causes computation."
686 |   [f ^JavaRDDLike rdd]
687 |   (.foreachPartition rdd (f/void-fn (comp f iterator-seq))))
688 | 
689 | 
690 | (defn count
691 |   "Count the number of elements in `rdd`.
692 | 
693 |   This is an action that causes computation."
694 |   [^JavaRDDLike rdd]
695 |   (.count rdd))
696 | 
697 | 
698 | (defn first
699 |   "Find the first element of `rdd`.
700 | 
701 |   This is an action that causes computation."
702 |   [^JavaRDDLike rdd]
703 |   (.first rdd))
704 | 
705 | 
706 | (defn min
707 |   "Find the minimum element in `rdd` in the ordering defined by `compare-fn`.
708 | 
709 |   This is an action that causes computation."
710 |   ([^JavaRDDLike rdd]
711 |    (min compare rdd))
712 |   ([compare-fn ^JavaRDDLike rdd]
713 |    (.min rdd (f/comparator-fn compare-fn))))
714 | 
715 | 
716 | (defn max
717 |   "Find the maximum element in `rdd` in the ordering defined by `compare-fn`.
718 | 
719 |   This is an action that causes computation."
720 |   ([^JavaRDDLike rdd]
721 |    (max compare rdd))
722 |   ([compare-fn ^JavaRDDLike rdd]
723 |    (.max rdd (f/comparator-fn compare-fn))))
724 | 
725 | 
726 | (defn take
727 |   "Take the first `n` elements of the RDD.
728 | 
729 |   This currently scans the partitions _one by one_ on the **driver**, so it
730 |   will be slow if a lot of elements are required. In that case, use `collect`
731 |   to get the whole RDD instead.
732 | 
733 |   This is an action that causes computation."
734 |   [n ^JavaRDDLike rdd]
735 |   (.take rdd (int n)))
736 | 
737 | 
738 | (defn take-ordered
739 |   "Take the first `n` (smallest) elements from this RDD as defined by the
740 |   elements' natural order or specified comparator.
741 | 
742 |   This currently scans the partitions _one by one_ on the **driver**, so it
743 |   will be slow if a lot of elements are required. In that case, use `collect`
744 |   to get the whole RDD instead.
745 | 
746 |   This is an action that causes computation."
747 |   ([n ^JavaRDDLike rdd]
748 |    (.takeOrdered rdd (int n)))
749 |   ([n compare-fn ^JavaRDDLike rdd]
750 |    (.takeOrdered rdd (int n) (f/comparator-fn compare-fn))))
751 | 
752 | 
753 | (defn reduce
754 |   "Aggregate the elements of `rdd` using the function `f`. The reducing
755 |   function must accept two arguments and should be commutative and associative
756 |   so that it can be computed correctly in parallel.
757 | 
758 |   This is an action that causes computation."
759 |   [f ^JavaRDDLike rdd]
760 |   (.reduce rdd (f/fn2 f)))
761 | 
762 | 
763 | (defn fold
764 |   "Aggregate the elements of each partition in `rdd`, followed by the results
765 |   for all the partitions, by using the given associative function `f` and a
766 |   neutral `zero` value.
767 | 
768 |   This is an action that causes computation."
769 |   [f zero ^JavaRDDLike rdd]
770 |   (.fold rdd zero (f/fn2 f)))
771 | 
772 | 
773 | (defn aggregate
774 |   "Aggregate the elements of each partition in `rdd` using `aggregator`, then
775 |   merge the results for all partitions using `combiner`. Both functions will be
776 |   seeded with the neutral `zero` value.
777 | 
778 |   This is an action that causes computation."
779 |   [aggregator combiner zero ^JavaRDDLike rdd]
780 |   (.aggregate rdd zero (f/fn2 aggregator) (f/fn2 combiner)))
781 | 
782 | 
783 | ;; ## Pair RDD Actions
784 | 
785 | (defn lookup
786 |   "Find all values in the `rdd` pairs whose keys is `k`. The key must be
787 |   serializable with the Java serializer (not Kryo) for this to work.
788 | 
789 |   This is an action that causes computation."
790 |   [^JavaPairRDD rdd k]
791 |   (vec (.lookup rdd k)))
792 | 
793 | 
794 | (defn count-by-key
795 |   "Count the distinct key values in `rdd`. Returns a map of keys to integer
796 |   counts.
797 | 
798 |   This is an action that causes computation."
799 |   [^JavaPairRDD rdd]
800 |   (c/into {} (.countByKey rdd)))
801 | 
802 | 
803 | (defn count-by-value
804 |   "Count the distinct values in `rdd`. Returns a map of values to integer
805 |   counts.
806 | 
807 |   This is an action that causes computation."
808 |   [^JavaRDDLike rdd]
809 |   (c/into {} (.countByValue rdd)))
810 | 


--------------------------------------------------------------------------------