├── .circleci └── config.yml ├── .clj-kondo ├── config.edn └── hooks │ └── sparkplug │ └── function.clj ├── .cljstyle ├── .gitignore ├── CHANGELOG.md ├── LICENSE ├── README.md ├── cluster ├── .dockerignore ├── .gitignore ├── Dockerfile ├── README.md ├── code │ └── .keep ├── docker-compose.yml ├── spark-env.sh └── submit.sh ├── doc ├── serialization.md └── sparkling.md ├── project.clj ├── sparkplug-core ├── .gitignore ├── README.md ├── dev-resources │ └── log4j.properties ├── dev │ └── user.clj ├── project.clj ├── resources │ └── sparkplug │ │ └── kryo │ │ └── registry │ │ ├── clojure.conf │ │ └── sparkplug.conf ├── src │ ├── clojure │ │ └── sparkplug │ │ │ ├── accumulator.clj │ │ │ ├── config.clj │ │ │ ├── context.clj │ │ │ ├── core.clj │ │ │ ├── function.clj │ │ │ ├── kryo.clj │ │ │ ├── rdd.clj │ │ │ └── scala.clj │ └── java │ │ └── sparkplug │ │ ├── broadcast │ │ └── DerefBroadcast.java │ │ ├── core │ │ └── UnionHelper.java │ │ ├── function │ │ ├── ComparatorFn.java │ │ ├── FlatMapFn1.java │ │ ├── FlatMapFn2.java │ │ ├── Fn1.java │ │ ├── Fn2.java │ │ ├── Fn3.java │ │ ├── PairFlatMapFn.java │ │ ├── PairFn.java │ │ ├── SerializableFn.java │ │ └── VoidFn.java │ │ ├── kryo │ │ └── ClassPathRegistrator.java │ │ └── partition │ │ └── FnHashPartitioner.java └── test │ └── sparkplug │ ├── core_test.clj │ ├── function │ └── test_fns.clj │ ├── function_test.clj │ └── kryo_test.clj └── sparkplug-repl ├── .gitignore ├── README.md ├── project.clj └── src └── sparkplug └── repl ├── main.clj └── work.clj /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2.1 2 | 3 | # Common executor configuration 4 | executors: 5 | clojure-java-11: 6 | docker: 7 | - image: cimg/clojure:1.11.1-openjdk-11.0 8 | working_directory: ~/repo 9 | 10 | 11 | # Job definitions 12 | jobs: 13 | style: 14 | executor: clojure-java-11 15 | steps: 16 | - checkout 17 | - run: 18 | name: Install cljstyle 19 | environment: 20 | CLJSTYLE_VERSION: 0.16.626 21 | command: | 22 | wget https://github.com/greglook/cljstyle/releases/download/${CLJSTYLE_VERSION}/cljstyle_${CLJSTYLE_VERSION}_linux_amd64.zip 23 | unzip cljstyle_${CLJSTYLE_VERSION}_linux_amd64.zip 24 | - run: 25 | name: Check source formatting 26 | command: "./cljstyle check --report" 27 | 28 | lint: 29 | executor: clojure-java-11 30 | steps: 31 | - checkout 32 | - run: 33 | name: Install clj-kondo 34 | environment: 35 | CLJ_KONDO_VERSION: 2024.09.27 36 | command: | 37 | wget https://github.com/borkdude/clj-kondo/releases/download/v${CLJ_KONDO_VERSION}/clj-kondo-${CLJ_KONDO_VERSION}-linux-amd64.zip 38 | unzip clj-kondo-${CLJ_KONDO_VERSION}-linux-amd64.zip 39 | - run: 40 | name: Lint source code 41 | command: "./clj-kondo --lint sparkplug-core/src:sparkplug-core/test" 42 | 43 | test-spark-3-1-java-11: 44 | executor: clojure-java-11 45 | steps: 46 | - checkout 47 | - restore_cache: 48 | keys: 49 | - v1-test-spark-3.1-java-11-{{ checksum "project.clj" }} 50 | - v1-test-spark-3.1-java-11- 51 | - run: 52 | name: Test projects 53 | command: | 54 | lein -version 55 | lein monolith each do clean, check, install, test 56 | - save_cache: 57 | key: v1-test-{{ checksum "project.clj" }} 58 | paths: 59 | - ~/.m2 60 | 61 | test-spark-3-5-java-11: 62 | executor: clojure-java-11 63 | steps: 64 | - checkout 65 | - restore_cache: 66 | keys: 67 | - v1-test-spark-3.5-java-11-{{ checksum "project.clj" }} 68 | - v1-test-spark-3.5-java-11- 69 | - run: 70 | name: Test projects 71 | command: | 72 | lein -version 73 | lein monolith each with-profile -spark-3.1,+spark-3.5 do clean, check, install, test 74 | - save_cache: 75 | key: v1-test-spark-3.5-java-11-{{ checksum "project.clj" }} 76 | paths: 77 | - ~/.m2 78 | 79 | coverage: 80 | executor: clojure-java-11 81 | steps: 82 | - checkout 83 | - restore_cache: 84 | keys: 85 | - v1-coverage-{{ checksum "project.clj" }} 86 | - v1-coverage- 87 | - v1-test- 88 | - run: 89 | name: Install projects 90 | command: lein monolith each install 91 | - run: 92 | name: Generate coverage 93 | command: lein monolith each :in sparkplug-core with-profile +spark-3.5 cloverage --codecov 94 | - save_cache: 95 | key: v1-coverage-{{ checksum "project.clj" }} 96 | paths: 97 | - ~/.m2 98 | - store_artifacts: 99 | path: sparkplug-core/target/coverage 100 | destination: coverage 101 | - run: 102 | name: Install codecov 103 | command: | 104 | sudo apt-get update && sudo apt-get install gpg 105 | curl https://keybase.io/codecovsecurity/pgp_keys.asc | gpg --no-default-keyring --keyring trustedkeys.gpg --import 106 | curl -Os https://uploader.codecov.io/latest/linux/codecov 107 | curl -Os https://uploader.codecov.io/latest/linux/codecov.SHA256SUM 108 | curl -Os https://uploader.codecov.io/latest/linux/codecov.SHA256SUM.sig 109 | gpgv codecov.SHA256SUM.sig codecov.SHA256SUM 110 | shasum -a 256 -c codecov.SHA256SUM 111 | chmod +x codecov 112 | - run: 113 | name: Publish coverage report 114 | command: './codecov -f sparkplug-core/target/coverage/codecov.json' 115 | 116 | 117 | # Workflow definitions 118 | workflows: 119 | version: 2 120 | test: 121 | jobs: 122 | - style 123 | - lint 124 | - test-spark-3-1-java-11 125 | - test-spark-3-5-java-11 126 | - coverage: 127 | requires: 128 | - test-spark-3-5-java-11 129 | -------------------------------------------------------------------------------- /.clj-kondo/config.edn: -------------------------------------------------------------------------------- 1 | {:linters 2 | {:consistent-alias 3 | {:level :warning 4 | :aliases {clojure.java.io io 5 | clojure.set set 6 | clojure.string str 7 | clojure.tools.logging log 8 | sparkplug.function f 9 | sparkplug.kryo kryo 10 | sparkplug.rdd rdd}}} 11 | 12 | :lint-as 13 | {clojure.test.check.clojure-test/defspec clj-kondo.lint-as/def-catch-all 14 | sparkplug.context/with-context clojure.core/let 15 | sparkplug.kryo/defserializer clj-kondo.lint-as/def-catch-all} 16 | 17 | :hooks 18 | {:analyze-call {sparkplug.function/gen-function hooks.sparkplug.function/gen-function}}} 19 | -------------------------------------------------------------------------------- /.clj-kondo/hooks/sparkplug/function.clj: -------------------------------------------------------------------------------- 1 | (ns hooks.sparkplug.function 2 | (:require 3 | [clj-kondo.hooks-api :as api])) 4 | 5 | 6 | (defn gen-function 7 | "Macro analysis for `sparkplug.function/gen-function`." 8 | [form] 9 | (let [name-sym (-> form :node :children (nth 2)) 10 | constructor (api/list-node 11 | [(api/token-node 'defn) 12 | name-sym 13 | (api/vector-node 14 | [(api/token-node '_f)])])] 15 | {:node constructor})) 16 | -------------------------------------------------------------------------------- /.cljstyle: -------------------------------------------------------------------------------- 1 | ;; vim: ft=clojure 2 | {:files 3 | {:ignore #{"checkouts" "target"}} 4 | 5 | :rules 6 | {:namespaces 7 | {:import-break-width 80} 8 | 9 | :indentation 10 | {:indents {for-all [[:block 1]] 11 | with-context [[:block 1]]}}}} 12 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | classes 3 | checkouts 4 | .clj-kondo/.cache 5 | .lein-* 6 | .nrepl-port 7 | pom.xml 8 | pom.xml.asc 9 | *.jar 10 | *.class 11 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | Change Log 2 | ========== 3 | 4 | All notable changes to this project will be documented in this file, which 5 | follows the conventions of [keepachangelog.com](http://keepachangelog.com/). 6 | This project adheres to [Semantic Versioning](http://semver.org/). 7 | 8 | ## [Unreleased] 9 | 10 | ... 11 | 12 | 13 | ## [1.1.0] - 2024-10-10 14 | 15 | ### Changed 16 | - Sparkplug is now tested with Java 11 + Spark 3.1.3, and Java 11 + Spark 3.5.1. 17 | Java 8 test coverage was dropped. 18 | - Bump Clojure to 1.12.0. 19 | - Update various dependency versions. 20 | - Add clj-kondo linting to test suite. 21 | - Fix bug when serializing functions which close over a boolean value. 22 | [#27](https://github.com/amperity/sparkplug/issues/27) 23 | [#28](https://github.com/amperity/sparkplug/pull/28) 24 | 25 | 26 | ## [1.0.0] - 2022-05-31 27 | 28 | ### Changed 29 | - Update some project dependencies to the latest versions. 30 | - The `sparkplug-sql` sub-project, which has been empty since its creation 31 | over two years ago, has been removed for now. 32 | 33 | 34 | ## [0.1.9] - 2022-04-25 35 | 36 | ### Changed 37 | - Sparkplug is now tested with Spark 3.1.3 and Spark 3.2.1. 38 | Spark 2.4.x and related dependencies were dropped. 39 | - The `sparkplug-ml` sub-project, which has been empty since its creation 40 | over two years ago, has been removed for now. 41 | 42 | ### Fixed 43 | - Correctly detect namespace to require when serializing a closure defined 44 | inside a record type. 45 | [#23](https://github.com/amperity/sparkplug/pull/23) 46 | 47 | 48 | ## [0.1.8] - 2021-08-06 49 | 50 | ### Fixed 51 | - `sparkplug.core/union` now works with Spark 3. 52 | [#21](https://github.com/amperity/sparkplug/pull/21) 53 | 54 | 55 | [Unreleased]: https://github.com/amperity/sparkplug/compare/1.1.0...HEAD 56 | [1.1.0]: https://github.com/amperity/sparkplug/compare/1.0.0...1.1.0 57 | [1.0.0]: https://github.com/amperity/sparkplug/compare/0.1.9...1.0.0 58 | [0.1.9]: https://github.com/amperity/sparkplug/compare/0.1.8...0.1.9 59 | [0.1.8]: https://github.com/amperity/sparkplug/compare/0.1.7...0.1.8 60 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2019 Amperity, Inc. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Clojure Spark API 2 | ================= 3 | 4 | [![CircleCI](https://dl.circleci.com/status-badge/img/gh/amperity/sparkplug/tree/main.svg?style=shield)](https://dl.circleci.com/status-badge/redirect/gh/amperity/sparkplug/tree/main) 5 | [![codecov](https://codecov.io/gh/amperity/sparkplug/branch/master/graph/badge.svg)](https://codecov.io/gh/amperity/sparkplug) 6 | [![cljdoc](https://cljdoc.org/badge/amperity/sparkplug)](https://cljdoc.org/d/amperity/sparkplug/CURRENT) 7 | 8 | SparkPlug is a Clojure API for [Apache Spark](http://spark.apache.org/). 9 | 10 | 11 | ## Installation 12 | 13 | Library releases are published on Clojars. To use the latest version with 14 | Leiningen, add the following dependency to your project: 15 | 16 | [![Clojars Project](https://clojars.org/amperity/sparkplug/latest-version.svg)](https://clojars.org/amperity/sparkplug) 17 | 18 | This will pull in the omnibus package, which in turn depends on each subproject 19 | of the same version. You may instead depend on the subprojects directly if you 20 | wish to omit some functionality, such as Spark SQL or Machine Learning 21 | dependencies. 22 | 23 | 24 | ## Usage 25 | 26 | The sparkplug-core package provides functions for working with RDDs, broadcasts, 27 | and accumulators with the classic Spark context API. 28 | See the [cljdoc](https://cljdoc.org/d/amperity/sparkplug-core/CURRENT) for API docs. 29 | 30 | 31 | ## License 32 | 33 | Licensed under the Apache License, Version 2.0. See the [LICENSE](LICENSE) file 34 | for rights and restrictions. 35 | -------------------------------------------------------------------------------- /cluster/.dockerignore: -------------------------------------------------------------------------------- 1 | /data 2 | /jars 3 | -------------------------------------------------------------------------------- /cluster/.gitignore: -------------------------------------------------------------------------------- 1 | /code 2 | /data 3 | -------------------------------------------------------------------------------- /cluster/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM eclipse-temurin:11-jdk 2 | 3 | RUN apt update 4 | RUN apt install -yy ca-certificates wget bash procps coreutils python3 5 | RUN update-ca-certificates 6 | 7 | RUN mkdir -p /opt 8 | WORKDIR /opt 9 | 10 | ARG HADOOP_VERSION 11 | RUN wget https://dlcdn.apache.org/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz && \ 12 | tar -xzvf hadoop-${HADOOP_VERSION}.tar.gz && \ 13 | rm hadoop-${HADOOP_VERSION}.tar.gz && \ 14 | mv hadoop-${HADOOP_VERSION} hadoop 15 | 16 | ARG SPARK_VERSION 17 | ARG SPARK_VARIANT=without-hadoop 18 | RUN wget https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-${SPARK_VARIANT}.tgz && \ 19 | tar -xzvf spark-${SPARK_VERSION}-bin-${SPARK_VARIANT}.tgz && \ 20 | rm spark-${SPARK_VERSION}-bin-${SPARK_VARIANT}.tgz && \ 21 | mv spark-${SPARK_VERSION}-bin-${SPARK_VARIANT} spark 22 | 23 | ENV HADOOP_HOME=/opt/hadoop 24 | ENV SPARK_HOME=/opt/spark 25 | ADD spark-env.sh /opt/spark/conf/spark-env.sh 26 | 27 | RUN mkdir -p /tmp/spark-events 28 | -------------------------------------------------------------------------------- /cluster/README.md: -------------------------------------------------------------------------------- 1 | Docker Spark Cluster 2 | ==================== 3 | 4 | A simple spark-master and two-worker cluster for use in testing and debugging 5 | deployed Spark applications. This setup surfaces serialization and classpath 6 | issues that do not occur in local development contexts. 7 | 8 | 9 | ## Usage 10 | 11 | Initialize the cluster, containing a master and one worker: 12 | 13 | ```shell 14 | docker compose up -d 15 | ``` 16 | 17 | You can submit an application with the submit script: 18 | 19 | ```shell 20 | cp $PROJECT/target/uberjar/my-app.jar cluster/code/ 21 | ./submit.sh my-app.jar 22 | ``` 23 | 24 | You can also submit an application using the Spark master's REST API. First, 25 | create a JSON file with the request body: 26 | 27 | ```json 28 | { 29 | "action": "CreateSubmissionRequest", 30 | "appArgs": ["file:///data/hamlet.txt"], 31 | "appResource": "file:///mnt/code/my-app.jar", 32 | "clientSparkVersion": "3.5.1", 33 | "environmentVariables": {"SPARK_ENV_LOADED": "1"}, 34 | "mainClass": "my_app.main", 35 | "sparkProperties": 36 | { 37 | "spark.app.name": "my-app", 38 | "spark.submit.deployMode": "cluster", 39 | "spark.jars": "file:///mnt/code/my-app.jar", 40 | "spark.driver.cores": 1, 41 | "spark.driver.memory": "1G", 42 | "spark.driver.supervise": "false", 43 | "spark.executor.cores": 1, 44 | "spark.executor.count": 1, 45 | "spark.executor.memory": "1G", 46 | "spark.logConf": "true" 47 | } 48 | } 49 | ``` 50 | 51 | Then submit it to the scheduling HTTP endpoint: 52 | 53 | ```shell 54 | curl http://localhost:6066/v1/submissions/create --data @request.json 55 | ``` 56 | 57 | ## Endpoints 58 | 59 | All of these are from docker host: 60 | 61 | * spark-master [http:8080](http://localhost:8080) 62 | * spark-driver [http:4040](http://localhost:4040) (when an application is running) 63 | * legacy submission [spark:7077](spark://localhost:7077) 64 | * REST API submission [spark:6066](spark://localhost:6066) 65 | -------------------------------------------------------------------------------- /cluster/code/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amperity/sparkplug/7235ba5e8abb52a89b3ce5b26a39fbd70aba3bb5/cluster/code/.keep -------------------------------------------------------------------------------- /cluster/docker-compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | master: 3 | build: 4 | context: . 5 | dockerfile: Dockerfile 6 | args: 7 | HADOOP_VERSION: 3.3.5 8 | SPARK_VERSION: 3.5.1 9 | command: /opt/spark/sbin/start-master.sh 10 | restart: on-failure 11 | hostname: master 12 | environment: 13 | SPARK_PUBLIC_DNS: localhost 14 | SPARK_MASTER_PORT: 7077 15 | SPARK_MASTER_WEBUI_PORT: 8080 16 | SPARK_MASTER_OPTS: "-Dspark.master.rest.enabled=true" 17 | expose: 18 | - 6066 19 | - 7001 20 | - 7002 21 | - 7003 22 | - 7004 23 | - 7005 24 | - 7006 25 | - 7077 26 | ports: 27 | - 6066:6066 28 | - 7077:7077 29 | - 8080:8080 30 | volumes: 31 | - ./code:/mnt/code 32 | 33 | worker-1: 34 | build: 35 | context: . 36 | dockerfile: Dockerfile 37 | args: 38 | HADOOP_VERSION: 3.3.5 39 | SPARK_VERSION: 3.5.1 40 | command: /opt/spark/sbin/start-worker.sh spark://master:7077 41 | restart: on-failure 42 | hostname: worker-1 43 | environment: 44 | SPARK_PUBLIC_DNS: localhost 45 | SPARK_WORKER_PORT: 8881 46 | SPARK_WORKER_WEBUI_PORT: 8081 47 | SPARK_WORKER_CORES: 2 48 | SPARK_WORKER_MEMORY: 2g 49 | links: 50 | - master 51 | depends_on: 52 | - master 53 | expose: 54 | - 4040 55 | - 7012 56 | - 7013 57 | - 7014 58 | - 7015 59 | - 7016 60 | - 8881 61 | ports: 62 | - 4040:4040 63 | - 8081:8081 64 | - 8881:8881 65 | volumes: 66 | - ./code:/mnt/code 67 | - ./data:/data 68 | 69 | repl: 70 | image: eclipse-temurin:11-jdk 71 | command: java -jar /sparkplug-repl.jar 72 | restart: on-failure 73 | hostname: repl 74 | environment: 75 | SPARKPLUG_REPL_MASTER: spark://master:7077 76 | SPARKPLUG_REPL_PORT: 8765 77 | ports: 78 | - 4050:4040 79 | - 8765:8765 80 | volumes: 81 | - ./code/sparkplug-repl.jar:/sparkplug-repl.jar 82 | - ./data:/data 83 | 84 | networks: 85 | default: 86 | ipam: 87 | config: 88 | - subnet: "10.128.99.0/24" 89 | -------------------------------------------------------------------------------- /cluster/spark-env.sh: -------------------------------------------------------------------------------- 1 | # Spark environment customizations 2 | 3 | export SPARK_DIST_CLASSPATH=$(/opt/hadoop/bin/hadoop classpath) 4 | export SPARK_NO_DAEMONIZE=1 5 | -------------------------------------------------------------------------------- /cluster/submit.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | APP_DRIVER="$1" 4 | 5 | if [[ -z $APP_DRIVER ]]; then 6 | echo "No application driver code provided!" >&2 7 | exit 1 8 | fi 9 | 10 | if [[ ! -f code/$APP_DRIVER ]]; then 11 | echo "Couldn't find code/$APP_DRIVER - did you copy it in place?" >&2 12 | exit 2 13 | fi 14 | 15 | docker compose exec master \ 16 | /opt/spark/bin/spark-submit \ 17 | --master spark://master:7077 \ 18 | /mnt/code/$APP_DRIVER 19 | -------------------------------------------------------------------------------- /doc/serialization.md: -------------------------------------------------------------------------------- 1 | ## Serialization 2 | 3 | A major concern of SparkPlug is reliable and efficient serialization for Spark 4 | programs written in Clojure. 5 | 6 | Under the umbrella of serialization, there are two separate problems: task functions, 7 | and task results. 8 | 9 | 10 | ### Task functions 11 | 12 | These are the functions you pass to RDD transformations, like map and filter. 13 | When you invoke an action on the resulting RDD, the driver will serialize these 14 | functions and broadcast them to executors. Executors must be able to 15 | deserialize the functions and run them across multiple threads. 16 | 17 | Due to challenges of serializing functions, Spark uses built-in Java serialization 18 | for task functions. The main difficulty with Clojure functions is that they have 19 | implicit dependencies on namespaces and Vars being available at runtime. If Clojure 20 | functions are not serialized correctly, your application is bound to crash with 21 | confusing errors like "attempting to call unbound fn". To address this, 22 | SparkPlug takes this approach: 23 | * On the driver side: Any function passed to an RDD transformation (map, 24 | filter, etc.) is serialized along with a list of the namespaces that it 25 | implicitly depends on. This list is built by reflecting on the _function 26 | object_ itself, instead of analyzing code. 27 | * On the executor side: When the function is deserialized, first require each 28 | of those namespaces to ensure they are available before calling the function. 29 | It's important to synchronize these requires, because `clojure.core/require` 30 | is not thread-safe! Without synchronization, it's likely to result in 31 | non-deterministic "unbound fn" and "unbound Var" errors. 32 | 33 | 34 | ### Task results 35 | 36 | This refers to the data produced by executing tasks. Executors will either send 37 | results back to the driver (as in a "collect" action), or pass them on to the 38 | next stage for executors to read again. 39 | 40 | For task results of Clojure data, such as keywords, maps, and vectors, 41 | Java serialization with `java.io.Serializable` is very suboptimal. 42 | For example, the keyword `:a` gets encoded to a whopping 218 bytes, and 43 | the empty vector `[]` becomes 405 bytes! 44 | 45 | SparkPlug solves this using Spark's support for [Kryo serialization](https://github.com/EsotericSoftware/kryo), 46 | by defining custom serializers and a registrator to handle common Clojure data types. 47 | To use SparkPlug's Kryo serialization, set these Spark properties: 48 | 49 | | Property | Value | 50 | | ------------------------ | -------------------------------------------- | 51 | | `spark.serializer` | `org.apache.spark.serializer.KryoSerializer` | 52 | | `spark.kryo.registrator` | `sparkplug.kryo.ClassPathRegistrator` | 53 | 54 | For convencience, SparkPlug's configuration builder functions include these 55 | properties by default. 56 | 57 | The registrator is also extensible, so that applications can easily add more 58 | serializers and have them included in the registry. See the 59 | [sparkplug.kryo](https://cljdoc.org/d/amperity/sparkplug-core/CURRENT/api/sparkplug.kryo) 60 | namespace for details. 61 | 62 | 63 | ## Tips 64 | 65 | Since task functions are serialized with `java.io.Serializable`, any Clojure 66 | data _closed over by_ a task function is also serialized this way. If you need 67 | to close over a relatively large piece of Clojure data in a task function, such 68 | as a static lookup table, using a broadcast variable will provide much better 69 | performance because it will use the same serialization path as task results. 70 | 71 | If you are caching RDDs of Clojure data, consider using a serialized storage 72 | level. This will use Kryo serialization, and will save a lot of memory on executors. 73 | The tradeoff is that this increases CPU time to access the data. 74 | -------------------------------------------------------------------------------- /doc/sparkling.md: -------------------------------------------------------------------------------- 1 | Migrating from Sparkling 2 | ======================== 3 | 4 | Migrating from Sparkling should require very little work - a few functions have 5 | changed names, but the API is extremely similar by design. The major change is 6 | obviously to update the namespaces you're requiring; for example, instead of 7 | requiring `[sparkling.core :as spark]`, require `[sparkplug.core :as spark]`. 8 | 9 | Specific changes to be aware of are documented by namespace below. 10 | 11 | 12 | ## `sparkling.conf` 13 | 14 | - `get` renamed `get-param` 15 | - `set` renamed `set-param` 16 | - `set-if-missing` renamed `set-param-default` 17 | - `remove` renamed `unset-param` 18 | - `master` no longer sets `"local[*]"` if provided no arguments 19 | - `to-string` renamed `debug-str` 20 | 21 | 22 | ## `sparkling.function` 23 | 24 | The names of all of the function interop classes changed and their serialization 25 | is slightly more efficient. Otherwise consumers shouldn't need to change much 26 | here. 27 | 28 | 29 | ## `sparkling.core` 30 | 31 | ### Spark Contexts 32 | - `spark-context` moved to `sparkplug.context/spark-context` 33 | - `local-spark-context` not implemented 34 | - `default-min-partitions` replaced by `sparkplug.context/info` 35 | - `default-parallelism` replaced by `sparkplug.context/info` 36 | - `stop` moved to `sparkplug.context/stop!` 37 | - `with-context` moved to `sparkplug.context/with-context` and now expects a 38 | two-element binding vector instead of separate symbol/config args. 39 | 40 | ### RDD Transformations 41 | - `map-to-pair` renamed `map->pairs` 42 | - `map-values` renamed `map-vals` 43 | - `values` renamed `vals` 44 | - `flat-map` renamed `mapcat` 45 | - `flat-map-to-pair` renamed `mapcat->pairs` 46 | - `flat-map-values` renamed `mapcat-vals` 47 | - `map-partition` renamed `map-partitions` 48 | - `map-partitions-to-pair` renamed `map-partitions->pairs` 49 | - `map-partition-with-index` renamed `map-partitions-indexed` 50 | - `sort-by-key` no longer auto-detects whether the first argument is a 51 | comparator - explicitly pass the `ascending?` argument to provide a custom 52 | comparison function 53 | - `sample` has more arities and a different argument signature 54 | - `zip-with-index` renamed `zip-indexed` 55 | - `zip-with-unique-id` renamed `zip-unique-ids` 56 | - `partitionwise-sampled-rdd` not implemented 57 | - `partitioner-aware-union` not implemented 58 | - `intersect-by-key` not implemented 59 | 60 | ### RDD Actions 61 | - `glom` not implemented 62 | - `collect` returns a vector instead of a mutable Java list 63 | - `collect-map` not implemented, use `(spark/into {} rdd)` instead 64 | - `save-as-text-file` moved to `sparkplug.rdd` namespace 65 | - `histogram` not implemented 66 | 67 | ### RDD Construction 68 | - `parallelize`/`into-rdd` moved to `sparkplug.rdd/parallelize` 69 | - `parallelize-pairs`/`into-pair-rdd` moved to `sparkplug.rdd/parallelize-pairs` 70 | - `text-file` moved to `sparkplug.rdd/text-file` 71 | - `whole-text-files` moved to `sparkplug.rdd` namespace 72 | 73 | ### RDD Partitioning 74 | - `hash-partitioner` moved to `sparkplug.rdd` namespace 75 | - `partitions` moved to `sparkplug.rdd` namespace 76 | - `partitioner` moved to `sparkplug.rdd` namespace 77 | - `partition-by` moved to `sparkplug.rdd` namespace 78 | - `repartition` moved to `sparkplug.rdd` namespace 79 | - `repartition` moved to `sparkplug.rdd` namespace 80 | - `coalesce` moved to `sparkplug.rdd` namespace 81 | - `coalesce-max` not implemented 82 | - `rekey` not implemented 83 | 84 | ### RDD Persistence 85 | - `STORAGE-LEVELS` moved to `sparkplug.rdd/storage-levels` 86 | - `cache`/`storage-level!` replaced by `sparkplug.rdd/cache!` 87 | - `uncache` moved to `sparkplug.rdd/uncache!` 88 | - `checkpoint` moved to `sparkplug.rdd/checkpoint!` 89 | 90 | ### Misc 91 | - `tuple` moved to `sparkplug.scala` namespace 92 | - `count-partitions` not implemented 93 | - `tuple-by` not implemented 94 | - `key-by-fn` not implemented 95 | - `rdd-name` replaced by `sparkplug.rdd/name` and `sparkplug.rdd/set-name` for 96 | the read and write operations, respectively 97 | 98 | 99 | ## `sparkling.broadcast` 100 | 101 | - `broadcast` moved to `sparkplug.core/broadcast` 102 | - `value` not implemented, deref the broadcast values instead 103 | 104 | 105 | ## `sparkling.accumulator` 106 | 107 | - `accumulator` replaced with type-specific v2 constructors: 108 | - `long-accumulator` 109 | - `double-accumulator` 110 | - `collection-accumulator` 111 | -------------------------------------------------------------------------------- /project.clj: -------------------------------------------------------------------------------- 1 | (defproject amperity/sparkplug "1.1.0" 2 | :description "Clojure API for Apache Spark" 3 | :url "https://github.com/amperity/sparkplug" 4 | :license {:name "Apache License 2.0" 5 | :url "http://www.apache.org/licenses/LICENSE-2.0"} 6 | 7 | :deploy-repositories {"releases" {:url "https://repo.clojars.org"}} 8 | :deploy-branches ["main"] 9 | :pedantic? :warn 10 | 11 | :plugins 12 | [[lein-cloverage "1.2.2"] 13 | [lein-monolith "1.7.0"]] 14 | 15 | :dependencies 16 | [[org.clojure/clojure "1.12.0"] 17 | [amperity/sparkplug-core "1.1.0"]] 18 | 19 | :profiles 20 | {:dev 21 | {:dependencies 22 | [[org.clojure/test.check "1.1.1"]]}} 23 | 24 | :monolith 25 | {:project-dirs ["sparkplug-core" 26 | "sparkplug-repl"] 27 | :inherit [:deploy-branches 28 | :pedantic?]}) 29 | -------------------------------------------------------------------------------- /sparkplug-core/.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /classes 3 | /checkouts 4 | /.lein-* 5 | /.nrepl-port 6 | pom.xml 7 | pom.xml.asc 8 | *.jar 9 | *.class 10 | -------------------------------------------------------------------------------- /sparkplug-core/README.md: -------------------------------------------------------------------------------- 1 | Spark Core API 2 | ============== 3 | 4 | [![cljdoc](https://cljdoc.org/badge/amperity/sparkplug-core)](https://cljdoc.org/d/amperity/sparkplug-core/CURRENT) 5 | 6 | This library contains the core API for working with Spark. If you want to get 7 | the basic building blocks of a Spark application, you can use this directly. 8 | 9 | 10 | ## Installation 11 | 12 | Library releases are published on Clojars. To use the latest version with 13 | Leiningen, add the following dependency to your project: 14 | 15 | [![Clojars Project](https://clojars.org/amperity/sparkplug-core/latest-version.svg)](https://clojars.org/amperity/sparkplug-core) 16 | -------------------------------------------------------------------------------- /sparkplug-core/dev-resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # log4j config for clojure development 2 | log4j.rootLogger=WARN, stdout 3 | 4 | # Console appender 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 6 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 7 | log4j.appender.stdout.layout.ConversionPattern=%d{ISO8601} %5p %c{2}:%L - %m%n 8 | 9 | log4j.logger.sparkplug=DEBUG 10 | log4j.logger.sparkplug.kryo=INFO 11 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO 12 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO 13 | log4j.logger.org.eclipse.jetty=WARN 14 | -------------------------------------------------------------------------------- /sparkplug-core/dev/user.clj: -------------------------------------------------------------------------------- 1 | (ns user 2 | (:require 3 | [clojure.java.io :as io] 4 | [clojure.repl :refer :all] 5 | [clojure.stacktrace :refer [print-cause-trace]] 6 | [clojure.string :as str] 7 | [clojure.tools.namespace.repl :refer [refresh]] 8 | [sparkplug.config :as conf] 9 | [sparkplug.context :as ctx] 10 | [sparkplug.core :as spark] 11 | [sparkplug.function :as f] 12 | [sparkplug.function.test-fns :as test-fns] 13 | [sparkplug.kryo :as kryo] 14 | [sparkplug.rdd :as rdd] 15 | [sparkplug.scala :as scala]) 16 | (:import 17 | com.esotericsoftware.kryo.Kryo 18 | (java.io 19 | ByteArrayInputStream 20 | ByteArrayOutputStream 21 | ObjectInputStream 22 | ObjectOutputStream))) 23 | 24 | 25 | (def local-conf 26 | (-> (conf/spark-conf) 27 | (conf/master "local[*]") 28 | (conf/app-name "user"))) 29 | 30 | 31 | (def spark-context nil) 32 | 33 | 34 | (defn letter-frequencies 35 | "Calculate the number of times each letter appears in the given text." 36 | ([] 37 | (letter-frequencies "/usr/share/dict/words")) 38 | ([path] 39 | (ctx/with-context [ctx (-> (conf/spark-conf) 40 | (conf/master "local[2]") 41 | (conf/app-name "letter-frequencies"))] 42 | (alter-var-root #'spark-context (constantly ctx)) 43 | (-> 44 | (->> 45 | (rdd/text-file ctx (str "file://" path)) 46 | (spark/map str/lower-case) 47 | (spark/mapcat seq) 48 | (spark/map->pairs #(vector % 1)) 49 | (spark/reduce-by-key +) 50 | (spark/into {})) 51 | (as-> result 52 | (do (println "Done, press enter to continue...") 53 | (read-line) 54 | result)))))) 55 | 56 | 57 | (def kryo 58 | (delay (kryo/initialize))) 59 | 60 | 61 | (defn inspect-bytes 62 | [data] 63 | (->> 64 | (seq data) 65 | (map #(let [c (char (if (neg? %) 66 | (+ % 256) 67 | %))] 68 | (if (<= 32 (int c)) 69 | c 70 | \.))) 71 | (partition-all 32) 72 | (map str/join) 73 | (str/join "\n") 74 | (println))) 75 | 76 | 77 | (defn serialize 78 | [f] 79 | (let [baos (ByteArrayOutputStream.)] 80 | (with-open [out (ObjectOutputStream. baos)] 81 | (.writeObject out f)) 82 | (.toByteArray baos))) 83 | 84 | 85 | (defn deserialize 86 | [bs] 87 | (with-open [in (ObjectInputStream. (ByteArrayInputStream. bs))] 88 | (.readObject in))) 89 | -------------------------------------------------------------------------------- /sparkplug-core/project.clj: -------------------------------------------------------------------------------- 1 | (defproject amperity/sparkplug-core "1.1.0" 2 | :description "Clojure API for Apache Spark" 3 | :url "https://github.com/amperity/sparkplug" 4 | :scm {:dir ".."} 5 | :license {:name "Apache License 2.0" 6 | :url "http://www.apache.org/licenses/LICENSE-2.0"} 7 | 8 | :monolith/inherit true 9 | 10 | :dependencies 11 | [[org.clojure/clojure "1.12.0"] 12 | [org.clojure/java.classpath "1.1.0"] 13 | [org.clojure/tools.logging "1.3.0"]] 14 | 15 | :source-paths ["src/clojure"] 16 | :java-source-paths ["src/java"] 17 | 18 | :profiles 19 | {:default 20 | [:base :system :user :provided :spark-3.5 :dev] 21 | 22 | :dev 23 | {:dependencies 24 | [[org.clojure/test.check "1.1.1"] 25 | [org.slf4j/slf4j-api "2.0.16"] 26 | [org.slf4j/slf4j-simple "2.0.16"]] 27 | :jvm-opts ["-Xmx2g" 28 | "-XX:-OmitStackTraceInFastThrow" 29 | "-Dorg.slf4j.simpleLogger.defaultLogLevel=warn" 30 | "-Dorg.slf4j.simpleLogger.log.org.apache=warn"]} 31 | 32 | :repl 33 | {:source-paths ["dev"] 34 | :aot [sparkplug.function.test-fns] 35 | :dependencies 36 | [[org.clojure/tools.namespace "1.5.0"]]} 37 | 38 | :test 39 | {:aot [sparkplug.function.test-fns]} 40 | 41 | :spark-3.1 42 | ^{:pom-scope :provided} 43 | {:dependencies 44 | [[org.apache.spark/spark-core_2.12 "3.1.3"]]} 45 | 46 | :spark-3.5 47 | ^{:pom-scope :provided} 48 | {:dependencies 49 | [[org.apache.spark/spark-core_2.12 "3.5.1" 50 | :exclusions [org.apache.logging.log4j/log4j-slf4j2-impl]] 51 | 52 | ;; Conflict resolution 53 | [com.fasterxml.jackson.core/jackson-core "2.15.2"] 54 | [com.google.code.findbugs/jsr305 "3.0.2"]]}}) 55 | -------------------------------------------------------------------------------- /sparkplug-core/resources/sparkplug/kryo/registry/clojure.conf: -------------------------------------------------------------------------------- 1 | # Clojure language types 2 | 3 | # Value types 4 | register clojure.lang.BigInt sparkplug.kryo/bigint-serializer 5 | register clojure.lang.Keyword sparkplug.kryo/ident-serializer 6 | register clojure.lang.Symbol sparkplug.kryo/ident-serializer 7 | register clojure.lang.Ratio sparkplug.kryo/ratio-serializer 8 | register clojure.lang.Var sparkplug.kryo/var-serializer 9 | 10 | # Sequences 11 | register clojure.lang.Cons sparkplug.kryo/sequence-serializer 12 | register clojure.lang.PersistentList$EmptyList sparkplug.kryo/sequence-serializer 13 | register clojure.lang.PersistentList sparkplug.kryo/sequence-serializer 14 | register clojure.lang.LazySeq sparkplug.kryo/sequence-serializer 15 | register clojure.lang.IteratorSeq sparkplug.kryo/sequence-serializer 16 | register clojure.lang.ArraySeq sparkplug.kryo/sequence-serializer 17 | register clojure.lang.PersistentVector$ChunkedSeq sparkplug.kryo/sequence-serializer 18 | register clojure.lang.StringSeq sparkplug.kryo/string-seq-serializer 19 | 20 | # Vectors 21 | register clojure.lang.MapEntry sparkplug.kryo/vector-serializer 22 | register clojure.lang.PersistentVector sparkplug.kryo/vector-serializer 23 | register clojure.lang.APersistentVector$SubVector sparkplug.kryo/vector-serializer 24 | 25 | # Maps 26 | register clojure.lang.PersistentArrayMap sparkplug.kryo/map-serializer 27 | register clojure.lang.PersistentHashMap sparkplug.kryo/map-serializer 28 | register clojure.lang.PersistentStructMap sparkplug.kryo/map-serializer 29 | register clojure.lang.PersistentTreeMap sparkplug.kryo/ordered-map-serializer 30 | 31 | # Sets 32 | register clojure.lang.PersistentHashSet sparkplug.kryo/set-serializer 33 | register clojure.lang.PersistentTreeSet sparkplug.kryo/ordered-map-serializer 34 | 35 | # Others 36 | register clojure.lang.MethodImplCache 37 | register clojure.lang.RT$DefaultComparator 38 | -------------------------------------------------------------------------------- /sparkplug-core/resources/sparkplug/kryo/registry/sparkplug.conf: -------------------------------------------------------------------------------- 1 | # SparkPlug types 2 | 3 | # Functions 4 | register sparkplug.function.Fn1 5 | register sparkplug.function.Fn2 6 | register sparkplug.function.Fn3 7 | register sparkplug.function.FlatMapFn1 8 | register sparkplug.function.FlatMapFn2 9 | register sparkplug.function.PairFn 10 | register sparkplug.function.PairFlatMapFn 11 | register sparkplug.function.ComparatorFn 12 | register sparkplug.function.VoidFn 13 | 14 | # Misc 15 | register sparkplug.broadcast.DerefBroadcast 16 | -------------------------------------------------------------------------------- /sparkplug-core/src/clojure/sparkplug/accumulator.clj: -------------------------------------------------------------------------------- 1 | (ns sparkplug.accumulator 2 | "Functions for working with Accumulator objects which can aggregate values 3 | across executors." 4 | (:refer-clojure :exclude [count empty? name reset!]) 5 | (:require 6 | [sparkplug.scala :as scala]) 7 | (:import 8 | org.apache.spark.api.java.JavaSparkContext 9 | (org.apache.spark.util 10 | AccumulatorV2 11 | DoubleAccumulator 12 | LongAccumulator))) 13 | 14 | 15 | ;; ## Constructors 16 | 17 | (defn long-accumulator 18 | "Create and register a long accumulator, which starts with 0 and accumulates 19 | inputs by summing them." 20 | ([^JavaSparkContext spark-context] 21 | (.longAccumulator (.sc spark-context))) 22 | ([^JavaSparkContext spark-context acc-name] 23 | (.longAccumulator (.sc spark-context) acc-name))) 24 | 25 | 26 | (defn double-accumulator 27 | "Create and register a double accumulator, which starts with 0.0 and 28 | accumulates inputs by summing them." 29 | ([^JavaSparkContext spark-context] 30 | (.doubleAccumulator (.sc spark-context))) 31 | ([^JavaSparkContext spark-context acc-name] 32 | (.doubleAccumulator (.sc spark-context) acc-name))) 33 | 34 | 35 | (defn collection-accumulator 36 | "Create and register a collection accumulator, which starts with empty list 37 | and accumulates inputs by adding them into the list." 38 | ([^JavaSparkContext spark-context] 39 | (.collectionAccumulator (.sc spark-context))) 40 | ([^JavaSparkContext spark-context acc-name] 41 | (.collectionAccumulator (.sc spark-context) acc-name))) 42 | 43 | 44 | ;; ## Accumulator Methods 45 | 46 | (defn name 47 | "Return the name of the accumulator, if any." 48 | [^AccumulatorV2 acc] 49 | (scala/resolve-option (.name acc))) 50 | 51 | 52 | (defn value 53 | "Return the current value of the accumulator. This can only be called by the 54 | driver." 55 | [^AccumulatorV2 acc] 56 | (.value acc)) 57 | 58 | 59 | (defn empty? 60 | "True if the accumulator has not had any values added to it." 61 | [^AccumulatorV2 acc] 62 | (.isZero acc)) 63 | 64 | 65 | (defn add! 66 | "Add an element to the accumulated value." 67 | [^AccumulatorV2 acc v] 68 | (.add acc v)) 69 | 70 | 71 | (defn merge! 72 | "Merge an accumulator `b` into `a`. Both accumulators must have the same 73 | type." 74 | [^AccumulatorV2 a ^AccumulatorV2 b] 75 | (.merge a b)) 76 | 77 | 78 | (defn reset! 79 | "Reset the accumulator to its empty or zero value." 80 | [^AccumulatorV2 acc] 81 | (.reset acc)) 82 | 83 | 84 | ;; ## Numeric Accumulators 85 | 86 | (defn count 87 | "Return the number of values added to the accumulator. The accumulator must 88 | hold either long or double values." 89 | [acc] 90 | (condp instance? acc 91 | LongAccumulator 92 | (.count ^LongAccumulator acc) 93 | 94 | DoubleAccumulator 95 | (.count ^DoubleAccumulator acc) 96 | 97 | (throw (IllegalArgumentException. 98 | (str "Cannot call count on accumulator type " 99 | (class acc)))))) 100 | 101 | 102 | (defn sum 103 | "Return the sum of all the values added to the accumulator. The accumulator 104 | must hold either long or double values." 105 | [acc] 106 | (condp instance? acc 107 | LongAccumulator 108 | (.sum ^LongAccumulator acc) 109 | 110 | DoubleAccumulator 111 | (.sum ^DoubleAccumulator acc) 112 | 113 | (throw (IllegalArgumentException. 114 | (str "Cannot call sum on accumulator type " 115 | (class acc)))))) 116 | 117 | 118 | (defn avg 119 | "Return the average of all the values added to the accumulator. The 120 | accumulator must hold either long or double values." 121 | [acc] 122 | (condp instance? acc 123 | LongAccumulator 124 | (.avg ^LongAccumulator acc) 125 | 126 | DoubleAccumulator 127 | (.avg ^DoubleAccumulator acc) 128 | 129 | (throw (IllegalArgumentException. 130 | (str "Cannot call avg on accumulator type " 131 | (class acc)))))) 132 | -------------------------------------------------------------------------------- /sparkplug-core/src/clojure/sparkplug/config.clj: -------------------------------------------------------------------------------- 1 | (ns sparkplug.config 2 | "Functions for working with Spark configuration." 3 | (:import 4 | org.apache.spark.SparkConf)) 5 | 6 | 7 | ;; ## Constructor 8 | 9 | (defn spark-conf 10 | "Construct a new Spark configuration. Optionally accepts a boolean to control 11 | whether default configuration is loaded from the system properties." 12 | ^SparkConf 13 | ([] 14 | (spark-conf true)) 15 | ^SparkConf 16 | ([defaults?] 17 | (-> (SparkConf. (boolean defaults?)) 18 | (.set "spark.serializer" "org.apache.spark.serializer.KryoSerializer") 19 | (.set "spark.kryo.registrator" "sparkplug.kryo.ClassPathRegistrator")))) 20 | 21 | 22 | ;; ## Property Accessors 23 | 24 | (defn contains-key? 25 | "True if the given spark configuration contains the named parameter." 26 | [^SparkConf conf ^String k] 27 | (.contains conf k)) 28 | 29 | 30 | (defn get-all 31 | "Get all configuration parameters as a map." 32 | [^SparkConf conf] 33 | (into {} 34 | (map (fn tuple->entry 35 | [^scala.Tuple2 entry] 36 | [(._1 entry) (._2 entry)])) 37 | (.getAll conf))) 38 | 39 | 40 | (defn get-param 41 | "Get a configuration parameter `k` in `conf`. If not set, this throws a 42 | `NoSuchElementException` or returns `not-found` if provided." 43 | ([^SparkConf conf ^String k] 44 | (.get conf k)) 45 | ([^SparkConf conf ^String k ^String not-found] 46 | (.get conf k not-found))) 47 | 48 | 49 | (defn merge-params 50 | "Merge the provided parameters into the Spark configuration. Returns updated 51 | configuration." 52 | ^SparkConf 53 | [^SparkConf conf params] 54 | (reduce-kv 55 | (fn set-entry 56 | [^SparkConf c ^String k ^String v] 57 | (.set c k v)) 58 | conf 59 | params)) 60 | 61 | 62 | (defn set-param 63 | "Set a parameter to a new value in the given Spark configuration. Returns 64 | updated configuration." 65 | ^SparkConf 66 | ([^SparkConf conf ^String k ^String v] 67 | (.set conf k v)) 68 | ^SparkConf 69 | ([^SparkConf conf k v & kvs] 70 | {:pre [(even? (count kvs))]} 71 | (merge-params conf (apply array-map k v kvs)))) 72 | 73 | 74 | (defn set-param-default 75 | "Set a parameter to a new value if it is not already set in the config. 76 | Returns an updated configuration." 77 | ^SparkConf 78 | [^SparkConf conf ^String k ^String v] 79 | (.setIfMissing conf k v)) 80 | 81 | 82 | (defn unset-param 83 | "Unset the given parameters on the config. Returns an updated config." 84 | ^SparkConf 85 | ([^SparkConf conf ^String k] 86 | (.remove conf k)) 87 | ^SparkConf 88 | ([^SparkConf conf k & ks] 89 | (reduce 90 | (fn unset-key 91 | [^SparkConf c ^String k] 92 | (.remove c k)) 93 | conf 94 | (cons k ks)))) 95 | 96 | 97 | (defn set-executor-env 98 | "Set environment variables to be used when launching executors for this 99 | application. Accepts a parameter key and value or a map of parameters. 100 | Returns an updated configuration." 101 | ^SparkConf 102 | ([^SparkConf conf k v] 103 | (.setExecutorEnv conf k v)) 104 | ^SparkConf 105 | ([^SparkConf conf env] 106 | (reduce-kv 107 | (fn set-entry 108 | [^SparkConf c k v] 109 | (.setExecutorEnv c k v)) 110 | conf 111 | env))) 112 | 113 | 114 | (defn master 115 | "Set the Spark master property. Returns updated configuration." 116 | ^SparkConf 117 | [^SparkConf conf ^String master] 118 | (.setMaster conf master)) 119 | 120 | 121 | (defn spark-home 122 | "Set the Spark home path property. Returns updated configuration." 123 | ^SparkConf 124 | [^SparkConf conf home] 125 | (.setSparkHome conf home)) 126 | 127 | 128 | (defn app-name 129 | "Set the Spark application name. Returns updated configuration." 130 | ^SparkConf 131 | [^SparkConf conf name-str] 132 | (.setAppName conf name-str)) 133 | 134 | 135 | (defn jars 136 | "Set JAR files to distribute to the cluster. Returns updated configuration." 137 | ^SparkConf 138 | [^SparkConf conf jars] 139 | (.setJars conf ^"[Ljava.lang.String;" (into-array String jars))) 140 | 141 | 142 | (defn debug-str 143 | "Return a string containing a representation of the configuration useful for 144 | debugging." 145 | [^SparkConf conf] 146 | (.toDebugString conf)) 147 | -------------------------------------------------------------------------------- /sparkplug-core/src/clojure/sparkplug/context.clj: -------------------------------------------------------------------------------- 1 | (ns sparkplug.context 2 | "Functions for working with and creating Spark contexts." 3 | (:require 4 | [sparkplug.scala :as scala]) 5 | (:import 6 | org.apache.spark.SparkConf 7 | org.apache.spark.api.java.JavaSparkContext)) 8 | 9 | 10 | ;; ## Application Lifecycle 11 | 12 | (defn spark-context 13 | "Create a new spark context which takes its settings from the given 14 | configuration object." 15 | ^JavaSparkContext 16 | ([^SparkConf conf] 17 | (JavaSparkContext. conf)) 18 | ^JavaSparkContext 19 | ([master app-name] 20 | (JavaSparkContext. (str master) (str app-name)))) 21 | 22 | 23 | (defn set-job-description! 24 | "Set a human readable description of the current job." 25 | [^JavaSparkContext spark-context description] 26 | (.setJobDescription spark-context description)) 27 | 28 | 29 | (defn set-job-group! 30 | "Assign a group ID to all the jobs started by this thread until the group ID 31 | is set to a different value or cleared. 32 | 33 | Often, a unit of execution in an application consists of multiple Spark 34 | actions or jobs. Application programmers can use this method to group all 35 | those jobs together and give a group description. Once set, the Spark web UI 36 | will associate such jobs with this group. 37 | 38 | The application can later use `cancel-job-group!` to cancel all running jobs 39 | in this group. If `interrupt?` is set to true for the job group, then job 40 | cancellation will result in the job's executor threads being interrupted." 41 | ([^JavaSparkContext spark-context group-id description] 42 | (.setJobGroup spark-context group-id description)) 43 | ([^JavaSparkContext spark-context group-id description interrupt?] 44 | (.setJobGroup spark-context group-id description (boolean interrupt?)))) 45 | 46 | 47 | (defn clear-job-group! 48 | "Clear the current thread's job group ID and its description." 49 | [^JavaSparkContext spark-context] 50 | (.clearJobGroup spark-context)) 51 | 52 | 53 | (defn cancel-job-group! 54 | "Cancel active jobs for the specified group. 55 | 56 | See `set-job-group!` for more information." 57 | [^JavaSparkContext spark-context group-id] 58 | (.cancelJobGroup spark-context group-id)) 59 | 60 | 61 | (defn cancel-all-jobs! 62 | "Cancel all jobs that have been scheduled or are running." 63 | [^JavaSparkContext spark-context] 64 | (.cancelAllJobs spark-context)) 65 | 66 | 67 | (defn stop! 68 | "Shut down the Spark context." 69 | [^JavaSparkContext spark-context] 70 | (.stop spark-context)) 71 | 72 | 73 | (defmacro with-context 74 | "Evaluate `body` within a new Spark context by constructing one from the 75 | given expression. The context is stopped after evaluation is complete." 76 | [binding-vec & body] 77 | {:pre [(vector? binding-vec) (= 2 (count binding-vec))]} 78 | (let [[ctx-sym expr] binding-vec 79 | ctx-sym (vary-meta ctx-sym assoc :tag 'org.apache.spark.api.java.JavaSparkContext)] 80 | `(let [~ctx-sym (spark-context ~expr)] 81 | (try 82 | ~@body 83 | (finally 84 | (stop! ~ctx-sym)))))) 85 | 86 | 87 | ;; ## Context Introspection 88 | 89 | (defn config 90 | "Return the Spark configuration used for the given context." 91 | ^SparkConf 92 | [^JavaSparkContext spark-context] 93 | (.getConf spark-context)) 94 | 95 | 96 | (defn info 97 | "Build a map of information about the Spark context." 98 | [^JavaSparkContext spark-context] 99 | {:master (.master spark-context) 100 | :app-name (.appName spark-context) 101 | :local? (.isLocal spark-context) 102 | :user (.sparkUser spark-context) 103 | :start-time (.startTime spark-context) 104 | :version (.version spark-context) 105 | :jars (.jars spark-context) 106 | :default-min-partitions (.defaultMinPartitions spark-context) 107 | :default-parallelism (.defaultParallelism spark-context) 108 | :checkpoint-dir (scala/resolve-option (.getCheckpointDir spark-context))}) 109 | 110 | 111 | (defn get-local-property 112 | "Get a local property set for this thread, or null if not set." 113 | [^JavaSparkContext spark-context k] 114 | (.getLocalProperty spark-context k)) 115 | 116 | 117 | (defn persistent-rdds 118 | "Return a Java map of JavaRDDs that have marked themselves as persistent via 119 | a `cache!` call." 120 | [^JavaSparkContext spark-context] 121 | (into {} (.getPersistentRDDs spark-context))) 122 | 123 | 124 | ;; ## Context Modifiers 125 | 126 | (defn add-file! 127 | "Add a file to be downloaded with this Spark job on every node." 128 | ([^JavaSparkContext spark-context path] 129 | (.addFile spark-context path)) 130 | ([^JavaSparkContext spark-context path recursive?] 131 | (.addFile spark-context path (boolean recursive?)))) 132 | 133 | 134 | (defn add-jar! 135 | "Adds a JAR dependency for all tasks to be executed on this SparkContext in 136 | the future." 137 | [^JavaSparkContext spark-context path] 138 | (.addJar spark-context path)) 139 | 140 | 141 | (defn set-local-property! 142 | "Set a local property that affects jobs submitted from this thread, and all 143 | child threads, such as the Spark fair scheduler pool." 144 | [^JavaSparkContext spark-context k v] 145 | (.setLocalProperty spark-context k v)) 146 | 147 | 148 | (defn set-checkpoint-dir! 149 | "Set the directory under which RDDs are going to be checkpointed." 150 | [^JavaSparkContext spark-context path] 151 | (.setCheckpointDir spark-context path)) 152 | 153 | 154 | (defn set-log-level! 155 | "Control the Spark application's logging level." 156 | [^JavaSparkContext spark-context level] 157 | (.setLogLevel spark-context level)) 158 | -------------------------------------------------------------------------------- /sparkplug-core/src/clojure/sparkplug/core.clj: -------------------------------------------------------------------------------- 1 | (ns sparkplug.core 2 | "This namespace provides the main API for writing Spark tasks. 3 | 4 | Most operations in this namespace place the RDD last in the argument list, 5 | just like Clojure collection functions. This lets you compose them using the 6 | thread-last macro (`->>`), making it simple to migrate existing Clojure 7 | code." 8 | (:refer-clojure :exclude [count distinct filter first group-by into keys map 9 | mapcat max min reduce sort-by take vals]) 10 | (:require 11 | [clojure.core :as c] 12 | [sparkplug.function :as f] 13 | [sparkplug.rdd :as rdd] 14 | [sparkplug.scala :as scala]) 15 | (:import 16 | org.apache.spark.Partitioner 17 | (org.apache.spark.api.java 18 | JavaPairRDD 19 | JavaRDD 20 | JavaRDDLike 21 | JavaSparkContext) 22 | org.apache.spark.broadcast.Broadcast 23 | sparkplug.broadcast.DerefBroadcast 24 | sparkplug.core.UnionHelper)) 25 | 26 | 27 | ;; ## Broadcast Variables 28 | 29 | (defn broadcast 30 | "Broadcast a read-only variable to the cluster, returning a reference for 31 | reading it in distributed functions. The variable data will be sent to each 32 | cluster only once. 33 | 34 | The returned broadcast value can be resolved with `deref` or the `@` reader 35 | macro." 36 | ^Broadcast 37 | [^JavaSparkContext spark-context value] 38 | (let [broadcast (.broadcast spark-context value)] 39 | (DerefBroadcast. broadcast (class value)))) 40 | 41 | 42 | ;; ## RDD Transformations 43 | 44 | ;; Type hints are omitted because `filter` is not included in JavaRDDLike. 45 | (defn filter 46 | "Filter the elements of `rdd` to the ones which satisfy the predicate `f`." 47 | ^JavaRDDLike 48 | [f rdd] 49 | (rdd/set-callsite-name 50 | (.filter rdd (f/fn1 (comp boolean f))) 51 | (rdd/fn-name f))) 52 | 53 | 54 | (defn map 55 | "Map the function `f` over each element of `rdd`. Returns a new RDD 56 | representing the transformed elements." 57 | ^JavaRDDLike 58 | [f ^JavaRDDLike rdd] 59 | (rdd/set-callsite-name 60 | (.map rdd (f/fn1 f)) 61 | (rdd/fn-name f))) 62 | 63 | 64 | (defn mapcat 65 | "Map the function `f` over each element in `rdd` to produce a sequence of 66 | results. Returns an RDD representing the concatenation of all element 67 | results." 68 | ^JavaRDD 69 | [f ^JavaRDDLike rdd] 70 | (rdd/set-callsite-name 71 | (.flatMap rdd (f/flat-map-fn f)) 72 | (rdd/fn-name f))) 73 | 74 | 75 | (defn map-partitions 76 | "Map the function `f` over each partition in `rdd`, producing a sequence of 77 | results. Returns an RDD representing the concatenation of all the partition 78 | results. The function will be called with an iterator of the elements of each 79 | partition." 80 | (^JavaRDDLike 81 | [f ^JavaRDDLike rdd] 82 | (map-partitions f false rdd)) 83 | (^JavaRDDLike 84 | [f preserve-partitioning? ^JavaRDDLike rdd] 85 | (rdd/set-callsite-name 86 | (.mapPartitions 87 | rdd 88 | (f/flat-map-fn f) 89 | (boolean preserve-partitioning?)) 90 | (rdd/fn-name f)))) 91 | 92 | 93 | (defn map-partitions-indexed 94 | "Map the function `f` over each partition in `rdd`, producing a sequence of 95 | results. Returns an RDD representing the concatenation of all the partition 96 | results. The function will be called with the partition index and an iterator 97 | of the elements of each partition." 98 | ^JavaRDD 99 | [f ^JavaRDDLike rdd] 100 | (rdd/set-callsite-name 101 | (.mapPartitionsWithIndex rdd (f/fn2 f) true) 102 | (rdd/fn-name f))) 103 | 104 | 105 | ;; Type hints are omitted because `distinct` is not included in JavaRDDLike. 106 | (defn distinct 107 | "Construct an RDD containing only a single copy of each distinct element in 108 | `rdd`. Optionally accepts a number of partitions to size the resulting RDD 109 | with." 110 | (^JavaRDDLike 111 | [rdd] 112 | (rdd/set-callsite-name 113 | (.distinct rdd))) 114 | (^JavaRDDLike 115 | [num-partitions rdd] 116 | (rdd/set-callsite-name 117 | (.distinct rdd (int num-partitions)) 118 | (int num-partitions)))) 119 | 120 | 121 | ;; Type hints are omitted because `sample` is not included in JavaRDDLike. 122 | (defn sample 123 | "Generate a randomly sampled subset of `rdd` with roughly `fraction` of the 124 | original elements. Callers can optionally select whether the sample happens 125 | with replacement, and a random seed to control the sample." 126 | (^JavaRDDLike 127 | [fraction rdd] 128 | (rdd/set-callsite-name 129 | (.sample rdd true (double fraction)) 130 | (double fraction))) 131 | (^JavaRDDLike 132 | [fraction replacement? rdd] 133 | (rdd/set-callsite-name 134 | (.sample rdd (boolean replacement?) (double fraction)) 135 | (double fraction) 136 | (boolean replacement?))) 137 | (^JavaRDDLike 138 | [fraction replacement? seed rdd] 139 | (rdd/set-callsite-name 140 | (.sample rdd (boolean replacement?) (double fraction) (long seed)) 141 | (double fraction) 142 | (boolean replacement?) 143 | (long seed)))) 144 | 145 | 146 | (defn sort-by 147 | "Reorder the elements of `rdd` so that they are sorted according to the given 148 | key function. The result may be ordered ascending or descending, depending on 149 | `ascending?`." 150 | (^JavaRDD 151 | [f ^JavaRDD rdd] 152 | (sort-by f true rdd)) 153 | (^JavaRDD 154 | [f ascending? ^JavaRDD rdd] 155 | (sort-by f ascending? (.getNumPartitions rdd) rdd)) 156 | (^JavaRDD 157 | [f ascending? num-partitions ^JavaRDD rdd] 158 | (rdd/set-callsite-name 159 | (.sortBy rdd 160 | (f/fn1 f) 161 | (boolean ascending?) 162 | num-partitions) 163 | (rdd/fn-name f) 164 | (boolean ascending?) 165 | (int num-partitions)))) 166 | 167 | 168 | ;; ## Pair RDD Transformations 169 | 170 | (defn keys 171 | "Transform `rdd` by replacing each pair with its key. Returns a new RDD 172 | representing the keys." 173 | ^JavaRDD 174 | [^JavaPairRDD rdd] 175 | (rdd/set-callsite-name (.keys rdd))) 176 | 177 | 178 | (defn vals 179 | "Transform `rdd` by replacing each pair with its value. Returns a new RDD 180 | representing the values." 181 | ^JavaRDD 182 | [^JavaPairRDD rdd] 183 | (rdd/set-callsite-name (.values rdd))) 184 | 185 | 186 | (defn key-by 187 | "Creates pairs from the elements in `rdd` by using `f` to compute a key for 188 | each value." 189 | ^JavaPairRDD 190 | [f ^JavaRDDLike rdd] 191 | (rdd/set-callsite-name 192 | (.mapToPair rdd (f/pair-fn (juxt f identity))) 193 | (rdd/fn-name f))) 194 | 195 | 196 | (defn map->pairs 197 | "Map the function `f` over each element of `rdd`. Returns a new pair RDD 198 | representing the transformed elements." 199 | ^JavaPairRDD 200 | [f ^JavaRDDLike rdd] 201 | (rdd/set-callsite-name 202 | (.mapToPair rdd (f/pair-fn f)) 203 | (rdd/fn-name f))) 204 | 205 | 206 | (defn mapcat->pairs 207 | "Map the function `f` over each element in `rdd` to produce a sequence of 208 | key-value pairs. Returns a new pair RDD representing the concatenation of all 209 | result pairs." 210 | ^JavaPairRDD 211 | [f ^JavaRDDLike rdd] 212 | (rdd/set-callsite-name 213 | (.flatMapToPair rdd (f/pair-flat-map-fn f)) 214 | (rdd/fn-name f))) 215 | 216 | 217 | (defn map-partitions->pairs 218 | "Map the function `f` over each partition in `rdd`, producing a sequence of 219 | key-value pairs. The function will be called with an iterator of the elements 220 | of the partition." 221 | (^JavaPairRDD 222 | [f ^JavaRDDLike rdd] 223 | (map-partitions->pairs f false rdd)) 224 | (^JavaPairRDD 225 | [f preserve-partitioning? ^JavaRDDLike rdd] 226 | (rdd/set-callsite-name 227 | (.mapPartitionsToPair 228 | rdd 229 | (f/pair-flat-map-fn f) 230 | (boolean preserve-partitioning?)) 231 | (rdd/fn-name f) 232 | (boolean preserve-partitioning?)))) 233 | 234 | 235 | (defn map-vals 236 | "Map the function `f` over each value of the pairs in `rdd`. Returns a new 237 | pair RDD representing the transformed pairs." 238 | ^JavaPairRDD 239 | [f ^JavaPairRDD rdd] 240 | (rdd/set-callsite-name 241 | (.mapValues rdd (f/fn1 f)) 242 | (rdd/fn-name f))) 243 | 244 | 245 | (defn mapcat-vals 246 | "Map the function `f` over each value of the pairs in `rdd` to produce a 247 | collection of values. Returns a new pair RDD representing the concatenated 248 | keys and values." 249 | ^JavaPairRDD 250 | [f ^JavaPairRDD rdd] 251 | (rdd/set-callsite-name 252 | (.flatMapValues rdd (f/fn1 f)) 253 | (rdd/fn-name f))) 254 | 255 | 256 | (defn zip-indexed 257 | "Zip the elements in `rdd` with their indices. Returns a new pair RDD with 258 | the element/index tuples. 259 | 260 | The ordering is first based on the partition index and then the ordering of 261 | items within each partition. So the first item in the first partition gets 262 | index 0, and the last item in the last partition receives the largest index. 263 | 264 | This method needs to trigger a spark job when `rdd` contains more than one 265 | partition." 266 | ^JavaPairRDD 267 | [^JavaRDDLike rdd] 268 | (rdd/set-callsite-name 269 | (.zipWithIndex rdd))) 270 | 271 | 272 | (defn zip-unique-ids 273 | "Zip the elements in `rdd` with unique long identifiers. Returns a new pair 274 | RDD with the element/id tuples. 275 | 276 | Items in the kth partition will get ids `k`, `n+k`, `2*n+k`, ..., where `n` 277 | is the number of partitions. So the ids won't be sequential and there may be 278 | gaps, but this method _won't_ trigger a spark job, unlike `zip-indexed`." 279 | ^JavaPairRDD 280 | [^JavaRDDLike rdd] 281 | (rdd/set-callsite-name 282 | (.zipWithUniqueId rdd))) 283 | 284 | 285 | ;; ## Multi-RDD Functions 286 | 287 | (defn cartesian 288 | "Construct an RDD representing the cartesian product of two RDDs. Returns a 289 | new pair RDD containing all combinations of elements between the datasets." 290 | ^JavaPairRDD 291 | [^JavaRDDLike rdd1 ^JavaRDDLike rdd2] 292 | (rdd/set-callsite-name 293 | (.cartesian rdd1 rdd2))) 294 | 295 | 296 | ;; Type hints are omitted because `union` is not included in JavaRDDLike. 297 | (defn union 298 | "Construct a union of the elements in the provided RDDs. Any identical 299 | elements will appear multiple times." 300 | [rdd1 & rdds] 301 | (let [ctx (JavaSparkContext/fromSparkContext (.context ^JavaRDDLike rdd1))] 302 | (rdd/set-callsite-name 303 | (condp instance? rdd1 304 | JavaRDD 305 | (UnionHelper/unionJavaRDDs ctx (into-array JavaRDD (list* rdd1 rdds))) 306 | 307 | JavaPairRDD 308 | (UnionHelper/unionJavaPairRDDs ctx (into-array JavaPairRDD (list* rdd1 rdds))) 309 | 310 | (throw 311 | (IllegalArgumentException. 312 | (str "Unsupported type for RDD union: " (.getName (class rdd1))))))))) 313 | 314 | 315 | ;; Type hints are omitted because `intersecton` is not included in JavaRDDLike. 316 | (defn intersection 317 | "Construct an RDD representing the intersection of elements which are in both 318 | RDDs." 319 | [rdd1 rdd2] 320 | (rdd/set-callsite-name 321 | (.intersection rdd1 rdd2))) 322 | 323 | 324 | ;; Type hints are omitted because `subtract` is not included in JavaRDDLike. 325 | (defn subtract 326 | "Remove all elements from `rdd1` that are present in `rdd2`." 327 | ^JavaRDDLike 328 | [rdd1 rdd2] 329 | (rdd/set-callsite-name 330 | (.subtract rdd1 rdd2))) 331 | 332 | 333 | (defn subtract-by-key 334 | "Construct an RDD representing all pairs in `rdd1` for which there is no pair 335 | with a matching key in `rdd2`." 336 | ^JavaPairRDD 337 | [^JavaPairRDD rdd1 ^JavaPairRDD rdd2] 338 | (rdd/set-callsite-name 339 | (.subtractByKey rdd1 rdd2))) 340 | 341 | 342 | (defn cogroup 343 | "Produe a new RDD containing an element for each key `k` in the given pair 344 | RDDs mapped to a tuple of the values from all RDDs as lists. 345 | 346 | If the input RDDs have types `(K, A)`, `(K, B)`, and `(K, C)`, the grouped 347 | RDD will have type `(K, (list(A), list(B), list(C)))`." 348 | (^JavaPairRDD 349 | [^JavaPairRDD rdd1 ^JavaPairRDD rdd2] 350 | (rdd/set-callsite-name 351 | (.cogroup rdd1 rdd2))) 352 | (^JavaPairRDD 353 | [^JavaPairRDD rdd1 ^JavaPairRDD rdd2 ^JavaPairRDD rdd3] 354 | (rdd/set-callsite-name 355 | (.cogroup rdd1 rdd2 rdd3))) 356 | (^JavaPairRDD 357 | [^JavaPairRDD rdd1 ^JavaPairRDD rdd2 ^JavaPairRDD rdd3 ^JavaPairRDD rdd4] 358 | (rdd/set-callsite-name 359 | (.cogroup rdd1 rdd2 rdd3 rdd4)))) 360 | 361 | 362 | (defn cogroup-partitioned 363 | "Produe a new RDD containing an element for each key `k` in the given pair 364 | RDDs mapped to a tuple of the values from all RDDs as lists. The resulting 365 | RDD partitions may be controlled by setting `partitions` to an integer number 366 | or a `Partitioner` instance. 367 | 368 | If the input RDDs have types `(K, A)`, `(K, B)`, and `(K, C)`, the grouped 369 | RDD will have type `(K, (List(A), List(B), List(C)))`." 370 | (^JavaPairRDD 371 | [^JavaPairRDD rdd1 ^JavaPairRDD rdd2 partitions] 372 | (if (instance? Partitioner partitions) 373 | (rdd/set-callsite-name 374 | (.cogroup rdd1 rdd2 ^Partitioner partitions) 375 | (class partitions)) 376 | (rdd/set-callsite-name 377 | (.cogroup rdd1 rdd2 (int partitions)) 378 | (int partitions)))) 379 | (^JavaPairRDD 380 | [^JavaPairRDD rdd1 ^JavaPairRDD rdd2 ^JavaPairRDD rdd3 partitions] 381 | (if (instance? Partitioner partitions) 382 | (rdd/set-callsite-name 383 | (.cogroup rdd1 rdd2 rdd3 ^Partitioner partitions) 384 | (class partitions)) 385 | (rdd/set-callsite-name 386 | (.cogroup rdd1 rdd2 rdd3 (int partitions)) 387 | (int partitions)))) 388 | (^JavaPairRDD 389 | [^JavaPairRDD rdd1 390 | ^JavaPairRDD rdd2 391 | ^JavaPairRDD rdd3 392 | ^JavaPairRDD rdd4 393 | partitions] 394 | (if (instance? Partitioner partitions) 395 | (rdd/set-callsite-name 396 | (.cogroup rdd1 rdd2 rdd3 rdd4 ^Partitioner partitions) 397 | (class partitions)) 398 | (rdd/set-callsite-name 399 | (.cogroup rdd1 rdd2 rdd3 rdd4 (int partitions)) 400 | (int partitions))))) 401 | 402 | 403 | (defn join 404 | "Construct an RDD containing all pairs of elements with matching keys in 405 | `rdd1` and `rdd2`. Each pair of elements will be returned as a tuple of 406 | `(k, (v, w))`, where `(k, v)` is in `rdd1` and `(k, w)` is in `rdd2`. 407 | 408 | Performs a hash join across the cluster. Optionally, `partitions` may be 409 | provided as an integer number or a partitioner instance to control the 410 | partitioning of the resulting RDD." 411 | (^JavaPairRDD 412 | [^JavaPairRDD rdd1 ^JavaPairRDD rdd2] 413 | (rdd/set-callsite-name 414 | (.join rdd1 rdd2))) 415 | (^JavaPairRDD 416 | [^JavaPairRDD rdd1 ^JavaPairRDD rdd2 partitions] 417 | (if (instance? Partitioner partitions) 418 | (rdd/set-callsite-name 419 | (.join rdd1 rdd2 ^Partitioner partitions) 420 | (class partitions)) 421 | (rdd/set-callsite-name 422 | (.join rdd1 rdd2 (int partitions)) 423 | (int partitions))))) 424 | 425 | 426 | (defn left-outer-join 427 | "Perform a left outer join of `rdd1` and `rdd2`. 428 | 429 | For each element `(k, v)` in `rdd1`, the resulting RDD will either contain 430 | all pairs `(k, (v, Some(w)))` for `(k, w)` in `rdd2`, or the pair 431 | `(k, (v, None))` if no elements in `rdd2` have key `k`. 432 | 433 | Hash-partitions the resulting RDD using the existing partitioner/parallelism 434 | level unless `partitions` is be provided as an integer number or a 435 | partitioner instance." 436 | (^JavaPairRDD 437 | [^JavaPairRDD rdd1 ^JavaPairRDD rdd2] 438 | (rdd/set-callsite-name 439 | (.leftOuterJoin rdd1 rdd2))) 440 | (^JavaPairRDD 441 | [^JavaPairRDD rdd1 ^JavaPairRDD rdd2 partitions] 442 | (if (instance? Partitioner partitions) 443 | (rdd/set-callsite-name 444 | (.leftOuterJoin rdd1 rdd2 ^Partitioner partitions) 445 | (class partitions)) 446 | (rdd/set-callsite-name 447 | (.leftOuterJoin rdd1 rdd2 (int partitions)) 448 | (int partitions))))) 449 | 450 | 451 | (defn right-outer-join 452 | "Perform a right outer join of `rdd1` and `rdd2`. 453 | 454 | For each element `(k, w)` in `rdd2`, the resulting RDD will either contain 455 | all pairs `(k, (Some(v), w))` for `(k, v)` in `rdd1`, or the pair 456 | `(k, (None, w))` if no elements in `rdd1` have key `k`. 457 | 458 | Hash-partitions the resulting RDD using the existing partitioner/parallelism 459 | level unless `partitions` is be provided as an integer number or a 460 | partitioner instance." 461 | (^JavaPairRDD 462 | [^JavaPairRDD rdd1 ^JavaPairRDD rdd2] 463 | (rdd/set-callsite-name 464 | (.rightOuterJoin rdd1 rdd2))) 465 | (^JavaPairRDD 466 | [^JavaPairRDD rdd1 ^JavaPairRDD rdd2 partitions] 467 | (if (instance? Partitioner partitions) 468 | (rdd/set-callsite-name 469 | (.rightOuterJoin rdd1 rdd2 ^Partitioner partitions) 470 | (class partitions)) 471 | (rdd/set-callsite-name 472 | (.rightOuterJoin rdd1 rdd2 (int partitions)) 473 | (int partitions))))) 474 | 475 | 476 | (defn full-outer-join 477 | "Perform a full outer join of `rdd1` and `rdd2`. 478 | 479 | For each element `(k, v)` in `rdd1`, the resulting RDD will either contain all 480 | pairs `(k, (Some(v), Some(w)))` for `(k, w)` in `rdd2`, or the pair 481 | `(k, (Some(v), None))` if no elements in other have key `k`. Similarly, for 482 | each element `(k, w)` in `rdd2`, the resulting RDD will either contain all 483 | pairs `(k, (Some(v), Some(w)))` for `v` in `rdd1`, or the pair 484 | `(k, (None, Some(w)))` if no elements in `rdd1` have key `k`. 485 | 486 | Hash-partitions the resulting RDD using the existing partitioner/parallelism 487 | level unless `partitions` is be provided as an integer number or a 488 | partitioner instance." 489 | (^JavaPairRDD 490 | [^JavaPairRDD rdd1 ^JavaPairRDD rdd2] 491 | (rdd/set-callsite-name 492 | (.fullOuterJoin rdd1 rdd2))) 493 | (^JavaPairRDD 494 | [^JavaPairRDD rdd1 ^JavaPairRDD rdd2 partitions] 495 | (if (instance? Partitioner partitions) 496 | (rdd/set-callsite-name 497 | (.fullOuterJoin rdd1 rdd2 ^Partitioner partitions) 498 | (class partitions)) 499 | (rdd/set-callsite-name 500 | (.fullOuterJoin rdd1 rdd2 (int partitions)) 501 | (int partitions))))) 502 | 503 | 504 | ;; ## Pair RDD Aggregation 505 | 506 | (defn aggregate-by-key 507 | "When called on an RDD of (K, V) pairs, returns an RDD of (K, U) pairs where 508 | the values for each key are aggregated using the given 2-arg aggregator 509 | function, 2-arg combiner function, and a neutral zero value. Allows an 510 | aggregated value type that is different than the input value type, while 511 | avoiding unnecessary allocations. The number of reduce tasks is configurable 512 | by optionally passing a number of partitions or a partitioner." 513 | (^JavaPairRDD 514 | [aggregator combiner zero ^JavaPairRDD rdd] 515 | (.aggregateByKey rdd zero (f/fn2 aggregator) (f/fn2 combiner))) 516 | (^JavaPairRDD 517 | [aggregator combiner zero partitioner-or-num-partitions ^JavaPairRDD rdd] 518 | (if (instance? Partitioner partitioner-or-num-partitions) 519 | (.aggregateByKey 520 | rdd 521 | zero 522 | ^Partitioner partitioner-or-num-partitions 523 | (f/fn2 aggregator) 524 | (f/fn2 combiner)) 525 | (.aggregateByKey 526 | rdd 527 | zero 528 | (int partitioner-or-num-partitions) 529 | (f/fn2 aggregator) 530 | (f/fn2 combiner))))) 531 | 532 | 533 | (defn group-by 534 | "Group the elements of `rdd` using a key function `f`. Returns a pair RDD 535 | with each generated key and all matching elements as a value sequence." 536 | (^JavaPairRDD 537 | [f ^JavaRDDLike rdd] 538 | (rdd/set-callsite-name 539 | (.groupBy rdd (f/fn1 f)) 540 | (rdd/fn-name f))) 541 | (^JavaPairRDD 542 | [f num-partitions ^JavaRDDLike rdd] 543 | (rdd/set-callsite-name 544 | (.groupBy rdd (f/fn1 f) (int num-partitions)) 545 | (rdd/fn-name f) 546 | num-partitions))) 547 | 548 | 549 | (defn group-by-key 550 | "Group the entries in the pair `rdd` by key. Returns a new pair RDD with one 551 | entry per key, containing all of the matching values as a sequence." 552 | (^JavaPairRDD 553 | [^JavaPairRDD rdd] 554 | (rdd/set-callsite-name 555 | (.groupByKey rdd))) 556 | (^JavaPairRDD 557 | [num-partitions ^JavaPairRDD rdd] 558 | (rdd/set-callsite-name 559 | (.groupByKey rdd (int num-partitions)) 560 | num-partitions))) 561 | 562 | 563 | (defn reduce-by-key 564 | "Aggregate the pairs of `rdd` which share a key by combining all of the 565 | values with the reducing function `f`. Returns a new pair RDD with one entry 566 | per unique key, holding the aggregated values." 567 | ^JavaPairRDD 568 | [f ^JavaPairRDD rdd] 569 | (rdd/set-callsite-name 570 | (.reduceByKey rdd (f/fn2 f)) 571 | (rdd/fn-name f))) 572 | 573 | 574 | (defn combine-by-key 575 | "Combine the elements for each key using a set of aggregation functions. 576 | 577 | If `rdd` contains pairs of `(K, V)`, the resulting RDD will contain pairs of 578 | type `(K, C)`. Callers must provide three functions: 579 | - `seq-fn` which turns a V into a C (for example, `vector`) 580 | - `conj-fn` to add a V to a C (for example, `conj`) 581 | - `merge-fn` to combine two C's into a single result" 582 | (^JavaPairRDD 583 | [seq-fn conj-fn merge-fn ^JavaPairRDD rdd] 584 | (rdd/set-callsite-name 585 | (.combineByKey rdd 586 | (f/fn1 seq-fn) 587 | (f/fn2 conj-fn) 588 | (f/fn2 merge-fn)) 589 | (rdd/fn-name seq-fn) 590 | (rdd/fn-name conj-fn) 591 | (rdd/fn-name merge-fn))) 592 | (^JavaPairRDD 593 | [seq-fn conj-fn merge-fn num-partitions ^JavaPairRDD rdd] 594 | (rdd/set-callsite-name 595 | (.combineByKey rdd 596 | (f/fn1 seq-fn) 597 | (f/fn2 conj-fn) 598 | (f/fn2 merge-fn) 599 | (int num-partitions)) 600 | (rdd/fn-name seq-fn) 601 | (rdd/fn-name conj-fn) 602 | (rdd/fn-name merge-fn) 603 | num-partitions))) 604 | 605 | 606 | (defn sort-by-key 607 | "Reorder the elements of `rdd` so that they are sorted according to their 608 | natural order or the given comparator `f` if provided. The result may be 609 | ordered ascending or descending, depending on `ascending?`." 610 | (^JavaPairRDD 611 | [^JavaPairRDD rdd] 612 | (rdd/set-callsite-name 613 | (.sortByKey rdd true))) 614 | (^JavaPairRDD 615 | [ascending? ^JavaPairRDD rdd] 616 | (rdd/set-callsite-name 617 | (.sortByKey rdd (boolean ascending?)) 618 | (boolean ascending?))) 619 | (^JavaPairRDD 620 | [compare-fn ascending? ^JavaPairRDD rdd] 621 | (rdd/set-callsite-name 622 | (.sortByKey rdd 623 | (f/comparator-fn compare-fn) 624 | (boolean ascending?)) 625 | (rdd/fn-name compare-fn) 626 | (boolean ascending?))) 627 | (^JavaPairRDD 628 | [compare-fn ascending? num-partitions ^JavaPairRDD rdd] 629 | (rdd/set-callsite-name 630 | (.sortByKey rdd 631 | (f/comparator-fn compare-fn) 632 | (boolean ascending?) 633 | (int num-partitions)) 634 | (rdd/fn-name compare-fn) 635 | (boolean ascending?) 636 | (int num-partitions)))) 637 | 638 | 639 | ;; ## RDD Actions 640 | 641 | (defn collect 642 | "Collect the elements of `rdd` into a vector on the driver. Be careful not to 643 | realize large datasets with this, as the driver will likely run out of 644 | memory. 645 | 646 | This is an action that causes computation." 647 | [^JavaRDDLike rdd] 648 | (vec (.collect rdd))) 649 | 650 | 651 | (defn into 652 | "Collect the elements of `rdd` into a collection on the driver. Behaves like 653 | `clojure.core/into`, including accepting an optional transducer. 654 | Automatically coerces Scala tuples into Clojure vectors. 655 | 656 | Be careful not to realize large datasets with this, as the driver will likely 657 | run out of memory. 658 | 659 | This is an action that causes computation." 660 | ([coll ^JavaRDDLike rdd] 661 | (into coll identity rdd)) 662 | ([coll xf ^JavaRDDLike rdd] 663 | (c/into coll 664 | (comp (c/map scala/from-tuple) xf) 665 | (.collect rdd)))) 666 | 667 | 668 | (defn foreach 669 | "Apply the function `f` to all elements of `rdd`. The function will run on 670 | the executors where the data resides. 671 | 672 | Consider `foreach-partition` for efficiency if handling an element requires 673 | costly resource acquisition such as a database connection. 674 | 675 | This is an action that causes computation." 676 | [f ^JavaRDDLike rdd] 677 | (.foreach rdd (f/void-fn f))) 678 | 679 | 680 | (defn foreach-partition 681 | "Apply the function `f` to all elements of `rdd` by calling it with a 682 | sequence of each partition's elements. The function will run on the executors 683 | where the data resides. 684 | 685 | This is an action that causes computation." 686 | [f ^JavaRDDLike rdd] 687 | (.foreachPartition rdd (f/void-fn (comp f iterator-seq)))) 688 | 689 | 690 | (defn count 691 | "Count the number of elements in `rdd`. 692 | 693 | This is an action that causes computation." 694 | [^JavaRDDLike rdd] 695 | (.count rdd)) 696 | 697 | 698 | (defn first 699 | "Find the first element of `rdd`. 700 | 701 | This is an action that causes computation." 702 | [^JavaRDDLike rdd] 703 | (.first rdd)) 704 | 705 | 706 | (defn min 707 | "Find the minimum element in `rdd` in the ordering defined by `compare-fn`. 708 | 709 | This is an action that causes computation." 710 | ([^JavaRDDLike rdd] 711 | (min compare rdd)) 712 | ([compare-fn ^JavaRDDLike rdd] 713 | (.min rdd (f/comparator-fn compare-fn)))) 714 | 715 | 716 | (defn max 717 | "Find the maximum element in `rdd` in the ordering defined by `compare-fn`. 718 | 719 | This is an action that causes computation." 720 | ([^JavaRDDLike rdd] 721 | (max compare rdd)) 722 | ([compare-fn ^JavaRDDLike rdd] 723 | (.max rdd (f/comparator-fn compare-fn)))) 724 | 725 | 726 | (defn take 727 | "Take the first `n` elements of the RDD. 728 | 729 | This currently scans the partitions _one by one_ on the **driver**, so it 730 | will be slow if a lot of elements are required. In that case, use `collect` 731 | to get the whole RDD instead. 732 | 733 | This is an action that causes computation." 734 | [n ^JavaRDDLike rdd] 735 | (.take rdd (int n))) 736 | 737 | 738 | (defn take-ordered 739 | "Take the first `n` (smallest) elements from this RDD as defined by the 740 | elements' natural order or specified comparator. 741 | 742 | This currently scans the partitions _one by one_ on the **driver**, so it 743 | will be slow if a lot of elements are required. In that case, use `collect` 744 | to get the whole RDD instead. 745 | 746 | This is an action that causes computation." 747 | ([n ^JavaRDDLike rdd] 748 | (.takeOrdered rdd (int n))) 749 | ([n compare-fn ^JavaRDDLike rdd] 750 | (.takeOrdered rdd (int n) (f/comparator-fn compare-fn)))) 751 | 752 | 753 | (defn reduce 754 | "Aggregate the elements of `rdd` using the function `f`. The reducing 755 | function must accept two arguments and should be commutative and associative 756 | so that it can be computed correctly in parallel. 757 | 758 | This is an action that causes computation." 759 | [f ^JavaRDDLike rdd] 760 | (.reduce rdd (f/fn2 f))) 761 | 762 | 763 | (defn fold 764 | "Aggregate the elements of each partition in `rdd`, followed by the results 765 | for all the partitions, by using the given associative function `f` and a 766 | neutral `zero` value. 767 | 768 | This is an action that causes computation." 769 | [f zero ^JavaRDDLike rdd] 770 | (.fold rdd zero (f/fn2 f))) 771 | 772 | 773 | (defn aggregate 774 | "Aggregate the elements of each partition in `rdd` using `aggregator`, then 775 | merge the results for all partitions using `combiner`. Both functions will be 776 | seeded with the neutral `zero` value. 777 | 778 | This is an action that causes computation." 779 | [aggregator combiner zero ^JavaRDDLike rdd] 780 | (.aggregate rdd zero (f/fn2 aggregator) (f/fn2 combiner))) 781 | 782 | 783 | ;; ## Pair RDD Actions 784 | 785 | (defn lookup 786 | "Find all values in the `rdd` pairs whose keys is `k`. The key must be 787 | serializable with the Java serializer (not Kryo) for this to work. 788 | 789 | This is an action that causes computation." 790 | [^JavaPairRDD rdd k] 791 | (vec (.lookup rdd k))) 792 | 793 | 794 | (defn count-by-key 795 | "Count the distinct key values in `rdd`. Returns a map of keys to integer 796 | counts. 797 | 798 | This is an action that causes computation." 799 | [^JavaPairRDD rdd] 800 | (c/into {} (.countByKey rdd))) 801 | 802 | 803 | (defn count-by-value 804 | "Count the distinct values in `rdd`. Returns a map of values to integer 805 | counts. 806 | 807 | This is an action that causes computation." 808 | [^JavaRDDLike rdd] 809 | (c/into {} (.countByValue rdd))) 810 | -------------------------------------------------------------------------------- /sparkplug-core/src/clojure/sparkplug/function.clj: -------------------------------------------------------------------------------- 1 | (ns sparkplug.function 2 | "This namespace generates function classes for various kinds of interop with 3 | Spark and Scala." 4 | (:require 5 | [clojure.string :as str]) 6 | (:import 7 | (java.lang.reflect 8 | Field 9 | Modifier) 10 | java.util.HashSet 11 | sparkplug.function.SerializableFn)) 12 | 13 | 14 | ;; ## Namespace Discovery 15 | 16 | (defn- fn-enclosing-class 17 | "Given a function object, determine the name of the class which the function 18 | is a child of. Usually this is the class representing the namespace where the 19 | function is defined." 20 | [f] 21 | (-> (.getName (class f)) 22 | (Compiler/demunge) 23 | (str/split #"/") 24 | (first) 25 | (symbol))) 26 | 27 | 28 | (defn- class-name? 29 | "True if the provided symbol names a class, rather than a namespace." 30 | [name-sym] 31 | (let [class-name (-> (str name-sym) 32 | (str/replace "-" "_") 33 | (symbol))] 34 | (class? (resolve class-name)))) 35 | 36 | 37 | (defn- type-namespace 38 | "Given a symbol naming a record, return a symbol naming its defining 39 | namespace if it exists." 40 | [name-sym] 41 | (let [ns-sym (-> (str name-sym) 42 | (str/replace #"\.[^.]+$" "") 43 | (symbol))] 44 | (when (find-ns ns-sym) 45 | ns-sym))) 46 | 47 | 48 | (defn- fn-namespace 49 | "Given a function object, derive the name of the namespace where it was 50 | defined." 51 | [f] 52 | ;; The logic here is to avoid marking class names as namespaces to be 53 | ;; required. When using a piece of data as a function, such as a keyword or 54 | ;; set, this will actually be a class name like `clojure.lang.Keyword`. This 55 | ;; also happens when referencing a function closure defined inside of a 56 | ;; record implementation, since the function becomes an inner class; in that 57 | ;; case, we _do_ want to mark the record's defining namespace. 58 | (let [enclosing (fn-enclosing-class f)] 59 | (if (class-name? enclosing) 60 | (type-namespace enclosing) 61 | enclosing))) 62 | 63 | 64 | (defn- walk-object-refs 65 | "Walk the given object to find namespaces referenced by vars. Adds discovered 66 | reference symbols to `references` and tracks values in `visited`." 67 | [^HashSet references ^HashSet visited obj] 68 | (when-not (or (nil? obj) 69 | ;; Simple types that can't have namespace references. 70 | (boolean? obj) 71 | (string? obj) 72 | (number? obj) 73 | (keyword? obj) 74 | (symbol? obj) 75 | (instance? clojure.lang.Ref obj) 76 | ;; Nothing to do if we've already visited this object. 77 | (.contains visited obj)) 78 | (.add visited obj) 79 | (cond 80 | ;; Vars directly represent a namespace dependency. 81 | (var? obj) 82 | (let [ns-sym (ns-name (:ns (meta obj)))] 83 | (.add references ns-sym)) 84 | 85 | ;; Clojure functions: 86 | ;; Try to derive the namespace that defined the function. 87 | ;; Functions also have Var references as static fields, 88 | ;; and have closed-over objects as non-static fields. 89 | (fn? obj) 90 | (when-let [ns-sym (fn-namespace obj)] 91 | (.add references ns-sym) 92 | (doseq [^Field field (.getDeclaredFields (class obj))] 93 | (let [value (SerializableFn/accessField obj field)] 94 | (walk-object-refs references visited value)))) 95 | 96 | ;; For collection-like objects, (e.g. vectors, maps, records, Java collections), 97 | ;; just traverse the objects they contain. 98 | (seqable? obj) 99 | (doseq [entry obj] 100 | (walk-object-refs references visited entry)) 101 | 102 | ;; Otherwise, reflectively traverse the fields of the object for more references. 103 | :else 104 | (doseq [^Field field (.getDeclaredFields (class obj))] 105 | (when-not (Modifier/isStatic (.getModifiers field)) 106 | (let [value (SerializableFn/accessField obj field)] 107 | (walk-object-refs references visited value))))))) 108 | 109 | 110 | (defn namespace-references 111 | "Walk the given function-like object to find all namespaces referenced by 112 | closed-over vars. Returns a set of referenced namespace symbols." 113 | [^Object obj] 114 | (let [references (HashSet.) 115 | visited (HashSet.)] 116 | (walk-object-refs references visited obj) 117 | (disj (set references) 'clojure.core))) 118 | 119 | 120 | ;; ## Function Wrappers 121 | 122 | (defmacro ^:private gen-function 123 | "Generate a new constructor for functions of the `fn-name` class that extends 124 | `SerializableFn` and implements interfaces for compatibility with Spark." 125 | [fn-name constructor] 126 | (let [class-sym (symbol (str "sparkplug.function." fn-name))] 127 | `(defn ~(vary-meta constructor assoc :tag class-sym) 128 | ~(str "Construct a new serializable " fn-name " function wrapping `f`.") 129 | [~'f] 130 | (let [references# (namespace-references ~'f)] 131 | (new ~class-sym ~'f (mapv str references#)))))) 132 | 133 | 134 | (gen-function Fn1 fn1) 135 | (gen-function Fn2 fn2) 136 | (gen-function Fn3 fn3) 137 | (gen-function ComparatorFn comparator-fn) 138 | (gen-function FlatMapFn1 flat-map-fn) 139 | (gen-function FlatMapFn2 flat-map-fn2) 140 | (gen-function PairFlatMapFn pair-flat-map-fn) 141 | (gen-function PairFn pair-fn) 142 | (gen-function VoidFn void-fn) 143 | -------------------------------------------------------------------------------- /sparkplug-core/src/clojure/sparkplug/kryo.clj: -------------------------------------------------------------------------------- 1 | (ns sparkplug.kryo 2 | "Functions for managing object serialization with Kryo. 3 | 4 | To configure a new Kryo instance, this class looks up all resources in 5 | directories named `sparkplug/kryo/registry/` on the classpath. The files are 6 | read in sorted order, one line at a time. Each line should be tab-separated 7 | and begin with the desired action: 8 | 9 | - `require {{namespace}}` 10 | Require a namespace to load code or for other side effects. 11 | - `register {{class}}` 12 | Register the named class with default serialization. The class name may be 13 | suffixed with `[]` pairs to indicate array class types. 14 | - `register {{class}} {{serializer}}` 15 | Register the named class with the given serializer. The serializer may 16 | either be the name of a class to instantiate with the default constructor, 17 | or a qualified function var be resolved and called with no arguments to 18 | return a `Serializer` instance. 19 | - `configure {{config-fn}}` 20 | Resolve the named function and call it on the Kryo instance to directly 21 | configure it. 22 | 23 | Blank lines or lines beginning with a hash (#) are ignored." 24 | (:require 25 | [clojure.java.classpath :as classpath] 26 | [clojure.java.io :as io] 27 | [clojure.string :as str] 28 | [clojure.tools.logging :as log]) 29 | (:import 30 | (clojure.lang 31 | BigInt 32 | IPersistentMap 33 | IPersistentSet 34 | IPersistentVector 35 | ISeq 36 | Keyword 37 | Named 38 | PersistentTreeMap 39 | PersistentTreeSet 40 | Ratio 41 | StringSeq 42 | Var) 43 | (com.esotericsoftware.kryo 44 | Kryo 45 | Serializer) 46 | (com.esotericsoftware.kryo.io 47 | Input 48 | Output) 49 | java.io.File 50 | java.math.BigInteger 51 | (java.util.jar 52 | JarFile) 53 | org.objenesis.strategy.StdInstantiatorStrategy)) 54 | 55 | 56 | ;; ## Registry Files 57 | 58 | (def ^:const registry-prefix 59 | "SparkPlug registry files must be available under this directory path." 60 | "sparkplug/kryo/registry") 61 | 62 | 63 | (def ^:const registry-extension 64 | "SparkPlug registry file extension." 65 | ".conf") 66 | 67 | 68 | (defn- registry-path? 69 | "True if the given path is a valid registry file name." 70 | [path] 71 | (and (str/starts-with? path registry-prefix) 72 | (str/ends-with? path registry-extension))) 73 | 74 | 75 | (defn- relative-suffix 76 | "Return the suffix in `b` if it is prefixed by `a`." 77 | [a b] 78 | (let [a (str a "/") 79 | b (str b)] 80 | (when (str/starts-with? b a) 81 | (subs b (count a))))) 82 | 83 | 84 | (defn- read-dir-file 85 | "Read a file from the given directory. Returns a map of registry data." 86 | [^File dir path] 87 | (let [file (io/file dir path)] 88 | (log/debug "Reading registry configuration from file" (str file)) 89 | {:path (str dir) 90 | :name path 91 | :text (slurp file)})) 92 | 93 | 94 | (defn- find-dir-files 95 | "Find all files in the given directory matching the registry prefix." 96 | [^File dir] 97 | (->> (file-seq dir) 98 | (keep (partial relative-suffix dir)) 99 | (filter registry-path?) 100 | (sort) 101 | (map (partial read-dir-file dir)))) 102 | 103 | 104 | (defn- read-jar-entry 105 | "Read an entry in the given jar. Returns a map of registry data if the entry 106 | is in the jar." 107 | [^JarFile jar entry-name] 108 | (when-let [entry (.getEntry jar entry-name)] 109 | (log/debugf "Reading registry configuration from JAR entry %s!%s" 110 | (.getName jar) entry-name) 111 | {:path (.getName jar) 112 | :name entry-name 113 | :text (slurp (.getInputStream jar entry))})) 114 | 115 | 116 | (defn- find-jar-entries 117 | "Find all entries in the given JAR file matching the registry prefix." 118 | [^JarFile jar] 119 | (->> (classpath/filenames-in-jar jar) 120 | (filter registry-path?) 121 | (sort) 122 | (keep (partial read-jar-entry jar)))) 123 | 124 | 125 | (defn- find-classpath-files 126 | "Find all config files on the classpath within the registry prefix." 127 | [] 128 | (concat (mapcat find-dir-files (classpath/classpath-directories)) 129 | (mapcat find-jar-entries (classpath/classpath-jarfiles)))) 130 | 131 | 132 | (defn- parse-registry-line 133 | "Parse a line from a registry file. Returns a map of information with the 134 | given line number as `:line`, an action `:type` keyword, and any remaining 135 | `:args` as a sequence of strings. Returns nil if the line is blank or a 136 | comment." 137 | [line-number line] 138 | (when-not (or (str/blank? line) 139 | (str/starts-with? line "#")) 140 | (let [[action-type & args] (str/split line #"\t")] 141 | {:line line-number 142 | :type (keyword action-type) 143 | :args (vec args)}))) 144 | 145 | 146 | (defn- parse-registry-actions 147 | "Parse the text content of the given registry data map. Returns an updated map 148 | with `:text` removed and `:actions` set to the parsed lines." 149 | [registry] 150 | (let [actions (->> 151 | (:text registry) 152 | (str/split-lines) 153 | (map-indexed parse-registry-line) 154 | (remove nil?) 155 | (vec))] 156 | (-> registry 157 | (assoc :actions actions) 158 | (dissoc :text)))) 159 | 160 | 161 | (defn classpath-registries 162 | "Return a sequence of registry file maps from the classpath. Returns a sorted 163 | sequence with a single entry per distinct config name. Files earlier on the 164 | classpath will take precedence." 165 | [] 166 | (->> 167 | (find-classpath-files) 168 | (map (juxt :name parse-registry-actions)) 169 | (reverse) 170 | (into (sorted-map)) 171 | (vals))) 172 | 173 | 174 | ;; ## Registry Actions 175 | 176 | (defn- load-require-action 177 | "Prepare a `require` action from a registry. Requires the namespace and 178 | returns nil." 179 | [args] 180 | ;; Check arguments. 181 | (when-not (= 1 (count args)) 182 | (throw (ex-info (str "require action takes exactly one argument, not " 183 | (count args)) 184 | {:type ::bad-action}))) 185 | (when (str/includes? (first args) "/") 186 | (throw (ex-info "require action argument should not be namespaced" 187 | {:type ::bad-action}))) 188 | ;; Require the namespace code. 189 | (let [ns-sym (symbol (first args))] 190 | (log/debug "Requiring namespace" ns-sym) 191 | (require ns-sym)) 192 | ;; Nothing to do per-kryo instance afterwards. 193 | nil) 194 | 195 | 196 | (defn- convert-array-class 197 | "Determine the base class and number of nested arrays for a class name like 198 | `String[][]`. Returns a rewritten string in a form that the classloader will 199 | understand like `[[LString;`." 200 | [class-name] 201 | (loop [class-name class-name 202 | arrays 0] 203 | (if (str/ends-with? class-name "[]") 204 | (recur (subs class-name 0 (- (count class-name) 2)) 205 | (inc arrays)) 206 | (if (zero? arrays) 207 | class-name 208 | (str (str/join (repeat arrays \[)) 209 | "L" class-name ";"))))) 210 | 211 | 212 | (defn- load-register-action 213 | "Prepare a `register` action from a registry at load-time. Loads the class to 214 | register and any serialzer and returns a function which will register the 215 | class with a Kryo instance." 216 | [args] 217 | (when-not (<= 1 (count args) 2) 218 | (throw (ex-info (str "register action takes one or two arguments, not " 219 | (count args)) 220 | {:type ::bad-action}))) 221 | (when (and (second args) (not (str/includes? (second args) "/"))) 222 | (throw (ex-info "register action serializer should be a namespaced symbol" 223 | {:type ::bad-action}))) 224 | (let [[class-name serializer-name] args] 225 | (log/debugf "Registering class %s with %s serializer" 226 | class-name 227 | (or serializer-name "default")) 228 | ;; Load the class to register. 229 | (let [target-class (Class/forName (convert-array-class class-name))] 230 | (if serializer-name 231 | (if (str/includes? serializer-name "/") 232 | ;; Resolve the named function to construct a new serializer instance. 233 | (if-let [constructor (requiring-resolve (symbol serializer-name))] 234 | (fn register 235 | [^Kryo kryo] 236 | (let [serializer ^Serializer (constructor)] 237 | (.register kryo target-class serializer))) 238 | (throw (ex-info (str "Could not resolve serializer constructor function " 239 | serializer-name) 240 | {:type ::bad-action}))) 241 | ;; Assume the serializer is a class name and construct an instance. 242 | (let [serializer-class (Class/forName serializer-name)] 243 | (fn register 244 | [^Kryo kryo] 245 | (let [serializer ^Serializer (.newInstance serializer-class)] 246 | (.register kryo target-class serializer))))) 247 | ;; No serializer, register with defaults. 248 | (fn register 249 | [^Kryo kryo] 250 | (.register kryo target-class)))))) 251 | 252 | 253 | (defn- load-configure-action 254 | "Prepare a `configure` action from a registry at load-time. Resolves the 255 | configuration function and returns it." 256 | [args] 257 | (when-not (= 1 (count args)) 258 | (throw (ex-info (str "configure action takes exactly one argument, not " 259 | (count args)) 260 | {:type ::bad-action}))) 261 | (when-not (str/includes? (first args) "/") 262 | (throw (ex-info "configure action function should be a namespaced symbol" 263 | {:type ::bad-action}))) 264 | (let [var-name (symbol (first args))] 265 | (log/debug "Configuring Kryo with function" var-name) 266 | (or (requiring-resolve var-name) 267 | (throw (ex-info (str "Could not resolve configuration function " 268 | var-name) 269 | {:type ::bad-action}))))) 270 | 271 | 272 | (defn- load-action 273 | "Load the configuration `action` as read from the given `registry`. 274 | Dispatches on action type." 275 | [registry action] 276 | (let [{:keys [path name]} registry 277 | {:keys [line type args]} action] 278 | (try 279 | (case type 280 | :require 281 | (load-require-action args) 282 | 283 | :register 284 | (load-register-action args) 285 | 286 | :configure 287 | (load-configure-action args) 288 | 289 | (throw (ex-info (str "Unsupported registry action " (pr-str type)) 290 | {:type ::bad-action}))) 291 | (catch Exception ex 292 | (let [message (format "Failed to load %s action on line %s of %s in %s" 293 | (clojure.core/name type) line name path) 294 | cause (when (not= ::bad-action (:type (ex-data ex))) 295 | ex)] 296 | (log/error message (ex-message ex)) 297 | (throw (ex-info (str message ": " (ex-message ex)) 298 | {:path path 299 | :name name 300 | :line line 301 | :type type 302 | :args args} 303 | cause))))))) 304 | 305 | 306 | (defn- load-registry 307 | "Process the given registry file map and returns a sequence of all 308 | loaded configuration functions." 309 | [registry] 310 | (log/debugf "Loading registry %s in %s" (:name registry) (:path registry)) 311 | (into [] 312 | (keep (partial load-action registry)) 313 | (:actions registry))) 314 | 315 | 316 | (defn load-configuration 317 | "Walk the classpath and load configuration actions from all discovered 318 | registries. Returns a function which can be called on a Kryo serializer to 319 | configure it." 320 | [] 321 | (let [actions (into [] (mapcat load-registry) (classpath-registries))] 322 | (fn configure! 323 | [^Kryo kryo] 324 | (.setInstantiatorStrategy kryo (StdInstantiatorStrategy.)) 325 | (doseq [f actions] 326 | (f kryo))))) 327 | 328 | 329 | (defn initialize 330 | "Creates a new Kryo instance and configures it with classpath registry 331 | actions." 332 | ^Kryo 333 | [] 334 | (let [configure! (load-configuration)] 335 | (doto (Kryo.) 336 | (configure!)))) 337 | 338 | 339 | ;; ## Serialization Logic 340 | 341 | ;; For types that are already registered with efficient serializers, see: 342 | ;; https://github.com/EsotericSoftware/kryo/blob/master/src/com/esotericsoftware/kryo/Kryo.java 343 | ;; https://github.com/twitter/chill/blob/v0.9.3/chill-java/src/main/java/com/twitter/chill/java/PackageRegistrar.java 344 | ;; https://github.com/twitter/chill/blob/v0.9.3/chill-scala/src/main/scala/com/twitter/chill/ScalaKryoInstantiator.scala 345 | 346 | (defmacro defserializer 347 | "Define a new constructor for a Kryo Serializer with the given `write` and 348 | `read` method implementations." 349 | [name-sym class-sym immutable? & body] 350 | ;; TODO: a spec for this macro would be better than these assertions 351 | {:pre [(symbol? name-sym) 352 | (symbol? class-sym) 353 | (boolean? immutable?) 354 | (= 2 (count body)) 355 | (every? list? body) 356 | (= #{'read 'write} (set (map first body)))]} 357 | (let [tagged #(vary-meta %1 assoc :tag (if (instance? Class %2) 358 | (.getName ^Class %2) 359 | (str %2))) 360 | name-sym (tagged name-sym Serializer) 361 | body-methods (into {} (map (juxt first identity)) body) 362 | write-form (get body-methods 'write) 363 | read-form (get body-methods 'read)] 364 | `(defn ~name-sym 365 | ~(str "Construct a new Kryo serializer for " class-sym " values.") 366 | [] 367 | (proxy [Serializer] [false ~immutable?] 368 | 369 | (write 370 | ~(let [[kryo-sym output-sym value-sym] (second write-form)] 371 | [(tagged kryo-sym Kryo) 372 | (tagged output-sym Output) 373 | (tagged value-sym class-sym)]) 374 | ~@(nnext write-form)) 375 | 376 | (read 377 | ~(let [[kryo-sym input-sym target-sym] (second read-form)] 378 | [(tagged kryo-sym Kryo) 379 | (tagged input-sym Input) 380 | (tagged target-sym Class)]) 381 | ~@(nnext read-form)))))) 382 | 383 | 384 | ;; ### Core Serializers 385 | 386 | (defserializer ident-serializer 387 | Named true 388 | 389 | (write 390 | [kryo output value] 391 | (let [named-str (if (keyword? value) 392 | (subs (str value) 1) 393 | (str value))] 394 | (.writeString output named-str))) 395 | 396 | (read 397 | [kryo input target-class] 398 | (let [named-str (.readString input)] 399 | (if (identical? Keyword target-class) 400 | (keyword named-str) 401 | (symbol named-str))))) 402 | 403 | 404 | (defn- write-biginteger 405 | "Write a BigInteger to the Kryo output." 406 | [^Output output ^BigInteger value] 407 | (let [int-bytes (.toByteArray value)] 408 | (.writeVarInt output (alength int-bytes) true) 409 | (.writeBytes output int-bytes))) 410 | 411 | 412 | (defn- read-biginteger 413 | "Read a BigInteger value from the Kryo input." 414 | [^Input input] 415 | (let [length (.readVarInt input true) 416 | int-bytes (.readBytes input length)] 417 | (BigInteger. int-bytes))) 418 | 419 | 420 | (defserializer bigint-serializer 421 | BigInt true 422 | 423 | (write 424 | [kryo output value] 425 | (write-biginteger output (biginteger value))) 426 | 427 | (read 428 | [kryo input _] 429 | (bigint (read-biginteger input)))) 430 | 431 | 432 | (defserializer ratio-serializer 433 | Ratio true 434 | 435 | (write 436 | [kryo output value] 437 | (write-biginteger output (numerator value)) 438 | (write-biginteger output (denominator value))) 439 | 440 | (read 441 | [kryo input _] 442 | (/ (read-biginteger input) 443 | (read-biginteger input)))) 444 | 445 | 446 | (defserializer var-serializer 447 | Var false 448 | 449 | (write 450 | [kryo output value] 451 | (.writeString output (str (symbol value)))) 452 | 453 | (read 454 | [kryo input _] 455 | (let [var-sym (symbol (.readString input))] 456 | (requiring-resolve var-sym)))) 457 | 458 | 459 | ;; ### Sequence Serializers 460 | 461 | (defn- write-sequence 462 | "Write a sequence of values to the Kryo output." 463 | [^Kryo kryo ^Output output coll] 464 | (.writeVarInt output (count coll) true) 465 | (doseq [x coll] 466 | (.writeClassAndObject kryo output x))) 467 | 468 | 469 | (defn- read-sequence 470 | "Read a lazy sequence of values from the Kryo output." 471 | [^Kryo kryo ^Input input] 472 | (let [length (.readVarInt input true)] 473 | (repeatedly length #(.readClassAndObject kryo input)))) 474 | 475 | 476 | (defserializer sequence-serializer 477 | ISeq true 478 | 479 | (write 480 | [kryo output coll] 481 | (write-sequence kryo output coll)) 482 | 483 | (read 484 | [kryo input _] 485 | (apply list (read-sequence kryo input)))) 486 | 487 | 488 | (defserializer vector-serializer 489 | IPersistentVector true 490 | 491 | (write 492 | [kryo output coll] 493 | (write-sequence kryo output coll)) 494 | 495 | (read 496 | [kryo input _] 497 | (into [] (read-sequence kryo input)))) 498 | 499 | 500 | (defserializer string-seq-serializer 501 | StringSeq true 502 | 503 | (write 504 | [kryo output coll] 505 | (.writeString output (str/join coll))) 506 | 507 | (read 508 | [kryo input _] 509 | (seq (.readString input)))) 510 | 511 | 512 | ;; ### Set Serializers 513 | 514 | (defserializer set-serializer 515 | IPersistentSet true 516 | 517 | (write 518 | [kryo output coll] 519 | (write-sequence kryo output coll)) 520 | 521 | (read 522 | [kryo input _] 523 | (into #{} (read-sequence kryo input)))) 524 | 525 | 526 | (defserializer ordered-set-serializer 527 | PersistentTreeSet true 528 | 529 | (write 530 | [kryo output coll] 531 | (.writeClassAndObject kryo output (.comparator coll)) 532 | (write-sequence kryo output coll)) 533 | 534 | (read 535 | [kryo input _] 536 | (let [cmp (.readClassAndObject kryo input)] 537 | (into (sorted-set-by cmp) (read-sequence kryo input))))) 538 | 539 | 540 | ;; ### Map Serializers 541 | 542 | (defn- write-kvs 543 | "Write a sequence of key/value pairs to the Kryo output." 544 | [^Kryo kryo ^Output output coll] 545 | (.writeVarInt output (count coll) true) 546 | (doseq [[k v] coll] 547 | (.writeClassAndObject kryo output k) 548 | (.writeClassAndObject kryo output v))) 549 | 550 | 551 | (defn- read-kvs 552 | "Read a lazy sequence of key/value pairs from the Kryo output." 553 | [^Kryo kryo ^Input input] 554 | (let [length (.readVarInt input true)] 555 | (repeatedly length #(clojure.lang.MapEntry. 556 | (.readClassAndObject kryo input) 557 | (.readClassAndObject kryo input))))) 558 | 559 | 560 | (defserializer map-serializer 561 | IPersistentMap true 562 | 563 | (write 564 | [kryo output coll] 565 | (write-kvs kryo output coll)) 566 | 567 | (read 568 | [kryo input _] 569 | (into {} (read-kvs kryo input)))) 570 | 571 | 572 | (defserializer ordered-map-serializer 573 | PersistentTreeMap true 574 | 575 | (write 576 | [kryo output coll] 577 | (.writeClassAndObject kryo output (.comparator coll)) 578 | (write-kvs kryo output coll)) 579 | 580 | (read 581 | [kryo input _] 582 | (let [cmp (.readClassAndObject kryo input)] 583 | (into (sorted-set-by cmp) (read-kvs kryo input))))) 584 | 585 | 586 | ;; ## Serialization Utilities 587 | 588 | ;; These are handy for tests and repl usage, but aren't actually used directly 589 | ;; by the library. 590 | 591 | (defn encode 592 | "Serialize the given object into a byte arary using the Kryo codec." 593 | ^bytes 594 | [^Kryo kryo obj] 595 | (let [output (Output. 512 8192)] 596 | (.writeClassAndObject kryo output obj) 597 | (.toBytes output))) 598 | 599 | 600 | (defn decode 601 | "Deserialize the given byte array using the Kryo codec." 602 | [^Kryo kryo ^bytes data] 603 | (let [input (Input. data)] 604 | (.readClassAndObject kryo input))) 605 | -------------------------------------------------------------------------------- /sparkplug-core/src/clojure/sparkplug/rdd.clj: -------------------------------------------------------------------------------- 1 | (ns sparkplug.rdd 2 | "This namespace provides the main API for writing Spark tasks. 3 | 4 | Most operations in this namespace place the RDD last in the argument list, 5 | just like Clojure collection functions. This lets you compose them using the 6 | thread-last macro (`->>`), making it simple to migrate existing Clojure 7 | code." 8 | (:refer-clojure :exclude [empty name partition-by]) 9 | (:require 10 | [clojure.string :as str] 11 | [sparkplug.function :as f] 12 | [sparkplug.scala :as scala]) 13 | (:import 14 | clojure.lang.Compiler 15 | (org.apache.spark 16 | HashPartitioner 17 | Partitioner) 18 | (org.apache.spark.api.java 19 | JavaPairRDD 20 | JavaRDD 21 | JavaRDDLike 22 | JavaSparkContext 23 | StorageLevels) 24 | sparkplug.partition.FnHashPartitioner)) 25 | 26 | 27 | ;; ## Naming Functions 28 | 29 | ;; Type hints are omitted because `name` is not included in JavaRDDLike. 30 | (defn name 31 | "Return the current name for `rdd`." 32 | [rdd] 33 | (.name rdd)) 34 | 35 | 36 | ;; Type hints are omitted because `setName` is not included in JavaRDDLike. 37 | (defn set-name 38 | "Set the name of `rdd` to `name-str`." 39 | ^JavaRDDLike 40 | [name-str rdd] 41 | (.setName rdd name-str)) 42 | 43 | 44 | (defn- internal-call? 45 | "True if a stack-trace element should be ignored because it represents an internal 46 | function call that should not be considered a callsite." 47 | [^StackTraceElement element] 48 | (let [class-name (.getClassName element)] 49 | (or (str/starts-with? class-name "sparkplug.") 50 | (str/starts-with? class-name "clojure.lang.")))) 51 | 52 | 53 | (defn- stack-callsite 54 | "Find the top element in the current stack trace that is not an internal 55 | function call." 56 | ^StackTraceElement 57 | [] 58 | (first (remove internal-call? (.getStackTrace (Exception.))))) 59 | 60 | 61 | (defn ^:no-doc fn-name 62 | "Return the (unmangled) name of the given Clojure function." 63 | [f] 64 | (Compiler/demunge (.getName (class f)))) 65 | 66 | 67 | (defn- callsite-name 68 | "Generate a name for the callsite of this function by looking at the current 69 | stack. Ignores core Clojure and internal function frames." 70 | [] 71 | (let [callsite (stack-callsite) 72 | filename (.getFileName callsite) 73 | classname (.getClassName callsite) 74 | line-number (.getLineNumber callsite)] 75 | (format "%s %s:%d" (Compiler/demunge classname) filename line-number))) 76 | 77 | 78 | (defn ^:no-doc set-callsite-name 79 | "Provide a name for the given RDD by looking at the current stack. Returns 80 | the updated RDD if the name could be determined." 81 | ^JavaRDD 82 | [^JavaRDD rdd & args] 83 | (try 84 | (let [rdd-name (format "#<%s: %s %s>" 85 | (.getSimpleName (class rdd)) 86 | (callsite-name) 87 | (if (seq args) 88 | (str " [" (str/join ", " args) "]") 89 | ""))] 90 | (.setName rdd rdd-name)) 91 | (catch Exception _ 92 | ;; Ignore errors and return an unnamed RDD. 93 | rdd))) 94 | 95 | 96 | ;; ## Dataset Construction 97 | 98 | (defn empty 99 | "Construct a new empty RDD." 100 | ^JavaRDD 101 | [^JavaSparkContext spark-context] 102 | (.emptyRDD spark-context)) 103 | 104 | 105 | (defn parallelize 106 | "Distribute a local collection to form an RDD. Optionally accepts a number 107 | of partitions to slice the collection into." 108 | (^JavaRDD 109 | [^JavaSparkContext spark-context coll] 110 | (set-callsite-name 111 | (.parallelize spark-context coll))) 112 | (^JavaRDD 113 | [^JavaSparkContext spark-context min-partitions coll] 114 | (set-callsite-name 115 | (.parallelize spark-context coll min-partitions) 116 | min-partitions))) 117 | 118 | 119 | (defn parallelize-pairs 120 | "Distributes a local collection to form a pair RDD. Optionally accepts a 121 | number of partitions to slice the collection into." 122 | ^JavaPairRDD 123 | ([^JavaSparkContext spark-context coll] 124 | (set-callsite-name 125 | (.parallelizePairs 126 | spark-context 127 | (map scala/to-pair coll)))) 128 | ^JavaPairRDD 129 | ([^JavaSparkContext spark-context min-partitions coll] 130 | (set-callsite-name 131 | (.parallelizePairs 132 | spark-context 133 | (map scala/to-pair coll) 134 | min-partitions) 135 | min-partitions))) 136 | 137 | 138 | (defn binary-files 139 | "Read a directory of binary files from the given URL as a pair RDD of paths 140 | to byte streams." 141 | ^JavaPairRDD 142 | ([^JavaSparkContext spark-context path] 143 | (.binaryFiles spark-context path)) 144 | ^JavaPairRDD 145 | ([^JavaSparkContext spark-context path num-partitions] 146 | (.binaryFiles spark-context path (int num-partitions)))) 147 | 148 | 149 | (defn text-file 150 | "Read a text file from a URL into an RDD of the lines in the file. Optionally 151 | accepts a number of partitions to slice the file into." 152 | ^JavaRDD 153 | ([^JavaSparkContext spark-context filename] 154 | (.textFile spark-context filename)) 155 | ^JavaRDD 156 | ([^JavaSparkContext spark-context min-partitions filename] 157 | (.textFile spark-context filename min-partitions))) 158 | 159 | 160 | (defn whole-text-files 161 | "Read a directory of text files from a URL into an RDD. Each element of the 162 | RDD is a pair of the file path and the full contents of the file." 163 | (^JavaPairRDD 164 | [^JavaSparkContext spark-context filename] 165 | (.wholeTextFiles spark-context filename)) 166 | (^JavaPairRDD 167 | [^JavaSparkContext spark-context min-partitions filename] 168 | (.wholeTextFiles spark-context filename min-partitions))) 169 | 170 | 171 | (defn save-as-text-file 172 | "Write the elements of `rdd` as a text file (or set of text files) in a given 173 | directory `path` in the local filesystem, HDFS or any other Hadoop-supported 174 | file system. Spark will call toString on each element to convert it to a line 175 | of text in the file." 176 | [path ^JavaRDDLike rdd] 177 | (.saveAsTextFile rdd (str path))) 178 | 179 | 180 | ;; ## Partitioning Logic 181 | 182 | (defn hash-partitioner 183 | "Construct a partitioner which will hash keys to distribute them uniformly 184 | over `n` buckets. Optionally accepts a `key-fn` which will be called on each 185 | key before hashing it." 186 | (^Partitioner 187 | [n] 188 | (HashPartitioner. (int n))) 189 | (^Partitioner 190 | [key-fn n] 191 | (FnHashPartitioner. (int n) (f/fn1 key-fn)))) 192 | 193 | 194 | (defn partitions 195 | "Return a vector of the partitions in `rdd`." 196 | [^JavaRDDLike rdd] 197 | (into [] (.partitions (.rdd rdd)))) 198 | 199 | 200 | (defn num-partitions 201 | "Returns the number of partitions in `rdd`." 202 | [^JavaRDDLike rdd] 203 | (.getNumPartitions rdd)) 204 | 205 | 206 | (defn partitioner 207 | "Return the partitioner associated with `rdd`, or nil if there is no custom 208 | partitioner." 209 | [^JavaRDDLike rdd] 210 | (scala/resolve-option 211 | (.partitioner (.rdd rdd)))) 212 | 213 | 214 | (defn partition-by 215 | "Return a copy of `rdd` partitioned by the given `partitioner`." 216 | [^Partitioner partitioner ^JavaPairRDD rdd] 217 | (set-callsite-name 218 | (.partitionBy rdd partitioner) 219 | (.getName (class partitioner)))) 220 | 221 | 222 | ;; Type hints are omitted because `repartition` is not included in JavaRDDLike. 223 | (defn repartition 224 | "Returns a new `rdd` with exactly `n` partitions. 225 | 226 | This method can increase or decrease the level of parallelism in this RDD. 227 | Internally, this uses a shuffle to redistribute data. 228 | 229 | If you are decreasing the number of partitions in this RDD, consider using 230 | `coalesce`, which can avoid performing a shuffle." 231 | ^JavaRDDLike 232 | [n rdd] 233 | (set-callsite-name 234 | (.repartition rdd (int n)) 235 | (int n))) 236 | 237 | 238 | (defn repartition-and-sort-within-partitions 239 | "Repartition the RDD according to the given partitioner and, within each 240 | resulting partition, sort records by their keys. This is more efficient than 241 | calling repartition and then sorting within each partition because it can 242 | push the sorting down into the shuffle machinery." 243 | (^JavaPairRDD 244 | [^Partitioner partitioner ^JavaPairRDD pair-rdd] 245 | (.repartitionAndSortWithinPartitions pair-rdd partitioner)) 246 | (^JavaPairRDD 247 | [^Partitioner partitioner ^java.util.Comparator comparator ^JavaPairRDD pair-rdd] 248 | (.repartitionAndSortWithinPartitions pair-rdd partitioner comparator))) 249 | 250 | 251 | ;; Type hints are omitted because `coalesce` is not included in JavaRDDLike. 252 | (defn coalesce 253 | "Decrease the number of partitions in `rdd` to `n`. Useful for running 254 | operations more efficiently after filtering down a large dataset." 255 | ([num-partitions rdd] 256 | (coalesce num-partitions false rdd)) 257 | ([num-partitions shuffle? rdd] 258 | (set-callsite-name 259 | (.coalesce rdd (int num-partitions) (boolean shuffle?)) 260 | (int num-partitions) 261 | (boolean shuffle?)))) 262 | 263 | 264 | ;; ## Storage Management 265 | 266 | (def storage-levels 267 | "Keyword mappings for available RDD storage levels." 268 | {:memory-only StorageLevels/MEMORY_ONLY 269 | :memory-only-ser StorageLevels/MEMORY_ONLY_SER 270 | :memory-and-disk StorageLevels/MEMORY_AND_DISK 271 | :memory-and-disk-ser StorageLevels/MEMORY_AND_DISK_SER 272 | :disk-only StorageLevels/DISK_ONLY 273 | :memory-only-2 StorageLevels/MEMORY_ONLY_2 274 | :memory-only-ser-2 StorageLevels/MEMORY_ONLY_SER_2 275 | :memory-and-disk-2 StorageLevels/MEMORY_AND_DISK_2 276 | :memory-and-disk-ser-2 StorageLevels/MEMORY_AND_DISK_SER_2 277 | :disk-only-2 StorageLevels/DISK_ONLY_2 278 | :none StorageLevels/NONE}) 279 | 280 | 281 | (defn storage-level 282 | "Return the keyword representing the storage level in the `storage-levels` 283 | map, or the raw value if not found." 284 | [^JavaRDD rdd] 285 | (let [level (.getStorageLevel rdd)] 286 | (or (->> storage-levels 287 | (filter #(= level (val %))) 288 | (map key) 289 | (first)) 290 | level))) 291 | 292 | 293 | ;; Type hints are omitted because `cache` and `persist` are not included in 294 | ;; JavaRDDLike. 295 | (defn cache! 296 | "Sets the storage level of `rdd` to persist its values across operations 297 | after the first time it is computed. By default, this uses the `:memory-only` 298 | level, but an alternate may be specified by `level`. 299 | 300 | This can only be used to assign a new storage level if the RDD does not have 301 | a storage level set already." 302 | ([rdd] 303 | (.cache rdd)) 304 | ([level rdd] 305 | {:pre [(contains? storage-levels level)]} 306 | (.persist rdd (get storage-levels level)))) 307 | 308 | 309 | ;; Type hints are omitted because `unpersist` is not included in JavaRDDLike. 310 | (defn uncache! 311 | "Mark `rdd` as non-persistent, and remove all blocks for it from memory and 312 | disk. Blocks until all data has been removed unless `blocking?` is provided 313 | and false." 314 | ([rdd] 315 | (.unpersist rdd)) 316 | ([blocking? rdd] 317 | (.unpersist rdd (boolean blocking?)))) 318 | 319 | 320 | (defn checkpointed? 321 | "True if `rdd` has been marked for checkpointing." 322 | [^JavaRDDLike rdd] 323 | (.isCheckpointed rdd)) 324 | 325 | 326 | (defn checkpoint! 327 | "Mark `rdd` for checkpointing. It will be saved to a file inside the 328 | checkpoint directory set on the Spark context and all references to its 329 | parent RDDs will be removed. 330 | 331 | This function must be called before any job has been executed on this RDD. It 332 | is strongly recommended that this RDD is persisted in memory, otherwise 333 | saving it to a file will require recomputation." 334 | [^JavaRDDLike rdd] 335 | (.checkpoint rdd)) 336 | -------------------------------------------------------------------------------- /sparkplug-core/src/clojure/sparkplug/scala.clj: -------------------------------------------------------------------------------- 1 | (ns sparkplug.scala 2 | "Commonly used utilities for interop with Scala objects." 3 | (:refer-clojure :exclude [first second]) 4 | (:require 5 | [clojure.walk :as walk]) 6 | (:import 7 | clojure.lang.MapEntry 8 | (scala 9 | Option 10 | Product 11 | Some 12 | Tuple1 13 | Tuple2 14 | Tuple3 15 | Tuple4 16 | Tuple5 17 | Tuple6 18 | Tuple7 19 | Tuple8 20 | Tuple9))) 21 | 22 | 23 | (defn resolve-option 24 | "Resolve an optional type to some value or nil." 25 | [^Option o] 26 | (when (instance? Some o) 27 | (.get ^Some o))) 28 | 29 | 30 | ;; ## Tuples 31 | 32 | (defn tuple 33 | "Construct a Scala tuple. Supports tuples up to size 9." 34 | ([a] 35 | (Tuple1. a)) 36 | ([a b] 37 | (Tuple2. a b)) 38 | ([a b c] 39 | (Tuple3. a b c)) 40 | ([a b c d] 41 | (Tuple4. a b c d)) 42 | ([a b c d e] 43 | (Tuple5. a b c d e)) 44 | ([a b c d e f] 45 | (Tuple6. a b c d e f)) 46 | ([a b c d e f g] 47 | (Tuple7. a b c d e f g)) 48 | ([a b c d e f g h] 49 | (Tuple8. a b c d e f g h)) 50 | ([a b c d e f g h i] 51 | (Tuple9. a b c d e f g h i))) 52 | 53 | 54 | (defn vec->tuple 55 | "Coerce a Clojure vector to a Scala tuple. Supports tuples up to size 9." 56 | [v] 57 | (cond 58 | (instance? MapEntry v) 59 | (Tuple2. (key v) (val v)) 60 | 61 | (< (count v) 10) 62 | (apply tuple v) 63 | 64 | :else 65 | (throw (IllegalArgumentException. 66 | (str "Cannot coerce value to a tuple: " (pr-str v)))))) 67 | 68 | 69 | (defn tuple->vec 70 | "Coerce a Scala tuple to a Clojure vector. Supports tuples up to size 9." 71 | [v] 72 | (condp instance? v 73 | Tuple1 74 | (let [t ^Tuple1 v] 75 | (vector (._1 t))) 76 | 77 | Tuple2 78 | (let [t ^Tuple2 v] 79 | (vector (._1 t) (._2 t))) 80 | 81 | Tuple3 82 | (let [t ^Tuple3 v] 83 | (vector (._1 t) (._2 t) (._3 t))) 84 | 85 | Tuple4 86 | (let [t ^Tuple4 v] 87 | (vector (._1 t) (._2 t) (._3 t) (._4 t))) 88 | 89 | Tuple5 90 | (let [t ^Tuple5 v] 91 | (vector (._1 t) (._2 t) (._3 t) (._4 t) (._5 t))) 92 | 93 | Tuple6 94 | (let [t ^Tuple6 v] 95 | (vector (._1 t) (._2 t) (._3 t) (._4 t) (._5 t) (._6 t))) 96 | 97 | Tuple7 98 | (let [t ^Tuple7 v] 99 | (vector (._1 t) (._2 t) (._3 t) (._4 t) (._5 t) (._6 t) (._7 t))) 100 | 101 | Tuple8 102 | (let [t ^Tuple8 v] 103 | (vector (._1 t) (._2 t) (._3 t) (._4 t) (._5 t) (._6 t) (._7 t) (._8 t))) 104 | 105 | Tuple9 106 | (let [t ^Tuple9 v] 107 | (vector (._1 t) (._2 t) (._3 t) (._4 t) (._5 t) (._6 t) (._7 t) (._8 t) (._9 t))) 108 | 109 | (throw (IllegalArgumentException. 110 | (str "Cannot coerce " (class v) " value to a vector"))))) 111 | 112 | 113 | (defn from-tuple 114 | "Coerce a Scala tuple value to a Clojure vector. Recursively walks the 115 | structure to ensure all nested tuples are converted." 116 | [t] 117 | (letfn [(coerce-product 118 | [x] 119 | (if (instance? Product x) 120 | (tuple->vec x) 121 | x))] 122 | (walk/prewalk coerce-product t))) 123 | 124 | 125 | (defn from-pair 126 | "Coerce a Scala pair (`Tuple2`) value to a Clojure value. Returns map entry 127 | values for efficiency. Recursively walks the structure to ensure all nested 128 | values are Clojure-compatible." 129 | [^Tuple2 pair] 130 | (MapEntry. (from-tuple (._1 pair)) (from-tuple (._2 pair)))) 131 | 132 | 133 | (defn to-pair 134 | "Coerce a Clojure value to a Scala pair (`Tuple2`)." 135 | ^Tuple2 136 | [entry] 137 | (cond 138 | ;; Null values can't be coerced. 139 | (nil? entry) 140 | (throw (IllegalArgumentException. 141 | "Cannot coerce nil to a pair value")) 142 | 143 | ;; Scala tuples can be returned directly. 144 | (instance? Tuple2 entry) 145 | entry 146 | 147 | ;; Use key/value from map entries to construct the pair. 148 | (instance? MapEntry entry) 149 | (Tuple2. (key entry) (val entry)) 150 | 151 | ;; Try to generically coerce a vector result. 152 | (vector? entry) 153 | (if (= 2 (count entry)) 154 | (Tuple2. (clojure.core/first entry) (clojure.core/second entry)) 155 | (throw (IllegalArgumentException. 156 | (str "Cannot coerce a vector with " (count entry) 157 | " elements to a pair value")))) 158 | 159 | ;; Unknown type, can't coerce. 160 | :else 161 | (throw (IllegalArgumentException. 162 | (str "Cannot coerce unknown type " (.getName (class entry)) 163 | " to a pair value"))))) 164 | 165 | 166 | (defn first 167 | "Get the first element of a Scala pair." 168 | [^Tuple2 t] 169 | (._1 t)) 170 | 171 | 172 | (defn second 173 | "Get the second element of a Scala pair." 174 | [^Tuple2 t] 175 | (._2 t)) 176 | -------------------------------------------------------------------------------- /sparkplug-core/src/java/sparkplug/broadcast/DerefBroadcast.java: -------------------------------------------------------------------------------- 1 | package sparkplug.broadcast; 2 | 3 | 4 | import clojure.lang.IDeref; 5 | 6 | import org.apache.spark.broadcast.Broadcast; 7 | 8 | import scala.reflect.ClassTag; 9 | 10 | 11 | /** 12 | * This class extends Spark's broadcast type so that it can be used with the 13 | * Clojure
deref
 function and reader macro.
14 |  */
15 | public class DerefBroadcast extends Broadcast implements IDeref {
16 | 
17 |     public final Broadcast wrapped;
18 | 
19 | 
20 |     /**
21 |      * Construct a new DerefBroadcast wrapping the given broadcast value.
22 |      */
23 |     public DerefBroadcast(Broadcast wrapped, Class cls) {
24 |         super(wrapped.id(), ClassTag.apply(cls));
25 |         this.wrapped = wrapped;
26 |     }
27 | 
28 | 
29 |     @Override
30 |     public boolean equals(Object other) {
31 |         if (this == other) {
32 |             return true;
33 |         } else if (other instanceof DerefBroadcast) {
34 |             DerefBroadcast db = (DerefBroadcast)other;
35 |             return wrapped.equals(db.wrapped);
36 |         } else {
37 |             return false;
38 |         }
39 |     }
40 | 
41 | 
42 |     @Override
43 |     public int hashCode() {
44 |         return wrapped.hashCode();
45 |     }
46 | 
47 | 
48 |     @Override
49 |     public String toString() {
50 |         return wrapped.toString();
51 |     }
52 | 
53 | 
54 |     @Override
55 |     public Object deref() {
56 |         return wrapped.value();
57 |     }
58 | 
59 | 
60 |     @Override
61 |     public T getValue() {
62 |         return wrapped.value();
63 |     }
64 | 
65 | 
66 |     @Override
67 |     public void doUnpersist(boolean blocking) {
68 |         wrapped.doUnpersist(blocking);
69 |     }
70 | 
71 | 
72 |     @Override
73 |     public void doDestroy(boolean blocking) {
74 |         wrapped.doDestroy(blocking);
75 |     }
76 | 
77 | }
78 | 


--------------------------------------------------------------------------------
/sparkplug-core/src/java/sparkplug/core/UnionHelper.java:
--------------------------------------------------------------------------------
 1 | package sparkplug.core;
 2 | 
 3 | import org.apache.spark.api.java.JavaPairRDD;
 4 | import org.apache.spark.api.java.JavaRDD;
 5 | import org.apache.spark.api.java.JavaSparkContext;
 6 | 
 7 | /**
 8 |  * This is a simple wrapper to call the `union` method on `JavaSparkContext`.
 9 |  *
10 |  * It is written in Java because:
11 |  *
12 |  * The non-varargs version of `union` was removed in Spark 3, leaving the varargs version as the
13 |  * only one that is compatible with both Spark 2 and Spark 3. See:
14 |  * 
15 |  *
16 |  * Unfortunately, Clojure is unable to call the varargs version, due to a compiler bug. Doing so
17 |  * will fail with errors such as:
18 |  *
19 |  * IllegalArgumentException: Can't call public method of non-public class: public final
20 |  * org.apache.spark.api.java.JavaPairRDD
21 |  * org.apache.spark.api.java.JavaSparkContextVarargsWorkaround.union(org.apache.spark.api.java.JavaPairRDD[])
22 |  *
23 |  * See: 
24 |  */
25 | public class UnionHelper {
26 |   public static JavaRDD unionJavaRDDs(JavaSparkContext jsc, JavaRDD[] rdds) {
27 |     return jsc.union(rdds);
28 |   }
29 | 
30 |   public static JavaPairRDD unionJavaPairRDDs(JavaSparkContext jsc, JavaPairRDD[] rdds) {
31 |     return jsc.union(rdds);
32 |   }
33 | }
34 | 


--------------------------------------------------------------------------------
/sparkplug-core/src/java/sparkplug/function/ComparatorFn.java:
--------------------------------------------------------------------------------
 1 | package sparkplug.function;
 2 | 
 3 | 
 4 | import clojure.lang.IFn;
 5 | 
 6 | import java.util.Collection;
 7 | import java.util.Comparator;
 8 | 
 9 | 
10 | /**
11 |  * Compatibility wrapper for a `Comparator` of two arguments.
12 |  */
13 | public class ComparatorFn extends SerializableFn implements Comparator {
14 | 
15 |     public ComparatorFn(IFn f, Collection namespaces) {
16 |         super(f, namespaces);
17 |     }
18 | 
19 | 
20 |     @Override
21 |     @SuppressWarnings("unchecked")
22 |     public int compare(Object v1, Object v2) {
23 |         return (int)f.invoke(v1, v2);
24 |     }
25 | 
26 | }
27 | 


--------------------------------------------------------------------------------
/sparkplug-core/src/java/sparkplug/function/FlatMapFn1.java:
--------------------------------------------------------------------------------
 1 | package sparkplug.function;
 2 | 
 3 | 
 4 | import clojure.lang.IFn;
 5 | 
 6 | import java.util.Iterator;
 7 | import java.util.Collection;
 8 | 
 9 | import org.apache.spark.api.java.function.FlatMapFunction;
10 | 
11 | 
12 | /**
13 |  * Compatibility wrapper for a Spark `FlatMapFunction` of one argument.
14 |  */
15 | public class FlatMapFn1 extends SerializableFn implements FlatMapFunction {
16 | 
17 |     public FlatMapFn1(IFn f, Collection namespaces) {
18 |         super(f, namespaces);
19 |     }
20 | 
21 | 
22 |     @Override
23 |     @SuppressWarnings("unchecked")
24 |     public Iterator call(Object v1) throws Exception {
25 |         Collection results = (Collection)f.invoke(v1);
26 |         return results.iterator();
27 |     }
28 | 
29 | }
30 | 


--------------------------------------------------------------------------------
/sparkplug-core/src/java/sparkplug/function/FlatMapFn2.java:
--------------------------------------------------------------------------------
 1 | package sparkplug.function;
 2 | 
 3 | 
 4 | import clojure.lang.IFn;
 5 | 
 6 | import java.util.Iterator;
 7 | import java.util.Collection;
 8 | 
 9 | import org.apache.spark.api.java.function.FlatMapFunction2;
10 | 
11 | 
12 | /**
13 |  * Compatibility wrapper for a Spark `FlatMapFunction2` of two arguments.
14 |  */
15 | public class FlatMapFn2 extends SerializableFn implements FlatMapFunction2 {
16 | 
17 |     public FlatMapFn2(IFn f, Collection namespaces) {
18 |         super(f, namespaces);
19 |     }
20 | 
21 | 
22 |     @Override
23 |     @SuppressWarnings("unchecked")
24 |     public Iterator call(Object v1, Object v2) throws Exception {
25 |         Collection results = (Collection)f.invoke(v1, v2);
26 |         return results.iterator();
27 |     }
28 | 
29 | }
30 | 


--------------------------------------------------------------------------------
/sparkplug-core/src/java/sparkplug/function/Fn1.java:
--------------------------------------------------------------------------------
 1 | package sparkplug.function;
 2 | 
 3 | 
 4 | import clojure.lang.IFn;
 5 | 
 6 | import java.util.Collection;
 7 | 
 8 | import org.apache.spark.api.java.function.Function;
 9 | 
10 | 
11 | /**
12 |  * Compatibility wrapper for a Spark `Function` of one argument.
13 |  */
14 | public class Fn1 extends SerializableFn implements Function {
15 | 
16 |     public Fn1(IFn f, Collection namespaces) {
17 |         super(f, namespaces);
18 |     }
19 | 
20 | 
21 |     @Override
22 |     @SuppressWarnings("unchecked")
23 |     public Object call(Object v1) throws Exception {
24 |         return f.invoke(v1);
25 |     }
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/sparkplug-core/src/java/sparkplug/function/Fn2.java:
--------------------------------------------------------------------------------
 1 | package sparkplug.function;
 2 | 
 3 | 
 4 | import clojure.lang.IFn;
 5 | 
 6 | import java.util.Collection;
 7 | 
 8 | import org.apache.spark.api.java.function.Function2;
 9 | 
10 | 
11 | /**
12 |  * Compatibility wrapper for a Spark `Function2` of two arguments.
13 |  */
14 | public class Fn2 extends SerializableFn implements Function2 {
15 | 
16 |     public Fn2(IFn f, Collection namespaces) {
17 |         super(f, namespaces);
18 |     }
19 | 
20 | 
21 |     @Override
22 |     @SuppressWarnings("unchecked")
23 |     public Object call(Object v1, Object v2) throws Exception {
24 |         return f.invoke(v1, v2);
25 |     }
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/sparkplug-core/src/java/sparkplug/function/Fn3.java:
--------------------------------------------------------------------------------
 1 | package sparkplug.function;
 2 | 
 3 | 
 4 | import clojure.lang.IFn;
 5 | 
 6 | import java.util.Collection;
 7 | 
 8 | import org.apache.spark.api.java.function.Function3;
 9 | 
10 | 
11 | /**
12 |  * Compatibility wrapper for a Spark `Function3` of three arguments.
13 |  */
14 | public class Fn3 extends SerializableFn implements Function3 {
15 | 
16 |     public Fn3(IFn f, Collection namespaces) {
17 |         super(f, namespaces);
18 |     }
19 | 
20 | 
21 |     @Override
22 |     @SuppressWarnings("unchecked")
23 |     public Object call(Object v1, Object v2, Object v3) throws Exception {
24 |         return f.invoke(v1, v2, v3);
25 |     }
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/sparkplug-core/src/java/sparkplug/function/PairFlatMapFn.java:
--------------------------------------------------------------------------------
 1 | package sparkplug.function;
 2 | 
 3 | 
 4 | import clojure.lang.IFn;
 5 | 
 6 | import java.util.Collection;
 7 | import java.util.Iterator;
 8 | 
 9 | import org.apache.spark.api.java.function.PairFlatMapFunction;
10 | 
11 | import scala.Tuple2;
12 | 
13 | 
14 | /**
15 |  * Compatibility wrapper for a Spark `PairFlatMapFunction` of one argument
16 |  * which returns a sequence of pairs.
17 |  */
18 | public class PairFlatMapFn extends SerializableFn implements PairFlatMapFunction {
19 | 
20 |     public PairFlatMapFn(IFn f, Collection namespaces) {
21 |         super(f, namespaces);
22 |     }
23 | 
24 | 
25 |     @Override
26 |     @SuppressWarnings("unchecked")
27 |     public Iterator> call(Object v1) throws Exception {
28 |         Collection result = (Collection)f.invoke(v1);
29 |         Iterator results = result.iterator();
30 |         return new Iterator>() {
31 |             public boolean hasNext() {
32 |                 return results.hasNext();
33 |             }
34 | 
35 |             public Tuple2 next() {
36 |                 return PairFn.coercePair(f, results.next());
37 |             }
38 |         };
39 |     }
40 | 
41 | }
42 | 


--------------------------------------------------------------------------------
/sparkplug-core/src/java/sparkplug/function/PairFn.java:
--------------------------------------------------------------------------------
 1 | package sparkplug.function;
 2 | 
 3 | 
 4 | import clojure.lang.IFn;
 5 | import clojure.lang.IMapEntry;
 6 | import clojure.lang.IPersistentVector;
 7 | 
 8 | import java.util.Collection;
 9 | 
10 | import org.apache.spark.api.java.function.PairFunction;
11 | 
12 | import scala.Tuple2;
13 | 
14 | 
15 | /**
16 |  * Compatibility wrapper for a Spark `PairFunction` of one argument which
17 |  * returns a pair.
18 |  */
19 | public class PairFn extends SerializableFn implements PairFunction {
20 | 
21 |     public PairFn(IFn f, Collection namespaces) {
22 |         super(f, namespaces);
23 |     }
24 | 
25 | 
26 |     @Override
27 |     @SuppressWarnings("unchecked")
28 |     public Tuple2 call(Object v1) throws Exception {
29 |         return coercePair(f, f.invoke(v1));
30 |     }
31 | 
32 | 
33 |     /**
34 |      * Coerce a result value into a Scala `Tuple2` as the result of a function.
35 |      *
36 |      * @param f the function which produced the result, to report in error messages
37 |      * @param result object to try to coerce
38 |      * @return a Scala tuple with two values
39 |      */
40 |     public static Tuple2 coercePair(IFn f, Object result) {
41 |         // Null can't be coerced.
42 |         if (result == null) {
43 |             throw new RuntimeException("Wrapped pair function " + f + " returned a null");
44 |         // Scala tuples can be returned directly.
45 |         } else if (result instanceof Tuple2) {
46 |             return (Tuple2)result;
47 |         // Use key/value from Clojure map entries to construct a tuple.
48 |         } else if (result instanceof IMapEntry) {
49 |             IMapEntry entry = (IMapEntry)result;
50 |             return new Tuple2(entry.key(), entry.val());
51 |         // Try to generically coerce a sequential result into a tuple.
52 |         } else if (result instanceof IPersistentVector) {
53 |             IPersistentVector vector = (IPersistentVector)result;
54 |             if (vector.count() != 2) {
55 |                 throw new RuntimeException("Wrapped pair function " + f + " returned a vector without exactly two values: " + vector.count());
56 |             }
57 |             return new Tuple2(vector.nth(0), vector.nth(1));
58 |         // Unknown type, can't coerce.
59 |         } else {
60 |             throw new RuntimeException("Wrapped pair function " + f + " returned an invalid pair type: " + result.getClass().getName());
61 |         }
62 |     }
63 | 
64 | }
65 | 


--------------------------------------------------------------------------------
/sparkplug-core/src/java/sparkplug/function/SerializableFn.java:
--------------------------------------------------------------------------------
  1 | package sparkplug.function;
  2 | 
  3 | 
  4 | import clojure.lang.Compiler;
  5 | import clojure.lang.IFn;
  6 | import clojure.lang.Keyword;
  7 | import clojure.lang.RT;
  8 | import clojure.lang.Symbol;
  9 | import clojure.lang.Var;
 10 | 
 11 | import java.lang.reflect.Field;
 12 | import java.lang.reflect.Modifier;
 13 | 
 14 | import java.io.IOException;
 15 | import java.io.InvalidObjectException;
 16 | import java.io.ObjectInputStream;
 17 | import java.io.ObjectOutputStream;
 18 | import java.io.Serializable;
 19 | 
 20 | import java.util.ArrayList;
 21 | import java.util.Collection;
 22 | import java.util.Collections;
 23 | import java.util.HashSet;
 24 | import java.util.List;
 25 | 
 26 | import org.slf4j.Logger;
 27 | import org.slf4j.LoggerFactory;
 28 | 
 29 | 
 30 | /**
 31 |  * Base class for function classes built for interop with Spark and Scala.
 32 |  *
 33 |  * This class is designed to be serialized across computation boundaries in a
 34 |  * manner compatible with Spark and Kryo, while ensuring that required code is
 35 |  * loaded upon deserialization.
 36 |  */
 37 | public abstract class SerializableFn implements Serializable {
 38 | 
 39 |     private static final Logger logger = LoggerFactory.getLogger(SerializableFn.class);
 40 |     private static final Var require = RT.var("clojure.core", "require");
 41 | 
 42 |     protected IFn f;
 43 |     protected List namespaces;
 44 | 
 45 | 
 46 |     /**
 47 |      * Default empty constructor.
 48 |      */
 49 |     private SerializableFn() {
 50 |     }
 51 | 
 52 | 
 53 |     /**
 54 |      * Construct a new serializable wrapper for the function with an explicit
 55 |      * set of required namespaces.
 56 |      *
 57 |      * @param fn Clojure function to wrap
 58 |      * @param namespaces collection of namespaces required
 59 |      */
 60 |     protected SerializableFn(IFn fn, Collection namespaces) {
 61 |         this.f = fn;
 62 |         List namespaceColl = new ArrayList(namespaces);
 63 |         Collections.sort(namespaceColl);
 64 |         this.namespaces = Collections.unmodifiableList(namespaceColl);
 65 |     }
 66 | 
 67 | 
 68 |     /**
 69 |      * Safely access the value of a field on the given object.
 70 |      *
 71 |      * @param obj Instance to access a field on
 72 |      * @param field Reflective field to access
 73 |      * @return the value of the field, or nil on failure
 74 |      */
 75 |     public static Object accessField(Object obj, Field field) {
 76 |         try {
 77 |             if (!field.isAccessible()) {
 78 |                 field.setAccessible(true);
 79 |             }
 80 |             return field.get(obj);
 81 |         } catch (Exception ex) {
 82 |             logger.trace("Failed to access field " + field.toString() + ": " + ex.getClass().getName());
 83 |             return null;
 84 |         }
 85 |     }
 86 | 
 87 | 
 88 |     /**
 89 |      * Walk a value to convert any deserialized booleans back into the
 90 |      * canonical java.lang.Boolean values.
 91 |      *
 92 |      * @param visited Set of objects already visited by the walk
 93 |      * @param obj Object to walk references of
 94 |      */
 95 |     private void fixBooleans(HashSet visited, Object obj) {
 96 |         // Short-circuit objects which can't have nested values to fix.
 97 |         if ((obj == null)
 98 |                 || (obj instanceof Boolean)
 99 |                 || (obj instanceof String)
100 |                 || (obj instanceof Number)
101 |                 || (obj instanceof Keyword)
102 |                 || (obj instanceof Symbol)
103 |                 || (obj instanceof Var)) {
104 |             return;
105 |         }
106 | 
107 |         // Short-circuit if we've already visited this object.
108 |         if (visited.contains(obj)) {
109 |             return;
110 |         }
111 | 
112 |         visited.add(obj);
113 | 
114 |         // For collection-like objects, just traverse their elements.
115 |         if (obj instanceof Iterable) {
116 |             for (Object el : (Iterable)obj) {
117 |                 fixBooleans(visited, el);
118 |             }
119 |             return;
120 |         }
121 | 
122 |         // Otherwise, look at the object's fields and try to fix any booleans
123 |         // we find and traverse further.
124 |         for (Field field : obj.getClass().getDeclaredFields()) {
125 |             if (!Modifier.isStatic(field.getModifiers())) {
126 |                 Object value = accessField(obj, field);
127 |                 if (value instanceof Boolean) {
128 |                     Boolean canonical = ((Boolean)value).booleanValue() ? Boolean.TRUE : Boolean.FALSE;
129 |                     try {
130 |                         field.set(obj, canonical);
131 |                     } catch (IllegalAccessException ex) {
132 |                         logger.warn("Failed to set boolean field " + field.toString());
133 |                     }
134 |                 } else {
135 |                     fixBooleans(visited, value);
136 |                 }
137 |             }
138 |         }
139 |     }
140 | 
141 | 
142 |     /**
143 |      * Serialize the function to the provided output stream.
144 |      * An unspoken part of the `Serializable` interface.
145 |      *
146 |      * @param out stream to write the function to
147 |      */
148 |     private void writeObject(ObjectOutputStream out) throws IOException {
149 |         try {
150 |             logger.trace("Serializing " + f);
151 |             // Write the function class name
152 |             // This is only used for debugging
153 |             out.writeObject(f.getClass().getName());
154 |             // Write out the referenced namespaces.
155 |             out.writeInt(namespaces.size());
156 |             for (String ns : namespaces) {
157 |                 out.writeObject(ns);
158 |             }
159 |             // Write out the function itself.
160 |             out.writeObject(f);
161 |         } catch (IOException ex) {
162 |             logger.error("Error serializing function " + f, ex);
163 |             throw ex;
164 |         } catch (RuntimeException ex){
165 |             logger.error("Error serializing function " + f, ex);
166 |             throw ex;
167 |         }
168 |     }
169 | 
170 | 
171 |     /**
172 |      * Deserialize a function from the provided input stream.
173 |      * An unspoken part of the `Serializable` interface.
174 |      *
175 |      * @param in stream to read the function from
176 |      */
177 |     private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException {
178 |         String className = "";
179 |         try {
180 |             // Read the function class name.
181 |             className = (String)in.readObject();
182 |             logger.trace("Deserializing " + className);
183 |             // Read the referenced namespaces and load them.
184 |             int nsCount = in.readInt();
185 |             this.namespaces = new ArrayList(nsCount);
186 |             for (int i = 0; i < nsCount; i++) {
187 |                 String ns = (String)in.readObject();
188 |                 namespaces.add(ns);
189 |                 requireNamespace(ns);
190 |             }
191 |             // Read the function itself.
192 |             this.f = (IFn)in.readObject();
193 |             // Walk the data structure to coerce canonical booleans.
194 |             fixBooleans(new HashSet(), this.f);
195 |         } catch (IOException ex) {
196 |             logger.error("IO error deserializing function " + className, ex);
197 |             throw ex;
198 |         } catch (ClassNotFoundException ex) {
199 |             logger.error("Class error deserializing function " + className, ex);
200 |             throw ex;
201 |         } catch (RuntimeException ex) {
202 |             logger.error("Error deserializing function " + className, ex);
203 |             throw ex;
204 |         }
205 |     }
206 | 
207 | 
208 |     /**
209 |      * Load the namespace specified by the given symbol.
210 |      *
211 |      * @param namespace string designating the namespace to load
212 |      */
213 |     private static void requireNamespace(String namespace) {
214 |         try {
215 |             logger.trace("(require " + namespace + ")");
216 |             synchronized (RT.REQUIRE_LOCK) {
217 |                 Symbol sym = Symbol.intern(namespace);
218 |                 require.invoke(sym);
219 |             }
220 |         } catch (Exception ex) {
221 |             logger.warn("Error loading namespace " + namespace, ex);
222 |         }
223 |     }
224 | 
225 | }
226 | 


--------------------------------------------------------------------------------
/sparkplug-core/src/java/sparkplug/function/VoidFn.java:
--------------------------------------------------------------------------------
 1 | package sparkplug.function;
 2 | 
 3 | 
 4 | import clojure.lang.IFn;
 5 | 
 6 | import java.util.Collection;
 7 | 
 8 | import org.apache.spark.api.java.function.VoidFunction;
 9 | 
10 | 
11 | /**
12 |  * Compatibility wrapper for a Spark `VoidFunction` of one argument.
13 |  */
14 | public class VoidFn extends SerializableFn implements VoidFunction {
15 | 
16 |     public VoidFn(IFn f, Collection namespaces) {
17 |         super(f, namespaces);
18 |     }
19 | 
20 | 
21 |     @Override
22 |     @SuppressWarnings("unchecked")
23 |     public void call(Object v1) throws Exception {
24 |         f.invoke(v1);
25 |     }
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/sparkplug-core/src/java/sparkplug/kryo/ClassPathRegistrator.java:
--------------------------------------------------------------------------------
 1 | package sparkplug.kryo;
 2 | 
 3 | 
 4 | import clojure.lang.IFn;
 5 | import clojure.lang.RT;
 6 | import clojure.lang.Symbol;
 7 | 
 8 | import com.esotericsoftware.kryo.Kryo;
 9 | 
10 | import org.apache.spark.serializer.KryoRegistrator;
11 | 
12 | 
13 | /**
14 |  * Spark interop class to register types for serialization with Kryo.
15 |  */
16 | public class ClassPathRegistrator implements KryoRegistrator {
17 | 
18 |     /**
19 |      * Wrapper class to efficiently ensure the configuration function is only
20 |      * loaded once.
21 |      */
22 |     private static class Singleton {
23 | 
24 |         private static final IFn configure;
25 | 
26 |         static {
27 |             IFn resolve = RT.var("clojure.core", "requiring-resolve");
28 |             Symbol name = Symbol.intern("sparkplug.kryo", "load-configuration");
29 |             IFn loader = (IFn)resolve.invoke(name);
30 |             configure = (IFn)loader.invoke();
31 |         }
32 | 
33 |     }
34 | 
35 | 
36 |     @Override
37 |     public void registerClasses(Kryo kryo) {
38 | 
39 |         IFn configure = Singleton.configure;
40 | 
41 |         if (configure == null) {
42 |             throw new RuntimeException("Could not construct kryo configuration function!");
43 |         }
44 | 
45 |         configure.invoke(kryo);
46 | 
47 |     }
48 | 
49 | }
50 | 


--------------------------------------------------------------------------------
/sparkplug-core/src/java/sparkplug/partition/FnHashPartitioner.java:
--------------------------------------------------------------------------------
 1 | package sparkplug.partition;
 2 | 
 3 | import static clojure.lang.Util.hasheq;
 4 | import org.apache.spark.Partitioner;
 5 | import org.slf4j.Logger;
 6 | import org.slf4j.LoggerFactory;
 7 | 
 8 | import sparkplug.function.Fn1;
 9 | 
10 | 
11 | /**
12 |  * A Partitioner Similar to Spark's HashPartitioner, which also accepts a key
13 |  * function to translate an Object into a hashable key, and uses Clojure's
14 |  * hash function instead of Object.hashCode().
15 |  */
16 | public class FnHashPartitioner extends Partitioner {
17 | 
18 |     private static final Logger logger = LoggerFactory.getLogger(FnHashPartitioner.class);
19 | 
20 |     private final int numPartitions;
21 |     private final Fn1 keyFn;
22 | 
23 |     public FnHashPartitioner(int numPartitions, Fn1 keyFn) {
24 |         if (numPartitions <= 0) {
25 |             throw new IllegalArgumentException("Number of partitions must be positive, got " + numPartitions);
26 |         }
27 |         if (keyFn == null) {
28 |             throw new IllegalArgumentException("Key function must not be null");
29 |         }
30 |         this.numPartitions = numPartitions;
31 |         this.keyFn = keyFn;
32 |     }
33 | 
34 |     @Override
35 |     public int numPartitions() {
36 |         return this.numPartitions;
37 |     }
38 | 
39 |     @Override
40 |     public int getPartition(Object key) {
41 |         Object transformedKey = null;
42 |         try {
43 |             transformedKey = this.keyFn.call(key);
44 |         } catch (Exception e) {
45 |             logger.error("Key function threw an exception, so this key will be hashed as if it were null."
46 |                          + " This is likely to cause skewed partitioning.", e);
47 |         }
48 | 
49 |         return Math.floorMod(hasheq(transformedKey), this.numPartitions);
50 |     }
51 | 
52 | }
53 | 


--------------------------------------------------------------------------------
/sparkplug-core/test/sparkplug/core_test.clj:
--------------------------------------------------------------------------------
 1 | (ns sparkplug.core-test
 2 |   (:require
 3 |     [clojure.test :refer [deftest is testing use-fixtures]]
 4 |     [sparkplug.config :as conf]
 5 |     [sparkplug.context :as context]
 6 |     [sparkplug.core :as spark]
 7 |     [sparkplug.rdd :as rdd]))
 8 | 
 9 | 
10 | (def ^:dynamic *sc*
11 |   nil)
12 | 
13 | 
14 | (def local-conf
15 |   (-> (conf/spark-conf)
16 |       (conf/master "local[*]")
17 |       (conf/app-name "user")
18 |       (conf/set-param "spark.ui.enabled" "false")))
19 | 
20 | 
21 | (defn spark-context-fixture
22 |   [f]
23 |   (context/with-context [sc local-conf]
24 |     (binding [*sc* sc]
25 |       (f))))
26 | 
27 | 
28 | (use-fixtures :once spark-context-fixture)
29 | 
30 | 
31 | (deftest core-transforms
32 |   (testing "aggregate-by-key"
33 |     (is (= [[1 (reduce + (range 10))]]
34 |            (->> (rdd/parallelize-pairs *sc* (map vector (repeat 10 1) (range 10)))
35 |                 (spark/aggregate-by-key + + 0)
36 |                 (spark/into []))))
37 |     (is (= [[1 (reduce + (range 10))]]
38 |            (->> (rdd/parallelize-pairs *sc* (map vector (repeat 10 1) (range 10)))
39 |                 (spark/aggregate-by-key + + 0 2)
40 |                 (spark/into []))))
41 |     (is (= [[1 (reduce + (range 10))]]
42 |            (->> (rdd/parallelize-pairs *sc* (map vector (repeat 10 1) (range 10)))
43 |                 (spark/aggregate-by-key + + 0 (rdd/hash-partitioner 2))
44 |                 (spark/into [])))))
45 | 
46 |   (testing "sort-by"
47 |     (is (= (vec (reverse (range 10)))
48 |            (->> (rdd/parallelize *sc* (shuffle (range 10)))
49 |                 (spark/sort-by -)
50 |                 (spark/into []))
51 |            (->> (rdd/parallelize *sc* (shuffle (range 10)))
52 |                 (spark/sort-by identity false)
53 |                 (spark/into [])))))
54 | 
55 |   (testing "union"
56 |     (is (= #{:a :b}
57 |            (spark/into #{} (spark/union (rdd/parallelize *sc* [:a :b])))))
58 |     (is (= #{:a :b :c :d}
59 |            (spark/into
60 |              #{}
61 |              (spark/union
62 |                (rdd/parallelize *sc* [:a :b])
63 |                (rdd/parallelize *sc* [:c :d])))))
64 |     (is (= #{:a :b :c :d :e :f}
65 |            (spark/into
66 |              #{}
67 |              (spark/union
68 |                (rdd/parallelize *sc* [:a :b])
69 |                (rdd/parallelize *sc* [:c :d])
70 |                (rdd/parallelize *sc* [:e :f])))))
71 |     (is (= #{[:a :b]}
72 |            (spark/into #{} (spark/union (rdd/parallelize-pairs *sc* [[:a :b]])))))
73 |     (is (= #{[:a :b] [:c :d]}
74 |            (spark/into
75 |              #{}
76 |              (spark/union
77 |                (rdd/parallelize-pairs *sc* [[:a :b]])
78 |                (rdd/parallelize-pairs *sc* [[:c :d]])))))
79 |     (is (= #{[:a :b] [:c :d] [:e :f]}
80 |            (spark/into
81 |              #{}
82 |              (spark/union
83 |                (rdd/parallelize-pairs *sc* [[:a :b]])
84 |                (rdd/parallelize-pairs *sc* [[:c :d]])
85 |                (rdd/parallelize-pairs *sc* [[:e :f]])))))))
86 | 


--------------------------------------------------------------------------------
/sparkplug-core/test/sparkplug/function/test_fns.clj:
--------------------------------------------------------------------------------
 1 | (ns sparkplug.function.test-fns
 2 |   "AOT-compiled test functions.")
 3 | 
 4 | 
 5 | (defn bool-closure
 6 |   [b]
 7 |   (fn inner
 8 |     [x]
 9 |     (when b
10 |       x)))
11 | 


--------------------------------------------------------------------------------
/sparkplug-core/test/sparkplug/function_test.clj:
--------------------------------------------------------------------------------
  1 | (ns sparkplug.function-test
  2 |   (:require
  3 |     [clojure.test :refer [are deftest is testing]]
  4 |     [sparkplug.function :as f]
  5 |     [sparkplug.function.test-fns :as test-fns])
  6 |   (:import
  7 |     (java.io
  8 |       ByteArrayInputStream
  9 |       ByteArrayOutputStream
 10 |       ObjectInputStream
 11 |       ObjectOutputStream)))
 12 | 
 13 | 
 14 | (def this-ns
 15 |   (ns-name *ns*))
 16 | 
 17 | 
 18 | (defprotocol TestProto
 19 | 
 20 |   (proto-method [this])
 21 | 
 22 |   (get-closure [this]))
 23 | 
 24 | 
 25 | (defrecord TestRecord
 26 |   [example-fn]
 27 | 
 28 |   TestProto
 29 | 
 30 |   (proto-method
 31 |     [_]
 32 |     (example-fn))
 33 | 
 34 | 
 35 |   (get-closure
 36 |     [_]
 37 |     (fn inside-fn
 38 |       []
 39 |       nil)))
 40 | 
 41 | 
 42 | (deftest resolve-namespace-references
 43 |   (are [expected-references obj] (= expected-references (f/namespace-references obj))
 44 | 
 45 |     ;; Simple data
 46 |     #{} nil
 47 |     #{} :keyword
 48 |     #{} 5
 49 |     #{} true
 50 |     #{} "str"
 51 |     #{} 'sym
 52 | 
 53 |     ;; Functions
 54 |     #{this-ns}
 55 |     (fn [])
 56 | 
 57 |     #{this-ns 'sparkplug.function}
 58 |     (fn []
 59 |       (f/namespace-references (fn [])))
 60 | 
 61 |     #{this-ns 'sparkplug.function}
 62 |     (fn []
 63 |       (let [x (f/namespace-references (fn []))]
 64 |         (x)))
 65 | 
 66 |     #{this-ns}
 67 |     [(fn [])]
 68 | 
 69 |     #{this-ns}
 70 |     (list (fn []))
 71 | 
 72 |     #{this-ns}
 73 |     (doto (java.util.ArrayList.)
 74 |       (.add (fn [])))
 75 | 
 76 |     #{this-ns}
 77 |     (doto (java.util.HashMap.)
 78 |       (.put "key" (fn [])))
 79 | 
 80 |     #{this-ns}
 81 |     {:key (fn [])}
 82 | 
 83 |     #{this-ns}
 84 |     {:key {:nested (fn [])}}
 85 | 
 86 |     #{this-ns}
 87 |     {:key {:nested [(fn [])]}}
 88 | 
 89 |     ;; Record fields.
 90 |     #{this-ns 'sparkplug.function}
 91 |     (->TestRecord
 92 |       (fn []
 93 |         (f/namespace-references nil)))
 94 | 
 95 |     ;; Function that closes over an object invoking a protocol method.
 96 |     #{this-ns 'sparkplug.function}
 97 |     (let [inst (->TestRecord
 98 |                  (fn []
 99 |                    (f/namespace-references nil)))]
100 |       (fn [] (proto-method inst)))
101 | 
102 |     ;; Function closure defined inside a record class.
103 |     #{this-ns}
104 |     (let [x (->TestRecord nil)]
105 |       (get-closure x))))
106 | 
107 | 
108 | ;; This is a regression test which ensures that decoded functions which close
109 | ;; over a boolean value are updated to use the canonical `Boolean` static
110 | ;; instances. Otherwise, users see bugs where a false value evaluates as truthy.
111 | (deftest canonical-booleans
112 |   (letfn [(serialize
113 |             [f]
114 |             (let [baos (ByteArrayOutputStream.)]
115 |               (with-open [out (ObjectOutputStream. baos)]
116 |                 (.writeObject out f))
117 |               (.toByteArray baos)))
118 | 
119 |           (deserialize
120 |             [bs]
121 |             (with-open [in (ObjectInputStream. (ByteArrayInputStream. bs))]
122 |               (.readObject in)))]
123 |     (testing "closure over true value"
124 |       (let [original-fn (f/fn1 (test-fns/bool-closure true))
125 |             decoded-fn (-> original-fn serialize deserialize)]
126 |         (testing "original behavior"
127 |           (is (= :x (.call original-fn :x))
128 |               "should return value"))
129 |         (testing "decoded behavior"
130 |           (is (= :x (.call decoded-fn :x))
131 |               "should return value"))))
132 |     (testing "closure over false value"
133 |       (let [original-fn (f/fn1 (test-fns/bool-closure false))
134 |             decoded-fn (-> original-fn serialize deserialize)]
135 |         (testing "original behavior"
136 |           (is (nil? (.call original-fn :x))
137 |               "should not return value"))
138 |         (testing "decoded behavior"
139 |           (is (nil? (.call decoded-fn :x))
140 |               "should not return value"))))))
141 | 


--------------------------------------------------------------------------------
/sparkplug-core/test/sparkplug/kryo_test.clj:
--------------------------------------------------------------------------------
 1 | (ns sparkplug.kryo-test
 2 |   (:require
 3 |     [clojure.test :refer [deftest is]]
 4 |     [clojure.test.check.clojure-test :refer [defspec]]
 5 |     [clojure.test.check.generators :as gen]
 6 |     [clojure.test.check.properties :as prop]
 7 |     [sparkplug.kryo :as kryo]))
 8 | 
 9 | 
10 | (deftest classpath-search
11 |   (let [registries (kryo/classpath-registries)]
12 |     (is (sequential? registries))
13 |     (is (<= 2 (count registries)))))
14 | 
15 | 
16 | (def kryo (kryo/initialize))
17 | 
18 | 
19 | (defspec clojure-data-roundtrip
20 |   {:num-tests 1000
21 |    :max-size 20}
22 |   (prop/for-all [x gen/any-equatable]
23 |     (is (= x (->> x (kryo/encode kryo) (kryo/decode kryo))))))
24 | 


--------------------------------------------------------------------------------
/sparkplug-repl/.gitignore:
--------------------------------------------------------------------------------
 1 | /target
 2 | /classes
 3 | /checkouts
 4 | /.lein-*
 5 | /.nrepl-port
 6 | pom.xml
 7 | pom.xml.asc
 8 | *.jar
 9 | *.class
10 | 


--------------------------------------------------------------------------------
/sparkplug-repl/README.md:
--------------------------------------------------------------------------------
 1 | Spark REPL
 2 | ==========
 3 | 
 4 | This project provides a server providing an interactive REPL experience while
 5 | connected to a Spark cluster.
 6 | 
 7 | 
 8 | ## Usage
 9 | 
10 | First, build the REPL uberjar and copy it into the Docker cluster:
11 | 
12 | ```shell
13 | lein uberjar
14 | cp target/uberjar/sparkplug-repl.jar ../cluster/code
15 | ```
16 | 
17 | Next, start up the REPL container in another terminal:
18 | 
19 | ```
20 | $ docker-compose up repl
21 | ```
22 | 
23 | Finally, connect to the REPL running in the container:
24 | 
25 | ```
26 | $ lein repl :connect 8765
27 | ```
28 | 
29 | If all goes well, you should see the prompt:
30 | 
31 | ```
32 | sparkplug.repl.work=>
33 | ```
34 | 
35 | The currently running Spark application context is available via the
36 | `spark-context` var, and the WebUI runs on http://localhost:4050/. When you're
37 | done with the REPL you can hit `^D` (Control + D) to hang up and leave the
38 | container running, or call `(exit!)` to shut it down cleanly and stop the Spark
39 | application.
40 | 
41 | 
42 | ## Limitations
43 | 
44 | Currently, you cannot use any dynamically-defined functions. Because these
45 | classes are defined locally, Spark won't be able to deserialize the instances on
46 | the executors.
47 | 
48 | If you can express your logic in terms of existing higher-order functions, this
49 | will still work:
50 | 
51 | ```clojure
52 | ;; won't work
53 | (spark/map->pairs #(vector % 1))
54 | 
55 | ;; will work!
56 | (spark/map->pairs (juxt identity (constantly 1)))
57 | ```
58 | 


--------------------------------------------------------------------------------
/sparkplug-repl/project.clj:
--------------------------------------------------------------------------------
 1 | (defproject amperity/sparkplug-repl "1.1.0"
 2 |   :description "Clojure REPL for Spark exploration"
 3 |   :url "https://github.com/amperity/sparkplug"
 4 |   :scm {:dir ".."}
 5 |   :license {:name "Apache License 2.0"
 6 |             :url "http://www.apache.org/licenses/LICENSE-2.0"}
 7 | 
 8 |   :monolith/inherit true
 9 | 
10 |   :dependencies
11 |   [[org.clojure/clojure "1.12.0"]
12 |    [amperity/sparkplug-core "1.1.0"]
13 |    [mvxcvi/whidbey "2.2.1"]
14 |    [nrepl "1.3.0"]]
15 | 
16 |   :main sparkplug.repl.main
17 | 
18 |   :profiles
19 |   {:default
20 |    [:base :system :user :provided :spark-3.5 :dev]
21 | 
22 |    :repl
23 |    {:repl-options
24 |     {:custom-init (whidbey.repl/update-print-fn!)
25 |      :init-ns sparkplug.repl.work}}
26 | 
27 |    :spark-3.1
28 |    ^{:pom-scope :provided}
29 |    {:dependencies
30 |     [[org.apache.spark/spark-core_2.12 "3.1.3"]
31 |      [org.apache.spark/spark-sql_2.12 "3.1.3"]]}
32 | 
33 |    :spark-3.5
34 |    ^{:pom-scope :provided}
35 |    {:dependencies
36 |     [[org.apache.spark/spark-core_2.12 "3.5.1"]
37 |      [org.apache.spark/spark-sql_2.12 "3.5.1"]]}
38 | 
39 |    :uberjar
40 |    {:target-path "target/uberjar"
41 |     :uberjar-name "sparkplug-repl.jar"
42 |     :aot :all}})
43 | 


--------------------------------------------------------------------------------
/sparkplug-repl/src/sparkplug/repl/main.clj:
--------------------------------------------------------------------------------
  1 | (ns sparkplug.repl.main
  2 |   (:gen-class)
  3 |   (:require
  4 |     [clojure.java.io :as io]
  5 |     [clojure.tools.logging :as log]
  6 |     [nrepl.middleware :as middleware]
  7 |     [nrepl.middleware.session :as mw-session]
  8 |     [nrepl.server :as server]
  9 |     [sparkplug.config :as conf]
 10 |     [sparkplug.context :as ctx]
 11 |     [sparkplug.core :as spark]
 12 |     [whidbey.repl :as whidbey]))
 13 | 
 14 | 
 15 | (def whidbey-opts
 16 |   {:width 200
 17 |    :namespace-maps true
 18 |    :color-scheme {:nil [:blue]}
 19 |    :tag-types {java.lang.Class {'java/class #(symbol (.getName ^Class %))}
 20 |                java.time.Instant {'inst str}}})
 21 | 
 22 | 
 23 | ;; ## REPL Middleware
 24 | 
 25 | (def repl-ns 'sparkplug.repl.work)
 26 | 
 27 | 
 28 | (defn wrap-repl-init
 29 |   "Middleware constructor which ensures the admin-repl system namespace is
 30 |   loaded and available before configuring the new session to use it."
 31 |   [handler]
 32 |   (with-local-vars [sentinel nil]
 33 |     (fn [{:keys [session] :as msg}]
 34 |       (when-not (@session sentinel)
 35 |         (swap! session assoc
 36 |                #'*ns*
 37 |                (try
 38 |                  (require repl-ns)
 39 |                  (create-ns repl-ns)
 40 |                  (catch Throwable t
 41 |                    (log/error t "Failed to switch to repl-ns" repl-ns)
 42 |                    (create-ns 'user)))
 43 |                sentinel true))
 44 |       (handler msg))))
 45 | 
 46 | 
 47 | (middleware/set-descriptor!
 48 |   #'wrap-repl-init
 49 |   {:requires #{#'mw-session/session}
 50 |    :expects #{"eval"}})
 51 | 
 52 | 
 53 | ;; ## Spark Lifecycle
 54 | 
 55 | (defn- initialize-context!
 56 |   "Construct a new Spark context and intern it in the repl namespace."
 57 |   [master]
 58 |   (require repl-ns)
 59 |   (let [ctx (-> (conf/spark-conf)
 60 |                 (conf/master master)
 61 |                 (conf/app-name "sparkplug-repl")
 62 |                 (conf/jars ["sparkplug-repl.jar"])
 63 |                 (ctx/spark-context))]
 64 |     (intern repl-ns 'spark-context ctx)))
 65 | 
 66 | 
 67 | (defn- stop-context!
 68 |   "Stop the running Spark context, if any."
 69 |   []
 70 |   (let [ctx-var (ns-resolve repl-ns 'spark-context)]
 71 |     (when-let [ctx (and ctx-var @ctx-var)]
 72 |       (ctx/stop! ctx))))
 73 | 
 74 | 
 75 | ;; ## Main Entry
 76 | 
 77 | (def nrepl-server nil)
 78 | (def exit-promise (promise))
 79 | 
 80 | 
 81 | (defn -main
 82 |   "Main entry point for launching the nREPL server."
 83 |   [& args]
 84 |   (let [master (or (System/getenv "SPARKPLUG_REPL_MASTER")
 85 |                    "local[*]")
 86 |         port (-> (System/getenv "SPARKPLUG_REPL_PORT")
 87 |                  (or "8765")
 88 |                  (Integer/parseInt))]
 89 |     (try
 90 |       (whidbey/init! whidbey-opts)
 91 |       (catch Exception ex
 92 |         (log/warn ex "Failed to initialize whidbey middleware!")))
 93 |     (try
 94 |       (log/info "Initializing Spark context...")
 95 |       (require repl-ns)
 96 |       (initialize-context! master)
 97 |       (catch Exception ex
 98 |         (log/error ex "Failed to initialize Spark context!")
 99 |         (System/exit 10)))
100 |     (log/info "Starting nrepl server on port:" port)
101 |     (let [server (server/start-server
102 |                    :bind "0.0.0.0"
103 |                    :port port
104 |                    :handler (server/default-handler #'wrap-repl-init))]
105 |       (alter-var-root #'nrepl-server (constantly server)))
106 |     @exit-promise
107 |     (log/info "Stopping Spark context...")
108 |     (stop-context!)
109 |     (log/info "Stopping nrepl server...")
110 |     (server/stop-server nrepl-server)
111 |     (System/exit 0)))
112 | 


--------------------------------------------------------------------------------
/sparkplug-repl/src/sparkplug/repl/work.clj:
--------------------------------------------------------------------------------
 1 | (ns sparkplug.repl.work
 2 |   (:require
 3 |     [clojure.java.io :as io]
 4 |     [clojure.repl :refer :all]
 5 |     [clojure.set :as set]
 6 |     [clojure.stacktrace :refer [print-cause-trace]]
 7 |     [clojure.string :as str]
 8 |     [sparkplug.config :as conf]
 9 |     [sparkplug.context :as ctx]
10 |     [sparkplug.core :as spark]
11 |     [sparkplug.function :as f]
12 |     [sparkplug.kryo :as kryo]
13 |     [sparkplug.rdd :as rdd]
14 |     [sparkplug.scala :as scala]))
15 | 
16 | 
17 | (def spark-context
18 |   "The currently active Spark context."
19 |   nil)
20 | 
21 | 
22 | (defn exit!
23 |   "Exit the running REPL gracefully."
24 |   []
25 |   (deliver @(resolve 'sparkplug.repl.main/exit-promise) :exit))
26 | 


--------------------------------------------------------------------------------